chadlinden's picture
Upload folder using huggingface_hub
8fd238c verified
raw
history blame
37.3 kB
import glob
import json
import os
import pickle
import subprocess
import sys
from collections import defaultdict
from pathlib import Path
import numpy as np
import pytest
import fsspec
from fsspec.implementations.ftp import FTPFileSystem
from fsspec.implementations.http import HTTPFileSystem
from fsspec.implementations.local import LocalFileSystem
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
PATHS_FOR_GLOB_TESTS = (
{"name": "test0.json", "type": "file", "size": 100},
{"name": "test0.yaml", "type": "file", "size": 100},
{"name": "test0", "type": "directory", "size": 0},
{"name": "test0/test0.json", "type": "file", "size": 100},
{"name": "test0/test0.yaml", "type": "file", "size": 100},
{"name": "test0/test1", "type": "directory", "size": 0},
{"name": "test0/test1/test0.json", "type": "file", "size": 100},
{"name": "test0/test1/test0.yaml", "type": "file", "size": 100},
{"name": "test0/test1/test2", "type": "directory", "size": 0},
{"name": "test0/test1/test2/test0.json", "type": "file", "size": 100},
{"name": "test0/test1/test2/test0.yaml", "type": "file", "size": 100},
{"name": "test0/test2", "type": "directory", "size": 0},
{"name": "test0/test2/test0.json", "type": "file", "size": 100},
{"name": "test0/test2/test0.yaml", "type": "file", "size": 100},
{"name": "test0/test2/test1", "type": "directory", "size": 0},
{"name": "test0/test2/test1/test0.json", "type": "file", "size": 100},
{"name": "test0/test2/test1/test0.yaml", "type": "file", "size": 100},
{"name": "test0/test2/test1/test3", "type": "directory", "size": 0},
{"name": "test0/test2/test1/test3/test0.json", "type": "file", "size": 100},
{"name": "test0/test2/test1/test3/test0.yaml", "type": "file", "size": 100},
{"name": "test1.json", "type": "file", "size": 100},
{"name": "test1.yaml", "type": "file", "size": 100},
{"name": "test1", "type": "directory", "size": 0},
{"name": "test1/test0.json", "type": "file", "size": 100},
{"name": "test1/test0.yaml", "type": "file", "size": 100},
{"name": "test1/test0", "type": "directory", "size": 0},
{"name": "test1/test0/test0.json", "type": "file", "size": 100},
{"name": "test1/test0/test0.yaml", "type": "file", "size": 100},
{"name": "special_chars", "type": "directory", "size": 0},
{"name": "special_chars/f\\oo.txt", "type": "file", "size": 100},
{"name": "special_chars/f.oo.txt", "type": "file", "size": 100},
{"name": "special_chars/f+oo.txt", "type": "file", "size": 100},
{"name": "special_chars/f(oo.txt", "type": "file", "size": 100},
{"name": "special_chars/f)oo.txt", "type": "file", "size": 100},
{"name": "special_chars/f|oo.txt", "type": "file", "size": 100},
{"name": "special_chars/f^oo.txt", "type": "file", "size": 100},
{"name": "special_chars/f$oo.txt", "type": "file", "size": 100},
{"name": "special_chars/f{oo.txt", "type": "file", "size": 100},
{"name": "special_chars/f}oo.txt", "type": "file", "size": 100},
)
GLOB_POSIX_TESTS = {
"argnames": ("path", "expected"),
"argvalues": [
("nonexistent", []),
("test0.json", ["test0.json"]),
("test0", ["test0"]),
("test0/", ["test0"]),
("test1/test0.yaml", ["test1/test0.yaml"]),
("test0/test[1-2]", ["test0/test1", "test0/test2"]),
("test0/test[1-2]/", ["test0/test1", "test0/test2"]),
(
"test0/test[1-2]/*",
[
"test0/test1/test0.json",
"test0/test1/test0.yaml",
"test0/test1/test2",
"test0/test2/test0.json",
"test0/test2/test0.yaml",
"test0/test2/test1",
],
),
(
"test0/test[1-2]/*.[j]*",
["test0/test1/test0.json", "test0/test2/test0.json"],
),
("special_chars/f\\oo.*", ["special_chars/f\\oo.txt"]),
("special_chars/f.oo.*", ["special_chars/f.oo.txt"]),
("special_chars/f+oo.*", ["special_chars/f+oo.txt"]),
("special_chars/f(oo.*", ["special_chars/f(oo.txt"]),
("special_chars/f)oo.*", ["special_chars/f)oo.txt"]),
("special_chars/f|oo.*", ["special_chars/f|oo.txt"]),
("special_chars/f^oo.*", ["special_chars/f^oo.txt"]),
("special_chars/f$oo.*", ["special_chars/f$oo.txt"]),
("special_chars/f{oo.*", ["special_chars/f{oo.txt"]),
("special_chars/f}oo.*", ["special_chars/f}oo.txt"]),
(
"*",
[
"special_chars",
"test0.json",
"test0.yaml",
"test0",
"test1.json",
"test1.yaml",
"test1",
],
),
("*.yaml", ["test0.yaml", "test1.yaml"]),
(
"**",
[
"special_chars",
"special_chars/f$oo.txt",
"special_chars/f(oo.txt",
"special_chars/f)oo.txt",
"special_chars/f+oo.txt",
"special_chars/f.oo.txt",
"special_chars/f\\oo.txt",
"special_chars/f^oo.txt",
"special_chars/f{oo.txt",
"special_chars/f|oo.txt",
"special_chars/f}oo.txt",
"test0.json",
"test0.yaml",
"test0",
"test0/test0.json",
"test0/test0.yaml",
"test0/test1",
"test0/test1/test0.json",
"test0/test1/test0.yaml",
"test0/test1/test2",
"test0/test1/test2/test0.json",
"test0/test1/test2/test0.yaml",
"test0/test2",
"test0/test2/test0.json",
"test0/test2/test0.yaml",
"test0/test2/test1",
"test0/test2/test1/test0.json",
"test0/test2/test1/test0.yaml",
"test0/test2/test1/test3",
"test0/test2/test1/test3/test0.json",
"test0/test2/test1/test3/test0.yaml",
"test1.json",
"test1.yaml",
"test1",
"test1/test0.json",
"test1/test0.yaml",
"test1/test0",
"test1/test0/test0.json",
"test1/test0/test0.yaml",
],
),
("*/", ["special_chars", "test0", "test1"]),
(
"**/",
[
"special_chars",
"test0",
"test0/test1",
"test0/test1/test2",
"test0/test2",
"test0/test2/test1",
"test0/test2/test1/test3",
"test1",
"test1/test0",
],
),
("*/*.yaml", ["test0/test0.yaml", "test1/test0.yaml"]),
(
"**/*.yaml",
[
"test0.yaml",
"test0/test0.yaml",
"test0/test1/test0.yaml",
"test0/test1/test2/test0.yaml",
"test0/test2/test0.yaml",
"test0/test2/test1/test0.yaml",
"test0/test2/test1/test3/test0.yaml",
"test1.yaml",
"test1/test0.yaml",
"test1/test0/test0.yaml",
],
),
(
"*/test1/*",
["test0/test1/test0.json", "test0/test1/test0.yaml", "test0/test1/test2"],
),
("*/test1/*.yaml", ["test0/test1/test0.yaml"]),
(
"**/test1/*",
[
"test0/test1/test0.json",
"test0/test1/test0.yaml",
"test0/test1/test2",
"test0/test2/test1/test0.json",
"test0/test2/test1/test0.yaml",
"test0/test2/test1/test3",
"test1/test0.json",
"test1/test0.yaml",
"test1/test0",
],
),
(
"**/test1/*.yaml",
[
"test0/test1/test0.yaml",
"test0/test2/test1/test0.yaml",
"test1/test0.yaml",
],
),
("*/test1/*/", ["test0/test1/test2"]),
(
"**/test1/*/",
["test0/test1/test2", "test0/test2/test1/test3", "test1/test0"],
),
(
"*/test1/**",
[
"test0/test1",
"test0/test1/test0.json",
"test0/test1/test0.yaml",
"test0/test1/test2",
"test0/test1/test2/test0.json",
"test0/test1/test2/test0.yaml",
],
),
(
"**/test1/**",
[
"test0/test1",
"test0/test1/test0.json",
"test0/test1/test0.yaml",
"test0/test1/test2",
"test0/test1/test2/test0.json",
"test0/test1/test2/test0.yaml",
"test0/test2/test1",
"test0/test2/test1/test0.json",
"test0/test2/test1/test0.yaml",
"test0/test2/test1/test3",
"test0/test2/test1/test3/test0.json",
"test0/test2/test1/test3/test0.yaml",
"test1",
"test1/test0.json",
"test1/test0.yaml",
"test1/test0",
"test1/test0/test0.json",
"test1/test0/test0.yaml",
],
),
("*/test1/**/", ["test0/test1", "test0/test1/test2"]),
(
"**/test1/**/",
[
"test0/test1",
"test0/test1/test2",
"test0/test2/test1",
"test0/test2/test1/test3",
"test1",
"test1/test0",
],
),
(
"test0/*",
["test0/test0.json", "test0/test0.yaml", "test0/test1", "test0/test2"],
),
("test0/*.yaml", ["test0/test0.yaml"]),
(
"test0/**",
[
"test0",
"test0/test0.json",
"test0/test0.yaml",
"test0/test1",
"test0/test1/test0.json",
"test0/test1/test0.yaml",
"test0/test1/test2",
"test0/test1/test2/test0.json",
"test0/test1/test2/test0.yaml",
"test0/test2",
"test0/test2/test0.json",
"test0/test2/test0.yaml",
"test0/test2/test1",
"test0/test2/test1/test0.json",
"test0/test2/test1/test0.yaml",
"test0/test2/test1/test3",
"test0/test2/test1/test3/test0.json",
"test0/test2/test1/test3/test0.yaml",
],
),
("test0/*/", ["test0/test1", "test0/test2"]),
(
"test0/**/",
[
"test0",
"test0/test1",
"test0/test1/test2",
"test0/test2",
"test0/test2/test1",
"test0/test2/test1/test3",
],
),
("test0/*/*.yaml", ["test0/test1/test0.yaml", "test0/test2/test0.yaml"]),
(
"test0/**/*.yaml",
[
"test0/test0.yaml",
"test0/test1/test0.yaml",
"test0/test1/test2/test0.yaml",
"test0/test2/test0.yaml",
"test0/test2/test1/test0.yaml",
"test0/test2/test1/test3/test0.yaml",
],
),
(
"test0/*/test1/*",
[
"test0/test2/test1/test0.json",
"test0/test2/test1/test0.yaml",
"test0/test2/test1/test3",
],
),
("test0/*/test1/*.yaml", ["test0/test2/test1/test0.yaml"]),
(
"test0/**/test1/*",
[
"test0/test1/test0.json",
"test0/test1/test0.yaml",
"test0/test1/test2",
"test0/test2/test1/test0.json",
"test0/test2/test1/test0.yaml",
"test0/test2/test1/test3",
],
),
(
"test0/**/test1/*.yaml",
["test0/test1/test0.yaml", "test0/test2/test1/test0.yaml"],
),
("test0/*/test1/*/", ["test0/test2/test1/test3"]),
("test0/**/test1/*/", ["test0/test1/test2", "test0/test2/test1/test3"]),
(
"test0/*/test1/**",
[
"test0/test2/test1",
"test0/test2/test1/test0.json",
"test0/test2/test1/test0.yaml",
"test0/test2/test1/test3",
"test0/test2/test1/test3/test0.json",
"test0/test2/test1/test3/test0.yaml",
],
),
(
"test0/**/test1/**",
[
"test0/test1",
"test0/test1/test0.json",
"test0/test1/test0.yaml",
"test0/test1/test2",
"test0/test1/test2/test0.json",
"test0/test1/test2/test0.yaml",
"test0/test2/test1",
"test0/test2/test1/test0.json",
"test0/test2/test1/test0.yaml",
"test0/test2/test1/test3",
"test0/test2/test1/test3/test0.json",
"test0/test2/test1/test3/test0.yaml",
],
),
("test0/*/test1/**/", ["test0/test2/test1", "test0/test2/test1/test3"]),
(
"test0/**/test1/**/",
[
"test0/test1",
"test0/test1/test2",
"test0/test2/test1",
"test0/test2/test1/test3",
],
),
],
}
class DummyTestFS(AbstractFileSystem):
protocol = "mock"
_file_class = AbstractBufferedFile
_fs_contents = (
{"name": "top_level", "type": "directory"},
{"name": "top_level/second_level", "type": "directory"},
{"name": "top_level/second_level/date=2019-10-01", "type": "directory"},
{
"name": "top_level/second_level/date=2019-10-01/a.parquet",
"type": "file",
"size": 100,
},
{
"name": "top_level/second_level/date=2019-10-01/b.parquet",
"type": "file",
"size": 100,
},
{"name": "top_level/second_level/date=2019-10-02", "type": "directory"},
{
"name": "top_level/second_level/date=2019-10-02/a.parquet",
"type": "file",
"size": 100,
},
{"name": "top_level/second_level/date=2019-10-04", "type": "directory"},
{
"name": "top_level/second_level/date=2019-10-04/a.parquet",
"type": "file",
"size": 100,
},
{"name": "misc", "type": "directory"},
{"name": "misc/foo.txt", "type": "file", "size": 100},
)
def __init__(self, fs_content=None, **kwargs):
if fs_content is not None:
self._fs_contents = fs_content
super().__init__(**kwargs)
def __getitem__(self, name):
for item in self._fs_contents:
if item["name"] == name:
return item
raise IndexError(f"{name} not found!")
def ls(self, path, detail=True, refresh=True, **kwargs):
if kwargs.pop("strip_proto", True):
path = self._strip_protocol(path)
files = not refresh and self._ls_from_cache(path)
if not files:
files = [
file for file in self._fs_contents if path == self._parent(file["name"])
]
files.sort(key=lambda file: file["name"])
self.dircache[path.rstrip("/")] = files
if detail:
return files
return [file["name"] for file in files]
@classmethod
def get_test_paths(cls, start_with=""):
"""Helper to return directory and file paths with no details"""
all = [
file["name"]
for file in cls._fs_contents
if file["name"].startswith(start_with)
]
return all
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
return self._file_class(
self,
path,
mode,
block_size,
autocommit,
cache_options=cache_options,
**kwargs,
)
@pytest.mark.parametrize(
["test_paths", "recursive", "maxdepth", "expected"],
[
(
(
"top_level/second_level",
"top_level/sec*",
"top_level/sec*vel",
"top_level/*",
),
True,
None,
[
"top_level/second_level",
"top_level/second_level/date=2019-10-01",
"top_level/second_level/date=2019-10-01/a.parquet",
"top_level/second_level/date=2019-10-01/b.parquet",
"top_level/second_level/date=2019-10-02",
"top_level/second_level/date=2019-10-02/a.parquet",
"top_level/second_level/date=2019-10-04",
"top_level/second_level/date=2019-10-04/a.parquet",
],
),
(
(
"top_level/second_level",
"top_level/sec*",
"top_level/sec*vel",
"top_level/*",
),
False,
None,
[
"top_level/second_level",
],
),
(
("top_level/second_level",),
True,
1,
[
"top_level/second_level",
"top_level/second_level/date=2019-10-01",
"top_level/second_level/date=2019-10-02",
"top_level/second_level/date=2019-10-04",
],
),
(
("top_level/second_level",),
True,
2,
[
"top_level/second_level",
"top_level/second_level/date=2019-10-01",
"top_level/second_level/date=2019-10-01/a.parquet",
"top_level/second_level/date=2019-10-01/b.parquet",
"top_level/second_level/date=2019-10-02",
"top_level/second_level/date=2019-10-02/a.parquet",
"top_level/second_level/date=2019-10-04",
"top_level/second_level/date=2019-10-04/a.parquet",
],
),
(
("top_level/*", "top_level/sec*", "top_level/sec*vel", "top_level/*"),
True,
1,
["top_level/second_level"],
),
(
("top_level/*", "top_level/sec*", "top_level/sec*vel", "top_level/*"),
True,
2,
[
"top_level/second_level",
"top_level/second_level/date=2019-10-01",
"top_level/second_level/date=2019-10-02",
"top_level/second_level/date=2019-10-04",
],
),
(
("top_level/**",),
False,
None,
[
"top_level",
"top_level/second_level",
"top_level/second_level/date=2019-10-01",
"top_level/second_level/date=2019-10-01/a.parquet",
"top_level/second_level/date=2019-10-01/b.parquet",
"top_level/second_level/date=2019-10-02",
"top_level/second_level/date=2019-10-02/a.parquet",
"top_level/second_level/date=2019-10-04",
"top_level/second_level/date=2019-10-04/a.parquet",
],
),
(
("top_level/**",),
True,
None,
[
"top_level",
"top_level/second_level",
"top_level/second_level/date=2019-10-01",
"top_level/second_level/date=2019-10-01/a.parquet",
"top_level/second_level/date=2019-10-01/b.parquet",
"top_level/second_level/date=2019-10-02",
"top_level/second_level/date=2019-10-02/a.parquet",
"top_level/second_level/date=2019-10-04",
"top_level/second_level/date=2019-10-04/a.parquet",
],
),
(("top_level/**",), True, 1, ["top_level", "top_level/second_level"]),
(
("top_level/**",),
True,
2,
[
"top_level",
"top_level/second_level",
"top_level/second_level/date=2019-10-01",
"top_level/second_level/date=2019-10-01/a.parquet",
"top_level/second_level/date=2019-10-01/b.parquet",
"top_level/second_level/date=2019-10-02",
"top_level/second_level/date=2019-10-02/a.parquet",
"top_level/second_level/date=2019-10-04",
"top_level/second_level/date=2019-10-04/a.parquet",
],
),
(
("top_level/**/a.*",),
False,
None,
[
"top_level/second_level/date=2019-10-01/a.parquet",
"top_level/second_level/date=2019-10-02/a.parquet",
"top_level/second_level/date=2019-10-04/a.parquet",
],
),
(
("top_level/**/a.*",),
True,
None,
[
"top_level/second_level/date=2019-10-01/a.parquet",
"top_level/second_level/date=2019-10-02/a.parquet",
"top_level/second_level/date=2019-10-04/a.parquet",
],
),
(
("top_level/**/second_level/date=2019-10-02",),
False,
2,
[
"top_level/second_level/date=2019-10-02",
],
),
(
("top_level/**/second_level/date=2019-10-02",),
True,
2,
[
"top_level/second_level/date=2019-10-02",
"top_level/second_level/date=2019-10-02/a.parquet",
],
),
[("misc/foo.txt", "misc/*.txt"), False, None, ["misc/foo.txt"]],
[("misc/foo.txt", "misc/*.txt"), True, None, ["misc/foo.txt"]],
(
("",),
False,
None,
[DummyTestFS.root_marker],
),
(
("",),
True,
None,
DummyTestFS.get_test_paths() + [DummyTestFS.root_marker],
),
[
(Path("misc/foo.txt"),),
False,
None,
[f"misc{os.sep}foo.txt"],
],
],
)
def test_expand_path(test_paths, recursive, maxdepth, expected):
"""Test a number of paths and then their combination which should all yield
the same set of expanded paths"""
test_fs = DummyTestFS()
# test single query
for test_path in test_paths:
paths = test_fs.expand_path(test_path, recursive=recursive, maxdepth=maxdepth)
assert sorted(paths) == sorted(expected)
# test with all queries
paths = test_fs.expand_path(
list(test_paths), recursive=recursive, maxdepth=maxdepth
)
assert sorted(paths) == sorted(expected)
def test_expand_paths_with_wrong_args():
test_fs = DummyTestFS()
with pytest.raises(ValueError):
test_fs.expand_path("top_level", recursive=True, maxdepth=0)
with pytest.raises(ValueError):
test_fs.expand_path("top_level", maxdepth=0)
with pytest.raises(FileNotFoundError):
test_fs.expand_path("top_level/**/second_level/date=2019-10-02", maxdepth=1)
with pytest.raises(FileNotFoundError):
test_fs.expand_path("nonexistent/*")
@pytest.mark.xfail
def test_find():
"""Test .find() method on debian server (ftp, https) with constant folder"""
filesystem, host, test_path = (
FTPFileSystem,
"ftp.fau.de",
"ftp://ftp.fau.de/debian-cd/current/amd64/log/success",
)
test_fs = filesystem(host)
filenames_ftp = test_fs.find(test_path)
assert filenames_ftp
filesystem, host, test_path = (
HTTPFileSystem,
"https://ftp.fau.de",
"https://ftp.fau.de/debian-cd/current/amd64/log/success",
)
test_fs = filesystem()
filenames_http = test_fs.find(test_path)
roots = [f.rsplit("/", 1)[-1] for f in filenames_http]
assert all(f.rsplit("/", 1)[-1] in roots for f in filenames_ftp)
def test_find_details():
test_fs = DummyTestFS()
filenames = test_fs.find("/")
details = test_fs.find("/", detail=True)
for filename in filenames:
assert details[filename] == test_fs.info(filename)
def test_find_file():
test_fs = DummyTestFS()
filename = "misc/foo.txt"
assert test_fs.find(filename) == [filename]
assert test_fs.find(filename, detail=True) == {filename: {}}
def test_cache():
fs = DummyTestFS()
fs2 = DummyTestFS()
assert fs is fs2
assert DummyTestFS.current() is fs
assert len(fs._cache) == 1
del fs2
assert len(fs._cache) == 1
del fs
# keeps and internal reference, doesn't get collected
assert len(DummyTestFS._cache) == 1
DummyTestFS.clear_instance_cache()
assert len(DummyTestFS._cache) == 0
def test_current():
fs = DummyTestFS()
fs2 = DummyTestFS(arg=1)
assert fs is not fs2
assert DummyTestFS.current() is fs2
DummyTestFS()
assert DummyTestFS.current() is fs
def test_alias():
with pytest.warns(FutureWarning, match="add_aliases"):
DummyTestFS(add_aliases=True)
def test_add_docs_warns():
with pytest.warns(FutureWarning, match="add_docs"):
AbstractFileSystem(add_docs=True)
def test_cache_options():
fs = DummyTestFS()
f = AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes")
assert f.cache.trim
# TODO: dummy buffered file
f = AbstractBufferedFile(
fs, "misc/foo.txt", cache_type="bytes", cache_options={"trim": False}
)
assert f.cache.trim is False
f = fs.open("misc/foo.txt", cache_type="bytes", cache_options={"trim": False})
assert f.cache.trim is False
def test_trim_kwarg_warns():
fs = DummyTestFS()
with pytest.warns(FutureWarning, match="cache_options"):
AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes", trim=False)
def tests_file_open_error(monkeypatch):
class InitiateError(ValueError): ...
class UploadError(ValueError): ...
class DummyBufferedFile(AbstractBufferedFile):
can_initiate = False
def _initiate_upload(self):
if not self.can_initiate:
raise InitiateError
def _upload_chunk(self, final=False):
raise UploadError
monkeypatch.setattr(DummyTestFS, "_file_class", DummyBufferedFile)
fs = DummyTestFS()
with pytest.raises(InitiateError):
with fs.open("misc/foo.txt", "wb") as stream:
stream.write(b"hello" * stream.blocksize * 2)
with pytest.raises(UploadError):
with fs.open("misc/foo.txt", "wb") as stream:
stream.can_initiate = True
stream.write(b"hello" * stream.blocksize * 2)
def test_eq():
fs = DummyTestFS()
result = fs == 1
assert result is False
f = AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes")
result = f == 1
assert result is False
def test_pickle_multiple():
a = DummyTestFS(1)
b = DummyTestFS(2, bar=1)
x = pickle.dumps(a)
y = pickle.dumps(b)
del a, b
DummyTestFS.clear_instance_cache()
result = pickle.loads(x)
assert result.storage_args == (1,)
assert result.storage_options == {}
result = pickle.loads(y)
assert result.storage_args == (2,)
assert result.storage_options == {"bar": 1}
def test_json():
a = DummyTestFS(1)
b = DummyTestFS(2, bar=1)
outa = a.to_json()
outb = b.to_json()
assert json.loads(outb) # is valid JSON
assert a != b
assert "bar" in outb
assert DummyTestFS.from_json(outa) is a
assert DummyTestFS.from_json(outb) is b
def test_ls_from_cache():
fs = DummyTestFS()
uncached_results = fs.ls("top_level/second_level/", refresh=True)
assert fs.ls("top_level/second_level/", refresh=False) == uncached_results
# _strip_protocol removes everything by default though
# for the sake of testing the _ls_from_cache interface
# directly, we need run one time more without that call
# to actually verify that our stripping in the client
# function works.
assert (
fs.ls("top_level/second_level/", refresh=False, strip_proto=False)
== uncached_results
)
@pytest.mark.parametrize(
"dt",
[
np.int8,
np.int16,
np.int32,
np.int64,
np.uint8,
np.uint16,
np.uint32,
np.uint64,
np.float32,
np.float64,
],
)
def test_readinto_with_numpy(tmpdir, dt):
store_path = str(tmpdir / "test_arr.npy")
arr = np.arange(10, dtype=dt)
arr.tofile(store_path)
arr2 = np.empty_like(arr)
with fsspec.open(store_path, "rb") as f:
f.readinto(arr2)
assert np.array_equal(arr, arr2)
@pytest.mark.parametrize(
"dt",
[
np.int8,
np.int16,
np.int32,
np.int64,
np.uint8,
np.uint16,
np.uint32,
np.uint64,
np.float32,
np.float64,
],
)
def test_readinto_with_multibyte(ftp_writable, tmpdir, dt):
host, port, user, pw = ftp_writable
ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
with ftp.open("/out", "wb") as fp:
arr = np.arange(10, dtype=dt)
fp.write(arr.tobytes())
with ftp.open("/out", "rb") as fp:
arr2 = np.empty_like(arr)
fp.readinto(arr2)
assert np.array_equal(arr, arr2)
class DummyOpenFS(DummyTestFS):
blocksize = 10
def _open(self, path, mode="rb", **kwargs):
stream = open(path, mode)
stream.size = os.stat(path).st_size
return stream
class BasicCallback(fsspec.Callback):
def __init__(self, **kwargs):
self.events = []
super().__init__(**kwargs)
def set_size(self, size):
self.events.append(("set_size", size))
def relative_update(self, inc=1):
self.events.append(("relative_update", inc))
def imitate_transfer(size, chunk, *, file=True):
events = [("set_size", size)]
events.extend(("relative_update", size // chunk) for _ in range(chunk))
if file:
# The reason that there is a relative_update(0) at the
# end is that, we don't have an early exit on the
# implementations of get_file/put_file so it needs to
# go through the callback to get catch by the while's
# condition and then it will stop the transfer.
events.append(("relative_update", 0))
return events
def get_files(tmpdir, amount=10):
src, dest, base = [], [], []
for index in range(amount):
src_path = tmpdir / f"src_{index}.txt"
src_path.write_text("x" * 50, "utf-8")
src.append(str(src_path))
dest.append(str(tmpdir / f"dst_{index}.txt"))
base.append(str(tmpdir / f"file_{index}.txt"))
return src, dest, base
def test_dummy_callbacks_file(tmpdir):
fs = DummyOpenFS()
callback = BasicCallback()
file = tmpdir / "file.txt"
source = tmpdir / "tmp.txt"
destination = tmpdir / "tmp2.txt"
size = 100
source.write_text("x" * 100, "utf-8")
fs.put_file(source, file, callback=callback)
# -1 here since put_file no longer has final zero-size put
assert callback.events == imitate_transfer(size, 10)[:-1]
callback.events.clear()
fs.get_file(file, destination, callback=callback)
assert callback.events == imitate_transfer(size, 10)
callback.events.clear()
assert destination.read_text("utf-8") == "x" * 100
def test_dummy_callbacks_files(tmpdir):
fs = DummyOpenFS()
callback = BasicCallback()
src, dest, base = get_files(tmpdir)
fs.put(src, base, callback=callback)
assert callback.events == imitate_transfer(10, 10, file=False)
callback.events.clear()
fs.get(base, dest, callback=callback)
assert callback.events == imitate_transfer(10, 10, file=False)
class BranchableCallback(BasicCallback):
def __init__(self, source, dest=None, events=None, **kwargs):
super().__init__(**kwargs)
if dest:
self.key = source, dest
else:
self.key = (source,)
self.events = events or defaultdict(list)
def branch(self, path_1, path_2, kwargs):
from fsspec.implementations.local import make_path_posix
path_1 = make_path_posix(path_1)
path_2 = make_path_posix(path_2)
kwargs["callback"] = BranchableCallback(path_1, path_2, events=self.events)
def set_size(self, size):
self.events[self.key].append(("set_size", size))
def relative_update(self, inc=1):
self.events[self.key].append(("relative_update", inc))
def test_dummy_callbacks_files_branched(tmpdir):
fs = DummyOpenFS()
src, dest, base = get_files(tmpdir)
callback = BranchableCallback("top-level")
def check_events(lpaths, rpaths):
from fsspec.implementations.local import make_path_posix
base_keys = zip(make_path_posix(lpaths), make_path_posix(rpaths))
assert set(callback.events.keys()) == {("top-level",), *base_keys}
assert callback.events["top-level",] == imitate_transfer(10, 10, file=False)
for key in base_keys:
assert callback.events[key] == imitate_transfer(50, 5)
fs.put(src, base, callback=callback)
check_events(src, base)
callback.events.clear()
fs.get(base, dest, callback=callback)
check_events(base, dest)
callback.events.clear()
def _clean_paths(paths, prefix=""):
"""
Helper to cleanup paths results by doing the following:
- remove the prefix provided from all paths
- remove the trailing slashes from all paths
- remove duplicates paths
- sort all paths
"""
paths_list = paths
if isinstance(paths, dict):
paths_list = list(paths)
paths_list = [p.replace(prefix, "").strip("/") for p in sorted(set(paths_list))]
if isinstance(paths, dict):
return {p: paths[p] for p in paths_list}
return paths_list
@pytest.fixture(scope="function")
def glob_fs():
return DummyTestFS(fs_content=PATHS_FOR_GLOB_TESTS)
@pytest.fixture(scope="function")
def glob_files_folder(tmp_path):
local_fs = LocalFileSystem(auto_mkdir=True)
local_fake_dir = str(tmp_path)
for path_info in PATHS_FOR_GLOB_TESTS:
if path_info["type"] == "file":
local_fs.touch(path=f"{str(tmp_path)}/{path_info['name']}")
return local_fake_dir
@pytest.mark.skipif(
sys.platform.startswith("win"),
reason="no need to run python glob posix tests on windows",
)
@pytest.mark.parametrize(
GLOB_POSIX_TESTS["argnames"],
GLOB_POSIX_TESTS["argvalues"],
)
def test_posix_tests_python_glob(path, expected, glob_files_folder):
"""
Tests against python glob to check if our posix tests are accurate.
"""
os.chdir(glob_files_folder)
python_output = glob.glob(pathname=path, recursive=True)
assert _clean_paths(python_output, glob_files_folder) == _clean_paths(expected)
@pytest.mark.skipif(
sys.platform.startswith("win"),
reason="no need to run bash stat posix tests on windows",
)
@pytest.mark.parametrize(
GLOB_POSIX_TESTS["argnames"],
GLOB_POSIX_TESTS["argvalues"],
)
def test_posix_tests_bash_stat(path, expected, glob_files_folder):
"""
Tests against bash stat to check if our posix tests are accurate.
"""
try:
subprocess.check_output(["bash", "-c", "shopt -s globstar"])
except FileNotFoundError:
pytest.skip("bash is not available")
except subprocess.CalledProcessError:
pytest.skip("globstar option is not available")
bash_path = (
path.replace("\\", "\\\\")
.replace("$", "\\$")
.replace("(", "\\(")
.replace(")", "\\)")
.replace("|", "\\|")
)
bash_output = subprocess.run(
[
"bash",
"-c",
f"cd {glob_files_folder} && shopt -s globstar && stat -c %N {bash_path}",
],
capture_output=True,
check=False,
)
# Remove the last element always empty
bash_output = bash_output.stdout.decode("utf-8").replace("'", "").split("\n")[:-1]
assert _clean_paths(bash_output, glob_files_folder) == _clean_paths(expected)
@pytest.mark.parametrize(
GLOB_POSIX_TESTS["argnames"],
GLOB_POSIX_TESTS["argvalues"],
)
def test_glob_posix_rules(path, expected, glob_fs):
output = glob_fs.glob(path=f"mock://{path}")
assert _clean_paths(output) == _clean_paths(expected)
detailed_output = glob_fs.glob(path=f"mock://{path}", detail=True)
for name, info in _clean_paths(detailed_output).items():
assert info == glob_fs[name]