chadlinden's picture
Upload folder using huggingface_hub
8fd238c verified
raw
history blame
14.6 kB
"""Tests the spec, using memoryfs"""
import contextlib
import os
import pickle
import tempfile
from unittest.mock import Mock
import pytest
import fsspec
from fsspec.implementations.memory import MemoryFile, MemoryFileSystem
def test_idempotent():
MemoryFileSystem.clear_instance_cache()
fs = MemoryFileSystem()
fs2 = MemoryFileSystem()
assert fs is fs2
assert MemoryFileSystem.current() is fs2
MemoryFileSystem.clear_instance_cache()
assert not MemoryFileSystem._cache
fs2 = MemoryFileSystem().current()
assert fs == fs2
def test_pickle():
fs = MemoryFileSystem()
fs2 = pickle.loads(pickle.dumps(fs))
assert fs == fs2
def test_class_methods():
assert MemoryFileSystem._strip_protocol("memory://stuff") == "/stuff"
assert MemoryFileSystem._strip_protocol("stuff") == "/stuff"
assert MemoryFileSystem._strip_protocol("other://stuff") == "other://stuff"
assert MemoryFileSystem._get_kwargs_from_urls("memory://user@thing") == {}
def test_multi(m):
m.pipe("/afile", b"data")
fs, token, paths = fsspec.core.get_fs_token_paths(["/afile", "/afile"])
assert len(paths) == 2
def test_get_put(tmpdir, m):
tmpdir = str(tmpdir)
fn = os.path.join(tmpdir, "one")
open(fn, "wb").write(b"one")
os.mkdir(os.path.join(tmpdir, "dir"))
fn2 = os.path.join(tmpdir, "dir", "two")
open(fn2, "wb").write(b"two")
fs = MemoryFileSystem()
fs.put(fn, "/afile")
assert fs.cat("/afile") == b"one"
fs.store["/bfile"] = MemoryFile(fs, "/bfile", b"data")
fn3 = os.path.join(tmpdir, "three")
fs.get("/bfile", fn3)
assert open(fn3, "rb").read() == b"data"
fs.put(tmpdir, "/more", recursive=True)
assert fs.find("/more") == ["/more/dir/two", "/more/one", "/more/three"]
@contextlib.contextmanager
def tmp_chdir(path):
curdir = os.getcwd()
os.chdir(path)
try:
yield
finally:
os.chdir(curdir)
with tmp_chdir(os.path.join(tmpdir, os.path.pardir)):
fs.put(os.path.basename(tmpdir), "/moretwo", recursive=True)
assert fs.find("/moretwo") == [
"/moretwo/dir/two",
"/moretwo/one",
"/moretwo/three",
]
with tmp_chdir(tmpdir):
fs.put(os.path.curdir, "/morethree", recursive=True)
assert fs.find("/morethree") == [
"/morethree/dir/two",
"/morethree/one",
"/morethree/three",
]
for f in [fn, fn2, fn3]:
os.remove(f)
os.rmdir(os.path.join(tmpdir, "dir"))
fs.get("/more/", tmpdir + "/", recursive=True)
assert open(fn3, "rb").read() == b"data"
assert open(fn, "rb").read() == b"one"
def test_du(m):
fs = MemoryFileSystem()
fs.store.update(
{
"/dir/afile": MemoryFile(fs, "/afile", b"a"),
"/dir/dirb/afile": MemoryFile(fs, "/afile", b"bb"),
"/dir/dirb/bfile": MemoryFile(fs, "/afile", b"ccc"),
}
)
assert fs.du("/dir") == 6
assert fs.du("/dir", total=False) == {
"/dir/afile": 1,
"/dir/dirb/afile": 2,
"/dir/dirb/bfile": 3,
}
assert fs.du("/dir", withdirs=True) == 6
assert fs.du("/dir", total=False, withdirs=True) == {
"/dir": 0,
"/dir/afile": 1,
"/dir/dirb": 0,
"/dir/dirb/afile": 2,
"/dir/dirb/bfile": 3,
}
with pytest.raises(ValueError):
assert fs.du("/dir", maxdepth=0) == 1
assert fs.du("/dir", total=False, withdirs=True, maxdepth=1) == {
"/dir": 0,
"/dir/afile": 1,
"/dir/dirb": 0,
}
# Size of file only.
assert fs.du("/dir/afile") == 1
assert fs.du("/dir/afile", withdirs=True) == 1
def test_head_tail(m):
fs = MemoryFileSystem()
with fs.open("/myfile", "wb") as f:
f.write(b"I had a nice big cabbage")
assert fs.head("/myfile", 5) == b"I had"
assert fs.tail("/myfile", 7) == b"cabbage"
def test_move(m):
fs = MemoryFileSystem()
with fs.open("/myfile", "wb") as f:
f.write(b"I had a nice big cabbage")
fs.move("/myfile", "/otherfile")
assert not fs.exists("/myfile")
assert fs.info("/otherfile")
assert isinstance(fs.ukey("/otherfile"), str)
def test_recursive_get_put(tmpdir, m):
fs = MemoryFileSystem()
os.makedirs(f"{tmpdir}/nest")
for file in ["one", "two", "nest/other"]:
with open(f"{tmpdir}/{file}", "wb") as f:
f.write(b"data")
fs.put(str(tmpdir), "test", recursive=True)
# get to directory with slash
d = tempfile.mkdtemp()
fs.get("test/", d, recursive=True)
for file in ["one", "two", "nest/other"]:
with open(f"{d}/{file}", "rb") as f:
f.read() == b"data"
# get to directory without slash
d = tempfile.mkdtemp()
fs.get("test", d, recursive=True)
for file in ["test/one", "test/two", "test/nest/other"]:
with open(f"{d}/{file}", "rb") as f:
f.read() == b"data"
def test_pipe_cat(m):
fs = MemoryFileSystem()
fs.pipe("afile", b"contents")
assert fs.cat("afile") == b"contents"
data = {"/bfile": b"more", "/cfile": b"stuff"}
fs.pipe(data)
assert fs.cat(list(data)) == data
def test_read_block_delimiter(m):
fs = MemoryFileSystem()
with fs.open("/myfile", "wb") as f:
f.write(b"some\nlines\nof\ntext")
assert fs.read_block("/myfile", 0, 2, b"\n") == b"some\n"
assert fs.read_block("/myfile", 2, 6, b"\n") == b"lines\n"
assert fs.read_block("/myfile", 6, 2, b"\n") == b""
assert fs.read_block("/myfile", 2, 9, b"\n") == b"lines\nof\n"
assert fs.read_block("/myfile", 12, 6, b"\n") == b"text"
assert fs.read_block("/myfile", 0, None) == fs.cat("/myfile")
def test_open_text(m):
fs = MemoryFileSystem()
with fs.open("/myfile", "wb") as f:
f.write(b"some\nlines\nof\ntext")
f = fs.open("/myfile", "r", encoding="latin1")
assert f.encoding == "latin1"
def test_read_text(m):
with m.open("/myfile", "w", encoding="utf-8") as f:
f.write("some\nlines\nof\ntext")
assert m.read_text("/myfile", encoding="utf-8") == "some\nlines\nof\ntext"
def test_write_text(m):
m.write_text("/myfile", "some\nlines\nof\ntext", encoding="utf-8")
assert m.read_text("/myfile", encoding="utf-8") == "some\nlines\nof\ntext"
def test_chained_fs():
d1 = tempfile.mkdtemp()
d2 = tempfile.mkdtemp()
f1 = os.path.join(d1, "f1")
with open(f1, "wb") as f:
f.write(b"test")
of = fsspec.open(
f"simplecache::file://{f1}",
simplecache={"cache_storage": d2, "same_names": True},
)
with of as f:
assert f.read() == b"test"
assert os.listdir(d2) == ["f1"]
@pytest.mark.xfail(reason="see issue #334", strict=True)
def test_multilevel_chained_fs():
"""This test reproduces fsspec/filesystem_spec#334"""
import zipfile
d1 = tempfile.mkdtemp()
f1 = os.path.join(d1, "f1.zip")
with zipfile.ZipFile(f1, mode="w") as z:
# filename, content
z.writestr("foo.txt", "foo.txt")
z.writestr("bar.txt", "bar.txt")
# We expected this to be the correct syntax
with pytest.raises(IsADirectoryError):
of = fsspec.open_files(f"zip://*.txt::simplecache::file://{f1}")
assert len(of) == 2
# But this is what is actually valid...
of = fsspec.open_files(f"zip://*.txt::simplecache://{f1}::file://")
assert len(of) == 2
for open_file in of:
with open_file as f:
assert f.read().decode("utf-8") == f.name
def test_multilevel_chained_fs_zip_zip_file():
"""This test reproduces fsspec/filesystem_spec#334"""
import zipfile
d1 = tempfile.mkdtemp()
f1 = os.path.join(d1, "f1.zip")
f2 = os.path.join(d1, "f2.zip")
with zipfile.ZipFile(f1, mode="w") as z:
# filename, content
z.writestr("foo.txt", "foo.txt")
z.writestr("bar.txt", "bar.txt")
with zipfile.ZipFile(f2, mode="w") as z:
with open(f1, "rb") as f:
z.writestr("f1.zip", f.read())
# We expected this to be the correct syntax
of = fsspec.open_files(f"zip://*.txt::zip://f1.zip::file://{f2}")
assert len(of) == 2
for open_file in of:
with open_file as f:
assert f.read().decode("utf-8") == f.name
def test_chained_equivalent():
d1 = tempfile.mkdtemp()
d2 = tempfile.mkdtemp()
f1 = os.path.join(d1, "f1")
with open(f1, "wb") as f:
f.write(b"test1")
of = fsspec.open(
f"simplecache::file://{f1}",
simplecache={"cache_storage": d2, "same_names": True},
)
of2 = fsspec.open(
f"simplecache://{f1}",
cache_storage=d2,
same_names=True,
target_protocol="file",
target_options={},
)
# the following line passes by fluke - they are not quite the same instance,
# since the parameters don't quite match. Also, the url understood by the two
# of s are not the same (path gets munged a bit differently)
assert of.fs == of2.fs
assert hash(of.fs) == hash(of2.fs)
assert of.open().read() == of2.open().read()
def test_chained_fs_multi():
d1 = tempfile.mkdtemp()
d2 = tempfile.mkdtemp()
f1 = os.path.join(d1, "f1")
f2 = os.path.join(d1, "f2")
with open(f1, "wb") as f:
f.write(b"test1")
with open(f2, "wb") as f:
f.write(b"test2")
of = fsspec.open_files(
f"simplecache::file://{d1}/*",
simplecache={"cache_storage": d2, "same_names": True},
)
with of[0] as f:
assert f.read() == b"test1"
with of[1] as f:
assert f.read() == b"test2"
assert sorted(os.listdir(d2)) == ["f1", "f2"]
d2 = tempfile.mkdtemp()
of = fsspec.open_files(
[f"simplecache::file://{f1}", f"simplecache::file://{f2}"],
simplecache={"cache_storage": d2, "same_names": True},
)
with of[0] as f:
assert f.read() == b"test1"
with of[1] as f:
assert f.read() == b"test2"
assert sorted(os.listdir(d2)) == ["f1", "f2"]
def test_chained_fo():
import zipfile
d1 = tempfile.mkdtemp()
f1 = os.path.join(d1, "temp.zip")
d3 = tempfile.mkdtemp()
with zipfile.ZipFile(f1, mode="w") as z:
z.writestr("afile", b"test")
of = fsspec.open(f"zip://afile::file://{f1}")
with of as f:
assert f.read() == b"test"
of = fsspec.open_files(f"zip://*::file://{f1}")
with of[0] as f:
assert f.read() == b"test"
of = fsspec.open_files(
f"simplecache::zip://*::file://{f1}",
simplecache={"cache_storage": d3, "same_names": True},
)
with of[0] as f:
assert f.read() == b"test"
assert "afile" in os.listdir(d3)
def test_url_to_fs():
url = "memory://a.txt"
fs, url2 = fsspec.core.url_to_fs(url)
assert isinstance(fs, MemoryFileSystem)
assert url2 == "/a.txt"
def test_walk(m):
# depth = 0
dir1 = "/dir1"
# depth = 1 (2 dirs, 1 file)
dir11 = dir1 + "/dir11"
dir12 = dir1 + "/dir12"
file11 = dir1 + "/file11"
# depth = 2
dir111 = dir11 + "/dir111"
file111 = dir11 + "/file111"
file121 = dir12 + "/file121"
# depth = 3
file1111 = dir111 + "/file1111"
m.mkdir(dir111) # Creates parents too
m.mkdir(dir12) # Creates parents too
m.touch(file11)
m.touch(file111)
m.touch(file121)
m.touch(file1111)
# No maxdepth
assert list(m.walk(dir1, topdown=True)) == [
(dir1, ["dir11", "dir12"], ["file11"]),
(dir11, ["dir111"], ["file111"]),
(dir111, [], ["file1111"]),
(dir12, [], ["file121"]),
]
assert list(m.walk(dir1, topdown=False)) == [
(dir111, [], ["file1111"]),
(dir11, ["dir111"], ["file111"]),
(dir12, [], ["file121"]),
(dir1, ["dir11", "dir12"], ["file11"]),
]
# maxdepth=2
assert list(m.walk(dir1, maxdepth=2, topdown=True)) == [
(dir1, ["dir11", "dir12"], ["file11"]),
(dir11, ["dir111"], ["file111"]),
(dir12, [], ["file121"]),
]
assert list(m.walk(dir1, maxdepth=2, topdown=False)) == [
(dir11, ["dir111"], ["file111"]),
(dir12, [], ["file121"]),
(dir1, ["dir11", "dir12"], ["file11"]),
]
# maxdepth=1
assert list(m.walk(dir1, maxdepth=1, topdown=True)) == [
(dir1, ["dir11", "dir12"], ["file11"]),
]
assert list(m.walk(dir1, maxdepth=1, topdown=False)) == [
(dir1, ["dir11", "dir12"], ["file11"]),
]
# maxdepth=0
with pytest.raises(ValueError):
list(m.walk(dir1, maxdepth=0, topdown=True))
with pytest.raises(ValueError):
list(m.walk(dir1, maxdepth=0, topdown=False))
# prune dir111
def _walk(*args, **kwargs):
for path, dirs, files in m.walk(*args, **kwargs):
yield (path, dirs.copy(), files)
if "dir111" in dirs:
dirs.remove("dir111")
assert list(_walk(dir1, topdown=True)) == [
(dir1, ["dir11", "dir12"], ["file11"]),
(dir11, ["dir111"], ["file111"]),
(dir12, [], ["file121"]),
]
assert list(_walk(dir1, topdown=False)) == [
(dir111, [], ["file1111"]),
(dir11, ["dir111"], ["file111"]),
(dir12, [], ["file121"]),
(dir1, ["dir11", "dir12"], ["file11"]),
]
# reverse dirs order
def _walk(*args, **kwargs):
for path, dirs, files in m.walk(*args, **kwargs):
yield (path, dirs.copy(), files)
dirs.reverse()
assert list(_walk(dir1, topdown=True)) == [
(dir1, ["dir11", "dir12"], ["file11"]),
# Here dir12 comes before dir11
(dir12, [], ["file121"]),
(dir11, ["dir111"], ["file111"]),
(dir111, [], ["file1111"]),
]
assert list(_walk(dir1, topdown=False)) == [
(dir111, [], ["file1111"]),
(dir11, ["dir111"], ["file111"]),
(dir12, [], ["file121"]),
(dir1, ["dir11", "dir12"], ["file11"]),
]
# on_error omit by default
assert list(m.walk("do_not_exist")) == []
# on_error omit
assert list(m.walk("do_not_exist", on_error="omit")) == []
# on_error raise
with pytest.raises(FileNotFoundError):
list(m.walk("do_not_exist", on_error="raise"))
# on_error callable function
mock = Mock()
assert list(m.walk("do_not_exist", on_error=mock.onerror)) == []
mock.onerror.assert_called()
assert mock.onerror.call_args.kwargs == {}
assert len(mock.onerror.call_args.args) == 1
assert isinstance(mock.onerror.call_args.args[0], FileNotFoundError)