reach-vb's picture
reach-vb HF staff
8643e0bff997343019440dbb643d796bcec3eb5d247e4cbf77173c5941ca163f
6017349
raw
history blame
4.11 kB
import logging
import tarfile
import fsspec
from fsspec.archive import AbstractArchiveFileSystem
from fsspec.compression import compr
from fsspec.utils import infer_compression
typemap = {b"0": "file", b"5": "directory"}
logger = logging.getLogger("tar")
class TarFileSystem(AbstractArchiveFileSystem):
"""Compressed Tar archives as a file-system (read-only)
Supports the following formats:
tar.gz, tar.bz2, tar.xz
"""
root_marker = ""
protocol = "tar"
cachable = False
def __init__(
self,
fo="",
index_store=None,
target_options=None,
target_protocol=None,
compression=None,
**kwargs,
):
super().__init__(**kwargs)
target_options = target_options or {}
if isinstance(fo, str):
self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
fo = self.of.open() # keep the reference
# Try to infer compression.
if compression is None:
name = None
# Try different ways to get hold of the filename. `fo` might either
# be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
# `fsspec.AbstractFileSystem` instance.
try:
# Amended io.BufferedReader or similar.
# This uses a "protocol extension" where original filenames are
# propagated to archive-like filesystems in order to let them
# infer the right compression appropriately.
if hasattr(fo, "original"):
name = fo.original
# fsspec.LocalFileOpener
elif hasattr(fo, "path"):
name = fo.path
# io.BufferedReader
elif hasattr(fo, "name"):
name = fo.name
# fsspec.AbstractFileSystem
elif hasattr(fo, "info"):
name = fo.info()["name"]
except Exception as ex:
logger.warning(
f"Unable to determine file name, not inferring compression: {ex}"
)
if name is not None:
compression = infer_compression(name)
logger.info(f"Inferred compression {compression} from file name {name}")
if compression is not None:
# TODO: tarfile already implements compression with modes like "'r:gz'",
# but then would seek to offset in the file work?
fo = compr[compression](fo)
self._fo_ref = fo
self.fo = fo # the whole instance is a context
self.tar = tarfile.TarFile(fileobj=self.fo)
self.dir_cache = None
self.index_store = index_store
self.index = None
self._index()
def _index(self):
# TODO: load and set saved index, if exists
out = {}
for ti in self.tar:
info = ti.get_info()
info["type"] = typemap.get(info["type"], "file")
name = ti.get_info()["name"].rstrip("/")
out[name] = (info, ti.offset_data)
self.index = out
# TODO: save index to self.index_store here, if set
def _get_dirs(self):
if self.dir_cache is not None:
return
# This enables ls to get directories as children as well as files
self.dir_cache = {
dirname: {"name": dirname, "size": 0, "type": "directory"}
for dirname in self._all_dirnames(self.tar.getnames())
}
for member in self.tar.getmembers():
info = member.get_info()
info["name"] = info["name"].rstrip("/")
info["type"] = typemap.get(info["type"], "file")
self.dir_cache[info["name"]] = info
def _open(self, path, mode="rb", **kwargs):
if mode != "rb":
raise ValueError("Read-only filesystem implementation")
details, offset = self.index[path]
if details["type"] != "file":
raise ValueError("Can only handle regular files")
return self.tar.extractfile(path)