Skip to article frontmatterSkip to article content

Load Kerchunked dataset with Xarray

Overview

Within this notebook, we will cover:

  1. How to load a Kerchunk pre-generated reference file into Xarray as if it were a Zarr store.

Prerequisites

ConceptsImportanceNotes
Kerchunk BasicsRequiredCore
Xarray TutorialRequiredCore
  • Time to learn: 45 minutes

Opening Reference Dataset with Fsspec and Xarray

One way of using our reference dataset is opening it with Xarray. To do this, we will create an fsspec filesystem and pass it to Xarray.

# create an fsspec reference filesystem from the Kerchunk output
import fsspec
import xarray as xr

fs = fsspec.filesystem(
    "reference",
    fo="references/ARG_combined.json",
    remote_protocol="s3",
    remote_options={"anon": True},
    skip_instance_cache=True,
)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", backend_kwargs={"consolidated": False})
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 13
      5 fs = fsspec.filesystem(
      6     "reference",
      7     fo="references/ARG_combined.json",
   (...)     10     skip_instance_cache=True,
     11 )
     12 m = fs.get_mapper("")
---> 13 ds = xr.open_dataset(m, engine="zarr", backend_kwargs={"consolidated": False})

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/api.py:687, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)
    675 decoders = _resolve_decoders_kwargs(
    676     decode_cf,
    677     open_backend_dataset_parameters=backend.open_dataset_parameters,
   (...)    683     decode_coords=decode_coords,
    684 )
    686 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 687 backend_ds = backend.open_dataset(
    688     filename_or_obj,
    689     drop_variables=drop_variables,
    690     **decoders,
    691     **kwargs,
    692 )
    693 ds = _dataset_from_backend_dataset(
    694     backend_ds,
    695     filename_or_obj,
   (...)    705     **kwargs,
    706 )
    707 return ds

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:1578, in ZarrBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, zarr_version, zarr_format, store, engine, use_zarr_fill_value_as_mask, cache_members)
   1576 filename_or_obj = _normalize_path(filename_or_obj)
   1577 if not store:
-> 1578     store = ZarrStore.open_group(
   1579         filename_or_obj,
   1580         group=group,
   1581         mode=mode,
   1582         synchronizer=synchronizer,
   1583         consolidated=consolidated,
   1584         consolidate_on_close=False,
   1585         chunk_store=chunk_store,
   1586         storage_options=storage_options,
   1587         zarr_version=zarr_version,
   1588         use_zarr_fill_value_as_mask=None,
   1589         zarr_format=zarr_format,
   1590         cache_members=cache_members,
   1591     )
   1593 store_entrypoint = StoreBackendEntrypoint()
   1594 with close_on_error(store):

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:664, in ZarrStore.open_group(cls, store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, append_dim, write_region, safe_chunks, align_chunks, zarr_version, zarr_format, use_zarr_fill_value_as_mask, write_empty, cache_members)
    638 @classmethod
    639 def open_group(
    640     cls,
   (...)    657     cache_members: bool = True,
    658 ):
    659     (
    660         zarr_group,
    661         consolidate_on_close,
    662         close_store_on_close,
    663         use_zarr_fill_value_as_mask,
--> 664     ) = _get_open_params(
    665         store=store,
    666         mode=mode,
    667         synchronizer=synchronizer,
    668         group=group,
    669         consolidated=consolidated,
    670         consolidate_on_close=consolidate_on_close,
    671         chunk_store=chunk_store,
    672         storage_options=storage_options,
    673         zarr_version=zarr_version,
    674         use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask,
    675         zarr_format=zarr_format,
    676     )
    678     return cls(
    679         zarr_group,
    680         mode,
   (...)    689         cache_members=cache_members,
    690     )

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:1815, in _get_open_params(store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, zarr_version, use_zarr_fill_value_as_mask, zarr_format)
   1811     if _zarr_v3():
   1812         # we have determined that we don't want to use consolidated metadata
   1813         # so we set that to False to avoid trying to read it
   1814         open_kwargs["use_consolidated"] = False
-> 1815     zarr_group = zarr.open_group(store, **open_kwargs)
   1817 close_store_on_close = zarr_group.store is not store
   1819 # we use this to determine how to handle fill_value

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/api/synchronous.py:531, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated)
    454 def open_group(
    455     store: StoreLike | None = None,
    456     *,
   (...)    467     use_consolidated: bool | str | None = None,
    468 ) -> Group:
    469     """Open a group using file-mode-like semantics.
    470 
    471     Parameters
   (...)    528         The new group.
    529     """
    530     return Group(
--> 531         sync(
    532             async_api.open_group(
    533                 store=store,
    534                 mode=mode,
    535                 cache_attrs=cache_attrs,
    536                 synchronizer=synchronizer,
    537                 path=path,
    538                 chunk_store=chunk_store,
    539                 storage_options=storage_options,
    540                 zarr_version=zarr_version,
    541                 zarr_format=zarr_format,
    542                 meta_array=meta_array,
    543                 attributes=attributes,
    544                 use_consolidated=use_consolidated,
    545             )
    546         )
    547     )

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/core/sync.py:163, in sync(coro, loop, timeout)
    160 return_result = next(iter(finished)).result()
    162 if isinstance(return_result, BaseException):
--> 163     raise return_result
    164 else:
    165     return return_result

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/core/sync.py:119, in _runner(coro)
    114 """
    115 Await a coroutine and return the result of running it. If awaiting the coroutine raises an
    116 exception, the exception will be returned.
    117 """
    118 try:
--> 119     return await coro
    120 except Exception as ex:
    121     return ex

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/api/asynchronous.py:838, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated)
    835 if chunk_store is not None:
    836     warnings.warn("chunk_store is not yet implemented", RuntimeWarning, stacklevel=2)
--> 838 store_path = await make_store_path(store, mode=mode, storage_options=storage_options, path=path)
    839 if attributes is None:
    840     attributes = {}

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/storage/_common.py:363, in make_store_path(store_like, path, mode, storage_options)
    359     if storage_options:
    360         raise ValueError(
    361             "'storage_options was provided but is not used for FSMap store_like objects. Specify the storage options when creating the FSMap instance instead."
    362         )
--> 363     store = FsspecStore.from_mapper(store_like, read_only=_read_only)
    364 else:
    365     raise TypeError(f"Unsupported type for store_like: '{type(store_like).__name__}'")

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/storage/_fsspec.py:202, in FsspecStore.from_mapper(cls, fs_map, read_only, allowed_exceptions)
    178 @classmethod
    179 def from_mapper(
    180     cls,
   (...)    183     allowed_exceptions: tuple[type[Exception], ...] = ALLOWED_EXCEPTIONS,
    184 ) -> FsspecStore:
    185     """
    186     Create a FsspecStore from a FSMap object.
    187 
   (...)    200     FsspecStore
    201     """
--> 202     fs = _make_async(fs_map.fs)
    203     return cls(
    204         fs=fs,
    205         path=fs_map.root,
    206         read_only=read_only,
    207         allowed_exceptions=allowed_exceptions,
    208     )

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/storage/_fsspec.py:57, in _make_async(fs)
     55     fs_dict = json.loads(fs.to_json())
     56     fs_dict["asynchronous"] = True
---> 57     return fsspec.AbstractFileSystem.from_json(json.dumps(fs_dict))
     59 if fsspec_version < parse_version("2024.12.0"):
     60     raise ImportError(
     61         f"The filesystem '{fs}' is synchronous, and the required "
     62         "AsyncFileSystemWrapper is not available. Upgrade fsspec to version "
     63         "2024.12.0 or later to enable this functionality."
     64     )

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/spec.py:1480, in AbstractFileSystem.from_json(blob)
   1459 """
   1460 Recreate a filesystem instance from JSON representation.
   1461 
   (...)   1476 at import time.
   1477 """
   1478 from .json import FilesystemJSONDecoder
-> 1480 return json.loads(blob, cls=FilesystemJSONDecoder)

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/json/__init__.py:359, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    357 if parse_constant is not None:
    358     kw['parse_constant'] = parse_constant
--> 359 return cls(**kw).decode(s)

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/json/decoder.py:345, in JSONDecoder.decode(self, s, _w)
    340 def decode(self, s, _w=WHITESPACE.match):
    341     """Return the Python representation of ``s`` (a ``str`` instance
    342     containing a JSON document).
    343 
    344     """
--> 345     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    346     end = _w(s, end).end()
    347     if end != len(s):

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/json/decoder.py:361, in JSONDecoder.raw_decode(self, s, idx)
    352 """Decode a JSON document from ``s`` (a ``str`` beginning with
    353 a JSON document) and return a 2-tuple of the Python
    354 representation and the index in ``s`` where the document ended.
   (...)    358 
    359 """
    360 try:
--> 361     obj, end = self.scan_once(s, idx)
    362 except StopIteration as err:
    363     raise JSONDecodeError("Expecting value", s, err.value) from None

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/json.py:97, in FilesystemJSONDecoder.custom_object_hook(self, dct)
     95 if "cls" in dct:
     96     if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
---> 97         return AbstractFileSystem.from_dict(dct)
     98     if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
     99         return obj_cls(dct["str"])

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/spec.py:1556, in AbstractFileSystem.from_dict(dct)
   1553 dct.pop("cls", None)
   1554 dct.pop("protocol", None)
-> 1556 return cls(
   1557     *json_decoder.unmake_serializable(dct.pop("args", ())),
   1558     **json_decoder.unmake_serializable(dct),
   1559 )

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/spec.py:81, in _Cached.__call__(cls, *args, **kwargs)
     79     return cls._cache[token]
     80 else:
---> 81     obj = super().__call__(*args, **kwargs)
     82     # Setting _fs_token here causes some static linters to complain.
     83     obj._fs_token_ = token

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/implementations/reference.py:770, in ReferenceFileSystem.__init__(self, fo, target, ref_storage_args, target_protocol, target_options, remote_protocol, remote_options, fs, template_overrides, simple_templates, max_gap, max_block, cache_size, **kwargs)
    768     self.fss[k] = AsyncFileSystemWrapper(f, asynchronous=self.asynchronous)
    769 elif self.asynchronous ^ f.asynchronous:
--> 770     raise ValueError(
    771         "Reference-FS's target filesystem must have same value "
    772         "of asynchronous"
    773     )

ValueError: Reference-FS's target filesystem must have same value of asynchronous

Opening Reference Dataset with Xarray and the Kerchunk Engine

As of writing, the latest version of Kerchunk supports opening an reference dataset with Xarray without specifically creating an fsspec filesystem. This is the same behavior as the example above, just a few less lines of code.

storage_options = {
    "remote_protocol": "s3",
    "skip_instance_cache": True,
    "remote_options": {"anon": True}
}  # options passed to fsspec
open_dataset_options = {"chunks": {}}  # opens passed to xarray

ds = xr.open_dataset(
    "references/ARG_combined.json",
    engine="kerchunk",
    storage_options=storage_options,
    open_dataset_options=open_dataset_options,
)