Skip to article frontmatterSkip to article content

Load Kerchunked dataset with Xarray

Overview

Within this notebook, we will cover:

  1. How to load a Kerchunk pre-generated reference file into Xarray as if it were a Zarr store.

Prerequisites

ConceptsImportanceNotes
Kerchunk BasicsRequiredCore
Xarray TutorialRequiredCore
  • Time to learn: 45 minutes


Opening Reference Dataset with Fsspec and Xarray

One way of using our reference dataset is opening it with Xarray. To do this, we will create an fsspec filesystem and pass it to Xarray.

# create an fsspec reference filesystem from the Kerchunk output
import fsspec
import xarray as xr

fs = fsspec.filesystem(
    "reference",
    fo="references/ARG_combined.json",
    remote_protocol="s3",
    remote_options={"anon": True},
    skip_instance_cache=True,
)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", backend_kwargs={"consolidated": False})
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 13
      5 fs = fsspec.filesystem(
      6     "reference",
      7     fo="references/ARG_combined.json",
   (...)     10     skip_instance_cache=True,
     11 )
     12 m = fs.get_mapper("")
---> 13 ds = xr.open_dataset(m, engine="zarr", backend_kwargs={"consolidated": False})

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/api.py:596, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, create_default_indexes, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)
    584 decoders = _resolve_decoders_kwargs(
    585     decode_cf,
    586     open_backend_dataset_parameters=backend.open_dataset_parameters,
   (...)    592     decode_coords=decode_coords,
    593 )
    595 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 596 backend_ds = backend.open_dataset(
    597     filename_or_obj,
    598     drop_variables=drop_variables,
    599     **decoders,
    600     **kwargs,
    601 )
    602 ds = _dataset_from_backend_dataset(
    603     backend_ds,
    604     filename_or_obj,
   (...)    615     **kwargs,
    616 )
    617 return ds

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:1660, in ZarrBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, zarr_version, zarr_format, store, engine, use_zarr_fill_value_as_mask, cache_members)
   1658 filename_or_obj = _normalize_path(filename_or_obj)
   1659 if not store:
-> 1660     store = ZarrStore.open_group(
   1661         filename_or_obj,
   1662         group=group,
   1663         mode=mode,
   1664         synchronizer=synchronizer,
   1665         consolidated=consolidated,
   1666         consolidate_on_close=False,
   1667         chunk_store=chunk_store,
   1668         storage_options=storage_options,
   1669         zarr_version=zarr_version,
   1670         use_zarr_fill_value_as_mask=None,
   1671         zarr_format=zarr_format,
   1672         cache_members=cache_members,
   1673     )
   1675 store_entrypoint = StoreBackendEntrypoint()
   1676 with close_on_error(store):

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:714, in ZarrStore.open_group(cls, store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, append_dim, write_region, safe_chunks, align_chunks, zarr_version, zarr_format, use_zarr_fill_value_as_mask, write_empty, cache_members)
    688 @classmethod
    689 def open_group(
    690     cls,
   (...)    707     cache_members: bool = True,
    708 ):
    709     (
    710         zarr_group,
    711         consolidate_on_close,
    712         close_store_on_close,
    713         use_zarr_fill_value_as_mask,
--> 714     ) = _get_open_params(
    715         store=store,
    716         mode=mode,
    717         synchronizer=synchronizer,
    718         group=group,
    719         consolidated=consolidated,
    720         consolidate_on_close=consolidate_on_close,
    721         chunk_store=chunk_store,
    722         storage_options=storage_options,
    723         zarr_version=zarr_version,
    724         use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask,
    725         zarr_format=zarr_format,
    726     )
    728     return cls(
    729         zarr_group,
    730         mode,
   (...)    739         cache_members=cache_members,
    740     )

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:1902, in _get_open_params(store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, zarr_version, use_zarr_fill_value_as_mask, zarr_format)
   1898     if _zarr_v3():
   1899         # we have determined that we don't want to use consolidated metadata
   1900         # so we set that to False to avoid trying to read it
   1901         open_kwargs["use_consolidated"] = False
-> 1902     zarr_group = zarr.open_group(store, **open_kwargs)
   1904 close_store_on_close = zarr_group.store is not store
   1906 # we use this to determine how to handle fill_value

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/api/synchronous.py:540, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated)
    463 def open_group(
    464     store: StoreLike | None = None,
    465     *,
   (...)    476     use_consolidated: bool | str | None = None,
    477 ) -> Group:
    478     """Open a group using file-mode-like semantics.
    479 
    480     Parameters
   (...)    537         The new group.
    538     """
    539     return Group(
--> 540         sync(
    541             async_api.open_group(
    542                 store=store,
    543                 mode=mode,
    544                 cache_attrs=cache_attrs,
    545                 synchronizer=synchronizer,
    546                 path=path,
    547                 chunk_store=chunk_store,
    548                 storage_options=storage_options,
    549                 zarr_version=zarr_version,
    550                 zarr_format=zarr_format,
    551                 meta_array=meta_array,
    552                 attributes=attributes,
    553                 use_consolidated=use_consolidated,
    554             )
    555         )
    556     )

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/core/sync.py:163, in sync(coro, loop, timeout)
    160 return_result = next(iter(finished)).result()
    162 if isinstance(return_result, BaseException):
--> 163     raise return_result
    164 else:
    165     return return_result

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/core/sync.py:119, in _runner(coro)
    114 """
    115 Await a coroutine and return the result of running it. If awaiting the coroutine raises an
    116 exception, the exception will be returned.
    117 """
    118 try:
--> 119     return await coro
    120 except Exception as ex:
    121     return ex

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/api/asynchronous.py:851, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated)
    848 if chunk_store is not None:
    849     warnings.warn("chunk_store is not yet implemented", ZarrRuntimeWarning, stacklevel=2)
--> 851 store_path = await make_store_path(store, mode=mode, storage_options=storage_options, path=path)
    852 if attributes is None:
    853     attributes = {}

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/storage/_common.py:418, in make_store_path(store_like, path, mode, storage_options)
    413     raise ValueError(
    414         "'path' was provided but is not used for FSMap store_like objects. Specify the path when creating the FSMap instance instead."
    415     )
    417 else:
--> 418     store = await make_store(store_like, mode=mode, storage_options=storage_options)
    419     return await StorePath.open(store, path=path_normalized, mode=mode)

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/storage/_common.py:352, in make_store(store_like, mode, storage_options)
    349         return await make_store(Path(store_like), mode=mode, storage_options=storage_options)
    351 elif _has_fsspec and isinstance(store_like, FSMap):
--> 352     return FsspecStore.from_mapper(store_like, read_only=_read_only)
    354 else:
    355     raise TypeError(f"Unsupported type for store_like: '{type(store_like).__name__}'")

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/storage/_fsspec.py:201, in FsspecStore.from_mapper(cls, fs_map, read_only, allowed_exceptions)
    177 @classmethod
    178 def from_mapper(
    179     cls,
   (...)    182     allowed_exceptions: tuple[type[Exception], ...] = ALLOWED_EXCEPTIONS,
    183 ) -> FsspecStore:
    184     """
    185     Create a FsspecStore from a FSMap object.
    186 
   (...)    199     FsspecStore
    200     """
--> 201     fs = _make_async(fs_map.fs)
    202     return cls(
    203         fs=fs,
    204         path=fs_map.root,
    205         read_only=read_only,
    206         allowed_exceptions=allowed_exceptions,
    207     )

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/storage/_fsspec.py:57, in _make_async(fs)
     55     fs_dict = json.loads(fs.to_json())
     56     fs_dict["asynchronous"] = True
---> 57     return fsspec.AbstractFileSystem.from_json(json.dumps(fs_dict))
     59 if fsspec_version < parse_version("2024.12.0"):
     60     raise ImportError(
     61         f"The filesystem '{fs}' is synchronous, and the required "
     62         "AsyncFileSystemWrapper is not available. Upgrade fsspec to version "
     63         "2024.12.0 or later to enable this functionality."
     64     )

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/spec.py:1480, in AbstractFileSystem.from_json(blob)
   1459 """
   1460 Recreate a filesystem instance from JSON representation.
   1461 
   (...)   1476 at import time.
   1477 """
   1478 from .json import FilesystemJSONDecoder
-> 1480 return json.loads(blob, cls=FilesystemJSONDecoder)

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/json/__init__.py:359, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    357 if parse_constant is not None:
    358     kw['parse_constant'] = parse_constant
--> 359 return cls(**kw).decode(s)

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/json/decoder.py:345, in JSONDecoder.decode(self, s, _w)
    340 def decode(self, s, _w=WHITESPACE.match):
    341     """Return the Python representation of ``s`` (a ``str`` instance
    342     containing a JSON document).
    343 
    344     """
--> 345     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    346     end = _w(s, end).end()
    347     if end != len(s):

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/json/decoder.py:361, in JSONDecoder.raw_decode(self, s, idx)
    352 """Decode a JSON document from ``s`` (a ``str`` beginning with
    353 a JSON document) and return a 2-tuple of the Python
    354 representation and the index in ``s`` where the document ended.
   (...)    358 
    359 """
    360 try:
--> 361     obj, end = self.scan_once(s, idx)
    362 except StopIteration as err:
    363     raise JSONDecodeError("Expecting value", s, err.value) from None

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/json.py:97, in FilesystemJSONDecoder.custom_object_hook(self, dct)
     95 if "cls" in dct:
     96     if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
---> 97         return AbstractFileSystem.from_dict(dct)
     98     if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
     99         return obj_cls(dct["str"])

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/spec.py:1556, in AbstractFileSystem.from_dict(dct)
   1553 dct.pop("cls", None)
   1554 dct.pop("protocol", None)
-> 1556 return cls(
   1557     *json_decoder.unmake_serializable(dct.pop("args", ())),
   1558     **json_decoder.unmake_serializable(dct),
   1559 )

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/spec.py:81, in _Cached.__call__(cls, *args, **kwargs)
     79     return cls._cache[token]
     80 else:
---> 81     obj = super().__call__(*args, **kwargs)
     82     # Setting _fs_token here causes some static linters to complain.
     83     obj._fs_token_ = token

File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/implementations/reference.py:770, in ReferenceFileSystem.__init__(self, fo, target, ref_storage_args, target_protocol, target_options, remote_protocol, remote_options, fs, template_overrides, simple_templates, max_gap, max_block, cache_size, **kwargs)
    768     self.fss[k] = AsyncFileSystemWrapper(f, asynchronous=self.asynchronous)
    769 elif self.asynchronous ^ f.asynchronous:
--> 770     raise ValueError(
    771         "Reference-FS's target filesystem must have same value "
    772         "of asynchronous"
    773     )

ValueError: Reference-FS's target filesystem must have same value of asynchronous

Opening Reference Dataset with Xarray and the Kerchunk Engine

As of writing, the latest version of Kerchunk supports opening an reference dataset with Xarray without specifically creating an fsspec filesystem. This is the same behavior as the example above, just a few less lines of code.

storage_options = {
    "remote_protocol": "s3",
    "skip_instance_cache": True,
    "remote_options": {"anon": True}
}  # options passed to fsspec
open_dataset_options = {"chunks": {}}  # opens passed to xarray

ds = xr.open_dataset(
    "references/ARG_combined.json",
    engine="kerchunk",
    storage_options=storage_options,
    open_dataset_options=open_dataset_options,
)