Overview¶
Within this notebook, we will cover:
- How to load a Kerchunk pre-generated reference file into Xarray as if it were a Zarr store.
Prerequisites¶
Concepts | Importance | Notes |
---|---|---|
Kerchunk Basics | Required | Core |
Xarray Tutorial | Required | Core |
- Time to learn: 45 minutes
Opening Reference Dataset with Fsspec and Xarray¶
One way of using our reference dataset is opening it with Xarray
. To do this, we will create an fsspec
filesystem and pass it to Xarray
.
# create an fsspec reference filesystem from the Kerchunk output
import fsspec
import xarray as xr
fs = fsspec.filesystem(
"reference",
fo="references/ARG_combined.json",
remote_protocol="s3",
remote_options={"anon": True},
skip_instance_cache=True,
)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", backend_kwargs={"consolidated": False})
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[1], line 13
5 fs = fsspec.filesystem(
6 "reference",
7 fo="references/ARG_combined.json",
(...) 10 skip_instance_cache=True,
11 )
12 m = fs.get_mapper("")
---> 13 ds = xr.open_dataset(m, engine="zarr", backend_kwargs={"consolidated": False})
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/api.py:687, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)
675 decoders = _resolve_decoders_kwargs(
676 decode_cf,
677 open_backend_dataset_parameters=backend.open_dataset_parameters,
(...) 683 decode_coords=decode_coords,
684 )
686 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 687 backend_ds = backend.open_dataset(
688 filename_or_obj,
689 drop_variables=drop_variables,
690 **decoders,
691 **kwargs,
692 )
693 ds = _dataset_from_backend_dataset(
694 backend_ds,
695 filename_or_obj,
(...) 705 **kwargs,
706 )
707 return ds
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:1578, in ZarrBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, zarr_version, zarr_format, store, engine, use_zarr_fill_value_as_mask, cache_members)
1576 filename_or_obj = _normalize_path(filename_or_obj)
1577 if not store:
-> 1578 store = ZarrStore.open_group(
1579 filename_or_obj,
1580 group=group,
1581 mode=mode,
1582 synchronizer=synchronizer,
1583 consolidated=consolidated,
1584 consolidate_on_close=False,
1585 chunk_store=chunk_store,
1586 storage_options=storage_options,
1587 zarr_version=zarr_version,
1588 use_zarr_fill_value_as_mask=None,
1589 zarr_format=zarr_format,
1590 cache_members=cache_members,
1591 )
1593 store_entrypoint = StoreBackendEntrypoint()
1594 with close_on_error(store):
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:664, in ZarrStore.open_group(cls, store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, append_dim, write_region, safe_chunks, align_chunks, zarr_version, zarr_format, use_zarr_fill_value_as_mask, write_empty, cache_members)
638 @classmethod
639 def open_group(
640 cls,
(...) 657 cache_members: bool = True,
658 ):
659 (
660 zarr_group,
661 consolidate_on_close,
662 close_store_on_close,
663 use_zarr_fill_value_as_mask,
--> 664 ) = _get_open_params(
665 store=store,
666 mode=mode,
667 synchronizer=synchronizer,
668 group=group,
669 consolidated=consolidated,
670 consolidate_on_close=consolidate_on_close,
671 chunk_store=chunk_store,
672 storage_options=storage_options,
673 zarr_version=zarr_version,
674 use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask,
675 zarr_format=zarr_format,
676 )
678 return cls(
679 zarr_group,
680 mode,
(...) 689 cache_members=cache_members,
690 )
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:1815, in _get_open_params(store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, zarr_version, use_zarr_fill_value_as_mask, zarr_format)
1811 if _zarr_v3():
1812 # we have determined that we don't want to use consolidated metadata
1813 # so we set that to False to avoid trying to read it
1814 open_kwargs["use_consolidated"] = False
-> 1815 zarr_group = zarr.open_group(store, **open_kwargs)
1817 close_store_on_close = zarr_group.store is not store
1819 # we use this to determine how to handle fill_value
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/_compat.py:43, in _deprecate_positional_args.<locals>._inner_deprecate_positional_args.<locals>.inner_f(*args, **kwargs)
41 extra_args = len(args) - len(all_args)
42 if extra_args <= 0:
---> 43 return f(*args, **kwargs)
45 # extra_args > 0
46 args_msg = [
47 f"{name}={arg}"
48 for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:], strict=False)
49 ]
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/api/synchronous.py:529, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated)
451 @_deprecate_positional_args
452 def open_group(
453 store: StoreLike | None = None,
(...) 465 use_consolidated: bool | str | None = None,
466 ) -> Group:
467 """Open a group using file-mode-like semantics.
468
469 Parameters
(...) 526 The new group.
527 """
528 return Group(
--> 529 sync(
530 async_api.open_group(
531 store=store,
532 mode=mode,
533 cache_attrs=cache_attrs,
534 synchronizer=synchronizer,
535 path=path,
536 chunk_store=chunk_store,
537 storage_options=storage_options,
538 zarr_version=zarr_version,
539 zarr_format=zarr_format,
540 meta_array=meta_array,
541 attributes=attributes,
542 use_consolidated=use_consolidated,
543 )
544 )
545 )
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/core/sync.py:163, in sync(coro, loop, timeout)
160 return_result = next(iter(finished)).result()
162 if isinstance(return_result, BaseException):
--> 163 raise return_result
164 else:
165 return return_result
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/core/sync.py:119, in _runner(coro)
114 """
115 Await a coroutine and return the result of running it. If awaiting the coroutine raises an
116 exception, the exception will be returned.
117 """
118 try:
--> 119 return await coro
120 except Exception as ex:
121 return ex
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/api/asynchronous.py:819, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated)
816 if chunk_store is not None:
817 warnings.warn("chunk_store is not yet implemented", RuntimeWarning, stacklevel=2)
--> 819 store_path = await make_store_path(store, mode=mode, storage_options=storage_options, path=path)
820 if attributes is None:
821 attributes = {}
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/storage/_common.py:316, in make_store_path(store_like, path, mode, storage_options)
314 else:
315 msg = f"Unsupported type for store_like: '{type(store_like).__name__}'" # type: ignore[unreachable]
--> 316 raise TypeError(msg)
318 result = await StorePath.open(store, path=path_normalized, mode=mode)
320 if storage_options and not used_storage_options:
TypeError: Unsupported type for store_like: 'FSMap'
Opening Reference Dataset with Xarray and the Kerchunk
Engine¶
As of writing, the latest version of Kerchunk supports opening an reference dataset with Xarray without specifically creating an fsspec filesystem. This is the same behavior as the example above, just a few less lines of code.
storage_options = {
"remote_protocol": "s3",
"skip_instance_cache": True,
"remote_options": {"anon": True}
} # options passed to fsspec
open_dataset_options = {"chunks": {}} # opens passed to xarray
ds = xr.open_dataset(
"references/ARG_combined.json",
engine="kerchunk",
storage_options=storage_options,
open_dataset_options=open_dataset_options,
)