Overview¶
Within this notebook, we will cover:
How to load a Kerchunk pre-generated reference file into Xarray as if it were a Zarr store.
Prerequisites¶
Concepts | Importance | Notes |
---|---|---|
Kerchunk Basics | Required | Core |
Xarray Tutorial | Required | Core |
Time to learn: 45 minutes
Opening Reference Dataset with Fsspec and Xarray¶
One way of using our reference dataset is opening it with Xarray
. To do this, we will create an fsspec
filesystem and pass it to Xarray
.
# create an fsspec reference filesystem from the Kerchunk output
import fsspec
import xarray as xr
fs = fsspec.filesystem(
"reference",
fo="references/ARG_combined.json",
remote_protocol="s3",
remote_options={"anon": True},
skip_instance_cache=True,
)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", backend_kwargs={"consolidated": False})
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 13
5 fs = fsspec.filesystem(
6 "reference",
7 fo="references/ARG_combined.json",
(...) 10 skip_instance_cache=True,
11 )
12 m = fs.get_mapper("")
---> 13 ds = xr.open_dataset(m, engine="zarr", backend_kwargs={"consolidated": False})
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/api.py:760, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, create_default_indexes, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)
748 decoders = _resolve_decoders_kwargs(
749 decode_cf,
750 open_backend_dataset_parameters=backend.open_dataset_parameters,
(...) 756 decode_coords=decode_coords,
757 )
759 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 760 backend_ds = backend.open_dataset(
761 filename_or_obj,
762 drop_variables=drop_variables,
763 **decoders,
764 **kwargs,
765 )
766 ds = _dataset_from_backend_dataset(
767 backend_ds,
768 filename_or_obj,
(...) 779 **kwargs,
780 )
781 return ds
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:1654, in ZarrBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, zarr_version, zarr_format, store, engine, use_zarr_fill_value_as_mask, cache_members)
1652 filename_or_obj = _normalize_path(filename_or_obj)
1653 if not store:
-> 1654 store = ZarrStore.open_group(
1655 filename_or_obj,
1656 group=group,
1657 mode=mode,
1658 synchronizer=synchronizer,
1659 consolidated=consolidated,
1660 consolidate_on_close=False,
1661 chunk_store=chunk_store,
1662 storage_options=storage_options,
1663 zarr_version=zarr_version,
1664 use_zarr_fill_value_as_mask=None,
1665 zarr_format=zarr_format,
1666 cache_members=cache_members,
1667 )
1669 store_entrypoint = StoreBackendEntrypoint()
1670 with close_on_error(store):
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:714, in ZarrStore.open_group(cls, store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, append_dim, write_region, safe_chunks, align_chunks, zarr_version, zarr_format, use_zarr_fill_value_as_mask, write_empty, cache_members)
688 @classmethod
689 def open_group(
690 cls,
(...) 707 cache_members: bool = True,
708 ):
709 (
710 zarr_group,
711 consolidate_on_close,
712 close_store_on_close,
713 use_zarr_fill_value_as_mask,
--> 714 ) = _get_open_params(
715 store=store,
716 mode=mode,
717 synchronizer=synchronizer,
718 group=group,
719 consolidated=consolidated,
720 consolidate_on_close=consolidate_on_close,
721 chunk_store=chunk_store,
722 storage_options=storage_options,
723 zarr_version=zarr_version,
724 use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask,
725 zarr_format=zarr_format,
726 )
728 return cls(
729 zarr_group,
730 mode,
(...) 739 cache_members=cache_members,
740 )
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/xarray/backends/zarr.py:1896, in _get_open_params(store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, zarr_version, use_zarr_fill_value_as_mask, zarr_format)
1892 if _zarr_v3():
1893 # we have determined that we don't want to use consolidated metadata
1894 # so we set that to False to avoid trying to read it
1895 open_kwargs["use_consolidated"] = False
-> 1896 zarr_group = zarr.open_group(store, **open_kwargs)
1898 close_store_on_close = zarr_group.store is not store
1900 # we use this to determine how to handle fill_value
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/api/synchronous.py:531, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated)
454 def open_group(
455 store: StoreLike | None = None,
456 *,
(...) 467 use_consolidated: bool | str | None = None,
468 ) -> Group:
469 """Open a group using file-mode-like semantics.
470
471 Parameters
(...) 528 The new group.
529 """
530 return Group(
--> 531 sync(
532 async_api.open_group(
533 store=store,
534 mode=mode,
535 cache_attrs=cache_attrs,
536 synchronizer=synchronizer,
537 path=path,
538 chunk_store=chunk_store,
539 storage_options=storage_options,
540 zarr_version=zarr_version,
541 zarr_format=zarr_format,
542 meta_array=meta_array,
543 attributes=attributes,
544 use_consolidated=use_consolidated,
545 )
546 )
547 )
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/core/sync.py:163, in sync(coro, loop, timeout)
160 return_result = next(iter(finished)).result()
162 if isinstance(return_result, BaseException):
--> 163 raise return_result
164 else:
165 return return_result
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/core/sync.py:119, in _runner(coro)
114 """
115 Await a coroutine and return the result of running it. If awaiting the coroutine raises an
116 exception, the exception will be returned.
117 """
118 try:
--> 119 return await coro
120 except Exception as ex:
121 return ex
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/api/asynchronous.py:845, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated)
842 if chunk_store is not None:
843 warnings.warn("chunk_store is not yet implemented", ZarrRuntimeWarning, stacklevel=2)
--> 845 store_path = await make_store_path(store, mode=mode, storage_options=storage_options, path=path)
846 if attributes is None:
847 attributes = {}
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/storage/_common.py:380, in make_store_path(store_like, path, mode, storage_options)
376 if path:
377 raise ValueError(
378 "'path' was provided but is not used for FSMap store_like objects. Specify the path when creating the FSMap instance instead."
379 )
--> 380 store = FsspecStore.from_mapper(store_like, read_only=_read_only)
381 else:
382 raise TypeError(f"Unsupported type for store_like: '{type(store_like).__name__}'")
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/storage/_fsspec.py:204, in FsspecStore.from_mapper(cls, fs_map, read_only, allowed_exceptions)
180 @classmethod
181 def from_mapper(
182 cls,
(...) 185 allowed_exceptions: tuple[type[Exception], ...] = ALLOWED_EXCEPTIONS,
186 ) -> FsspecStore:
187 """
188 Create a FsspecStore from a FSMap object.
189
(...) 202 FsspecStore
203 """
--> 204 fs = _make_async(fs_map.fs)
205 return cls(
206 fs=fs,
207 path=fs_map.root,
208 read_only=read_only,
209 allowed_exceptions=allowed_exceptions,
210 )
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/zarr/storage/_fsspec.py:58, in _make_async(fs)
56 fs_dict = json.loads(fs.to_json())
57 fs_dict["asynchronous"] = True
---> 58 return fsspec.AbstractFileSystem.from_json(json.dumps(fs_dict))
60 if fsspec_version < parse_version("2024.12.0"):
61 raise ImportError(
62 f"The filesystem '{fs}' is synchronous, and the required "
63 "AsyncFileSystemWrapper is not available. Upgrade fsspec to version "
64 "2024.12.0 or later to enable this functionality."
65 )
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/spec.py:1480, in AbstractFileSystem.from_json(blob)
1459 """
1460 Recreate a filesystem instance from JSON representation.
1461
(...) 1476 at import time.
1477 """
1478 from .json import FilesystemJSONDecoder
-> 1480 return json.loads(blob, cls=FilesystemJSONDecoder)
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/json/__init__.py:359, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
357 if parse_constant is not None:
358 kw['parse_constant'] = parse_constant
--> 359 return cls(**kw).decode(s)
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/json/decoder.py:345, in JSONDecoder.decode(self, s, _w)
340 def decode(self, s, _w=WHITESPACE.match):
341 """Return the Python representation of ``s`` (a ``str`` instance
342 containing a JSON document).
343
344 """
--> 345 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
346 end = _w(s, end).end()
347 if end != len(s):
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/json/decoder.py:361, in JSONDecoder.raw_decode(self, s, idx)
352 """Decode a JSON document from ``s`` (a ``str`` beginning with
353 a JSON document) and return a 2-tuple of the Python
354 representation and the index in ``s`` where the document ended.
(...) 358
359 """
360 try:
--> 361 obj, end = self.scan_once(s, idx)
362 except StopIteration as err:
363 raise JSONDecodeError("Expecting value", s, err.value) from None
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/json.py:97, in FilesystemJSONDecoder.custom_object_hook(self, dct)
95 if "cls" in dct:
96 if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
---> 97 return AbstractFileSystem.from_dict(dct)
98 if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
99 return obj_cls(dct["str"])
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/spec.py:1556, in AbstractFileSystem.from_dict(dct)
1553 dct.pop("cls", None)
1554 dct.pop("protocol", None)
-> 1556 return cls(
1557 *json_decoder.unmake_serializable(dct.pop("args", ())),
1558 **json_decoder.unmake_serializable(dct),
1559 )
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/spec.py:81, in _Cached.__call__(cls, *args, **kwargs)
79 return cls._cache[token]
80 else:
---> 81 obj = super().__call__(*args, **kwargs)
82 # Setting _fs_token here causes some static linters to complain.
83 obj._fs_token_ = token
File ~/micromamba/envs/kerchunk-cookbook/lib/python3.13/site-packages/fsspec/implementations/reference.py:770, in ReferenceFileSystem.__init__(self, fo, target, ref_storage_args, target_protocol, target_options, remote_protocol, remote_options, fs, template_overrides, simple_templates, max_gap, max_block, cache_size, **kwargs)
768 self.fss[k] = AsyncFileSystemWrapper(f, asynchronous=self.asynchronous)
769 elif self.asynchronous ^ f.asynchronous:
--> 770 raise ValueError(
771 "Reference-FS's target filesystem must have same value "
772 "of asynchronous"
773 )
ValueError: Reference-FS's target filesystem must have same value of asynchronous
Opening Reference Dataset with Xarray and the Kerchunk
Engine¶
As of writing, the latest version of Kerchunk supports opening an reference dataset with Xarray without specifically creating an fsspec filesystem. This is the same behavior as the example above, just a few less lines of code.
storage_options = {
"remote_protocol": "s3",
"skip_instance_cache": True,
"remote_options": {"anon": True}
} # options passed to fsspec
open_dataset_options = {"chunks": {}} # opens passed to xarray
ds = xr.open_dataset(
"references/ARG_combined.json",
engine="kerchunk",
storage_options=storage_options,
open_dataset_options=open_dataset_options,
)