Complex Searching with intake
and analysing employing xarray
¶
Overview¶
This tutorial we will present access multiple historical (as an example here) data available and analyze using intake
. Put them in a dictionary format employing xarray
and plotting simple area average time series over a specific region.
Imports¶
import warnings
import intake
from distributed import Client
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import dask
xr.set_options(display_style='html')
warnings.filterwarnings("ignore")
cat_url = "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
col = intake.open_esm_datastore(cat_url)
col
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[2], line 2
1 cat_url = "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
----> 2 col = intake.open_esm_datastore(cat_url)
3 col
File ~/micromamba/envs/esgf-cookbook-dev/lib/python3.11/site-packages/intake_esm/core.py:124, in esm_datastore.__init__(self, obj, progressbar, sep, registry, read_csv_kwargs, columns_with_iterables, storage_options, threaded, **intake_kwargs)
122 self.esmcat = ESMCatalogModel.from_dict(obj)
123 else:
--> 124 self.esmcat = ESMCatalogModel.load(
125 obj, storage_options=self.storage_options, read_csv_kwargs=read_csv_kwargs
126 )
128 self.derivedcat = registry or default_registry
129 self._entries = {}
File ~/micromamba/envs/esgf-cookbook-dev/lib/python3.11/site-packages/intake_esm/cat.py:255, in ESMCatalogModel.load(cls, json_file, storage_options, read_csv_kwargs)
253 if 'last_updated' not in data:
254 data['last_updated'] = None
--> 255 cat = cls.model_validate(data)
256 if cat.catalog_file:
257 cat._frames = cat._df_from_file(cat, _mapper, storage_options, read_csv_kwargs)
[... skipping hidden 1 frame]
File ~/micromamba/envs/esgf-cookbook-dev/lib/python3.11/site-packages/intake_esm/cat.py:76, in Assets._validate_data_format(cls, model)
74 @pydantic.model_validator(mode='after')
75 def _validate_data_format(cls, model):
---> 76 data_format, format_column_name = model.format, model.format_column_name
77 if data_format is not None and format_column_name is not None:
78 raise ValueError('Cannot set both format and format_column_name')
AttributeError: 'pydantic_core._pydantic_core.ValidationInfo' object has no attribute 'format'
cat = col.search(experiment_id=["historical"],
variable_id = ["tas"],
member_id = ["r1i1p1f1"],
table_id = ["Amon",],
source_id = [ "CMCC-ESM2", "CanESM5", "CESM2", "CESM2-FV2", ]
)
cat.df
dset_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True})
list(dset_dict.keys())
ds = {}
for key in dset_dict.keys():
# Sort the dataset by time
sorted_dataset = dset_dict[key].sortby("time")
# Subset data for years 1900-2000
ds[key] = sorted_dataset.sel(time=slice("1900", "2000"))
# Optional: Print a message indicating dataset processing
print(f"Processing dataset: {key}")
ds
now contains subset of datasets for each key in dset_dict
Let’s check ds
ds
Calculate regional mean for each dataset and visualizing time series¶
regn_mean = {}
for key in dset_dict.keys():
regn_mean[key] = ds[key]['tas'].sel(lon=slice(65, 100), lat=slice(5, 25)).mean(dim=['lon', 'lat']).squeeze()
regn_mean
plt.rcParams['figure.figsize'] = [15, 4]
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['font.size'] = 10
plt.rcParams['font.weight'] = 'bold'
Visualizing the regional mean for each dataset¶
for key, regm in regn_mean.items():
source_id = key.split('.')[2]
regm.plot(label=source_id)
plt.title(f"Mean Surface Air Temperature for {source_id}")
plt.xlabel('Time')
plt.ylabel('Temperature (K)')
plt.legend()
plt.show()
Calculating annual mean for each dataset and visualizing time series¶
annual_mean = {}
for key, regm in regn_mean.items():
annual_mean[key] = regm.resample(time='Y').mean()
annual_mean
Visualizing the regional annual mean for each dataset¶
for key, anmn in annual_mean.items():
source_id = key.split('.')[2]
anmn.plot(label=source_id)
plt.title(f"Mean Annual Surface Air Temperature for {source_id}")
plt.xlabel('Time')
plt.ylabel('Temperature (K)')
plt.legend()
plt.show()
Visualizing the regional annual mean for each dataset in a single panel¶
# Create the plot
plt.figure(figsize=(12, 6))
# Plotting the annual mean for each dataset on the same plot
for key, annm in annual_mean.items():
source_id = key.split('.')[2]
annm.plot(label=source_id)
plt.title("Annual Mean Surface Air Temperature (Regional)")
plt.xlabel('Time')
plt.ylabel('Temperature (K)')
plt.legend()
plt.show()