Skip to article frontmatterSkip to article content

Complex Searching with intake and analysing employing xarray

ESGF logo

Complex Searching with intake and analysing employing xarray

Overview

This tutorial we will present access multiple historical (as an example here) data available and analyze using intake. Put them in a dictionary format employing xarray and plotting simple area average time series over a specific region.

Imports

import warnings
import intake
from distributed import Client
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import dask
xr.set_options(display_style='html')
warnings.filterwarnings("ignore")
cat_url = "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
col = intake.open_esm_datastore(cat_url)
col
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[2], line 2
      1 cat_url = "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
----> 2 col = intake.open_esm_datastore(cat_url)
      3 col

File ~/micromamba/envs/esgf-cookbook-dev/lib/python3.11/site-packages/intake_esm/core.py:124, in esm_datastore.__init__(self, obj, progressbar, sep, registry, read_csv_kwargs, columns_with_iterables, storage_options, threaded, **intake_kwargs)
    122     self.esmcat = ESMCatalogModel.from_dict(obj)
    123 else:
--> 124     self.esmcat = ESMCatalogModel.load(
    125         obj, storage_options=self.storage_options, read_csv_kwargs=read_csv_kwargs
    126     )
    128 self.derivedcat = registry or default_registry
    129 self._entries = {}

File ~/micromamba/envs/esgf-cookbook-dev/lib/python3.11/site-packages/intake_esm/cat.py:255, in ESMCatalogModel.load(cls, json_file, storage_options, read_csv_kwargs)
    253 if 'last_updated' not in data:
    254     data['last_updated'] = None
--> 255 cat = cls.model_validate(data)
    256 if cat.catalog_file:
    257     cat._frames = cat._df_from_file(cat, _mapper, storage_options, read_csv_kwargs)

    [... skipping hidden 1 frame]

File ~/micromamba/envs/esgf-cookbook-dev/lib/python3.11/site-packages/intake_esm/cat.py:76, in Assets._validate_data_format(cls, model)
     74 @pydantic.model_validator(mode='after')
     75 def _validate_data_format(cls, model):
---> 76     data_format, format_column_name = model.format, model.format_column_name
     77     if data_format is not None and format_column_name is not None:
     78         raise ValueError('Cannot set both format and format_column_name')

AttributeError: 'pydantic_core._pydantic_core.ValidationInfo' object has no attribute 'format'
cat = col.search(experiment_id=["historical"],
    variable_id = ["tas"],
    member_id = ["r1i1p1f1"],
    table_id = ["Amon",], 
    source_id = [ "CMCC-ESM2", "CanESM5", "CESM2", "CESM2-FV2", ]
)
cat.df
dset_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True})
list(dset_dict.keys())
ds = {}

for key in dset_dict.keys():
    # Sort the dataset by time
    sorted_dataset = dset_dict[key].sortby("time")
    
    # Subset data for years 1900-2000
    ds[key] = sorted_dataset.sel(time=slice("1900", "2000"))
        
    # Optional: Print a message indicating dataset processing
    print(f"Processing dataset: {key}")

ds now contains subset of datasets for each key in dset_dict

Let’s check ds

ds

Calculate regional mean for each dataset and visualizing time series

regn_mean = {} 
for key in dset_dict.keys():
    regn_mean[key] = ds[key]['tas'].sel(lon=slice(65, 100), lat=slice(5, 25)).mean(dim=['lon', 'lat']).squeeze()
regn_mean
plt.rcParams['figure.figsize'] = [15, 4]
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['font.size'] = 10
plt.rcParams['font.weight'] = 'bold'

Visualizing the regional mean for each dataset

for key, regm in regn_mean.items():
    source_id = key.split('.')[2]
    regm.plot(label=source_id)
    plt.title(f"Mean Surface Air Temperature for {source_id}")
    plt.xlabel('Time')
    plt.ylabel('Temperature (K)')
    plt.legend()
    plt.show()

Calculating annual mean for each dataset and visualizing time series

annual_mean = {}
for key, regm in regn_mean.items():
    annual_mean[key] = regm.resample(time='Y').mean()
annual_mean

Visualizing the regional annual mean for each dataset

for key, anmn in annual_mean.items():
    source_id = key.split('.')[2]
    anmn.plot(label=source_id)
    plt.title(f"Mean Annual Surface Air Temperature for {source_id}")
    plt.xlabel('Time')
    plt.ylabel('Temperature (K)')
    plt.legend()
    plt.show()

Visualizing the regional annual mean for each dataset in a single panel

# Create the plot
plt.figure(figsize=(12, 6))

# Plotting the annual mean for each dataset on the same plot
for key, annm in annual_mean.items():
    source_id = key.split('.')[2]
    annm.plot(label=source_id)

plt.title("Annual Mean Surface Air Temperature (Regional)")
plt.xlabel('Time')
plt.ylabel('Temperature (K)')
plt.legend()
plt.show()