Model Setup - Machine Learning Hurricane Intensity Prediction Cookbook

Overview¶

This notebook is for setting up a simple CNN-LSTM model to predict event-wise hurricane intensity using the prreproceesed data from the era5_preprocessing.ipynb notebook.

Data Preparation: Load and preprocess the data.
Model Definition: Define a CNN-LSTM model.
Model Training: Train the model on the prepared data.
Model Evaluation: Evaluate the model’s performance on test data.

Imports¶

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import xarray as xr


from sklearn.preprocessing import MinMaxScaler


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras.layers import  TimeDistributed, LSTM
import visualkeras
import tensorflow as tf

2025-10-23 01:51:32.064441: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

Load the hurricane wise variable dataset¶

In this section,we will load the preprocessed dataset containing hurricane-wise environmental variables. This dataset is essential for training our CNN-LSTM model to predict hurricane intensity.

# load the preprocessed dataset
model_input = xr.open_dataset('../test_folder/input_predictands.nc')
model_input

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 2
      1 # load the preprocessed dataset
----> 2 model_input = xr.open_dataset('../test_folder/input_predictands.nc')
      3 model_input

File ~/micromamba/envs/cookbook-dev/lib/python3.12/site-packages/xarray/backends/api.py:577, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, create_default_indexes, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)
    574     kwargs.update(backend_kwargs)
    576 if engine is None:
--> 577     engine = plugins.guess_engine(filename_or_obj)
    579 if from_array_kwargs is None:
    580     from_array_kwargs = {}

File ~/micromamba/envs/cookbook-dev/lib/python3.12/site-packages/xarray/backends/plugins.py:212, in guess_engine(store_spec, must_support_groups)
    204 else:
    205     error_msg = (
    206         "found the following matches with the input file in xarray's IO "
    207         f"backends: {compatible_engines}. But their dependencies may not be installed, see:\n"
    208         "https://docs.xarray.dev/en/stable/user-guide/io.html \n"
    209         "https://docs.xarray.dev/en/stable/getting-started-guide/installing.html"
    210     )
--> 212 raise ValueError(error_msg)

ValueError: did not find a match in any of xarray's currently installed IO backends ['netcdf4', 'h5netcdf', 'scipy']. Consider explicitly selecting one of the installed engines via the ``engine`` parameter, or installing additional IO dependencies, see:
https://docs.xarray.dev/en/stable/getting-started-guide/installing.html
https://docs.xarray.dev/en/stable/user-guide/io.html

Input Data prerpocessing steps¶

Nan and padded values will be set to zero
Set the train and test split
normalize using the MinMaxScaler
random shuffle for generalization

# remove the nan values and set it 
model_input = model_input.fillna(0)

# selecting the predictors (X) and expanding the dimensions

X_data = model_input[['u','v','vo','speed_shear','sp','r','cor_params']].to_array(dim='variable')

print(f'Dimensions are , features: {X_data.shape[0]}, Event: {X_data.shape[1]}, time(lead): {X_data.shape[2]}, lat: {X_data.shape[3]}, lon: {X_data.shape[4]}')

X_data = X_data.transpose('id', 'lead', 'y','x','variable')

print(f'X_data dimensions are: Event: {X_data.shape[0]}, time(lead): {X_data.shape[1]}, lat: {X_data.shape[2]}, lon: {X_data.shape[3]}, features: {X_data.shape[4]}')

# selecting the target variable (y)
Y_data = model_input['target']

# expanded the dimensions of Y_data to match the expected input shape for the model
#Y_data = np.expand_dims(Y_data, axis=-1)

print(f'Target dimensions are: Event: {Y_data.shape[0]}, time(lead): {Y_data.shape[1]}')

## 80% train and 20% test split

## random shuffled the events and split the data into training and testing sets
X_train , X_test , Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=1)


X_train_data = X_train.values
X_test_data = X_test.values
Y_train_data = Y_train.values
Y_test_data = Y_test.values

x_train_scaler = MinMaxScaler()
x_test_scaler = MinMaxScaler()

y_train_scaler = MinMaxScaler()
y_test_scaler = MinMaxScaler()

X_train_scaled = x_train_scaler.fit_transform(X_train_data.reshape(-1, X_train_data.shape[-1])).reshape(X_train_data.shape)
Y_train_scaled = y_train_scaler.fit_transform(Y_train_data.reshape(-1,1)).reshape(Y_train_data.shape)

X_test_scaled = x_test_scaler.fit_transform(X_test_data.reshape(-1, X_test_data.shape[-1])).reshape(X_test_data.shape)
Y_test_scaled = y_test_scaler.fit_transform(Y_test_data.reshape(-1,1)).reshape(Y_test_data.shape)

def masked_mse(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, 0.0), tf.float32)
    squared_error = tf.square(y_true - y_pred)
    masked_loss = tf.reduce_sum(squared_error * mask) / (tf.reduce_sum(mask) + 1e-6)
    return masked_loss

model = Sequential()
model.add(TimeDistributed(
    Conv2D(16, (3, 3), activation='relu', padding='same'),
    input_shape=(140, 5, 5, 7)
))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(64, return_sequences=True))  # <--- important!
model.add(TimeDistributed(Dense(1)))

model.summary()

visualkeras.layered_view(model,scale_xy=0.6)

model.compile(optimizer='adam', metrics=['mae'] , loss=masked_mse)

model.fit(X_train_scaled, Y_train_scaled, epochs=100, batch_size=32, validation_split=0.2)

predict_x = model.predict(X_test_scaled)

def invert_add_meta(data,scalar,xr_data):
    data = scalar.inverse_transform(data.reshape(-1 ,1)).reshape(xr_data.shape)
    added_meta = xr.DataArray(data, coords=xr_data.coords, dims=xr_data.dims)
    return added_meta

predicted_wind_speed = invert_add_meta(predict_x, y_test_scaler, Y_test)

observed_wind_speeds = Y_test

final_dset = predicted_wind_speed.to_dataset(name='predicted_wind_speed')
final_dset['observed_wind_speed'] = observed_wind_speeds
                                                    
final_dft = final_dset.to_dataframe().reset_index()

# set 0.0 as nan in observed_wind_speed
final_dft['observed_wind_speed'] = final_dft['observed_wind_speed'].replace(0.0, np.nan)

# whenever the predicted_wind_speed is nan, sdrop the entire row

final_dft = final_dft.dropna(how='any',axis=0)

final_pivot_col = final_dft.drop(columns=['time','level'])

final_pivot_col

import seaborn as sbs 

import seaborn as sns
import matplotlib.pyplot as plt

# Define two different palettes
palette1 = sns.color_palette("Reds")           # for predicted
palette2 = sns.color_palette("Blues")            # for observed

fig,ax = plt.subplots(figsize=(12, 6))
# First lineplot (predicted) with palette1
sns.lineplot(
    data=final_pivot_col,
    x='lead',
    y='predicted_wind_speed',
    hue='id',
    style='id',
    markers=True,
    dashes=False,
    palette=palette1,
    legend='brief'
,ax=ax)

# Second lineplot (observed) with palette2
sns.lineplot(
    data=final_pivot_col,
    x='lead',
    y='observed_wind_speed',
    hue='id',
    style='id',
    markers=True,
    dashes=False,
    palette=palette2,
    legend='brief'
,ax=ax)
plt.legend('')



# add H to the end of xticks
xticks = ax.get_xticks()
ax.set_xticks(xticks)
ax.set_xticklabels([f'{int(tick)}H' for tick in xticks])
ax.set_xlim(0, 500)


ax.set_xlabel('Lead (Hours)',fontsize=15)
ax.set_title('Predicted (Reds) vs Observed Wind Speed (Blues)',fontsize=18)

ax.set_ylabel('Wind Speed (m/s)',fontsize=15)
#sbs.lineplot(data=final_pivot_col, x='lead', y='observed_wind_speed', hue='id', style='id', markers=True, dashes=False)

Data Preprocessing

ERA5 Data Preprocessing