"""
This module provides utilities to assist in statistics calculations related
to SMYLE analysis. Functions provide tools to perform linear detrending along
a particular axis, determine skill metrics based on model and observation
DataArrays, and generate a distribution of skill scores using a smaller
ensemble member size.
Authors
-------
- Steve Yeager
- E. Maroon
Use
---
Users wishing to utilize these tools may do so by importing
various functions, for example:
::
from esp-tools.utils.stat_utils import cor_ci_bootyears
Dependencies
------------
The user must have an activated conda environment which includes
xarray, numpy, sys, cftime, and xskillscore.
"""
import xarray as xr
import numpy as np
import sys
import cftime
import xskillscore as xs
[docs]def cor_ci_bootyears(ts1, ts2, seed=None, nboots=1000, conf=95):
"""
Determine confidence intervals for correlation scores.
Parameters
----------
ts1 : array
ts2 : array
seed : int (optional)
seed for random number generation, default None
nboots : int
number boots (optional, default 1000)
conf : float (optional)
confidence value; defaults to 95
Returns
-------
minci : float
minimum confidence interval
maxci : float
maximum confidence interval
"""
# calculate min and max percentile
ptilemin = (100. - conf) / 2.
ptilemax = conf + (100 - conf) / 2.
# ensure that the arrays have the same size
if (ts1.size != ts2.size):
print("The two arrays must have the same size")
sys.exit()
# if provided, use a particular seed for random number generation
if (seed):
np.random.seed(seed)
# retreive uniform random number using sample size
samplesize = ts1.size
ranu = np.random.uniform(0, samplesize, nboots * samplesize)
ranu = np.floor(ranu).astype(int)
bootdat1 = np.array(ts1[ranu])
bootdat2 = np.array(ts2[ranu])
bootdat1 = bootdat1.reshape([samplesize, nboots])
bootdat2 = bootdat2.reshape([samplesize, nboots])
# compute the Pearson correlation coefficient between datasets
bootcor = xr.corr(xr.DataArray(bootdat1),
xr.DataArray(bootdat2),
dim='dim_0')
# determine minimum and maximum confidence intervals
minci = np.percentile(bootcor, ptilemin)
maxci = np.percentile(bootcor, ptilemax)
return minci, maxci
[docs]def detrend_linear(dat, dim):
"""
Linear detrend dat along the axis dim.
Parameters
----------
dat : array
data which is to be detrended
dim : str
dimension along which linear detrending is performed
Returns
-------
dat : array
detrended array
"""
# determine parameters
params = dat.polyfit(dim=dim, deg=1)
# determine fit
fit = xr.polyval(dat[dim], params.polyfit_coefficients)
# linearly detrend data
dat = dat - fit
return dat
[docs]def leadtime_skill_seas(mod_da, mod_time, obs_da, detrend=False):
"""
Computes a suite of deterministic skill metrics given two DataArrays
corresponding to model and observations, which must share the same
lat/lon coordinates (if any). Assumes time coordinates are compatible
(can be aligned). Both DataArrays should represent 3-month seasonal
averages (DJF, MAM, JJA, SON).
Parameters
----------
mod_da: DataArray
a seasonally-averaged hindcast DataArray dimensioned (Y,L,M,...)
mod_time: DataArray
a hindcast time DataArray dimensioned (Y,L).
note: assumes mod_time.dt.month
obs_da: DataArray
an OBS DataArray dimensioned (season,year,...)
detrend (optional): bool
defaults to False; if True, skill scores computed after detrending
Returns
-------
xr_dataset : DataArray
the mid-month of a 3-month seasonal average (e.g., mon=1 ==> "DJF").
"""
# default seasons
seasons = {1: 'DJF', 4: 'MAM', 7: 'JJA', 10: 'SON'}
corr_list = []
pval_list = []
rmse_list = []
msss_list = []
rpc_list = []
# convert L to leadtime values:
leadtime = mod_da.L - 2
for i in mod_da.L.values:
# adjust ensemble time to correct format
ens_ts = mod_da.sel(L=i).rename({'Y': 'time'})
ens_time_year = mod_time.sel(L=i).dt.year.data
ens_time_month = mod_time.sel(L=i).dt.month.data[0]
obs_ts = obs_da.sel(season=seasons[ens_time_month]).rename({'year': 'time'})
ens_ts = ens_ts.assign_coords(time=("time", ens_time_year))
a, b = xr.align(ens_ts, obs_ts)
# perform linear detrending if detrend is set to True
if detrend:
a = detrend_linear(a, 'time')
b = detrend_linear(b, 'time')
# calculate statistics
amean = a.mean('M')
sigobs = b.std('time')
sigsig = amean.std('time')
sigtot = a.std('time').mean('M')
# compute Pearson's correlation coefficient
r = xs.pearson_r(amean, b, dim='time')
rpc = r / (sigsig / sigtot)
# append skill metrics to relevant lists
corr_list.append(r)
rpc_list.append(rpc.where(r > 0))
rmse_list.append(xs.rmse(amean, b, dim='time') / sigobs)
msss_list.append(1 - (xs.mse(amean, b, dim='time') / b.var('time')))
pval_list.append(xs.pearson_r_eff_p_value(amean, b, dim='time'))
# concatenate various lists along leadtime dimension
corr = xr.concat(corr_list, leadtime)
pval = xr.concat(pval_list, leadtime)
rmse = xr.concat(rmse_list, leadtime)
msss = xr.concat(msss_list, leadtime)
rpc = xr.concat(rpc_list, leadtime)
# create xarray dataset from lists
xr_dataset = xr.Dataset({'corr': corr, 'pval': pval, 'nrmse': rmse,
'msss': msss, 'rpc': rpc})
return xr_dataset
[docs]def leadtime_skill_seas_resamp(mod_da, mod_time, obs_da, sampsize, N, detrend=False):
"""
Computes a suite of deterministic skill metrics given two DataArrays
corresponding to model and observations, which must share the same
lat/lon coordinates (if any). Assumes time coordinates are compatible
(can be aligned). Both DataArrays should represent 3-month seasonal
averages (DJF, MAM, JJA, SON).
Unlike leadtime_skill_seas(), this version resamples the
mod_da member dimension (M) to generate a distribution of skill scores
using a smaller ensemble size (N, where N<M). Returns the mean of the
resampled skill score distribution.
Parameters
----------
mod_da: DataArray
a seasonally-averaged hindcast DataArray dimensioned (Y,L,M,...)
mod_time: DataArray
a hindcast time DataArray dimensioned (Y,L). Assumes mod_time.dt.month
obs_da: DataArray
an OBS DataArray dimensioned (season,year,...)
sampsize : int
sample size
N : int
maximum dimension for resampling
detrend : bool (optional)
defaults to False; if set to True, skill scores will be computed after detrending
Returns
-------
dsout : xarray
mean of resampled skill score distribution
"""
dslist = []
# default seasons
seasons = {1: 'DJF', 4: 'MAM', 7: 'JJA', 10: 'SON'}
# convert L to leadtime values:
leadtime = mod_da.L - 2
# Perform resampling
if (not N < mod_da.M.size):
raise ValueError('ERROR: expecting resampled ensemble size to be less than original')
mod_da_r = xs.resample_iterations(mod_da.chunk(), sampsize, 'M', dim_max=N)
for l in mod_da_r.iteration.values:
# create lists for skill metrics
corr_list = []
pval_list = []
rmse_list = []
msss_list = []
rpc_list = []
# loop through leadtime values
for i in mod_da.L.values:
# adjust ensemble time to correct format
ens_ts = mod_da_r.sel(iteration=l).sel(L=i).rename({'Y': 'time'})
ens_time_year = mod_time.sel(L=i).dt.year.data
ens_time_month = mod_time.sel(L=i).dt.month.data[0]
obs_ts = obs_da.sel(season=seasons[ens_time_month]).rename({'year': 'time'})
ens_ts = ens_ts.assign_coords(time=("time", ens_time_year))
a, b = xr.align(ens_ts, obs_ts)
# perform linear detrending if detrend is set to True
if detrend:
a = detrend_linear(a, 'time')
b = detrend_linear(b, 'time')
# calculate statistics
amean = a.mean('M')
sigobs = b.std('time')
sigsig = amean.std('time')
sigtot = a.std('time').mean('M')
# compute Pearson's correlation coefficient
r = xs.pearson_r(amean, b, dim='time')
rpc = r / (sigsig / sigtot)
# append skill metrics to relevant lists
corr_list.append(r)
rpc_list.append(rpc.where(r > 0))
rmse_list.append(xs.rmse(amean, b, dim='time') / sigobs)
msss_list.append(1 - (xs.mse(amean, b, dim='time') / b.var('time')))
pval_list.append(xs.pearson_r_eff_p_value(amean, b, dim='time'))
# concatenate various lists along leadtime dimension
corr = xr.concat(corr_list, leadtime)
pval = xr.concat(pval_list, leadtime)
rmse = xr.concat(rmse_list, leadtime)
msss = xr.concat(msss_list, leadtime)
rpc = xr.concat(rpc_list, leadtime)
# create xarray dataset from lists and append to dslist
dslist.append(xr.Dataset({'corr': corr, 'pval': pval, 'rmse': rmse,
'msss': msss, 'rpc': rpc}))
# concatenate dslist along iteration dimension
dsout = xr.concat(dslist, dim='iteration').mean('iteration').compute()
return dsout
[docs]def remove_drift(da, da_time, y1, y2):
"""
Function to convert raw DP DataArray into anomaly DP DataArray
with leadtime-dependent climatology removed.
Author
------
E. Maroon (modified by S. Yeager)
Parameters
----------
da : DP DataArray
Raw DP DataArray with dimensions (Y,L,M,...)
da_time : DP DataArray
Verification time of DP DataArray (Y,L)
y1 : int
Start year of climatology
y2 : int
End year of climatology
Returns
-------
da_anom : DP DataArray
De-drifted DP DataArray
da_climo : DP DataArray
Leadtime-dependent climatology
"""
# gather first and last second of first and last year, respectively
d1 = cftime.DatetimeNoLeap(y1, 1, 1, 0, 0, 0)
d2 = cftime.DatetimeNoLeap(y2, 12, 31, 23, 59, 59)
# mask data array outside of selected time
masked_period = da.where((da_time > d1) & (da_time < d2))
da_climo = masked_period.mean('M').mean('Y')
# De-drifted DP data array is data array with
# leadtime-dependent climotology subtracted
da_anom = da - da_climo
return da_anom, da_climo