Source code for caloutils.distances.ecdf

"""
ecdf: Calculate an approximation to the cumulative distribution function (CDF).
ecdf_distance: Calculate a distance metric between two 1D samples using ecdf.
calc_ecdf_dist: Convenience function to run ecdf_distance over multiple dimensions and convert the result to a numpy array.
"""

from typing import Optional

import numpy as np
import torch


[docs]def ecdf( arr: torch.Tensor, weights: Optional[torch.Tensor] = None ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """ Calculate an approximation to the cumulative distribution function (CDF) using empirical distribution. Args: arr (torch.Tensor): Input tensor containing data points. weights (Optional[torch.Tensor]): Weights for each data point (optional). Returns: tuple[torch.Tensor, Optional[torch.Tensor]]: Tuple containing the calculated CDF values and weights (if provided). """ val, sortidx = arr.sort() cdf = val.cumsum(-1) cdf = cdf.clone() / cdf[-1].clone() if weights is None: return cdf, None else: return cdf, weights[sortidx]
[docs]def ecdf_distance( r: torch.Tensor, f: torch.Tensor, rw: Optional[torch.Tensor] = None, fw: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ Calculate a distance metric between two 1D samples using the ecdf function. Args: r (torch.Tensor): Reference sample. f (torch.Tensor): Comparison sample. rw (Optional[torch.Tensor]): Weights for the reference sample (optional). fw (Optional[torch.Tensor]): Weights for the comparison sample (optional). Returns: torch.Tensor: Calculated distance metric between the two samples. """ real_cdf, w_real = ecdf(r, rw) cdf_fake, w_fake = ecdf(f, fw) dist = cdf_fake - real_cdf if rw is not None: ww = w_fake * w_real ww /= ww.sum() dist *= ww return dist.abs().mean(0)
[docs]def calc_ecdf_dist( r: torch.Tensor, f: torch.Tensor, rw: Optional[torch.Tensor] = None, fw: Optional[torch.Tensor] = None, ) -> np.ndarray: """ Calculate distance metrics between two samples over multiple dimensions using ecdf_distance and convert to a numpy array. Args: r (torch.Tensor): Reference sample. f (torch.Tensor): Comparison sample. rw (Optional[torch.Tensor]): Weights for the reference sample (optional). fw (Optional[torch.Tensor]): Weights for the comparison sample (optional). Returns: np.ndarray: Array of distance metrics calculated for each dimension. """ assert r.shape == f.shape assert (rw is None) == (fw is None) dists = [] for iftx in range(r.shape[-1]): dists.append(ecdf_distance(r[..., iftx], f[..., iftx], rw, fw)) return torch.stack(dists, dim=0).cpu().numpy()