Source code for utils.stats

"""
.. module: stats

Created on 03/07/18

:author: Lukas Malina

Provides statistical methods to compute:
    various weighted averages along specified axis and their errors
    unbiased error estimator of infinite normal distribution from finite-sized sample

TODO use weighted average and its error in circular calculations
TODO write tests
TODO LOGGER or Raising error and warnings?
TODO if zeros or nans occur in errors, fallback to uniform weights only in affected cases

"""
import numpy as np
from scipy.special import erf
from scipy.stats import t

PI2 = 2 * np.pi
PI2I = PI2 * 1j
CONFIDENCE_LEVEL = (1 + erf(1 / np.sqrt(2))) / 2


[docs]def circular_mean(data, period=PI2, errors=None, axis=None):
    """
    Computes weighted circular average along the specified axis.

    Parameters:
        data: array-like
            Contains the data to be averaged
        period: scalar, optional, default (2 * np.pi)
            Periodicity period of data
        errors: array-like, optional
            Contains errors associated with the values in data, it is used to calculated weights
        axis: int or tuple of ints, optional
            Axis or axes along which to average data

    Returns:
        Returns the weighted circular average along the specified axis.
    """

    phases = data * PI2I / period
    weights = weights_from_errors(errors, period=period)

    return np.angle(np.average(np.exp(phases), axis=axis, weights=weights)) * period / PI2


[docs]def circular_error(data, period=PI2, errors=None, axis=None, t_value_corr=True):
    """
    Computes error of weighted circular average along the specified axis.

    Parameters:
        data: array-like
            Contains the data to be averaged
        period: scalar, optional
            Periodicity period of data, default is (2 * np.pi)
        errors: array-like, optional
            Contains errors associated with the values in data, it is used to calculated weights
        axis: int or tuple of ints, optional
            Axis or axes along which to average data
        t_value_corr: bool, optional
            Species if the error is corrected for small sample size, default True

    Returns:
        Returns the error of weighted circular average along the specified axis.
        """
    phases = data * PI2I / period
    weights = weights_from_errors(errors, period=period)
    complex_phases = np.exp(phases)
    complex_average = np.average(complex_phases, axis=axis, weights=weights)

    (sample_variance, sum_of_weights) = np.average(np.square(np.abs(complex_phases - complex_average.reshape(_get_shape(complex_phases.shape, axis)))),
                                                   weights=weights, axis=axis, returned=True)
    if weights is not None:
        sample_variance = sample_variance + 1. / sum_of_weights
    error_of_complex_average = np.sqrt(sample_variance * unbias_variance(data, weights, axis=axis))
    phase_error = error_of_complex_average / np.abs(complex_average)
    if t_value_corr:
        phase_error = phase_error * t_value_correction(effective_sample_size(data, weights, axis=axis))
    return np.where(phase_error > 0.25 * PI2, 0.3 * period, phase_error * period / PI2)


[docs]def weighted_mean(data, errors=None, axis=None):
    """
    Computes weighted average along the specified axis.

    Parameters:
        data: array-like
            Contains the data to be averaged
        errors: array-like, optional
            Contains errors associated with the values in data, it is used to calculated weights
        axis: int or tuple of ints, optional
            Axis or axes along which to average data

    Returns:
        Returns the weighted average along the specified axis.
    """
    weights = weights_from_errors(errors)
    return np.average(data, axis=axis, weights=weights)


def _get_shape(orig_shape, axis):
    new_shape = np.array(orig_shape)
    if axis is None:
        new_shape[:] = 1
    else:
        new_shape[np.array(axis)] = 1
    return tuple(new_shape)


[docs]def weighted_error(data, errors=None, axis=None, t_value_corr=True):
    """
    Computes error of weighted average along the specified axis.

    Parameters:
        data: array-like
            Contains the data to be averaged
        errors: array-like, optional
            Contains errors associated with the values in data, it is used to calculated weights
        axis: int or tuple of ints, optional
            Axis or axes along which to average data
        t_value_corr: bool, optional
            Species if the error is corrected for small sample size, default True

    Returns:
        Returns the error of weighted average along the specified axis.
    """
    weights = weights_from_errors(errors)
    weighted_average = np.average(data, axis=axis, weights=weights)
    (sample_variance, sum_of_weights) = np.average(np.square(np.abs(data - weighted_average.reshape(_get_shape(data.shape, axis)))),
                                                   weights=weights, axis=axis, returned=True)
    if weights is not None:
        sample_variance = sample_variance + 1 / sum_of_weights
    error = np.sqrt(sample_variance * unbias_variance(data, weights, axis=axis))
    if t_value_corr:
        error = error * t_value_correction(effective_sample_size(data, weights, axis=axis))
    return error


[docs]def weighted_rms(data, errors=None, axis=None):
    """
    Computes weighted root mean square along the specified axis.

    Parameters:
        data: array-like
            Contains the data to be averaged
        errors: array-like, optional
            Contains errors associated with the values in data, it is used to calculated weights
        axis: int or tuple of ints, optional
            Axis or axes along which to average data

    Returns:
        Returns weighted root mean square along the specified axis.
    """
    weights = weights_from_errors(errors)
    return np.sqrt(np.average(np.square(data), weights=weights, axis=axis))



[docs]def weights_from_errors(errors, period=PI2):
    """
    Computes weights from measurement errors, weights are not output if errors contain zeros or nans

    Parameters:
        errors: array-like
            Contains errors which are used to calculated weights
        period: scalar, optional
            Periodicity period of data, default is (2 * np.pi)

    Returns:
        Returns the error of weighted circular average along the specified axis.
    """
    if errors is None:
        return None
    if np.any(np.isnan(errors)):
        # LOGGER.warning("Nans found, weights are not used.")
        return None
    if np.any(np.logical_not(errors)):
        # LOGGER.warning("Zeros found, weights are not used.")
        return None
    return 1 / np.square(errors * PI2 / period)



[docs]def effective_sample_size(data, weights, axis=None):
    """
    Computes effective sample size of weighted data along specifies axis

    Parameters:
        data: array-like
        weights: array-like
            Contains weights associated with the values in data
        axis: int or tuple of ints, optional
            Axis or axes along which the effective sample size is computed

    Returns:
        Returns the error of weighted circular average along the specified axis.
    """
    if weights is None:
        return np.sum(np.ones(data.shape), axis=axis)
    return np.square(np.sum(weights, axis=axis)) / np.sum(np.square(weights), axis=axis)


[docs]def unbias_variance(data, weights, axis=None):
    """
    Computes a correction factor to unbias variance of weighted average of data along specified axis

    Parameters:
        data: array-like
        weights: array-like
            Contains weights associated with the values in data
        axis: int or tuple of ints, optional
            Axis or axes along which the effective sample size is computed

    Returns:
        Returns the error of weighted circular average along the specified axis.
    """
    sample_size = effective_sample_size(data, weights, axis=axis)
    try:
        return sample_size / (sample_size - 1)
    except ZeroDivisionError:
        return np.zeros(sample_size.shape)


[docs]def t_value_correction(sample_size):
    """
    Calculates the multiplicative correction factor to determine standard deviation of normally
    distributed quantity from standard deviation of its finite-sized sample

    Args:
        sample_size: array-like

    Returns:
        multiplicative correction factor(s) of same shape as sample_size
            can contain nans
    """
    return t.ppf(CONFIDENCE_LEVEL, sample_size - 1)