Source code for strapvizpy.bootstrap

import numpy as np
import pandas as pd
import warnings

# Global constant for supported estimators
SUPPORTED_ESTIMATORS = {
    "mean": np.mean,
    "median": np.median,
    "var": np.var,
    "sd": np.std
}

[docs]def bootstrap_distribution(sample, rep, n="auto", estimator="mean", random_seed=None): """Bootstraps a sampling distribution for a sample. A sampling distribution of `rep` replicates is generated for the specified `estimator`with replacement with a bootstrap sample size of `n`. Parameters ---------- sample : list or numpy.ndarray or pandas.core.series.Series sample to bootstrap rep : int number of replicates of the distribution n : str or int, default="auto" bootstrap sample size, "auto" specifies using the same size as the sample estimator : {"mean", "median", "var", "sd"} sampling distributor's estimator random_seed : None or int, default=None seed for random state Returns ------- numpy.ndarray bootstrapped sampling distribution Examples -------- >>> bootstrap_distribution([1, 2, 3], 3, 3) array([1.66, 2, 2.66]) """ if not (isinstance(sample, list) or isinstance(sample, np.ndarray) or isinstance(sample, pd.Series)): raise TypeError("sample should be one of the types" "[list, numpy.ndarray, pandas.core.series.Series]") if not isinstance(rep, int): raise TypeError("rep should be of type 'int'") if isinstance(rep, int) and rep < 1: raise ValueError("Invalid value for rep") if not (isinstance(n, str) or isinstance(n, int)): raise TypeError("n should be of type 'str' or 'int'") if isinstance(n, str) and n != "auto": raise ValueError("Invalid value for n. Did you intend n='auto'?") if isinstance(n, int) and n < 1: raise ValueError("Invalid value for n") if not isinstance(estimator, str): raise TypeError("estimator should be of type 'str'") if estimator not in SUPPORTED_ESTIMATORS.keys(): raise ValueError("Supported estimators are mean, median, var, sd") if not (random_seed is None or isinstance(random_seed, int)): raise TypeError("random_seed should be None or of type 'int'") if isinstance(random_seed, int) and random_seed < 0: raise ValueError("Invalid value for random_seed") if random_seed: np.random.seed(random_seed) if n == "auto": n = len(sample) return SUPPORTED_ESTIMATORS[estimator]( np.random.choice(sample, size=(rep, n), replace=True), axis=1 )
[docs]def calculate_boot_stats(sample, rep, n="auto", level=0.95, estimator="mean", random_seed=None, pass_dist=False): """Calculates a bootstrapped confidence interval for a sample. A bootstrapped confidence interval for the desired estimator for the provided sample is calculated for a confidence level `level`. Other stats and parameters of the distribution and sample are also returned. Parameters ---------- sample : list or numpy.ndarray or pandas.core.series.Series sample to bootstrap rep : int number of replicates of the distribution n : str or int, default="auto" bootstrap sample size, "auto" specifies using the same size as the sample level : float, default=0.95 confidence level estimator : {"mean", "median", "var", "sd"} sampling distributor's estimator random_seed : None or int, default=None seed for random state pass_dist : bool, default = "False" return the bootstrapped sample distribution - False or True Returns ------- dictionary Dictionary containing lower and upper bootstrapped confidence interval for the desired estimator, along with the given estimator. Also Examples -------- >>> calculate_boot_stats([1, 2, 3, 4], 1000, level=0.95, random_seed=123) {'lower': 1.5, 'upper': 3.5, 'sample_mean': 2.5, 'std_err': 0.5414773771820943, 'level': 0.95, 'sample size': 4, 'n': 'auto', 'rep': 1000, 'estimator': 'mean'} """ if not isinstance(level, float): raise TypeError("level should be of type 'float'") if not (level > 0 and level < 1): raise ValueError("level should be between 0 and 1") if not isinstance(pass_dist, bool): raise TypeError("pass_dist should be of type 'bool'") if level < 0.7: warnings.warn("Warning: chosen level is quite \ low--level is a confidence level, not a signficance level") # get the bootstrapped mean vector dist = bootstrap_distribution(sample=sample, rep=rep, n=n, estimator=estimator, random_seed=random_seed) stats_dict = {} stats_dict["lower"] = np.percentile(dist, 100 * (1-level)/2) stats_dict["upper"] = np.percentile(dist, 100 * (1-(1-level)/2)) stats_dict["sample_" + estimator] = SUPPORTED_ESTIMATORS[estimator](sample) stats_dict["std_err"] = np.std(dist) stats_dict["level"] = level stats_dict["sample_size"] = len(sample) stats_dict["n"] = n stats_dict["rep"] = rep stats_dict["estimator"] = estimator if pass_dist: return stats_dict, dist else: return stats_dict