import numpy as np
import pandas as pd
import warnings
# Global constant for supported estimators
SUPPORTED_ESTIMATORS = {
"mean": np.mean,
"median": np.median,
"var": np.var,
"sd": np.std
}
[docs]def bootstrap_distribution(sample, rep, n="auto", estimator="mean", random_seed=None):
"""Bootstraps a sampling distribution for a sample.
A sampling distribution of `rep` replicates is generated
for the specified `estimator`with replacement with a
bootstrap sample size of `n`.
Parameters
----------
sample : list or numpy.ndarray or pandas.core.series.Series
sample to bootstrap
rep : int
number of replicates of the distribution
n : str or int, default="auto"
bootstrap sample size, "auto" specifies using the same size as the sample
estimator : {"mean", "median", "var", "sd"}
sampling distributor's estimator
random_seed : None or int, default=None
seed for random state
Returns
-------
numpy.ndarray
bootstrapped sampling distribution
Examples
--------
>>> bootstrap_distribution([1, 2, 3], 3, 3)
array([1.66, 2, 2.66])
"""
if not (isinstance(sample, list) or
isinstance(sample, np.ndarray) or
isinstance(sample, pd.Series)):
raise TypeError("sample should be one of the types"
"[list, numpy.ndarray, pandas.core.series.Series]")
if not isinstance(rep, int):
raise TypeError("rep should be of type 'int'")
if isinstance(rep, int) and rep < 1:
raise ValueError("Invalid value for rep")
if not (isinstance(n, str) or isinstance(n, int)):
raise TypeError("n should be of type 'str' or 'int'")
if isinstance(n, str) and n != "auto":
raise ValueError("Invalid value for n. Did you intend n='auto'?")
if isinstance(n, int) and n < 1:
raise ValueError("Invalid value for n")
if not isinstance(estimator, str):
raise TypeError("estimator should be of type 'str'")
if estimator not in SUPPORTED_ESTIMATORS.keys():
raise ValueError("Supported estimators are mean, median, var, sd")
if not (random_seed is None or isinstance(random_seed, int)):
raise TypeError("random_seed should be None or of type 'int'")
if isinstance(random_seed, int) and random_seed < 0:
raise ValueError("Invalid value for random_seed")
if random_seed:
np.random.seed(random_seed)
if n == "auto":
n = len(sample)
return SUPPORTED_ESTIMATORS[estimator](
np.random.choice(sample, size=(rep, n), replace=True),
axis=1
)
[docs]def calculate_boot_stats(sample, rep, n="auto", level=0.95, estimator="mean", random_seed=None, pass_dist=False):
"""Calculates a bootstrapped confidence interval for a sample.
A bootstrapped confidence interval for the desired estimator for
the provided sample is calculated for a confidence level `level`.
Other stats and parameters of the distribution and sample are
also returned.
Parameters
----------
sample : list or numpy.ndarray or pandas.core.series.Series
sample to bootstrap
rep : int
number of replicates of the distribution
n : str or int, default="auto"
bootstrap sample size, "auto" specifies using the same size as the sample
level : float, default=0.95
confidence level
estimator : {"mean", "median", "var", "sd"}
sampling distributor's estimator
random_seed : None or int, default=None
seed for random state
pass_dist : bool, default = "False"
return the bootstrapped sample distribution - False or True
Returns
-------
dictionary
Dictionary containing lower and upper bootstrapped confidence
interval for the desired estimator, along with the given estimator.
Also
Examples
--------
>>> calculate_boot_stats([1, 2, 3, 4], 1000, level=0.95, random_seed=123)
{'lower': 1.5,
'upper': 3.5,
'sample_mean': 2.5,
'std_err': 0.5414773771820943,
'level': 0.95,
'sample size': 4,
'n': 'auto',
'rep': 1000,
'estimator': 'mean'}
"""
if not isinstance(level, float):
raise TypeError("level should be of type 'float'")
if not (level > 0 and level < 1):
raise ValueError("level should be between 0 and 1")
if not isinstance(pass_dist, bool):
raise TypeError("pass_dist should be of type 'bool'")
if level < 0.7:
warnings.warn("Warning: chosen level is quite \
low--level is a confidence level, not a signficance level")
# get the bootstrapped mean vector
dist = bootstrap_distribution(sample=sample,
rep=rep,
n=n,
estimator=estimator,
random_seed=random_seed)
stats_dict = {}
stats_dict["lower"] = np.percentile(dist, 100 * (1-level)/2)
stats_dict["upper"] = np.percentile(dist, 100 * (1-(1-level)/2))
stats_dict["sample_" + estimator] = SUPPORTED_ESTIMATORS[estimator](sample)
stats_dict["std_err"] = np.std(dist)
stats_dict["level"] = level
stats_dict["sample_size"] = len(sample)
stats_dict["n"] = n
stats_dict["rep"] = rep
stats_dict["estimator"] = estimator
if pass_dist:
return stats_dict, dist
else:
return stats_dict