import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from strapvizpy.bootstrap import calculate_boot_stats
[docs]def plot_ci(sample, rep, bin_size=30, n="auto", ci_level=0.95,
ci_random_seed=None, title="", x_axis="Bootstrap Sample Mean",
y_axis="Count", path=None):
"""Makes a histogram of a boostrapped sampling distribution
with its confidence interval and oberserved mean.
Parameters
----------
sample : list or numpy.ndarray or pandas.core.series.Series
sample to bootstrap
rep : int
number of replicates of the distribution
bin_size = int
a number of bins representing intervals of equal size
over the range
n : str or int, default="auto"
bootstrap sample size, "auto" specifies using the
same size as the sample
ci_level : float, default=0.95
confidence level
ci_random_seed : None or int, default=None
seed for random state
title : str, default = ""
title of the histogram
x_axis : str, default = "Bootstrap Sample Mean"
name of the x axis
y_axis : str, default = "Count"
name of the y axis
path : None or str, default = None
specify the directory to save the figure as .png
Returns
-------
plot: histogram
histogram of bootstrap distribution with confidence interval
and oberserved mean
Examples
--------
>>> plot_ci([1, 2, 3, 4, 5, 6, 7], 1000, n=100, ci_level=0.95,
ci_random_seed=123, title="Bootstrap")
"""
if not isinstance(title, str):
raise TypeError(
"The value of the argument 'title' must be type of str."
)
if not isinstance(x_axis, str):
raise TypeError(
"The value of the argument 'x_axis' must be type of str."
)
if not isinstance(y_axis, str):
raise TypeError(
"The value of the argument 'y_axis' must be type of str."
)
if not (isinstance(path, str) or path == None):
raise TypeError(
"The value of the argument 'path' must be type of str or None."
)
if path is not None :
if os.path.isdir(path) is False:
raise NameError("The folder path you specified is invalid.")
sample_stat_dict = calculate_boot_stats(sample, rep, level=ci_level,
n=n, random_seed = ci_random_seed,
pass_dist=True)
plt.hist(sample_stat_dict[1], density=False, bins=bin_size)
plt.axvline(sample_stat_dict[0]["lower"], color='k', linestyle='--')
plt.axvline(sample_stat_dict[0]["sample_mean"], color='r', linestyle='-')
plt.axvline(sample_stat_dict[0]["upper"], color='k', linestyle='--')
axes = plt.gca()
_, y_max = axes.get_ylim()
plt.text(sample_stat_dict[0]["sample_mean"],
y_max * 0.9 ,
(str(round(sample_stat_dict[0]["sample_mean"], 2))+
'('+u"\u00B1"+str(round(sample_stat_dict[0]['std_err'],2))+')'),
ha='center', va='center',rotation='horizontal',
color = "k", bbox={'facecolor':'white', 'pad':5})
plt.text(sample_stat_dict[0]["upper"],
y_max * 0.9 ,
(str(round(sample_stat_dict[0]["upper"], 2))),
ha='center', va='center',rotation='horizontal',
color = "k", bbox={'facecolor':'white', 'pad':5})
plt.text(sample_stat_dict[0]["lower"],
y_max * 0.9 ,
(str(round(sample_stat_dict[0]["lower"], 2))),
ha='center', va='center',rotation='horizontal',
color = "k", bbox={'facecolor':'white', 'pad':5})
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)
if path is not None:
plt.savefig(f"{path}bootstrap_histogram.png")
return plt
[docs]def tabulate_stats(stat, precision=2, estimator=True, alpha=True, path=None):
"""Makes two tables that summerize the statistics from the bootstrapped
samples and the parameters for creating the bootstrapped samples. It also allows you
to save the tables in html format.
Parameters
----------
stat : dict or tuple
summary statistics produced by the `calculate_boot_stats()` function
precision : int, default=2
the precision of the table values
estimator : boolean, default=True
include the bootstrap estimate in the summary statistics table
alpha : boolean, default=True
include the significance level in the summary statistics table
path : str, default = None
specify a path to where the tex files of tables should be saved.
Returns
-------
tuple :
summary statistics: style object
table summerizing the lower bound and upper bound of the confidence
interval,the standard error, the sampling statitic (if estimator = True),
and the significance level (if alpha = True). Style objects do not display
well in a python shell.
bootstrap parameters: style object
table summerizing the parameters of the bootstrap sampling spficiying
the original sample size, number of repititions, the significance level,
and the number of samples in each bootstrap if its different from the
original sample size. Style objects do not display well in a python shell.
Examples
--------
>>> st = calculate_boot_stats([1, 2, 3, 4], 1000, level=0.95, random_seed=123)
>>> stats_table, parameter_table = tabulate_stats(st)
>>> stats_table
>>> parameter_table
"""
if not(isinstance(stat, tuple) | isinstance(stat, dict)):
raise TypeError(
"The stats parameter must be created from "
"calculate_boot_stats() function."
)
if not isinstance(precision, int):
raise TypeError("The precision parameter must be of type int.")
if not (isinstance(estimator, bool) & isinstance(alpha, bool)):
raise TypeError(
"The estimator and alpha parameters must be of type boolean."
)
if not (isinstance(path, str) or path is None):
raise TypeError("The path parameter must be a character string.")
if path is not None :
if os.path.isdir(path) is False:
raise NameError("The folder path you specified is invalid.")
if isinstance(stat, tuple):
stat = stat[0]
dic_keys = stat.keys()
if not (("lower" in dic_keys) &
("upper" in dic_keys) &
("std_err" in dic_keys) &
("estimator" in dic_keys) &
("level" in dic_keys) &
("sample_size" in dic_keys) &
("n" in dic_keys) &
("rep" in dic_keys)):
raise TypeError(
"The statistics dictionary is missing a key. "
"Please rerun calculate_boot_stats() function"
)
# define the statistics table
df = pd.DataFrame(data=np.array([(stat["lower"], stat["upper"],
stat["std_err"])]),
columns=["Lower Bound CI", "Upper Bound CI",
"Standard Error"])
if estimator is True:
s_name = "Sample " + stat["estimator"]
df[s_name] = stat["sample_" + stat["estimator"]]
if alpha is True:
df["Significance Level"] = 1 - stat["level"]
stats_table = df.style.format(
precision=precision, formatter={("Significance Level"): "{:.3f}"}
)
else:
stats_table = df.style.format(precision=precision)
stats_table = stats_table.hide(axis="index")
# set formatting and caption for table
stats_table.set_caption(
"Bootstrapping sample statistics from sample with "+
str(stat["sample_size"]) + " records"
).set_table_styles(
[{"selector": "caption",
"props": "caption-side: bottom; font-size: 1.00em;"}],
overwrite=False)
# create bootstrapping parameter summary table
df_bs = pd.DataFrame(
data=np.array(
[(stat["sample_size"], stat["rep"], (1 - stat["level"]))]),
columns=["Sample Size", "Repetition", "Significance Level"])
if stat["n"] != "auto":
df_bs["Samples per bootstrap"] = round(stat["n"], 0)
# set formatting and caption for table
bs_params = df_bs.style.format(
precision=0,
formatter={("Significance Level"): "{:.3f}"}
).hide(axis="index")
(bs_params
.set_caption("Parameters used for bootstrapping")
.set_table_styles(
[{"selector": "caption",
"props": "caption-side: bottom;font-size:1.00em;"}],
overwrite=False)
)
if path is not None:
with open(f"{path}sampling_statistics.tex", "w") as tf:
tf.write(stats_table.to_latex())
with open(f"{path}bootstrap_params.tex", "w") as tf:
tf.write(bs_params.to_latex())
return stats_table, bs_params