Source code for strapvizpy.display

import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from strapvizpy.bootstrap import calculate_boot_stats


[docs]def plot_ci(sample, rep, bin_size=30, n="auto", ci_level=0.95, ci_random_seed=None, title="", x_axis="Bootstrap Sample Mean", y_axis="Count", path=None): """Makes a histogram of a boostrapped sampling distribution with its confidence interval and oberserved mean. Parameters ---------- sample : list or numpy.ndarray or pandas.core.series.Series sample to bootstrap rep : int number of replicates of the distribution bin_size = int a number of bins representing intervals of equal size over the range n : str or int, default="auto" bootstrap sample size, "auto" specifies using the same size as the sample ci_level : float, default=0.95 confidence level ci_random_seed : None or int, default=None seed for random state title : str, default = "" title of the histogram x_axis : str, default = "Bootstrap Sample Mean" name of the x axis y_axis : str, default = "Count" name of the y axis path : None or str, default = None specify the directory to save the figure as .png Returns ------- plot: histogram histogram of bootstrap distribution with confidence interval and oberserved mean Examples -------- >>> plot_ci([1, 2, 3, 4, 5, 6, 7], 1000, n=100, ci_level=0.95, ci_random_seed=123, title="Bootstrap") """ if not isinstance(title, str): raise TypeError( "The value of the argument 'title' must be type of str." ) if not isinstance(x_axis, str): raise TypeError( "The value of the argument 'x_axis' must be type of str." ) if not isinstance(y_axis, str): raise TypeError( "The value of the argument 'y_axis' must be type of str." ) if not (isinstance(path, str) or path == None): raise TypeError( "The value of the argument 'path' must be type of str or None." ) if path is not None : if os.path.isdir(path) is False: raise NameError("The folder path you specified is invalid.") sample_stat_dict = calculate_boot_stats(sample, rep, level=ci_level, n=n, random_seed = ci_random_seed, pass_dist=True) plt.hist(sample_stat_dict[1], density=False, bins=bin_size) plt.axvline(sample_stat_dict[0]["lower"], color='k', linestyle='--') plt.axvline(sample_stat_dict[0]["sample_mean"], color='r', linestyle='-') plt.axvline(sample_stat_dict[0]["upper"], color='k', linestyle='--') axes = plt.gca() _, y_max = axes.get_ylim() plt.text(sample_stat_dict[0]["sample_mean"], y_max * 0.9 , (str(round(sample_stat_dict[0]["sample_mean"], 2))+ '('+u"\u00B1"+str(round(sample_stat_dict[0]['std_err'],2))+')'), ha='center', va='center',rotation='horizontal', color = "k", bbox={'facecolor':'white', 'pad':5}) plt.text(sample_stat_dict[0]["upper"], y_max * 0.9 , (str(round(sample_stat_dict[0]["upper"], 2))), ha='center', va='center',rotation='horizontal', color = "k", bbox={'facecolor':'white', 'pad':5}) plt.text(sample_stat_dict[0]["lower"], y_max * 0.9 , (str(round(sample_stat_dict[0]["lower"], 2))), ha='center', va='center',rotation='horizontal', color = "k", bbox={'facecolor':'white', 'pad':5}) plt.title(title) plt.xlabel(x_axis) plt.ylabel(y_axis) if path is not None: plt.savefig(f"{path}bootstrap_histogram.png") return plt
[docs]def tabulate_stats(stat, precision=2, estimator=True, alpha=True, path=None): """Makes two tables that summerize the statistics from the bootstrapped samples and the parameters for creating the bootstrapped samples. It also allows you to save the tables in html format. Parameters ---------- stat : dict or tuple summary statistics produced by the `calculate_boot_stats()` function precision : int, default=2 the precision of the table values estimator : boolean, default=True include the bootstrap estimate in the summary statistics table alpha : boolean, default=True include the significance level in the summary statistics table path : str, default = None specify a path to where the tex files of tables should be saved. Returns ------- tuple : summary statistics: style object table summerizing the lower bound and upper bound of the confidence interval,the standard error, the sampling statitic (if estimator = True), and the significance level (if alpha = True). Style objects do not display well in a python shell. bootstrap parameters: style object table summerizing the parameters of the bootstrap sampling spficiying the original sample size, number of repititions, the significance level, and the number of samples in each bootstrap if its different from the original sample size. Style objects do not display well in a python shell. Examples -------- >>> st = calculate_boot_stats([1, 2, 3, 4], 1000, level=0.95, random_seed=123) >>> stats_table, parameter_table = tabulate_stats(st) >>> stats_table >>> parameter_table """ if not(isinstance(stat, tuple) | isinstance(stat, dict)): raise TypeError( "The stats parameter must be created from " "calculate_boot_stats() function." ) if not isinstance(precision, int): raise TypeError("The precision parameter must be of type int.") if not (isinstance(estimator, bool) & isinstance(alpha, bool)): raise TypeError( "The estimator and alpha parameters must be of type boolean." ) if not (isinstance(path, str) or path is None): raise TypeError("The path parameter must be a character string.") if path is not None : if os.path.isdir(path) is False: raise NameError("The folder path you specified is invalid.") if isinstance(stat, tuple): stat = stat[0] dic_keys = stat.keys() if not (("lower" in dic_keys) & ("upper" in dic_keys) & ("std_err" in dic_keys) & ("estimator" in dic_keys) & ("level" in dic_keys) & ("sample_size" in dic_keys) & ("n" in dic_keys) & ("rep" in dic_keys)): raise TypeError( "The statistics dictionary is missing a key. " "Please rerun calculate_boot_stats() function" ) # define the statistics table df = pd.DataFrame(data=np.array([(stat["lower"], stat["upper"], stat["std_err"])]), columns=["Lower Bound CI", "Upper Bound CI", "Standard Error"]) if estimator is True: s_name = "Sample " + stat["estimator"] df[s_name] = stat["sample_" + stat["estimator"]] if alpha is True: df["Significance Level"] = 1 - stat["level"] stats_table = df.style.format( precision=precision, formatter={("Significance Level"): "{:.3f}"} ) else: stats_table = df.style.format(precision=precision) stats_table = stats_table.hide(axis="index") # set formatting and caption for table stats_table.set_caption( "Bootstrapping sample statistics from sample with "+ str(stat["sample_size"]) + " records" ).set_table_styles( [{"selector": "caption", "props": "caption-side: bottom; font-size: 1.00em;"}], overwrite=False) # create bootstrapping parameter summary table df_bs = pd.DataFrame( data=np.array( [(stat["sample_size"], stat["rep"], (1 - stat["level"]))]), columns=["Sample Size", "Repetition", "Significance Level"]) if stat["n"] != "auto": df_bs["Samples per bootstrap"] = round(stat["n"], 0) # set formatting and caption for table bs_params = df_bs.style.format( precision=0, formatter={("Significance Level"): "{:.3f}"} ).hide(axis="index") (bs_params .set_caption("Parameters used for bootstrapping") .set_table_styles( [{"selector": "caption", "props": "caption-side: bottom;font-size:1.00em;"}], overwrite=False) ) if path is not None: with open(f"{path}sampling_statistics.tex", "w") as tf: tf.write(stats_table.to_latex()) with open(f"{path}bootstrap_params.tex", "w") as tf: tf.write(bs_params.to_latex()) return stats_table, bs_params