feat: adding binning feature that handles summary statistics

This commit is contained in:
Deepak Mallubhotla 2024-08-05 05:24:59 -05:00
parent 07457ba0eb
commit a20f0c2069
Signed by: deepak
GPG Key ID: BEBAEBF28083E022
2 changed files with 147 additions and 9 deletions

View File

@ -65,28 +65,38 @@ class Bin:
summary_dict = {k: _summarise_values(v) for k, v in self.point_y_dict.items()} summary_dict = {k: _summarise_values(v) for k, v in self.point_y_dict.items()}
return BinSummary(mean_x, summary_dict) return BinSummary(mean_x, summary_dict)
def stdev_ys(self) -> typing.Dict[str, float]:
return {k: v.std(axis=0, ddof=1).item() for k, v in self.point_y_dict.items()}
def _construct_bins(xs: numpy.ndarray, bin_config: BinConfig) -> numpy.ndarray: def _construct_bins(xs: numpy.ndarray, bin_config: BinConfig) -> numpy.ndarray:
min_x = numpy.min(xs) min_x_raw = numpy.min(xs)
# if the bin config requested bin_min is None, then we can ignore it. # if the bin config requested bin_min is None, then we can ignore it.
if bin_config.bin_min is not None: if bin_config.bin_min is not None:
_logger.debug(f"Received a desired bin_min={bin_config.bin_min}") _logger.debug(f"Received a desired bin_min={bin_config.bin_min}")
if bin_config.bin_min > min_x: if bin_config.bin_min > min_x_raw:
raise ValueError( raise ValueError(
f"The lowest x value of {xs=} was {min_x=}, which is lower than the requested bin_min={bin_config.bin_min}" f"The lowest x value of {xs=} was {min_x_raw=}, which is lower than the requested bin_min={bin_config.bin_min}"
) )
else: else:
_logger.debug(f"Setting minimum to {bin_config.bin_min}") _logger.debug(f"Setting minimum to {bin_config.bin_min}")
min_x = bin_config.bin_min min_x_raw = bin_config.bin_min
max_x_raw = numpy.max(xs)
if bin_config.log_scale:
min_x = numpy.log10(min_x_raw)
max_x = numpy.log10(max_x_raw)
else:
min_x = min_x_raw
max_x = max_x_raw
max_x = numpy.max(xs)
num_points = numpy.ceil(1 + (max_x - min_x) / bin_config.bin_width) num_points = numpy.ceil(1 + (max_x - min_x) / bin_config.bin_width)
return min_x + (numpy.arange(0, num_points) * bin_config.bin_width) bins = min_x + (numpy.arange(0, num_points) * bin_config.bin_width)
if bin_config.log_scale:
return 10**bins
else:
return bins
def _populate_bins( def _populate_bins(

View File

@ -3,6 +3,11 @@ import tantri.binning.binning as binning
import numpy import numpy
def test_bin_config_validation():
with pytest.raises(ValueError):
binning.BinConfig(log_scale=False, bin_width=1, min_points_required=1)
def test_bin_construction_faulty_min(): def test_bin_construction_faulty_min():
x_list = numpy.array([5, 6, 7, 8]) x_list = numpy.array([5, 6, 7, 8])
@ -109,3 +114,126 @@ def test_group_x_bins_summary(snapshot):
summary = [bin.summary_point() for bin in binned] summary = [bin.summary_point() for bin in binned]
assert summary == snapshot assert summary == snapshot
def test_bin_construction_faulty_min_log_scale():
x_list = numpy.array([5, 6, 7, 8])
bin_config = binning.BinConfig(log_scale=True, bin_width=0.8, bin_min=5.5)
with pytest.raises(ValueError):
binning._construct_bins(x_list, bin_config)
def test_bin_construction_force_min_log():
"""
This test shows the main use ofthe bin_min parameter, if we want our bins to nicely line up with decades for example,
then we can force it by ignoring the provided minimum x.
"""
x_list = numpy.array([1500, 5000, 10000, 33253, 400000])
bin_config = binning.BinConfig(log_scale=True, bin_width=1, bin_min=10)
expected_bins = numpy.array([10, 100, 1000, 10000, 100000, 1000000])
actual_bins = binning._construct_bins(x_list, bin_config=bin_config)
numpy.testing.assert_allclose(
actual_bins, expected_bins, err_msg="The bins were not as expected"
)
def test_bin_construction_even_log_scale():
x_list = numpy.array([1, 2.8, 8, 12.2, 13.6, 17, 19.71, 20, 24, 33])
# bin width of 0.3 corresponds to 10^0.3 ~= 2, so we're roughly looking at
bin_config = binning.BinConfig(log_scale=True, bin_width=0.3)
expected_bins = numpy.array(
[
1.00000000000,
1.99526231497,
3.98107170553,
7.94328234724,
15.8489319246,
31.6227766017,
63.0957344480,
]
)
actual_bins = binning._construct_bins(x_list, bin_config=bin_config)
numpy.testing.assert_allclose(
actual_bins, expected_bins, err_msg="The bins were not as expected"
)
def test_group_x_bins_log(snapshot):
x_list = numpy.array(
[
0.00158489,
0.00363078,
0.0398107,
0.275423,
0.524807,
2.51189,
8.74984,
10.0,
63.0957,
3981.07,
]
)
y_dict = {
"basic_lorentzian": numpy.array(
[
0.159154,
0.15915,
0.158535,
0.134062,
0.0947588,
0.00960602,
0.000838084,
0.000642427,
0.0000162008,
4.06987e-9,
]
)
}
bin_config = binning.BinConfig(log_scale=True, bin_width=2)
# expected_bins = numpy.array([1, 9, 17, 25, 33])
binned = binning.bin_lists(x_list, y_dict, bin_config)
assert binned == snapshot
def test_group_x_bins_mean_log(snapshot):
x_list = numpy.array([0.0158489, 0.0316228, 0.0794328, 0.158489, 0.17378, 0.316228, 0.944061, 0.977237, 0.988553, 3.16228, 5.01187, 15.8489, 25.1189, 31.6228, 158.489, 630.957])
y_dict = {
"basic_lorentzian": (
numpy.array([0.159056, 0.158763, 0.156715, 0.149866, 0.148118, 0.127657, 0.0497503, 0.0474191, 0.0466561, 0.00619907, 0.00252714, 0.000256378, 0.000102165, 0.0000644769, 2.56787e-6, 1.62024e-7])
)
}
bin_config = binning.BinConfig(log_scale=True, bin_width=1, bin_min=-2)
# expected_bins = numpy.array([1, 9, 17, 25, 33])
binned = binning.bin_lists(x_list, y_dict, bin_config)
mean_binned = [bin.mean_point() for bin in binned]
assert mean_binned == snapshot
# def test_group_x_bins_summary(snapshot):
# x_list = numpy.array([1, 2.8, 8, 12.2, 13.6, 17, 19.71, 20, 24, 33])
# y_dict = {
# "identity_plus_one": (
# numpy.array([1, 2.8, 8, 12.2, 13.6, 17, 19.71, 20, 24, 33]) + 2
# )
# }
# bin_config = binning.BinConfig(log_scale=False, bin_width=8)
# # expected_bins = numpy.array([1, 9, 17, 25, 33])
# binned = binning.bin_lists(x_list, y_dict, bin_config)
# summary = [bin.summary_point() for bin in binned]
# assert summary == snapshot