feat: adding binning feature that handles summary statistics

2024-08-05 05:24:59 -05:00 · 2024-08-05 05:24:59 -05:00 · a20f0c2069
commit a20f0c2069
parent 07457ba0eb
2 changed files with 147 additions and 9 deletions
--- a/tantri/binning/binning.py
+++ b/tantri/binning/binning.py
@ -65,28 +65,38 @@ class Bin:
 		summary_dict = {k: _summarise_values(v) for k, v in self.point_y_dict.items()}
 		return BinSummary(mean_x, summary_dict)
 	def stdev_ys(self) -> typing.Dict[str, float]:
 		return {k: v.std(axis=0, ddof=1).item() for k, v in self.point_y_dict.items()}
 def _construct_bins(xs: numpy.ndarray, bin_config: BinConfig) -> numpy.ndarray:
-	min_x = numpy.min(xs)
+	min_x_raw = numpy.min(xs)
 	# if the bin config requested bin_min is None, then we can ignore it.
 	if bin_config.bin_min is not None:
 		_logger.debug(f"Received a desired bin_min={bin_config.bin_min}")
-		if bin_config.bin_min > min_x:
+		if bin_config.bin_min > min_x_raw:
 			raise ValueError(
-				f"The lowest x value of {xs=} was {min_x=}, which is lower than the requested bin_min={bin_config.bin_min}"
+				f"The lowest x value of {xs=} was {min_x_raw=}, which is lower than the requested bin_min={bin_config.bin_min}"
 			)
 		else:
 			_logger.debug(f"Setting minimum to {bin_config.bin_min}")
-			min_x = bin_config.bin_min
+			min_x_raw = bin_config.bin_min
 	max_x_raw = numpy.max(xs)
 	if bin_config.log_scale:
 		min_x = numpy.log10(min_x_raw)
 		max_x = numpy.log10(max_x_raw)
 	else:
 		min_x = min_x_raw
 		max_x = max_x_raw
 	max_x = numpy.max(xs)
 	num_points = numpy.ceil(1 + (max_x - min_x) / bin_config.bin_width)
-	return min_x + (numpy.arange(0, num_points) * bin_config.bin_width)
+	bins = min_x + (numpy.arange(0, num_points) * bin_config.bin_width)
 	if bin_config.log_scale:
 		return 10**bins
 	else:
 		return bins
 def _populate_bins(
--- a/tests/binning/test_binning.py
+++ b/tests/binning/test_binning.py
@ -3,6 +3,11 @@ import tantri.binning.binning as binning
 import numpy
 def test_bin_config_validation():
 	with pytest.raises(ValueError):
 		binning.BinConfig(log_scale=False, bin_width=1, min_points_required=1)
 def test_bin_construction_faulty_min():
 	x_list = numpy.array([5, 6, 7, 8])
@ -109,3 +114,126 @@ def test_group_x_bins_summary(snapshot):
 	summary = [bin.summary_point() for bin in binned]
 	assert summary == snapshot
 def test_bin_construction_faulty_min_log_scale():
 	x_list = numpy.array([5, 6, 7, 8])
 	bin_config = binning.BinConfig(log_scale=True, bin_width=0.8, bin_min=5.5)
 	with pytest.raises(ValueError):
 		binning._construct_bins(x_list, bin_config)
 def test_bin_construction_force_min_log():
 	"""
 	This test shows the main use ofthe bin_min parameter, if we want our bins to nicely line up with decades for example,
 	then we can force it by ignoring the provided minimum x.
 	"""
 	x_list = numpy.array([1500, 5000, 10000, 33253, 400000])
 	bin_config = binning.BinConfig(log_scale=True, bin_width=1, bin_min=10)
 	expected_bins = numpy.array([10, 100, 1000, 10000, 100000, 1000000])
 	actual_bins = binning._construct_bins(x_list, bin_config=bin_config)
 	numpy.testing.assert_allclose(
 		actual_bins, expected_bins, err_msg="The bins were not as expected"
 	)
 def test_bin_construction_even_log_scale():
 	x_list = numpy.array([1, 2.8, 8, 12.2, 13.6, 17, 19.71, 20, 24, 33])
 	# bin width of 0.3 corresponds to 10^0.3 ~= 2, so we're roughly looking at
 	bin_config = binning.BinConfig(log_scale=True, bin_width=0.3)
 	expected_bins = numpy.array(
 		[
 			1.00000000000,
 			1.99526231497,
 			3.98107170553,
 			7.94328234724,
 			15.8489319246,
 			31.6227766017,
 			63.0957344480,
 		]
 	)
 	actual_bins = binning._construct_bins(x_list, bin_config=bin_config)
 	numpy.testing.assert_allclose(
 		actual_bins, expected_bins, err_msg="The bins were not as expected"
 	)
 def test_group_x_bins_log(snapshot):
 	x_list = numpy.array(
 		[
 			0.00158489,
 			0.00363078,
 			0.0398107,
 			0.275423,
 			0.524807,
 			2.51189,
 			8.74984,
 			10.0,
 			63.0957,
 			3981.07,
 		]
 	)
 	y_dict = {
 		"basic_lorentzian": numpy.array(
 			[
 				0.159154,
 				0.15915,
 				0.158535,
 				0.134062,
 				0.0947588,
 				0.00960602,
 				0.000838084,
 				0.000642427,
 				0.0000162008,
 				4.06987e-9,
 			]
 		)
 	}
 	bin_config = binning.BinConfig(log_scale=True, bin_width=2)
 	# expected_bins = numpy.array([1, 9, 17, 25, 33])
 	binned = binning.bin_lists(x_list, y_dict, bin_config)
 	assert binned == snapshot
 def test_group_x_bins_mean_log(snapshot):
 	x_list = numpy.array([0.0158489, 0.0316228, 0.0794328, 0.158489, 0.17378, 0.316228, 0.944061, 0.977237, 0.988553, 3.16228, 5.01187, 15.8489, 25.1189, 31.6228, 158.489, 630.957])
 	y_dict = {
 		"basic_lorentzian": (
 			numpy.array([0.159056, 0.158763, 0.156715, 0.149866, 0.148118, 0.127657, 0.0497503, 0.0474191, 0.0466561, 0.00619907, 0.00252714, 0.000256378, 0.000102165, 0.0000644769, 2.56787e-6, 1.62024e-7])
 		)
 	}
 	bin_config = binning.BinConfig(log_scale=True, bin_width=1, bin_min=-2)
 	# expected_bins = numpy.array([1, 9, 17, 25, 33])
 	binned = binning.bin_lists(x_list, y_dict, bin_config)
 	mean_binned = [bin.mean_point() for bin in binned]
 	assert mean_binned == snapshot
 # def test_group_x_bins_summary(snapshot):
 # 	x_list = numpy.array([1, 2.8, 8, 12.2, 13.6, 17, 19.71, 20, 24, 33])
 # 	y_dict = {
 # 		"identity_plus_one": (
 # 			numpy.array([1, 2.8, 8, 12.2, 13.6, 17, 19.71, 20, 24, 33]) + 2
 # 		)
 # 	}
 # 	bin_config = binning.BinConfig(log_scale=False, bin_width=8)
 # 	# expected_bins = numpy.array([1, 9, 17, 25, 33])
 # 	binned = binning.bin_lists(x_list, y_dict, bin_config)
 # 	summary = [bin.summary_point() for bin in binned]
 # 	assert summary == snapshot