feat: adds subset simulation stuff

2023-07-24 01:50:56 -05:00
parent ad521ba472
commit 33cab9ab41
6 changed files with 547 additions and 4 deletions
--- a/deepdog/subset_simulation/subset_simulation_impl.py
+++ b/deepdog/subset_simulation/subset_simulation_impl.py
@@ -0,0 +1,310 @@
+import logging
+import numpy
+import pdme.measurement
+import pdme.measurement.input_types
+import pdme.subspace_simulation
+import bisect
+from typing import Sequence, Tuple, Optional
+
+from dataclasses import dataclass
+
+_logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SubsetSimulationResult:
+	probs_list: Sequence[Tuple]
+	over_target_cost: Optional[float]
+	over_target_likelihood: Optional[float]
+	under_target_cost: Optional[float]
+	under_target_likelihood: Optional[float]
+
+
+class SubsetSimulation:
+	def __init__(
+		self,
+		model_name_pair,
+		dot_inputs,
+		actual_measurements: Sequence[pdme.measurement.DotMeasurement],
+		n_c: int,
+		n_s: int,
+		m_max: int,
+		target_cost: Optional[float] = None,
+		level_0_seed: int = 200,
+		mcmc_seed: int = 20,
+		use_adaptive_steps=True,
+		default_phi_step=0.01,
+		default_theta_step=0.01,
+		default_r_step=0.01,
+		default_w_log_step=0.01,
+		default_upper_w_log_step=4,
+	):
+		name, model = model_name_pair
+		self.model_name = name
+		self.model = model
+		_logger.info(f"got model {self.model_name}")
+
+		self.dot_inputs_array = pdme.measurement.input_types.dot_inputs_to_array(
+			dot_inputs
+		)
+		# _logger.debug(f"actual measurements: {actual_measurements}")
+		self.actual_measurement_array = numpy.array([m.v for m in actual_measurements])
+
+		def cost_function_to_use(dipoles_to_test):
+			return pdme.subspace_simulation.proportional_costs_vs_actual_measurement(
+				self.dot_inputs_array, self.actual_measurement_array, dipoles_to_test
+			)
+
+		self.cost_function_to_use = cost_function_to_use
+
+		self.n_c = n_c
+		self.n_s = n_s
+		self.m_max = m_max
+
+		self.level_0_seed = level_0_seed
+		self.mcmc_seed = mcmc_seed
+
+		self.use_adaptive_steps = use_adaptive_steps
+		self.default_phi_step = default_phi_step
+		self.default_theta_step = default_theta_step
+		self.default_r_step = default_r_step
+		self.default_w_log_step = default_w_log_step
+		self.default_upper_w_log_step = default_upper_w_log_step
+
+		_logger.info(f"using params:")
+		_logger.info(f"\tn_c: {self.n_c}")
+		_logger.info(f"\tn_s: {self.n_s}")
+		_logger.info(f"\tm: {self.m_max}")
+		_logger.info(f"let's do level 0...")
+
+		self.target_cost = target_cost
+		_logger.info(f"will stop at target cost {target_cost}")
+
+	def execute(self) -> SubsetSimulationResult:
+
+		probs_list = []
+
+		sample_dipoles = self.model.get_monte_carlo_dipole_inputs(
+			self.n_c * self.n_s,
+			-1,
+			rng_to_use=numpy.random.default_rng(self.level_0_seed),
+		)
+		# _logger.debug(sample_dipoles)
+		# _logger.debug(sample_dipoles.shape)
+		costs = self.cost_function_to_use(sample_dipoles)
+
+		_logger.debug(f"costs: {costs}")
+		sorted_indexes = costs.argsort()[::-1]
+
+		_logger.debug(costs[sorted_indexes])
+		_logger.debug(sample_dipoles[sorted_indexes])
+
+		sorted_costs = costs[sorted_indexes]
+		sorted_dipoles = sample_dipoles[sorted_indexes]
+
+		threshold_cost = sorted_costs[-self.n_c]
+
+		all_dipoles = numpy.array(
+			[
+				pdme.subspace_simulation.sort_array_of_dipoles_by_frequency(samp)
+				for samp in sorted_dipoles
+			]
+		)
+		all_chains = list(zip(sorted_costs, all_dipoles))
+
+		mcmc_rng = numpy.random.default_rng(self.mcmc_seed)
+
+		for i in range(self.m_max):
+			next_seeds = all_chains[-self.n_c :]
+
+			for cost_index, cost_chain in enumerate(all_chains[: -self.n_c]):
+				probs_list.append(
+					(
+						((self.n_c * self.n_s - cost_index) / (self.n_c * self.n_s))
+						/ (self.n_s ** (i)),
+						cost_chain[0],
+						i + 1,
+					)
+				)
+
+			next_seeds_as_array = numpy.array([s for _, s in next_seeds])
+
+			stdevs = self.get_stdevs_from_arrays(next_seeds_as_array)
+			_logger.info(f"got stdevs: {stdevs.stdevs}")
+
+			all_chains = []
+			for c, s in next_seeds:
+				# chain = mcmc(s, threshold_cost, n_s, model, dot_inputs_array, actual_measurement_array, mcmc_rng, curr_cost=c, stdevs=stdevs)
+				# until new version gotta do
+				chain = self.model.get_mcmc_chain(
+					s,
+					self.cost_function_to_use,
+					self.n_s,
+					threshold_cost,
+					stdevs,
+					initial_cost=c,
+					rng_arg=mcmc_rng,
+				)
+				for cost, chained in chain:
+					try:
+						filtered_cost = cost[0]
+					except IndexError:
+						filtered_cost = cost
+					all_chains.append((filtered_cost, chained))
+
+			# _logger.debug(all_chains)
+
+			all_chains.sort(key=lambda c: c[0], reverse=True)
+
+			threshold_cost = all_chains[-self.n_c][0]
+			_logger.info(
+				f"current threshold cost: {threshold_cost}, at P = (1 / {self.n_s})^{i + 1}"
+			)
+			if (self.target_cost is not None) and (threshold_cost < self.target_cost):
+				_logger.info(
+					f"got a threshold cost {threshold_cost}, less than {self.target_cost}. will leave early"
+				)
+
+				cost_list = [c[0] for c in all_chains]
+				over_index = reverse_bisect_right(cost_list, self.target_cost)
+
+				shorter_probs_list = []
+				for cost_index, cost_chain in enumerate(all_chains):
+					probs_list.append(
+						(
+							((self.n_c * self.n_s - cost_index) / (self.n_c * self.n_s))
+							/ (self.n_s ** (i)),
+							cost_chain[0],
+							i + 1,
+						)
+					)
+					shorter_probs_list.append(
+						(
+							cost_chain[0],
+							((self.n_c * self.n_s - cost_index) / (self.n_c * self.n_s))
+							/ (self.n_s ** (i)),
+						)
+					)
+				# _logger.info(shorter_probs_list)
+				result = SubsetSimulationResult(
+					probs_list=probs_list,
+					over_target_cost=shorter_probs_list[over_index - 1][0],
+					over_target_likelihood=shorter_probs_list[over_index - 1][1],
+					under_target_cost=shorter_probs_list[over_index][0],
+					under_target_likelihood=shorter_probs_list[over_index][1],
+				)
+				return result
+
+			# _logger.debug([c[0] for c in all_chains[-n_c:]])
+			_logger.info(f"doing level {i + 1}")
+
+		for cost_index, cost_chain in enumerate(all_chains):
+			probs_list.append(
+				(
+					((self.n_c * self.n_s - cost_index) / (self.n_c * self.n_s))
+					/ (self.n_s ** (self.m_max)),
+					cost_chain[0],
+					self.m_max + 1,
+				)
+			)
+		threshold_cost = all_chains[-self.n_c][0]
+		_logger.info(
+			f"final threshold cost: {threshold_cost}, at P = (1 / {self.n_s})^{self.m_max + 1}"
+		)
+		for a in all_chains[-10:]:
+			_logger.info(a)
+		# for prob, prob_cost in probs_list:
+		# 	_logger.info(f"\t{prob}: {prob_cost}")
+		probs_list.sort(key=lambda c: c[0], reverse=True)
+		result = SubsetSimulationResult(
+			probs_list=probs_list,
+			over_target_cost=None,
+			over_target_likelihood=None,
+			under_target_cost=None,
+			under_target_likelihood=None,
+		)
+		return result
+
+	def get_stdevs_from_arrays(
+		self, array
+	) -> pdme.subspace_simulation.MCMCStandardDeviation:
+		# stdevs = get_stdevs_from_arrays(next_seeds_as_array, model)
+		if self.use_adaptive_steps:
+
+			stdev_array = []
+			count = array.shape[1]
+			for dipole_index in range(count):
+				selected = array[:, dipole_index]
+				pxs = selected[:, 0]
+				pys = selected[:, 1]
+				pzs = selected[:, 2]
+				thetas = numpy.arccos(pzs / self.model.pfixed)
+				phis = numpy.arctan2(pys, pxs)
+
+				rstdevs = numpy.maximum(
+					numpy.std(selected, axis=0)[3:6],
+					self.default_r_step / (self.n_s * 10),
+				)
+				frequency_stdevs = numpy.minimum(
+					numpy.maximum(
+						numpy.std(numpy.log(selected[:, -1])),
+						self.default_w_log_step / (self.n_s * 10),
+					),
+					self.default_upper_w_log_step,
+				)
+				stdev_array.append(
+					pdme.subspace_simulation.DipoleStandardDeviation(
+						p_theta_step=max(
+							numpy.std(thetas), self.default_theta_step / (self.n_s * 10)
+						),
+						p_phi_step=max(
+							numpy.std(phis), self.default_phi_step / (self.n_s * 10)
+						),
+						rx_step=rstdevs[0],
+						ry_step=rstdevs[1],
+						rz_step=rstdevs[2],
+						w_log_step=frequency_stdevs,
+					)
+				)
+		else:
+			default_stdev = pdme.subspace_simulation.DipoleStandardDeviation(
+				self.default_phi_step,
+				self.default_theta_step,
+				self.default_r_step,
+				self.default_r_step,
+				self.default_r_step,
+				self.default_w_log_step,
+			)
+			stdev_array = [default_stdev]
+		stdevs = pdme.subspace_simulation.MCMCStandardDeviation(stdev_array)
+		return stdevs
+
+
+def reverse_bisect_right(a, x, lo=0, hi=None):
+	"""Return the index where to insert item x in list a, assuming a is sorted in descending order.
+
+	The return value i is such that all e in a[:i] have e >= x, and all e in
+	a[i:] have e < x.  So if x already appears in the list, a.insert(x) will
+	insert just after the rightmost x already there.
+
+	Optional args lo (default 0) and hi (default len(a)) bound the
+	slice of a to be searched.
+
+	Essentially, the function returns number of elements in a which are >= than x.
+	>>> a = [8, 6, 5, 4, 2]
+	>>> reverse_bisect_right(a, 5)
+	3
+	>>> a[:reverse_bisect_right(a, 5)]
+	[8, 6, 5]
+	"""
+	if lo < 0:
+		raise ValueError("lo must be non-negative")
+	if hi is None:
+		hi = len(a)
+	while lo < hi:
+		mid = (lo + hi) // 2
+		if x > a[mid]:
+			hi = mid
+		else:
+			lo = mid + 1
+	return lo