diff --git a/deepdog/__init__.py b/deepdog/__init__.py index 8dc5987..5db1f84 100644 --- a/deepdog/__init__.py +++ b/deepdog/__init__.py @@ -4,7 +4,7 @@ from deepdog.bayes_run import BayesRun from deepdog.bayes_run_simulpairs import BayesRunSimulPairs from deepdog.real_spectrum_run import RealSpectrumRun from deepdog.temp_aware_real_spectrum_run import TempAwareRealSpectrumRun - +from deepdog.bayes_run_with_ss import BayesRunWithSubspaceSimulation def get_version(): return __version__ @@ -16,6 +16,7 @@ __all__ = [ "BayesRunSimulPairs", "RealSpectrumRun", "TempAwareRealSpectrumRun", + "BayesRunWithSubspaceSimulation", ] diff --git a/deepdog/bayes_run_with_ss.py b/deepdog/bayes_run_with_ss.py new file mode 100644 index 0000000..ad73e00 --- /dev/null +++ b/deepdog/bayes_run_with_ss.py @@ -0,0 +1,229 @@ +import deepdog.subset_simulation +import pdme.inputs +import pdme.model +import pdme.measurement.input_types +import pdme.measurement.oscillating_dipole +import pdme.util.fast_v_calc +import pdme.util.fast_nonlocal_spectrum +from typing import Sequence, Tuple, List, Optional +import datetime +import csv +import multiprocessing +import logging +import numpy +import numpy.typing + + +# TODO: remove hardcode +CHUNKSIZE = 50 + +# TODO: It's garbage to have this here duplicated from pdme. +DotInput = Tuple[numpy.typing.ArrayLike, float] + + +_logger = logging.getLogger(__name__) + + +class BayesRunWithSubspaceSimulation: + """ + A single Bayes run for a given set of dots. + + Parameters + ---------- + dot_inputs : Sequence[DotInput] + The dot inputs for this bayes run. + + models_with_names : Sequence[Tuple(str, pdme.model.DipoleModel)] + The models to evaluate. + + actual_model : pdme.model.DipoleModel + The model which is actually correct. + + filename_slug : str + The filename slug to include. + + run_count: int + The number of runs to do. + """ + + def __init__( + self, + dot_positions: Sequence[numpy.typing.ArrayLike], + frequency_range: Sequence[float], + models_with_names: Sequence[Tuple[str, pdme.model.DipoleModel]], + actual_model: pdme.model.DipoleModel, + filename_slug: str, + max_frequency: float = 20, + end_threshold: float = None, + run_count = 100, + chunksize: int = CHUNKSIZE, + ss_n_c: int = 500, + ss_n_s: int = 100, + ss_m_max: int = 15, + ss_target_cost: Optional[float] = None, + ss_level_0_seed: int = 200, + ss_mcmc_seed: int = 20, + ss_use_adaptive_steps=True, + ss_default_phi_step=0.01, + ss_default_theta_step=0.01, + ss_default_r_step=0.01, + ss_default_w_log_step=0.01, + ss_default_upper_w_log_step=4, + ) -> None: + self.dot_inputs = pdme.inputs.inputs_with_frequency_range( + dot_positions, frequency_range + ) + self.dot_inputs_array = pdme.measurement.input_types.dot_inputs_to_array( + self.dot_inputs + ) + + self.models_with_names = models_with_names + self.models = [model for (_, model) in models_with_names] + self.model_names = [name for (name, _) in models_with_names] + self.actual_model = actual_model + + self.n: int + try: + self.n = self.actual_model.n # type: ignore + except AttributeError: + self.n = 1 + + self.model_count = len(self.models) + + self.csv_fields = [] + for i in range(self.n): + self.csv_fields.extend( + [ + f"dipole_moment_{i+1}", + f"dipole_location_{i+1}", + f"dipole_frequency_{i+1}", + ] + ) + self.compensate_zeros = True + self.chunksize = chunksize + for name in self.model_names: + self.csv_fields.extend([f"{name}_likelihood", f"{name}_prob"]) + + self.probabilities = [1 / self.model_count] * self.model_count + + timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + self.filename = f"{timestamp}-{filename_slug}.bayesrunwithss.csv" + self.max_frequency = max_frequency + + if end_threshold is not None: + if 0 < end_threshold < 1: + self.end_threshold: float = end_threshold + self.use_end_threshold = True + _logger.info(f"Will abort early, at {self.end_threshold}.") + else: + raise ValueError( + f"end_threshold should be between 0 and 1, but is actually {end_threshold}" + ) + + self.ss_n_c = ss_n_c + self.ss_n_s = ss_n_s + self.ss_m_max = ss_m_max + self.ss_target_cost = ss_target_cost + self.ss_level_0_seed = ss_level_0_seed + self.ss_mcmc_seed = ss_mcmc_seed + self.ss_use_adaptive_steps = ss_use_adaptive_steps + self.ss_default_phi_step = ss_default_phi_step + self.ss_default_theta_step = ss_default_theta_step + self.ss_default_r_step = ss_default_r_step + self.ss_default_w_log_step = ss_default_w_log_step + self.ss_default_upper_w_log_step = ss_default_upper_w_log_step + + self.run_count = run_count + + def go(self) -> None: + with open(self.filename, "a", newline="") as outfile: + writer = csv.DictWriter(outfile, fieldnames=self.csv_fields, dialect="unix") + writer.writeheader() + + for run in range(1, self.run_count + 1): + + # Generate the actual dipoles + actual_dipoles = self.actual_model.get_dipoles(self.max_frequency) + + measurements = actual_dipoles.get_dot_measurements( + self.dot_inputs + ) + + + _logger.info(f"Going to work on dipole at {actual_dipoles.dipoles}") + + # define a new seed sequence for each run + + results = [] + _logger.debug("Going to iterate over models now") + for model_count, model in enumerate(self.models_with_names): + _logger.debug(f"Doing model #{model_count}, {model[0]}") + subset_run = deepdog.subset_simulation.SubsetSimulation( + model, + self.dot_inputs, + measurements, + self.ss_n_c, + self.ss_n_s, + self.ss_m_max, + self.ss_target_cost, + self.ss_level_0_seed, + self.ss_mcmc_seed, + self.ss_use_adaptive_steps, + self.ss_default_phi_step, + self.ss_default_theta_step, + self.ss_default_r_step, + self.ss_default_w_log_step, + self.ss_default_upper_w_log_step, + ) + results.append(subset_run.execute()) + + _logger.debug("Done, constructing output now") + row = { + "dipole_moment_1": actual_dipoles.dipoles[0].p, + "dipole_location_1": actual_dipoles.dipoles[0].s, + "dipole_frequency_1": actual_dipoles.dipoles[0].w, + } + for i in range(1, self.n): + try: + current_dipoles = actual_dipoles.dipoles[i] + row[f"dipole_moment_{i+1}"] = current_dipoles.p + row[f"dipole_location_{i+1}"] = current_dipoles.s + row[f"dipole_frequency_{i+1}"] = current_dipoles.w + except IndexError: + _logger.info(f"Not writing anymore, saw end after {i}") + break + + likelihoods: List[float] = [] + + for (name, result) in zip(self.model_names, results): + likelihoods.append(result.over_target_likelihood) + row[f"{name}_likelihood"] = result.over_target_likelihood + + success_weight = sum( + [ + likelihood * prob + for likelihood, prob in zip(likelihoods, self.probabilities) + ] + ) + new_probabilities = [ + likelihood * old_prob / success_weight + for likelihood, old_prob in zip(likelihoods, self.probabilities) + ] + self.probabilities = new_probabilities + for name, probability in zip(self.model_names, self.probabilities): + row[f"{name}_prob"] = probability + _logger.info(row) + + with open(self.filename, "a", newline="") as outfile: + writer = csv.DictWriter( + outfile, fieldnames=self.csv_fields, dialect="unix" + ) + writer.writerow(row) + + if self.use_end_threshold: + max_prob = max(self.probabilities) + if max_prob > self.end_threshold: + _logger.info( + f"Aborting early, because {max_prob} is greater than {self.end_threshold}" + ) + break diff --git a/deepdog/subset_simulation/__init__.py b/deepdog/subset_simulation/__init__.py new file mode 100644 index 0000000..68b2fef --- /dev/null +++ b/deepdog/subset_simulation/__init__.py @@ -0,0 +1,3 @@ +from deepdog.subset_simulation.subset_simulation_impl import SubsetSimulation + +__all__ = ["SubsetSimulation"] diff --git a/deepdog/subset_simulation/subset_simulation_impl.py b/deepdog/subset_simulation/subset_simulation_impl.py new file mode 100644 index 0000000..377ce43 --- /dev/null +++ b/deepdog/subset_simulation/subset_simulation_impl.py @@ -0,0 +1,310 @@ +import logging +import numpy +import pdme.measurement +import pdme.measurement.input_types +import pdme.subspace_simulation +import bisect +from typing import Sequence, Tuple, Optional + +from dataclasses import dataclass + +_logger = logging.getLogger(__name__) + + +@dataclass +class SubsetSimulationResult: + probs_list: Sequence[Tuple] + over_target_cost: Optional[float] + over_target_likelihood: Optional[float] + under_target_cost: Optional[float] + under_target_likelihood: Optional[float] + + +class SubsetSimulation: + def __init__( + self, + model_name_pair, + dot_inputs, + actual_measurements: Sequence[pdme.measurement.DotMeasurement], + n_c: int, + n_s: int, + m_max: int, + target_cost: Optional[float] = None, + level_0_seed: int = 200, + mcmc_seed: int = 20, + use_adaptive_steps=True, + default_phi_step=0.01, + default_theta_step=0.01, + default_r_step=0.01, + default_w_log_step=0.01, + default_upper_w_log_step=4, + ): + name, model = model_name_pair + self.model_name = name + self.model = model + _logger.info(f"got model {self.model_name}") + + self.dot_inputs_array = pdme.measurement.input_types.dot_inputs_to_array( + dot_inputs + ) + # _logger.debug(f"actual measurements: {actual_measurements}") + self.actual_measurement_array = numpy.array([m.v for m in actual_measurements]) + + def cost_function_to_use(dipoles_to_test): + return pdme.subspace_simulation.proportional_costs_vs_actual_measurement( + self.dot_inputs_array, self.actual_measurement_array, dipoles_to_test + ) + + self.cost_function_to_use = cost_function_to_use + + self.n_c = n_c + self.n_s = n_s + self.m_max = m_max + + self.level_0_seed = level_0_seed + self.mcmc_seed = mcmc_seed + + self.use_adaptive_steps = use_adaptive_steps + self.default_phi_step = default_phi_step + self.default_theta_step = default_theta_step + self.default_r_step = default_r_step + self.default_w_log_step = default_w_log_step + self.default_upper_w_log_step = default_upper_w_log_step + + _logger.info(f"using params:") + _logger.info(f"\tn_c: {self.n_c}") + _logger.info(f"\tn_s: {self.n_s}") + _logger.info(f"\tm: {self.m_max}") + _logger.info(f"let's do level 0...") + + self.target_cost = target_cost + _logger.info(f"will stop at target cost {target_cost}") + + def execute(self) -> SubsetSimulationResult: + + probs_list = [] + + sample_dipoles = self.model.get_monte_carlo_dipole_inputs( + self.n_c * self.n_s, + -1, + rng_to_use=numpy.random.default_rng(self.level_0_seed), + ) + # _logger.debug(sample_dipoles) + # _logger.debug(sample_dipoles.shape) + costs = self.cost_function_to_use(sample_dipoles) + + _logger.debug(f"costs: {costs}") + sorted_indexes = costs.argsort()[::-1] + + _logger.debug(costs[sorted_indexes]) + _logger.debug(sample_dipoles[sorted_indexes]) + + sorted_costs = costs[sorted_indexes] + sorted_dipoles = sample_dipoles[sorted_indexes] + + threshold_cost = sorted_costs[-self.n_c] + + all_dipoles = numpy.array( + [ + pdme.subspace_simulation.sort_array_of_dipoles_by_frequency(samp) + for samp in sorted_dipoles + ] + ) + all_chains = list(zip(sorted_costs, all_dipoles)) + + mcmc_rng = numpy.random.default_rng(self.mcmc_seed) + + for i in range(self.m_max): + next_seeds = all_chains[-self.n_c :] + + for cost_index, cost_chain in enumerate(all_chains[: -self.n_c]): + probs_list.append( + ( + ((self.n_c * self.n_s - cost_index) / (self.n_c * self.n_s)) + / (self.n_s ** (i)), + cost_chain[0], + i + 1, + ) + ) + + next_seeds_as_array = numpy.array([s for _, s in next_seeds]) + + stdevs = self.get_stdevs_from_arrays(next_seeds_as_array) + _logger.info(f"got stdevs: {stdevs.stdevs}") + + all_chains = [] + for c, s in next_seeds: + # chain = mcmc(s, threshold_cost, n_s, model, dot_inputs_array, actual_measurement_array, mcmc_rng, curr_cost=c, stdevs=stdevs) + # until new version gotta do + chain = self.model.get_mcmc_chain( + s, + self.cost_function_to_use, + self.n_s, + threshold_cost, + stdevs, + initial_cost=c, + rng_arg=mcmc_rng, + ) + for cost, chained in chain: + try: + filtered_cost = cost[0] + except IndexError: + filtered_cost = cost + all_chains.append((filtered_cost, chained)) + + # _logger.debug(all_chains) + + all_chains.sort(key=lambda c: c[0], reverse=True) + + threshold_cost = all_chains[-self.n_c][0] + _logger.info( + f"current threshold cost: {threshold_cost}, at P = (1 / {self.n_s})^{i + 1}" + ) + if (self.target_cost is not None) and (threshold_cost < self.target_cost): + _logger.info( + f"got a threshold cost {threshold_cost}, less than {self.target_cost}. will leave early" + ) + + cost_list = [c[0] for c in all_chains] + over_index = reverse_bisect_right(cost_list, self.target_cost) + + shorter_probs_list = [] + for cost_index, cost_chain in enumerate(all_chains): + probs_list.append( + ( + ((self.n_c * self.n_s - cost_index) / (self.n_c * self.n_s)) + / (self.n_s ** (i)), + cost_chain[0], + i + 1, + ) + ) + shorter_probs_list.append( + ( + cost_chain[0], + ((self.n_c * self.n_s - cost_index) / (self.n_c * self.n_s)) + / (self.n_s ** (i)), + ) + ) + # _logger.info(shorter_probs_list) + result = SubsetSimulationResult( + probs_list=probs_list, + over_target_cost=shorter_probs_list[over_index - 1][0], + over_target_likelihood=shorter_probs_list[over_index - 1][1], + under_target_cost=shorter_probs_list[over_index][0], + under_target_likelihood=shorter_probs_list[over_index][1], + ) + return result + + # _logger.debug([c[0] for c in all_chains[-n_c:]]) + _logger.info(f"doing level {i + 1}") + + for cost_index, cost_chain in enumerate(all_chains): + probs_list.append( + ( + ((self.n_c * self.n_s - cost_index) / (self.n_c * self.n_s)) + / (self.n_s ** (self.m_max)), + cost_chain[0], + self.m_max + 1, + ) + ) + threshold_cost = all_chains[-self.n_c][0] + _logger.info( + f"final threshold cost: {threshold_cost}, at P = (1 / {self.n_s})^{self.m_max + 1}" + ) + for a in all_chains[-10:]: + _logger.info(a) + # for prob, prob_cost in probs_list: + # _logger.info(f"\t{prob}: {prob_cost}") + probs_list.sort(key=lambda c: c[0], reverse=True) + result = SubsetSimulationResult( + probs_list=probs_list, + over_target_cost=None, + over_target_likelihood=None, + under_target_cost=None, + under_target_likelihood=None, + ) + return result + + def get_stdevs_from_arrays( + self, array + ) -> pdme.subspace_simulation.MCMCStandardDeviation: + # stdevs = get_stdevs_from_arrays(next_seeds_as_array, model) + if self.use_adaptive_steps: + + stdev_array = [] + count = array.shape[1] + for dipole_index in range(count): + selected = array[:, dipole_index] + pxs = selected[:, 0] + pys = selected[:, 1] + pzs = selected[:, 2] + thetas = numpy.arccos(pzs / self.model.pfixed) + phis = numpy.arctan2(pys, pxs) + + rstdevs = numpy.maximum( + numpy.std(selected, axis=0)[3:6], + self.default_r_step / (self.n_s * 10), + ) + frequency_stdevs = numpy.minimum( + numpy.maximum( + numpy.std(numpy.log(selected[:, -1])), + self.default_w_log_step / (self.n_s * 10), + ), + self.default_upper_w_log_step, + ) + stdev_array.append( + pdme.subspace_simulation.DipoleStandardDeviation( + p_theta_step=max( + numpy.std(thetas), self.default_theta_step / (self.n_s * 10) + ), + p_phi_step=max( + numpy.std(phis), self.default_phi_step / (self.n_s * 10) + ), + rx_step=rstdevs[0], + ry_step=rstdevs[1], + rz_step=rstdevs[2], + w_log_step=frequency_stdevs, + ) + ) + else: + default_stdev = pdme.subspace_simulation.DipoleStandardDeviation( + self.default_phi_step, + self.default_theta_step, + self.default_r_step, + self.default_r_step, + self.default_r_step, + self.default_w_log_step, + ) + stdev_array = [default_stdev] + stdevs = pdme.subspace_simulation.MCMCStandardDeviation(stdev_array) + return stdevs + + +def reverse_bisect_right(a, x, lo=0, hi=None): + """Return the index where to insert item x in list a, assuming a is sorted in descending order. + + The return value i is such that all e in a[:i] have e >= x, and all e in + a[i:] have e < x. So if x already appears in the list, a.insert(x) will + insert just after the rightmost x already there. + + Optional args lo (default 0) and hi (default len(a)) bound the + slice of a to be searched. + + Essentially, the function returns number of elements in a which are >= than x. + >>> a = [8, 6, 5, 4, 2] + >>> reverse_bisect_right(a, 5) + 3 + >>> a[:reverse_bisect_right(a, 5)] + [8, 6, 5] + """ + if lo < 0: + raise ValueError("lo must be non-negative") + if hi is None: + hi = len(a) + while lo < hi: + mid = (lo + hi) // 2 + if x > a[mid]: + hi = mid + else: + lo = mid + 1 + return lo diff --git a/poetry.lock b/poetry.lock index 42ad5ec..3fa9092 100644 --- a/poetry.lock +++ b/poetry.lock @@ -360,7 +360,7 @@ python-versions = ">=3.7" [[package]] name = "pdme" -version = "0.8.9" +version = "0.9.1" description = "Python dipole model evaluator" category = "main" optional = false @@ -730,7 +730,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-co [metadata] lock-version = "1.1" python-versions = ">=3.8.1,<3.10" -content-hash = "7a708c21744e80b6b5e94f7ef2d5b58f9539179d2c0dd1d9f17b3daa4151d52f" +content-hash = "0161af7edf18c16819f1ce083ab491c17c9809f2770219725131451b1a16a970" [metadata.files] black = [] diff --git a/pyproject.toml b/pyproject.toml index 6f375fc..f252cbd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Deepak Mallubhotla "] [tool.poetry.dependencies] python = ">=3.8.1,<3.10" -pdme = "^0.8.9" +pdme = "^0.9.1" numpy = "1.22.3" scipy = "1.10"