Compare commits

..

17 Commits

Author SHA1 Message Date
c288e42e5f chore(deps): update dependency numpy to v1.26.4
Some checks failed
renovate/artifacts Artifact file update failure
gitea-physics/deepdog/pipeline/pr-master There was a failure building this commit
2024-05-21 01:30:41 +00:00
9cfd484d7c chore(release): 1.3.0
All checks were successful
gitea-physics/deepdog/pipeline/tag This commit looks good
gitea-physics/deepdog/pipeline/head This commit looks good
2024-05-19 22:13:39 -05:00
09fad2e102 feat: improve initial cost calculation to allow multiprocessing, adds ability to specify a number of levels to do with direct mc instead of subset simulation 2024-05-19 22:11:50 -05:00
24ac65bf9c fix: fix seeding to avoid recreating seed combinations across multi runs 2024-05-19 22:10:40 -05:00
8fbae32111 doc: some commenting and logging changes 2024-05-19 22:09:52 -05:00
b1c01b25c8 fix: Adds ugly hack for stdevs for this uniform range to multiply by root3, proper fix would be in pdme 2024-05-19 22:08:44 -05:00
a14d9834e5 doc: note on refactoring for subset sim probs 2024-05-19 22:01:42 -05:00
8d04803eb3 fmt: formatting, nicer log, removing comment 2024-05-19 02:29:59 -05:00
92b49fce7c feat: add multi run to wrap multi model and repeat runs 2024-05-19 02:27:11 -05:00
8845b2875f feat: adds a filter that works with cost functions 2024-05-19 02:26:00 -05:00
72791f2d0f deps: update pdme 2024-05-19 02:25:29 -05:00
d258cfbec7 chore(release): 1.2.1
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
gitea-physics/deepdog/pipeline/tag This commit looks good
2024-05-11 20:51:05 -05:00
b3bf4cde97 perf: precompile the magic regexes for probs parsing 2024-05-11 20:49:45 -05:00
60f29b0b2f perf: avoid recalculating product dict in indexifier to improve performance for probs 2024-05-11 20:49:26 -05:00
093a3fb5c4 chore(release): 1.2.0
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
gitea-physics/deepdog/pipeline/tag This commit looks good
2024-05-08 22:24:28 -05:00
dc1d2d45a3 feat: adds additional matching regexes
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
2024-05-08 22:23:57 -05:00
f0e2fa3da9 feat: adds magnitude enabled parsing option 2024-05-03 10:44:06 -05:00
12 changed files with 655 additions and 178 deletions

View File

@@ -2,6 +2,31 @@
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines. All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
## [1.3.0](https://gitea.deepak.science:2222/physics/deepdog/compare/1.2.1...1.3.0) (2024-05-20)
### Features
* add multi run to wrap multi model and repeat runs ([92b49fc](https://gitea.deepak.science:2222/physics/deepdog/commit/92b49fce7c86f14484deb1c4aaaa810a6f69c08a))
* adds a filter that works with cost functions ([8845b28](https://gitea.deepak.science:2222/physics/deepdog/commit/8845b2875f2c91c91dd3988fabda26400c59b2d7))
* improve initial cost calculation to allow multiprocessing, adds ability to specify a number of levels to do with direct mc instead of subset simulation ([09fad2e](https://gitea.deepak.science:2222/physics/deepdog/commit/09fad2e1024d9237a6a4f7931f51cb4c84b83bf8))
### Bug Fixes
* Adds ugly hack for stdevs for this uniform range to multiply by root3, proper fix would be in pdme ([b1c01b2](https://gitea.deepak.science:2222/physics/deepdog/commit/b1c01b25c8f2c3947be23f5b2c656c37437dab17))
* fix seeding to avoid recreating seed combinations across multi runs ([24ac65b](https://gitea.deepak.science:2222/physics/deepdog/commit/24ac65bf9c74c454fec826ca9de640fe095f5a17))
### [1.2.1](https://gitea.deepak.science:2222/physics/deepdog/compare/1.2.0...1.2.1) (2024-05-12)
## [1.2.0](https://gitea.deepak.science:2222/physics/deepdog/compare/1.1.0...1.2.0) (2024-05-09)
### Features
* adds additional matching regexes ([dc1d2d4](https://gitea.deepak.science:2222/physics/deepdog/commit/dc1d2d45a3e631c5efccce80f8a24fa87c6089e0))
* adds magnitude enabled parsing option ([f0e2fa3](https://gitea.deepak.science:2222/physics/deepdog/commit/f0e2fa3da9f5a5136908d691137a904fda4e3a9a))
## [1.1.0](https://gitea.deepak.science:2222/physics/deepdog/compare/1.0.1...1.1.0) (2024-05-03) ## [1.1.0](https://gitea.deepak.science:2222/physics/deepdog/compare/1.0.1...1.1.0) (2024-05-03)

View File

@@ -72,6 +72,7 @@ def main(args: argparse.Namespace):
for f in tqdm.tqdm(out_files, desc="reading files", leave=False) for f in tqdm.tqdm(out_files, desc="reading files", leave=False)
] ]
# Refactor here to allow for arbitrary likelihood file sources
_logger.info("building uncoalesced dict") _logger.info("building uncoalesced dict")
uncoalesced_dict = deepdog.cli.probs.dicts.build_model_dict(parsed_output_files) uncoalesced_dict = deepdog.cli.probs.dicts.build_model_dict(parsed_output_files)

View File

@@ -0,0 +1,24 @@
from deepdog.direct_monte_carlo.direct_mc import DirectMonteCarloFilter
from typing import Callable
import numpy
class CostFunctionTargetFilter(DirectMonteCarloFilter):
def __init__(
self,
cost_function: Callable[[numpy.ndarray], numpy.ndarray],
target_cost: float,
):
"""
Filters dipoles by cost, only leaving dipoles with cost below target_cost
"""
self.cost_function = cost_function
self.target_cost = target_cost
def filter_samples(self, samples: numpy.ndarray) -> numpy.ndarray:
current_sample = samples
costs = self.cost_function(current_sample)
current_sample = current_sample[costs < self.target_cost]
return current_sample

View File

@@ -31,10 +31,10 @@ class Indexifier:
def __init__(self, list_dict: typing.Dict[str, typing.Sequence]): def __init__(self, list_dict: typing.Dict[str, typing.Sequence]):
self.dict = list_dict self.dict = list_dict
self.product_dict = _dict_product(self.dict)
def indexify(self, n: int) -> typing.Dict[str, typing.Any]: def indexify(self, n: int) -> typing.Dict[str, typing.Any]:
product_dict = _dict_product(self.dict) return self.product_dict[n]
return product_dict[n]
def _indexify_indices(self, n: int) -> typing.Sequence[int]: def _indexify_indices(self, n: int) -> typing.Sequence[int]:
""" """

View File

@@ -8,17 +8,30 @@ import csv
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
FILENAME_REGEX = r"(?P<timestamp>\d{8}-\d{6})-(?P<filename_slug>.*)\.realdata\.fast_filter\.bayesrun\.csv" FILENAME_REGEX = re.compile(
r"(?P<timestamp>\d{8}-\d{6})-(?P<filename_slug>.*)\.realdata\.fast_filter\.bayesrun\.csv"
)
MODEL_REGEXES = [ MODEL_REGEXES = [
r"geom_(?P<xmin>-?\d+)_(?P<xmax>-?\d+)_(?P<ymin>-?\d+)_(?P<ymax>-?\d+)_(?P<zmin>-?\d+)_(?P<zmax>-?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)" re.compile(pattern)
for pattern in [
r"geom_(?P<xmin>-?\d+)_(?P<xmax>-?\d+)_(?P<ymin>-?\d+)_(?P<ymax>-?\d+)_(?P<zmin>-?\d+)_(?P<zmax>-?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)",
r"geom_(?P<xmin>-?\d+)_(?P<xmax>-?\d+)_(?P<ymin>-?\d+)_(?P<ymax>-?\d+)_(?P<zmin>-?\d+)_(?P<zmax>-?\d+)-magnitude_(?P<log_magnitude>\d*\.?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)",
r"geom_(?P<xmin>-?\d*\.?\d+)_(?P<xmax>-?\d*\.?\d+)_(?P<ymin>-?\d*\.?\d+)_(?P<ymax>-?\d*\.?\d+)_(?P<zmin>-?\d*\.?\d+)_(?P<zmax>-?\d*\.?\d+)-magnitude_(?P<log_magnitude>\d*\.?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)",
]
] ]
FILE_SLUG_REGEXES = [ FILE_SLUG_REGEXES = [
re.compile(pattern)
for pattern in [
r"(?P<tag>\w+)-(?P<job_index>\d+)",
r"mock_tarucha-(?P<job_index>\d+)", r"mock_tarucha-(?P<job_index>\d+)",
r"(?:(?P<mock>mock)_)?tarucha(?:_(?P<tarucha_run_id>\d+))?-(?P<job_index>\d+)", r"(?:(?P<mock>mock)_)?tarucha(?:_(?P<tarucha_run_id>\d+))?-(?P<job_index>\d+)",
]
] ]
SIMPLE_TAG_REGEX = re.compile(r"\w+-\d+")
@dataclasses.dataclass @dataclasses.dataclass
class BayesrunOutputFilename: class BayesrunOutputFilename:
@@ -27,7 +40,6 @@ class BayesrunOutputFilename:
path: pathlib.Path path: pathlib.Path
@dataclasses.dataclass
class BayesrunColumnParsed: class BayesrunColumnParsed:
""" """
class for parsing a bayesrun while pulling certain special fields out class for parsing a bayesrun while pulling certain special fields out
@@ -38,10 +50,21 @@ class BayesrunColumnParsed:
self.model_field_dict = { self.model_field_dict = {
k: v for k, v in groupdict.items() if k != "field_name" k: v for k, v in groupdict.items() if k != "field_name"
} }
self._groupdict_str = repr(groupdict)
def __str__(self): def __str__(self):
return f"BayesrunColumnParsed[{self.column_field}: {self.model_field_dict}]" return f"BayesrunColumnParsed[{self.column_field}: {self.model_field_dict}]"
def __repr__(self):
return f"BayesrunColumnParsed({self._groupdict_str})"
def __eq__(self, other):
if isinstance(other, BayesrunColumnParsed):
return (self.column_field == other.column_field) and (
self.model_field_dict == other.model_field_dict
)
return NotImplemented
@dataclasses.dataclass @dataclasses.dataclass
class BayesrunModelResult: class BayesrunModelResult:
@@ -73,7 +96,7 @@ def _parse_bayesrun_column(
Returns the groupdict for the first match, or None if no match found. Returns the groupdict for the first match, or None if no match found.
""" """
for pattern in MODEL_REGEXES: for pattern in MODEL_REGEXES:
match = re.match(pattern, column) match = pattern.match(column)
if match: if match:
return BayesrunColumnParsed(match.groupdict()) return BayesrunColumnParsed(match.groupdict())
else: else:
@@ -112,7 +135,7 @@ def _parse_bayesrun_row(
def _parse_output_filename(file: pathlib.Path) -> BayesrunOutputFilename: def _parse_output_filename(file: pathlib.Path) -> BayesrunOutputFilename:
filename = file.name filename = file.name
match = re.match(FILENAME_REGEX, filename) match = FILENAME_REGEX.match(filename)
if not match: if not match:
raise ValueError(f"{filename} was not a valid bayesrun output") raise ValueError(f"{filename} was not a valid bayesrun output")
groups = match.groupdict() groups = match.groupdict()
@@ -123,7 +146,7 @@ def _parse_output_filename(file: pathlib.Path) -> BayesrunOutputFilename:
def _parse_file_slug(slug: str) -> typing.Optional[typing.Dict[str, str]]: def _parse_file_slug(slug: str) -> typing.Optional[typing.Dict[str, str]]:
for pattern in FILE_SLUG_REGEXES: for pattern in FILE_SLUG_REGEXES:
match = re.match(pattern, slug) match = pattern.match(slug)
if match: if match:
return match.groupdict() return match.groupdict()
else: else:

View File

@@ -1,9 +1,11 @@
import logging import logging
import multiprocessing
import numpy import numpy
import pdme.measurement import pdme.measurement
import pdme.measurement.input_types import pdme.measurement.input_types
import pdme.model
import pdme.subspace_simulation import pdme.subspace_simulation
from typing import Sequence, Tuple, Optional from typing import Sequence, Tuple, Optional, Callable, Union, List
from dataclasses import dataclass from dataclasses import dataclass
@@ -18,47 +20,63 @@ class SubsetSimulationResult:
under_target_cost: Optional[float] under_target_cost: Optional[float]
under_target_likelihood: Optional[float] under_target_likelihood: Optional[float]
lowest_likelihood: Optional[float] lowest_likelihood: Optional[float]
messages: Sequence[str]
@dataclass
class MultiSubsetSimulationResult:
child_results: Sequence[SubsetSimulationResult]
model_name: str
estimated_likelihood: float
arithmetic_mean_estimated_likelihood: float
num_children: int
num_finished_children: int
clean_estimate: bool
class SubsetSimulation: class SubsetSimulation:
def __init__( def __init__(
self, self,
model_name_pair, model_name_pair,
dot_inputs, # actual_measurements: Sequence[pdme.measurement.DotMeasurement],
actual_measurements: Sequence[pdme.measurement.DotMeasurement], cost_function: Callable[[numpy.ndarray], numpy.ndarray],
n_c: int, n_c: int,
n_s: int, n_s: int,
m_max: int, m_max: int,
target_cost: Optional[float] = None, target_cost: Optional[float] = None,
level_0_seed: int = 200, level_0_seed: Union[int, Sequence[int]] = 200,
mcmc_seed: int = 20, mcmc_seed: Union[int, Sequence[int]] = 20,
use_adaptive_steps=True, use_adaptive_steps=True,
default_phi_step=0.01, default_phi_step=0.01,
default_theta_step=0.01, default_theta_step=0.01,
default_r_step=0.01, default_r_step=0.01,
default_w_log_step=0.01, default_w_log_step=0.01,
default_upper_w_log_step=4, default_upper_w_log_step=4,
num_initial_dmc_gens=1,
keep_probs_list=True, keep_probs_list=True,
dump_last_generation_to_file=False, dump_last_generation_to_file=False,
initial_cost_chunk_size=100, initial_cost_chunk_size=100,
initial_cost_multiprocess=True,
cap_core_count: int = 0, # 0 means cap at num cores - 1
): ):
name, model = model_name_pair name, model = model_name_pair
self.model_name = name self.model_name = name
self.model = model self.model = model
_logger.info(f"got model {self.model_name}") _logger.info(f"got model {self.model_name}")
self.dot_inputs_array = pdme.measurement.input_types.dot_inputs_to_array( # dot_inputs = [(meas.r, meas.f) for meas in actual_measurements]
dot_inputs # self.dot_inputs_array = pdme.measurement.input_types.dot_inputs_to_array(
) # dot_inputs
# )
# _logger.debug(f"actual measurements: {actual_measurements}") # _logger.debug(f"actual measurements: {actual_measurements}")
self.actual_measurement_array = numpy.array([m.v for m in actual_measurements]) # self.actual_measurement_array = numpy.array([m.v for m in actual_measurements])
def cost_function_to_use(dipoles_to_test): # def cost_function_to_use(dipoles_to_test):
return pdme.subspace_simulation.proportional_costs_vs_actual_measurement( # return pdme.subspace_simulation.proportional_costs_vs_actual_measurement(
self.dot_inputs_array, self.actual_measurement_array, dipoles_to_test # self.dot_inputs_array, self.actual_measurement_array, dipoles_to_test
) # )
self.cost_function_to_use = cost_function_to_use self.cost_function_to_use = cost_function
self.n_c = n_c self.n_c = n_c
self.n_s = n_s self.n_s = n_s
@@ -68,16 +86,25 @@ class SubsetSimulation:
self.mcmc_seed = mcmc_seed self.mcmc_seed = mcmc_seed
self.use_adaptive_steps = use_adaptive_steps self.use_adaptive_steps = use_adaptive_steps
self.default_phi_step = default_phi_step self.default_phi_step = (
default_phi_step * 1.73
) # this is a hack to fix a missing sqrt 3 in the proposal function code.
self.default_theta_step = default_theta_step self.default_theta_step = default_theta_step
self.default_r_step = default_r_step self.default_r_step = (
self.default_w_log_step = default_w_log_step default_r_step * 1.73
) # this is a hack to fix a missing sqrt 3 in the proposal function code.
self.default_w_log_step = (
default_w_log_step * 1.73
) # this is a hack to fix a missing sqrt 3 in the proposal function code.
self.default_upper_w_log_step = default_upper_w_log_step self.default_upper_w_log_step = default_upper_w_log_step
_logger.info("using params:") _logger.info("using params:")
_logger.info(f"\tn_c: {self.n_c}") _logger.info(f"\tn_c: {self.n_c}")
_logger.info(f"\tn_s: {self.n_s}") _logger.info(f"\tn_s: {self.n_s}")
_logger.info(f"\tm: {self.m_max}") _logger.info(f"\tm: {self.m_max}")
_logger.info(f"\t{num_initial_dmc_gens=}")
_logger.info(f"\t{mcmc_seed=}")
_logger.info(f"\t{level_0_seed=}")
_logger.info("let's do level 0...") _logger.info("let's do level 0...")
self.target_cost = target_cost self.target_cost = target_cost
@@ -87,44 +114,91 @@ class SubsetSimulation:
self.dump_last_generations = dump_last_generation_to_file self.dump_last_generations = dump_last_generation_to_file
self.initial_cost_chunk_size = initial_cost_chunk_size self.initial_cost_chunk_size = initial_cost_chunk_size
self.initial_cost_multiprocess = initial_cost_multiprocess
self.cap_core_count = cap_core_count
self.num_dmc_gens = num_initial_dmc_gens
def _single_chain_gen(self, args: Tuple):
threshold_cost, stdevs, rng_seed, (c, s) = args
rng = numpy.random.default_rng(rng_seed)
return self.model.get_repeat_counting_mcmc_chain(
s,
self.cost_function_to_use,
self.n_s,
threshold_cost,
stdevs,
initial_cost=c,
rng_arg=rng,
)
def execute(self) -> SubsetSimulationResult: def execute(self) -> SubsetSimulationResult:
probs_list = [] probs_list = []
output_messages = []
# If we have n_s = 10 and n_c = 100, then our big N = 1000 and p = 1/10
# The DMC stage would normally generate 1000, then pick the best 100 and start counting prob = p/10.
# Let's say we want our DMC stage to go down to level 2.
# Then we need to filter out p^2, so our initial has to be N_0 = N / p = n_c * n_s^2
initial_dmc_n = self.n_c * (self.n_s**self.num_dmc_gens)
initial_level = (
self.num_dmc_gens - 1
) # This is perfunctory but let's label it here really explicitly
_logger.info(f"Generating {initial_dmc_n} for DMC stage")
sample_dipoles = self.model.get_monte_carlo_dipole_inputs( sample_dipoles = self.model.get_monte_carlo_dipole_inputs(
self.n_c * self.n_s, initial_dmc_n,
-1, -1,
rng_to_use=numpy.random.default_rng(self.level_0_seed), rng_to_use=numpy.random.default_rng(self.level_0_seed),
) )
# _logger.debug(sample_dipoles) # _logger.debug(sample_dipoles)
# _logger.debug(sample_dipoles.shape) # _logger.debug(sample_dipoles.shape)
raw_costs = [] _logger.debug("Finished dipole generation")
_logger.debug( _logger.debug(
f"Using iterated cost function thing with chunk size {self.initial_cost_chunk_size}" f"Using iterated multiprocessing cost function thing with chunk size {self.initial_cost_chunk_size}"
) )
for x in range(0, len(sample_dipoles), self.initial_cost_chunk_size): # core count etc. logic here
_logger.debug(f"doing chunk {x}") core_count = multiprocessing.cpu_count() - 1 or 1
raw_costs.extend( if (self.cap_core_count >= 1) and (self.cap_core_count < core_count):
self.cost_function_to_use( core_count = self.cap_core_count
sample_dipoles[x : x + self.initial_cost_chunk_size] _logger.info(f"Using {core_count} cores")
)
)
costs = numpy.array(raw_costs)
_logger.debug(f"costs: {costs}") with multiprocessing.Pool(core_count) as pool:
# Do the initial DMC calculation in a multiprocessing
chunks = numpy.array_split(
sample_dipoles,
range(
self.initial_cost_chunk_size,
len(sample_dipoles),
self.initial_cost_chunk_size,
),
)
if self.initial_cost_multiprocess:
_logger.debug("Multiprocessing initial costs")
raw_costs = pool.map(self.cost_function_to_use, chunks)
else:
_logger.debug("Single process initial costs")
raw_costs = []
for chunk_idx, chunk in enumerate(chunks):
_logger.debug(f"doing chunk #{chunk_idx}")
raw_costs.append(self.cost_function_to_use(chunk))
costs = numpy.concatenate(raw_costs)
_logger.debug("finished initial dmc cost calculation")
# _logger.debug(f"costs: {costs}")
sorted_indexes = costs.argsort()[::-1] sorted_indexes = costs.argsort()[::-1]
_logger.debug(costs[sorted_indexes]) # _logger.debug(costs[sorted_indexes])
_logger.debug(sample_dipoles[sorted_indexes]) # _logger.debug(sample_dipoles[sorted_indexes])
sorted_costs = costs[sorted_indexes] sorted_costs = costs[sorted_indexes]
sorted_dipoles = sample_dipoles[sorted_indexes] sorted_dipoles = sample_dipoles[sorted_indexes]
threshold_cost = sorted_costs[-self.n_c]
all_dipoles = numpy.array( all_dipoles = numpy.array(
[ [
pdme.subspace_simulation.sort_array_of_dipoles_by_frequency(samp) pdme.subspace_simulation.sort_array_of_dipoles_by_frequency(samp)
@@ -132,10 +206,36 @@ class SubsetSimulation:
] ]
) )
all_chains = list(zip(sorted_costs, all_dipoles)) all_chains = list(zip(sorted_costs, all_dipoles))
for dmc_level in range(initial_level):
# if initial level is 1, we want to print out what the level 0 threshold would have been?
_logger.debug(f"Get the pseudo statistics for level {dmc_level}")
_logger.debug(f"Whole chain has length {len(all_chains)}")
pseudo_threshold_index = -(
self.n_c * (self.n_s ** (self.num_dmc_gens - dmc_level - 1))
)
_logger.debug(
f"Have a pseudo_threshold_index of {pseudo_threshold_index}, or {len(all_chains) + pseudo_threshold_index}"
)
pseudo_threshold_cost = all_chains[-pseudo_threshold_index][0]
_logger.info(
f"Pseudo-level {dmc_level} threshold cost {pseudo_threshold_cost}, at P = (1 / {self.n_s})^{dmc_level + 1}"
)
all_chains = all_chains[pseudo_threshold_index:]
mcmc_rng = numpy.random.default_rng(self.mcmc_seed) long_mcmc_rng = numpy.random.default_rng(self.mcmc_seed)
mcmc_rng_seed_sequence = numpy.random.SeedSequence(self.mcmc_seed)
for i in range(self.m_max): threshold_cost = all_chains[-self.n_c][0]
_logger.info(
f"Finishing DMC threshold cost {threshold_cost} at level {initial_level}, at P = (1 / {self.n_s})^{initial_level + 1}"
)
_logger.debug(f"Executing the MCMC with chains of length {len(all_chains)}")
# Now we move on to the MCMC part of the algorithm
# This is important, we want to allow some extra initial levels so we need to account for that here!
for i in range(self.num_dmc_gens, self.m_max):
_logger.info(f"Starting level {i}")
next_seeds = all_chains[-self.n_c :] next_seeds = all_chains[-self.n_c :]
if self.dump_last_generations: if self.dump_last_generations:
@@ -158,7 +258,9 @@ class SubsetSimulation:
): ):
# chain = mcmc(s, threshold_cost, n_s, model, dot_inputs_array, actual_measurement_array, mcmc_rng, curr_cost=c, stdevs=stdevs) # chain = mcmc(s, threshold_cost, n_s, model, dot_inputs_array, actual_measurement_array, mcmc_rng, curr_cost=c, stdevs=stdevs)
# until new version gotta do # until new version gotta do
_logger.debug(f"\t{seed_index}: doing long chain on the next seed") _logger.debug(
f"\t{seed_index}: doing long chain on the next seed"
)
long_chain = self.model.get_mcmc_chain( long_chain = self.model.get_mcmc_chain(
s, s,
@@ -167,7 +269,7 @@ class SubsetSimulation:
threshold_cost, threshold_cost,
stdevs, stdevs,
initial_cost=c, initial_cost=c,
rng_arg=mcmc_rng, rng_arg=long_mcmc_rng,
) )
for _, chained in long_chain: for _, chained in long_chain:
all_long_chains.append(chained) all_long_chains.append(chained)
@@ -184,7 +286,10 @@ class SubsetSimulation:
for cost_index, cost_chain in enumerate(all_chains[: -self.n_c]): for cost_index, cost_chain in enumerate(all_chains[: -self.n_c]):
probs_list.append( probs_list.append(
( (
((self.n_c * self.n_s - cost_index) / (self.n_c * self.n_s)) (
(self.n_c * self.n_s - cost_index)
/ (self.n_c * self.n_s)
)
/ (self.n_s ** (i)), / (self.n_s ** (i)),
cost_chain[0], cost_chain[0],
i + 1, i + 1,
@@ -194,31 +299,42 @@ class SubsetSimulation:
next_seeds_as_array = numpy.array([s for _, s in next_seeds]) next_seeds_as_array = numpy.array([s for _, s in next_seeds])
stdevs = self.get_stdevs_from_arrays(next_seeds_as_array) stdevs = self.get_stdevs_from_arrays(next_seeds_as_array)
_logger.info(f"got stdevs: {stdevs.stdevs}") _logger.debug(f"got stdevs, begin: {stdevs.stdevs[:10]}")
_logger.debug("Starting the MCMC") _logger.debug("Starting the MCMC")
all_chains = [] all_chains = []
for seed_index, (c, s) in enumerate(next_seeds):
# chain = mcmc(s, threshold_cost, n_s, model, dot_inputs_array, actual_measurement_array, mcmc_rng, curr_cost=c, stdevs=stdevs) seeds = mcmc_rng_seed_sequence.spawn(len(next_seeds))
# until new version gotta do pool_results = pool.imap_unordered(
_logger.debug( self._single_chain_gen,
f"\t{seed_index}: getting another chain from the next seed" [
) (threshold_cost, stdevs, rng_seed, test_seed)
chain = self.model.get_mcmc_chain( for rng_seed, test_seed in zip(seeds, next_seeds)
s, ],
self.cost_function_to_use, chunksize=50,
self.n_s,
threshold_cost,
stdevs,
initial_cost=c,
rng_arg=mcmc_rng,
) )
# count for ergodicity analysis
samples_generated = 0
samples_rejected = 0
for rejected_count, chain in pool_results:
for cost, chained in chain: for cost, chained in chain:
try: try:
filtered_cost = cost[0] filtered_cost = cost[0]
except (IndexError, TypeError): except (IndexError, TypeError):
filtered_cost = cost filtered_cost = cost
all_chains.append((filtered_cost, chained)) all_chains.append((filtered_cost, chained))
samples_generated += self.n_s
samples_rejected += rejected_count
_logger.debug("finished mcmc") _logger.debug("finished mcmc")
_logger.debug(f"{samples_rejected=} out of {samples_generated=}")
if samples_rejected * 2 > samples_generated:
reject_ratio = samples_rejected / samples_generated
rejectionmessage = f"On level {i}, rejected {samples_rejected} out of {samples_generated}, {reject_ratio=} is too high and may indicate ergodicity problems"
output_messages.append(rejectionmessage)
_logger.warning(rejectionmessage)
# _logger.debug(all_chains) # _logger.debug(all_chains)
all_chains.sort(key=lambda c: c[0], reverse=True) all_chains.sort(key=lambda c: c[0], reverse=True)
@@ -228,7 +344,9 @@ class SubsetSimulation:
_logger.info( _logger.info(
f"current threshold cost: {threshold_cost}, at P = (1 / {self.n_s})^{i + 1}" f"current threshold cost: {threshold_cost}, at P = (1 / {self.n_s})^{i + 1}"
) )
if (self.target_cost is not None) and (threshold_cost < self.target_cost): if (self.target_cost is not None) and (
threshold_cost < self.target_cost
):
_logger.info( _logger.info(
f"got a threshold cost {threshold_cost}, less than {self.target_cost}. will leave early" f"got a threshold cost {threshold_cost}, less than {self.target_cost}. will leave early"
) )
@@ -236,6 +354,8 @@ class SubsetSimulation:
cost_list = [c[0] for c in all_chains] cost_list = [c[0] for c in all_chains]
over_index = reverse_bisect_right(cost_list, self.target_cost) over_index = reverse_bisect_right(cost_list, self.target_cost)
winner = all_chains[over_index][1]
_logger.info(f"Winner obtained: {winner}")
shorter_probs_list = [] shorter_probs_list = []
for cost_index, cost_chain in enumerate(all_chains): for cost_index, cost_chain in enumerate(all_chains):
if self.keep_probs_list: if self.keep_probs_list:
@@ -253,7 +373,10 @@ class SubsetSimulation:
shorter_probs_list.append( shorter_probs_list.append(
( (
cost_chain[0], cost_chain[0],
((self.n_c * self.n_s - cost_index) / (self.n_c * self.n_s)) (
(self.n_c * self.n_s - cost_index)
/ (self.n_c * self.n_s)
)
/ (self.n_s ** (i)), / (self.n_s ** (i)),
) )
) )
@@ -265,6 +388,7 @@ class SubsetSimulation:
under_target_cost=shorter_probs_list[over_index][0], under_target_cost=shorter_probs_list[over_index][0],
under_target_likelihood=shorter_probs_list[over_index][1], under_target_likelihood=shorter_probs_list[over_index][1],
lowest_likelihood=shorter_probs_list[-1][1], lowest_likelihood=shorter_probs_list[-1][1],
messages=output_messages,
) )
return result return result
@@ -285,8 +409,8 @@ class SubsetSimulation:
_logger.info( _logger.info(
f"final threshold cost: {threshold_cost}, at P = (1 / {self.n_s})^{self.m_max + 1}" f"final threshold cost: {threshold_cost}, at P = (1 / {self.n_s})^{self.m_max + 1}"
) )
for a in all_chains[-10:]: # for a in all_chains[-10:]:
_logger.info(a) # _logger.info(a)
# for prob, prob_cost in probs_list: # for prob, prob_cost in probs_list:
# _logger.info(f"\t{prob}: {prob_cost}") # _logger.info(f"\t{prob}: {prob_cost}")
probs_list.sort(key=lambda c: c[0], reverse=True) probs_list.sort(key=lambda c: c[0], reverse=True)
@@ -300,6 +424,7 @@ class SubsetSimulation:
under_target_cost=None, under_target_cost=None,
under_target_likelihood=None, under_target_likelihood=None,
lowest_likelihood=min_likelihood, lowest_likelihood=min_likelihood,
messages=output_messages,
) )
return result return result
@@ -358,6 +483,116 @@ class SubsetSimulation:
return stdevs return stdevs
class MultiSubsetSimulations:
def __init__(
self,
model_name_pairs: Sequence[Tuple[str, pdme.model.DipoleModel]],
# actual_measurements: Sequence[pdme.measurement.DotMeasurement],
cost_function: Callable[[numpy.ndarray], numpy.ndarray],
num_runs: int,
n_c: int,
n_s: int,
m_max: int,
target_cost: float,
num_initial_dmc_gens: int = 1,
level_0_seed_seed: int = 200,
mcmc_seed_seed: int = 20,
use_adaptive_steps=True,
default_phi_step=0.01,
default_theta_step=0.01,
default_r_step=0.01,
default_w_log_step=0.01,
default_upper_w_log_step=4,
initial_cost_chunk_size=100,
cap_core_count: int = 0, # 0 means cap at num cores - 1
):
self.model_name_pairs = model_name_pairs
self.cost_function = cost_function
self.num_runs = num_runs
self.n_c = n_c
self.n_s = n_s
self.m_max = m_max
self.target_cost = target_cost # This is not optional here!
self.num_dmc_gens = num_initial_dmc_gens
self.level_0_seed_seed = level_0_seed_seed
self.mcmc_seed_seed = mcmc_seed_seed
self.use_adaptive_steps = use_adaptive_steps
self.default_phi_step = default_phi_step
self.default_theta_step = default_theta_step
self.default_r_step = default_r_step
self.default_w_log_step = default_w_log_step
self.default_upper_w_log_step = default_upper_w_log_step
self.initial_cost_chunk_size = initial_cost_chunk_size
self.cap_core_count = cap_core_count
def execute(self) -> Sequence[MultiSubsetSimulationResult]:
output: List[MultiSubsetSimulationResult] = []
for model_index, model_name_pair in enumerate(self.model_name_pairs):
ss_results = [
SubsetSimulation(
model_name_pair,
self.cost_function,
self.n_c,
self.n_s,
self.m_max,
self.target_cost,
num_initial_dmc_gens=self.num_dmc_gens,
level_0_seed=[model_index, run_index, self.level_0_seed_seed],
mcmc_seed=[model_index, run_index, self.mcmc_seed_seed],
use_adaptive_steps=self.use_adaptive_steps,
default_phi_step=self.default_phi_step,
default_theta_step=self.default_theta_step,
default_r_step=self.default_r_step,
default_w_log_step=self.default_w_log_step,
default_upper_w_log_step=self.default_upper_w_log_step,
keep_probs_list=False,
dump_last_generation_to_file=False,
initial_cost_chunk_size=self.initial_cost_chunk_size,
cap_core_count=self.cap_core_count,
).execute()
for run_index in range(self.num_runs)
]
output.append(coalesce_ss_results(model_name_pair[0], ss_results))
return output
def coalesce_ss_results(
model_name: str, results: Sequence[SubsetSimulationResult]
) -> MultiSubsetSimulationResult:
num_finished = sum(1 for res in results if res.under_target_likelihood is not None)
estimated_likelihoods = numpy.array(
[
res.under_target_likelihood
if res.under_target_likelihood is not None
else res.lowest_likelihood
for res in results
]
)
_logger.info(estimated_likelihoods)
geometric_mean_estimated_likelihoods = numpy.exp(
numpy.log(estimated_likelihoods).mean()
)
_logger.info(geometric_mean_estimated_likelihoods)
arithmetic_mean_estimated_likelihoods = estimated_likelihoods.mean()
result = MultiSubsetSimulationResult(
child_results=results,
model_name=model_name,
estimated_likelihood=geometric_mean_estimated_likelihoods,
arithmetic_mean_estimated_likelihood=arithmetic_mean_estimated_likelihoods,
num_children=len(results),
num_finished_children=num_finished,
clean_estimate=num_finished == len(results),
)
return result
def reverse_bisect_right(a, x, lo=0, hi=None): def reverse_bisect_right(a, x, lo=0, hi=None):
"""Return the index where to insert item x in list a, assuming a is sorted in descending order. """Return the index where to insert item x in list a, assuming a is sorted in descending order.

8
poetry.lock generated
View File

@@ -786,13 +786,13 @@ files = [
[[package]] [[package]]
name = "pdme" name = "pdme"
version = "1.2.0" version = "1.5.0"
description = "Python dipole model evaluator" description = "Python dipole model evaluator"
optional = false optional = false
python-versions = "<3.10,>=3.8.1" python-versions = "<3.10,>=3.8.1"
files = [ files = [
{file = "pdme-1.2.0-py3-none-any.whl", hash = "sha256:602710a053f22921b4adbc03d46d284149fe2367a65455cde56608708e01c84b"}, {file = "pdme-1.5.0-py3-none-any.whl", hash = "sha256:1b4fa30ba98a336957b3029563552d73286a3a5f932809ac1330e65a1f61c363"},
{file = "pdme-1.2.0.tar.gz", hash = "sha256:412806d7ae384c048515e0f2cba70252778bf153800829a1d3265a0596872263"}, {file = "pdme-1.5.0.tar.gz", hash = "sha256:cc0ac4ffab2994e08b4efde2991c6d9dccb2942c7e33c4be3b52e068366526d1"},
] ]
[package.dependencies] [package.dependencies]
@@ -1275,4 +1275,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8.1,<3.10" python-versions = ">=3.8.1,<3.10"
content-hash = "918b6736766a9c1b6732a56e1ef2e7a53241f2e25babb884881e49c299801fc9" content-hash = "85114054176aa164964acea6fdc085581ee7fc2f94c1cd03ad77611b82e52c79"

View File

@@ -1,13 +1,13 @@
[tool.poetry] [tool.poetry]
name = "deepdog" name = "deepdog"
version = "1.1.0" version = "1.3.0"
description = "" description = ""
authors = ["Deepak Mallubhotla <dmallubhotla+github@gmail.com>"] authors = ["Deepak Mallubhotla <dmallubhotla+github@gmail.com>"]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.8.1,<3.10" python = ">=3.8.1,<3.10"
pdme = "^1.2.0" pdme = "^1.5.0"
numpy = "1.22.3" numpy = "1.26.4"
scipy = "1.10" scipy = "1.10"
tqdm = "^4.66.2" tqdm = "^4.66.2"

View File

@@ -0,0 +1,42 @@
import deepdog.direct_monte_carlo.cost_function_filter
import numpy
def test_px_cost_function_filter_example():
dipoles_1 = [
[1, 2, 3, 4, 5, 6, 7],
[2, 3, 2, 5, 4, 7, 6],
]
dipoles_2 = [
[15, 9, 8, 7, 6, 5, 3],
[30, 4, 4, 7, 3, 1, 4],
]
dipoleses = numpy.array([dipoles_1, dipoles_2])
def cost_function(dipoleses: numpy.ndarray) -> numpy.ndarray:
return dipoleses[:, :, 0].max(axis=-1)
expected_costs = numpy.array([2, 30])
numpy.testing.assert_array_equal(cost_function(dipoleses), expected_costs)
filter = deepdog.direct_monte_carlo.cost_function_filter.CostFunctionTargetFilter(
cost_function, 5
)
actual_filtered = filter.filter_samples(dipoleses)
expected_filtered = numpy.array([dipoles_1])
assert actual_filtered.size != 0
numpy.testing.assert_array_equal(actual_filtered, expected_filtered)
filter_stricter = (
deepdog.direct_monte_carlo.cost_function_filter.CostFunctionTargetFilter(
cost_function, 0.5
)
)
actual_filtered_stricter = filter_stricter.filter_samples(dipoleses)
assert actual_filtered_stricter.size == 0

View File

@@ -7,6 +7,7 @@ def test_parse_groupdict():
) )
parsed = deepdog.results._parse_bayesrun_column(example_column_name) parsed = deepdog.results._parse_bayesrun_column(example_column_name)
assert parsed is not None
expected = deepdog.results.BayesrunColumnParsed( expected = deepdog.results.BayesrunColumnParsed(
{ {
"xmin": "-20", "xmin": "-20",
@@ -23,6 +24,30 @@ def test_parse_groupdict():
assert parsed == expected assert parsed == expected
def test_parse_groupdict_with_magnitude():
example_column_name = (
"geom_-20_20_-10_10_0_5-magnitude_3.5-orientation_free-dipole_count_100_success"
)
parsed = deepdog.results._parse_bayesrun_column(example_column_name)
assert parsed is not None
expected = deepdog.results.BayesrunColumnParsed(
{
"xmin": "-20",
"xmax": "20",
"ymin": "-10",
"ymax": "10",
"zmin": "0",
"zmax": "5",
"orientation": "free",
"avg_filled": "100",
"log_magnitude": "3.5",
"field_name": "success",
}
)
assert parsed == expected
# def test_parse_no_match_column_name(): # def test_parse_no_match_column_name():
# parsed = deepdog.results.parse_bayesrun_column("There's nothing here") # parsed = deepdog.results.parse_bayesrun_column("There's nothing here")
# assert parsed is None # assert parsed is None

View File

@@ -0,0 +1,10 @@
# serializer version: 1
# name: test_subset_simulation_multi_result_coalescing_easy_arithmetic
MultiSubsetSimulationResult(child_results=[SubsetSimulationResult(probs_list=(), over_target_cost=1, over_target_likelihood=1, under_target_cost=0.99, under_target_likelihood=0.8, lowest_likelihood=0.5, messages=[]), SubsetSimulationResult(probs_list=(), over_target_cost=1, over_target_likelihood=1, under_target_cost=0.99, under_target_likelihood=0.6, lowest_likelihood=0.01, messages=[])], model_name='test', estimated_likelihood=0.6928203230275509, arithmetic_mean_estimated_likelihood=0.7, num_children=2, num_finished_children=2, clean_estimate=True)
# ---
# name: test_subset_simulation_multi_result_coalescing_easy_geometric
MultiSubsetSimulationResult(child_results=[SubsetSimulationResult(probs_list=(), over_target_cost=1, over_target_likelihood=1, under_target_cost=0.99, under_target_likelihood=0.1, lowest_likelihood=0.5, messages=[]), SubsetSimulationResult(probs_list=(), over_target_cost=1, over_target_likelihood=1, under_target_cost=0.99, under_target_likelihood=0.001, lowest_likelihood=0.01, messages=[])], model_name='test', estimated_likelihood=0.010000000000000004, arithmetic_mean_estimated_likelihood=0.0505, num_children=2, num_finished_children=2, clean_estimate=True)
# ---
# name: test_subset_simulation_multi_result_coalescing_include_dirty
MultiSubsetSimulationResult(child_results=[SubsetSimulationResult(probs_list=(), over_target_cost=1, over_target_likelihood=1, under_target_cost=0.99, under_target_likelihood=0.8, lowest_likelihood=0.5, messages=[]), SubsetSimulationResult(probs_list=(), over_target_cost=1, over_target_likelihood=1, under_target_cost=0.99, under_target_likelihood=0.08, lowest_likelihood=0.01, messages=[]), SubsetSimulationResult(probs_list=(), over_target_cost=None, over_target_likelihood=None, under_target_cost=None, under_target_likelihood=None, lowest_likelihood=0.0001, messages=[])], model_name='test', estimated_likelihood=0.01856635533445112, arithmetic_mean_estimated_likelihood=0.29336666666666666, num_children=3, num_finished_children=2, clean_estimate=False)
# ---

View File

@@ -0,0 +1,92 @@
import deepdog.subset_simulation.subset_simulation_impl as impl
import numpy
def test_subset_simulation_multi_result_coalescing_include_dirty(snapshot):
res1 = impl.SubsetSimulationResult(
probs_list=(),
over_target_cost=1,
over_target_likelihood=1,
under_target_cost=0.99,
under_target_likelihood=0.8,
lowest_likelihood=0.5,
messages=[],
)
res2 = impl.SubsetSimulationResult(
probs_list=(),
over_target_cost=1,
over_target_likelihood=1,
under_target_cost=0.99,
under_target_likelihood=0.08,
lowest_likelihood=0.01,
messages=[],
)
res3 = impl.SubsetSimulationResult(
probs_list=(),
over_target_cost=None,
over_target_likelihood=None,
under_target_cost=None,
under_target_likelihood=None,
lowest_likelihood=0.0001,
messages=[],
)
combined = impl.coalesce_ss_results("test", [res1, res2, res3])
assert combined == snapshot
def test_subset_simulation_multi_result_coalescing_easy_arithmetic(snapshot):
res1 = impl.SubsetSimulationResult(
probs_list=(),
over_target_cost=1,
over_target_likelihood=1,
under_target_cost=0.99,
under_target_likelihood=0.8,
lowest_likelihood=0.5,
messages=[],
)
res2 = impl.SubsetSimulationResult(
probs_list=(),
over_target_cost=1,
over_target_likelihood=1,
under_target_cost=0.99,
under_target_likelihood=0.6,
lowest_likelihood=0.01,
messages=[],
)
combined = impl.coalesce_ss_results("test", [res1, res2])
assert combined.arithmetic_mean_estimated_likelihood == 0.7
assert combined == snapshot
def test_subset_simulation_multi_result_coalescing_easy_geometric(snapshot):
res1 = impl.SubsetSimulationResult(
probs_list=(),
over_target_cost=1,
over_target_likelihood=1,
under_target_cost=0.99,
under_target_likelihood=0.1,
lowest_likelihood=0.5,
messages=[],
)
res2 = impl.SubsetSimulationResult(
probs_list=(),
over_target_cost=1,
over_target_likelihood=1,
under_target_cost=0.99,
under_target_likelihood=0.001,
lowest_likelihood=0.01,
messages=[],
)
combined = impl.coalesce_ss_results("test", [res1, res2])
numpy.testing.assert_allclose(combined.estimated_likelihood, 0.01)
assert combined == snapshot