From 2c5c1228209e51d17253f07470e2f1e6dc6872d7 Mon Sep 17 00:00:00 2001 From: Deepak Mallubhotla Date: Sat, 11 Jun 2022 21:06:27 -0500 Subject: [PATCH] feat: adds fast filter variant --- deepdog/__init__.py | 2 - deepdog/real_spectrum_run.py | 35 +++- deepdog/real_spectrum_run_fast_filter.py | 227 ----------------------- 3 files changed, 33 insertions(+), 231 deletions(-) delete mode 100644 deepdog/real_spectrum_run_fast_filter.py diff --git a/deepdog/__init__.py b/deepdog/__init__.py index 97e8061..723a7a6 100644 --- a/deepdog/__init__.py +++ b/deepdog/__init__.py @@ -3,7 +3,6 @@ from deepdog.meta import __version__ from deepdog.bayes_run import BayesRun from deepdog.bayes_run_simulpairs import BayesRunSimulPairs from deepdog.real_spectrum_run import RealSpectrumRun -from deepdog.real_spectrum_run_fast_filter import FastFilterCompareRun def get_version(): @@ -15,7 +14,6 @@ __all__ = [ "BayesRun", "BayesRunSimulPairs", "RealSpectrumRun", - "FastFilterCompareRun", ] diff --git a/deepdog/real_spectrum_run.py b/deepdog/real_spectrum_run.py index 5180b6d..03bf147 100644 --- a/deepdog/real_spectrum_run.py +++ b/deepdog/real_spectrum_run.py @@ -32,6 +32,28 @@ def get_a_result(input) -> int: return numpy.count_nonzero(pdme.util.fast_v_calc.between(vals, lows, highs)) +def get_a_result_fast_filter(input) -> int: + model, dot_inputs, lows, highs, monte_carlo_count, seed = input + + rng = numpy.random.default_rng(seed) + # TODO: A long term refactor is to pull the frequency stuff out from here. The None stands for max_frequency, which is unneeded in the actually useful models. + sample_dipoles = model.get_monte_carlo_dipole_inputs( + monte_carlo_count, None, rng_to_use=rng + ) + + current_sample = sample_dipoles + for di, low, high in zip(dot_inputs, lows, highs): + + if len(current_sample) < 1: + break + vals = pdme.util.fast_v_calc.fast_vs_for_dipoleses( + numpy.array([di]), current_sample + ) + + current_sample = current_sample[numpy.all((vals > low) & (vals < high), axis=1)] + return len(current_sample) + + class RealSpectrumRun: """ A bayes run given some real data. @@ -65,6 +87,7 @@ class RealSpectrumRun: max_monte_carlo_cycles_steps: int = 10, chunksize: int = CHUNKSIZE, initial_seed: int = 12345, + use_fast_filter: bool = True, ) -> None: self.measurements = measurements self.dot_inputs = [(measure.r, measure.f) for measure in self.measurements] @@ -93,7 +116,11 @@ class RealSpectrumRun: self.probabilities = [1 / self.model_count] * self.model_count timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - self.filename = f"{timestamp}-{filename_slug}.realdata.bayesrun.csv" + self.use_fast_filter = use_fast_filter + ff_string = "no_fast_filter" + if self.use_fast_filter: + ff_string = "fast_filter" + self.filename = f"{timestamp}-{filename_slug}.realdata.{ff_string}.bayesrun.csv" self.initial_seed = initial_seed def go(self) -> None: @@ -133,9 +160,13 @@ class RealSpectrumRun: # that way we get more stuff. seeds = seed_sequence.spawn(self.monte_carlo_cycles) + if self.use_fast_filter: + result_func = get_a_result_fast_filter + else: + result_func = get_a_result current_success = sum( pool.imap_unordered( - get_a_result, + result_func, [ ( model, diff --git a/deepdog/real_spectrum_run_fast_filter.py b/deepdog/real_spectrum_run_fast_filter.py deleted file mode 100644 index 93d4bf6..0000000 --- a/deepdog/real_spectrum_run_fast_filter.py +++ /dev/null @@ -1,227 +0,0 @@ -import pdme.inputs -import pdme.model -import pdme.measurement -import pdme.measurement.input_types -import pdme.measurement.oscillating_dipole -import pdme.util.fast_v_calc -import pdme.util.fast_nonlocal_spectrum -from typing import Sequence, Tuple, List, Dict, Union -import datetime -import csv -import multiprocessing -import logging -import numpy - - -# TODO: remove hardcode -CHUNKSIZE = 50 - - -_logger = logging.getLogger(__name__) - - -def get_a_result(input) -> int: - model, dot_inputs, lows, highs, monte_carlo_count, seed = input - - rng = numpy.random.default_rng(seed) - # TODO: A long term refactor is to pull the frequency stuff out from here. The None stands for max_frequency, which is unneeded in the actually useful models. - sample_dipoles = model.get_monte_carlo_dipole_inputs( - monte_carlo_count, None, rng_to_use=rng - ) - vals = pdme.util.fast_v_calc.fast_vs_for_dipoleses(dot_inputs, sample_dipoles) - return numpy.count_nonzero(pdme.util.fast_v_calc.between(vals, lows, highs)) - - -def get_a_result_fast_filter(input) -> int: - model, dot_inputs, lows, highs, monte_carlo_count, seed = input - - rng = numpy.random.default_rng(seed) - # TODO: A long term refactor is to pull the frequency stuff out from here. The None stands for max_frequency, which is unneeded in the actually useful models. - sample_dipoles = model.get_monte_carlo_dipole_inputs( - monte_carlo_count, None, rng_to_use=rng - ) - - current_sample = sample_dipoles - for di in dot_inputs: - vals = pdme.util.fast_v_calc.fast_vs_for_dipoleses(numpy.array([di]), current_sample) - current_sample = current_sample[ - pdme.util.fast_v_calc.between(vals, lows, highs) - ] - return len(current_sample) - - -class FastFilterCompareRun: - """ - A bayes run given some real data. - - Parameters - ---------- - measurements : Sequence[pdme.measurement.DotRangeMeasurement] - The dot inputs for this bayes run. - - models_with_names : Sequence[Tuple(str, pdme.model.DipoleModel)] - The models to evaluate. - - actual_model : pdme.model.DipoleModel - The model which is actually correct. - - filename_slug : str - The filename slug to include. - - run_count: int - The number of runs to do. - """ - - def __init__( - self, - measurements: Sequence[pdme.measurement.DotRangeMeasurement], - models_with_names: Sequence[Tuple[str, pdme.model.DipoleModel]], - filename_slug: str, - monte_carlo_count: int = 10000, - monte_carlo_cycles: int = 10, - target_success: int = 100, - max_monte_carlo_cycles_steps: int = 10, - chunksize: int = CHUNKSIZE, - initial_seed: int = 12345, - ) -> None: - self.measurements = measurements - self.dot_inputs = [(measure.r, measure.f) for measure in self.measurements] - - self.dot_inputs_array = pdme.measurement.input_types.dot_inputs_to_array( - self.dot_inputs - ) - - self.models = [model for (_, model) in models_with_names] - self.model_names = [name for (name, _) in models_with_names] - self.model_count = len(self.models) - - self.monte_carlo_count = monte_carlo_count - self.monte_carlo_cycles = monte_carlo_cycles - self.target_success = target_success - self.max_monte_carlo_cycles_steps = max_monte_carlo_cycles_steps - - self.csv_fields = [] - - self.compensate_zeros = True - self.chunksize = chunksize - for name in self.model_names: - self.csv_fields.extend([f"{name}_success", f"{name}_count", f"{name}_prob"]) - - # for now initialise priors as uniform. - self.probabilities = [1 / self.model_count] * self.model_count - - timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - self.filename = f"{timestamp}-{filename_slug}.realdata.bayesrun.csv" - self.initial_seed = initial_seed - - def go(self) -> None: - with open(self.filename, "a", newline="") as outfile: - writer = csv.DictWriter(outfile, fieldnames=self.csv_fields, dialect="unix") - writer.writeheader() - - ( - lows, - highs, - ) = pdme.measurement.input_types.dot_range_measurements_low_high_arrays( - self.measurements - ) - - # define a new seed sequence for each run - seed_sequence = numpy.random.SeedSequence(self.initial_seed) - - results = [] - _logger.debug("Going to iterate over models now") - for model_count, model in enumerate(self.models): - _logger.debug(f"Doing model #{model_count}") - core_count = multiprocessing.cpu_count() - 1 or 1 - with multiprocessing.Pool(core_count) as pool: - cycle_count = 0 - cycle_success = 0 - cycle_success_ff = 0 - cycles = 0 - while (cycles < self.max_monte_carlo_cycles_steps) and ( - cycle_success <= self.target_success - ): - _logger.debug(f"Starting cycle {cycles}") - cycles += 1 - current_success = 0 - cycle_count += self.monte_carlo_count * self.monte_carlo_cycles - - # generate a seed from the sequence for each core. - # note this needs to be inside the loop for monte carlo cycle steps! - # that way we get more stuff. - seeds = seed_sequence.spawn(self.monte_carlo_cycles) - - current_success = sum( - pool.imap_unordered( - get_a_result, - [ - ( - model, - self.dot_inputs_array, - lows, - highs, - self.monte_carlo_count, - seed, - ) - for seed in seeds - ], - self.chunksize, - ) - ) - current_success_ff = sum( - pool.imap_unordered( - get_a_result_fast_filter, - [ - ( - model, - self.dot_inputs_array, - lows, - highs, - self.monte_carlo_count, - seed, - ) - for seed in seeds - ], - self.chunksize, - ) - ) - - cycle_success += current_success - cycle_success_ff += current_success_ff - _logger.debug(f"current running successes: {cycle_success}") - _logger.debug(f"current running successes: {cycle_success_ff}") - results.append((cycle_count, cycle_success)) - - _logger.debug("Done, constructing output now") - row: Dict[str, Union[int, float, str]] = {} - - successes: List[float] = [] - counts: List[int] = [] - for model_index, (name, (count, result)) in enumerate( - zip(self.model_names, results) - ): - - row[f"{name}_success"] = result - row[f"{name}_count"] = count - successes.append(max(result, 0.5)) - counts.append(count) - - success_weight = sum( - [ - (succ / count) * prob - for succ, count, prob in zip(successes, counts, self.probabilities) - ] - ) - new_probabilities = [ - (succ / count) * old_prob / success_weight - for succ, count, old_prob in zip(successes, counts, self.probabilities) - ] - self.probabilities = new_probabilities - for name, probability in zip(self.model_names, self.probabilities): - row[f"{name}_prob"] = probability - _logger.info(row) - - with open(self.filename, "a", newline="") as outfile: - writer = csv.DictWriter(outfile, fieldnames=self.csv_fields, dialect="unix") - writer.writerow(row)