Compare commits

..

15 Commits

Author SHA1 Message Date
71dc906a96 chore(release): 1.7.0
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
gitea-physics/deepdog/pipeline/tag This commit looks good
2025-02-26 21:57:13 -06:00
24c6e311c1 feat: adds configurable skip if file exists
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
2025-02-26 21:55:12 -06:00
4dd3004a7b chore(release): 1.6.0
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
gitea-physics/deepdog/pipeline/tag This commit looks good
2025-02-26 21:08:00 -06:00
46f6b6cdf1 feat: Adds ability to parse bayesruns without timestamps
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
2025-02-26 21:01:19 -06:00
c8435b4b2a feat: allows negative log magnitude strings in models
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
2025-02-24 08:34:11 -06:00
c2375e6f5c chore(release): 1.5.0
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
gitea-physics/deepdog/pipeline/tag This commit looks good
2024-12-29 21:23:30 -06:00
a1b59cd18b feat: add configurable max number of dipoles to write
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
2024-12-29 21:14:59 -06:00
53f8993f2b feat: add configurable max number of dipoles to write 2024-12-29 21:13:34 -06:00
700f32ea58 chore(release): 1.4.0
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
gitea-physics/deepdog/pipeline/tag This commit looks good
2024-09-04 13:58:56 -05:00
3737252c4b log: adds additional logging of dipole count
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
2024-09-04 13:56:09 -05:00
6f79a49e59 log: adds additional logging of dipole count
All checks were successful
gitea-physics/deepdog/pipeline/head This commit looks good
2024-09-04 13:54:50 -05:00
d962ecb11e feat: indexifier now has len 2024-08-26 03:34:57 -05:00
7beca501bf fmt: ran formatter 2024-08-26 03:34:50 -05:00
5425ce1362 feat: allows some betetr matching for single_dipole runs 2024-08-26 03:31:15 -05:00
6a5c5931d4 fix: update log file arg names in cli scripts 2024-05-21 16:10:02 -05:00
13 changed files with 265 additions and 29 deletions

View File

@@ -2,6 +2,43 @@
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
## [1.7.0](https://gitea.deepak.science:2222/physics/deepdog/compare/1.6.0...1.7.0) (2025-02-27)
### Features
* adds configurable skip if file exists ([24c6e31](https://gitea.deepak.science:2222/physics/deepdog/commit/24c6e311c1d3067eb98cc60e6ca38d76373bf08e))
## [1.6.0](https://gitea.deepak.science:2222/physics/deepdog/compare/1.5.0...1.6.0) (2025-02-27)
### Features
* Adds ability to parse bayesruns without timestamps ([46f6b6c](https://gitea.deepak.science:2222/physics/deepdog/commit/46f6b6cdf15c67aedf0c871d201b8db320bccbdf))
* allows negative log magnitude strings in models ([c8435b4](https://gitea.deepak.science:2222/physics/deepdog/commit/c8435b4b2a6e4b89030f53b5734eb743e2003fb7))
## [1.5.0](https://gitea.deepak.science:2222/physics/deepdog/compare/1.4.0...1.5.0) (2024-12-30)
### Features
* add configurable max number of dipoles to write ([a1b59cd](https://gitea.deepak.science:2222/physics/deepdog/commit/a1b59cd18b30359328a09210d9393f211aab30c2))
* add configurable max number of dipoles to write ([53f8993](https://gitea.deepak.science:2222/physics/deepdog/commit/53f8993f2b155228fff5cbee84f10c62eb149a1f))
## [1.4.0](https://gitea.deepak.science:2222/physics/deepdog/compare/1.3.0...1.4.0) (2024-09-04)
### Features
* add subset sim probs command for bayes for subset simulation results ([c881da2](https://gitea.deepak.science:2222/physics/deepdog/commit/c881da28370a1e51d062e1a7edaa62af6eb98d0a))
* allows some betetr matching for single_dipole runs ([5425ce1](https://gitea.deepak.science:2222/physics/deepdog/commit/5425ce1362919af4cc4dbd5813df3be8d877b198))
* indexifier now has len ([d962ecb](https://gitea.deepak.science:2222/physics/deepdog/commit/d962ecb11e929de1d9aa458b5d8e82270eff0039))
### Bug Fixes
* update log file arg names in cli scripts ([6a5c593](https://gitea.deepak.science:2222/physics/deepdog/commit/6a5c5931d4fc849d0d6a0f2b971523a0f039d559))
## [1.3.0](https://gitea.deepak.science:2222/physics/deepdog/compare/1.2.1...1.3.0) (2024-05-20)

View File

@@ -13,7 +13,7 @@ def parse_args() -> argparse.Namespace:
"probs", description="Calculating probability from finished bayesrun"
)
parser.add_argument(
"--log_file",
"--log-file",
type=str,
help="A filename for logging to, if not provided will only log to stderr",
default=None,

View File

@@ -14,7 +14,7 @@ def parse_args() -> argparse.Namespace:
description="Calculating probability from finished subset sim run",
)
parser.add_argument(
"--log_file",
"--log-file",
type=str,
help="A filename for logging to, if not provided will only log to stderr",
default=None,

View File

@@ -1,3 +1,5 @@
import re
import pathlib
import csv
import pdme.model
import pdme.measurement
@@ -36,9 +38,35 @@ class DirectMonteCarloConfig:
tag: str = ""
cap_core_count: int = 0 # 0 means cap at num cores - 1
chunk_size: int = 50
write_bayesrun_file = True
bayesrun_file_timestamp = True
# chunk size of some kind
write_bayesrun_file: bool = True
bayesrun_file_timestamp: bool = True
skip_if_exists: bool = False
def get_filename(self) -> str:
"""
Generate a filename for the output of this run.
"""
# set starting execution timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
if self.bayesrun_file_timestamp:
timestamp_str = f"{timestamp}-"
else:
timestamp_str = ""
filename = f"{timestamp_str}{self.tag}.realdata.fast_filter.bayesrun.csv"
_logger.debug(f"Got filename {filename}")
return filename
def get_filename_regex(self) -> str:
"""
Generate a regex for the output of this run.
"""
# having both timestamp and the hyphen separately optional is a bit of a hack
# too loose, but will never matter
pattern = rf"(?P<timestamp>\d{{8}}-\d{{6}})?-?{self.tag}\.realdata\.fast_filter\.bayesrun\.csv"
return pattern
# Aliasing dict as a generic data container
@@ -145,15 +173,21 @@ class DirectMonteCarloRun:
single run wrapped up for multiprocessing call.
takes in a tuple of arguments corresponding to
(model_name_pair, seed)
(model_name_pair, seed, return_configs)
return_configs is a boolean, if true then will return tuple of (count, [matching configs])
if false, return (count, [])
"""
# here's where we do our work
model_name_pair, seed = args
model_name_pair, seed, return_configs = args
cycle_success_configs = self._single_run(model_name_pair, seed)
cycle_success_count = len(cycle_success_configs)
return cycle_success_count
if return_configs:
return (cycle_success_count, cycle_success_configs)
else:
return (cycle_success_count, [])
def execute_no_multiprocessing(self) -> Sequence[DirectMonteCarloResult]:
@@ -198,9 +232,11 @@ class DirectMonteCarloRun:
)
dipole_count = numpy.array(cycle_success_configs).shape[1]
for n in range(dipole_count):
number_dipoles_to_write = self.config.target_success * 5
_logger.info(f"Limiting to {number_dipoles_to_write=}")
numpy.savetxt(
f"{self.config.tag}_{step_count}_{cycle_i}_dipole_{n}.csv",
sorted_by_freq[:, n],
sorted_by_freq[:number_dipoles_to_write, n],
delimiter=",",
)
total_success += cycle_success_count
@@ -222,8 +258,27 @@ class DirectMonteCarloRun:
def execute(self) -> Sequence[DirectMonteCarloResult]:
# set starting execution timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
filename = self.config.get_filename()
if self.config.skip_if_exists:
_logger.info(f"Checking if {filename} exists")
cwd = pathlib.Path.cwd()
if (cwd / filename).exists():
_logger.info(f"File {filename} exists, skipping")
return []
if self.config.bayesrun_file_timestamp:
_logger.info(
"Also need to check file endings because of possible past or current timestamps, check only occurs if writing timestamp is set"
)
pattern = self.config.get_filename_regex()
for file in cwd.iterdir():
match = re.match(pattern, file.name)
if match is not None:
_logger.info(f"Matched {file.name} to {pattern}")
_logger.info(f"File {filename} exists, skipping")
return []
_logger.info(
f"Finished checking against pattern {pattern}, hopefully didn't take too long!"
)
count_per_step = (
self.config.monte_carlo_count_per_cycle * self.config.monte_carlo_cycles
@@ -259,15 +314,71 @@ class DirectMonteCarloRun:
seeds = seed_sequence.spawn(self.config.monte_carlo_cycles)
pool_results = sum(
raw_pool_results = list(
pool.imap_unordered(
self._wrapped_single_run,
[(model_name_pair, seed) for seed in seeds],
[
(
model_name_pair,
seed,
self.config.write_successes_to_file,
)
for seed in seeds
],
self.config.chunk_size,
)
)
pool_results = sum(result[0] for result in raw_pool_results)
_logger.debug(f"Pool results: {pool_results}")
if self.config.write_successes_to_file:
_logger.info("Writing dipole results")
cycle_success_configs = numpy.concatenate(
[result[1] for result in raw_pool_results]
)
dipole_count = numpy.array(cycle_success_configs).shape[1]
max_number_dipoles_to_write = self.config.target_success * 5
_logger.debug(
f"Limiting to {max_number_dipoles_to_write=}, have {len(cycle_success_configs)}"
)
if len(cycle_success_configs):
sorted_by_freq = numpy.array(
[
pdme.subspace_simulation.sort_array_of_dipoles_by_frequency(
dipole_config
)
for dipole_config in cycle_success_configs[
:max_number_dipoles_to_write
]
]
)
for n in range(dipole_count):
dipole_filename = (
f"{self.config.tag}_{step_count}_dipole_{n}.csv"
)
_logger.debug(
f"Writing {min(len(cycle_success_configs), max_number_dipoles_to_write)} to {dipole_filename}"
)
numpy.savetxt(
dipole_filename,
sorted_by_freq[:, n],
delimiter=",",
)
else:
_logger.debug(
"Instructed to write results, but none obtained"
)
total_success += pool_results
total_count += count_per_step
_logger.debug(
@@ -285,14 +396,6 @@ class DirectMonteCarloRun:
if self.config.write_bayesrun_file:
if self.config.bayesrun_file_timestamp:
timestamp_str = f"{timestamp}-"
else:
timestamp_str = ""
filename = (
f"{timestamp_str}{self.config.tag}.realdata.fast_filter.bayesrun.csv"
)
_logger.info(f"Going to write to file [{filename}]")
# row: Dict[str, Union[int, float, str]] = {}
row = {}

View File

@@ -36,6 +36,10 @@ class Indexifier:
def indexify(self, n: int) -> typing.Dict[str, typing.Any]:
return self.product_dict[n]
def __len__(self) -> int:
weights = [len(v) for v in self.dict.values()]
return math.prod(weights)
def _indexify_indices(self, n: int) -> typing.Sequence[int]:
"""
legacy indexify from old scripts, copypast.

View File

@@ -19,6 +19,11 @@ FILENAME_REGEX = re.compile(
r"(?P<timestamp>\d{8}-\d{6})-(?P<filename_slug>.*)\.realdata\.fast_filter\.bayesrun\.csv"
)
# probably a better way but who cares
NO_TIMESTAMP_FILENAME_REGEX = re.compile(
r"(?P<filename_slug>.*)\.realdata\.fast_filter\.bayesrun\.csv"
)
SUBSET_SIM_FILENAME_REGEX = re.compile(
r"(?P<filename_slug>.*)-(?:no_adaptive_steps_)?(?P<num_ss_runs>\d+)-nc_(?P<n_c>\d+)-ns_(?P<n_s>\d+)-mmax_(?P<mmax>\d+)\.multi\.subsetsim\.csv"
@@ -46,15 +51,23 @@ class GeneralOutput:
results: typing.Sequence[GeneralModelResult]
def _parse_string_output_filename(
filename: str,
) -> typing.Tuple[typing.Optional[str], str]:
if match := FILENAME_REGEX.match(filename):
groups = match.groupdict()
return (groups["timestamp"], groups["filename_slug"])
elif match := NO_TIMESTAMP_FILENAME_REGEX.match(filename):
groups = match.groupdict()
return (None, groups["filename_slug"])
else:
raise ValueError(f"Could not parse {filename} as a bayesrun output filename")
def _parse_output_filename(file: pathlib.Path) -> BayesrunOutputFilename:
filename = file.name
match = FILENAME_REGEX.match(filename)
if not match:
raise ValueError(f"{filename} was not a valid bayesrun output")
groups = match.groupdict()
return BayesrunOutputFilename(
timestamp=groups["timestamp"], filename_slug=groups["filename_slug"], path=file
)
timestamp, slug = _parse_string_output_filename(filename)
return BayesrunOutputFilename(timestamp=timestamp, filename_slug=slug, path=file)
def _parse_ss_output_filename(file: pathlib.Path) -> BayesrunOutputFilename:

View File

@@ -8,6 +8,7 @@ FILE_SLUG_REGEXES = [
r"(?P<tag>\w+)-(?P<job_index>\d+)",
r"mock_tarucha-(?P<job_index>\d+)",
r"(?:(?P<mock>mock)_)?tarucha(?:_(?P<tarucha_run_id>\d+))?-(?P<job_index>\d+)",
r"(?P<tag>\w+)-(?P<included_dots>[\w,]+)-(?P<target_cost>\d*\.?\d+)-(?P<job_index>\d+)",
]
]

View File

@@ -8,6 +8,8 @@ MODEL_REGEXES = [
r"geom_(?P<xmin>-?\d+)_(?P<xmax>-?\d+)_(?P<ymin>-?\d+)_(?P<ymax>-?\d+)_(?P<zmin>-?\d+)_(?P<zmax>-?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)",
r"geom_(?P<xmin>-?\d+)_(?P<xmax>-?\d+)_(?P<ymin>-?\d+)_(?P<ymax>-?\d+)_(?P<zmin>-?\d+)_(?P<zmax>-?\d+)-magnitude_(?P<log_magnitude>\d*\.?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)",
r"geom_(?P<xmin>-?\d*\.?\d+)_(?P<xmax>-?\d*\.?\d+)_(?P<ymin>-?\d*\.?\d+)_(?P<ymax>-?\d*\.?\d+)_(?P<zmin>-?\d*\.?\d+)_(?P<zmax>-?\d*\.?\d+)-magnitude_(?P<log_magnitude>\d*\.?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)",
r"geom_(?P<xmin>-?\d+)_(?P<xmax>-?\d+)_(?P<ymin>-?\d+)_(?P<ymax>-?\d+)_(?P<zmin>-?\d+)_(?P<zmax>-?\d+)-magnitude_(?P<log_magnitude>-?\d*\.?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)",
r"geom_(?P<xmin>-?\d*\.?\d+)_(?P<xmax>-?\d*\.?\d+)_(?P<ymin>-?\d*\.?\d+)_(?P<ymax>-?\d*\.?\d+)_(?P<zmin>-?\d*\.?\d+)_(?P<zmax>-?\d*\.?\d+)-magnitude_(?P<log_magnitude>-?\d*\.?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)",
]
]

View File

@@ -1,13 +1,13 @@
[tool.poetry]
name = "deepdog"
version = "1.3.0"
version = "1.7.0"
description = ""
authors = ["Deepak Mallubhotla <dmallubhotla+github@gmail.com>"]
[tool.poetry.dependencies]
python = ">=3.8.1,<3.10"
pdme = "^1.5.0"
numpy = "2.0.0"
numpy = "1.22.3"
scipy = "1.10"
tqdm = "^4.66.2"

View File

@@ -0,0 +1,26 @@
import re
import deepdog.direct_monte_carlo
def test_config_check_self():
config = deepdog.direct_monte_carlo.DirectMonteCarloConfig(
tag="test_tag",
bayesrun_file_timestamp=False,
)
expected_filename = "test_tag.realdata.fast_filter.bayesrun.csv"
actual_filename = config.get_filename()
assert actual_filename == expected_filename
regex = config.get_filename_regex()
assert re.match(regex, actual_filename) is not None
def test_config_check_self_with_timestamp():
config = deepdog.direct_monte_carlo.DirectMonteCarloConfig(
tag="test_tag",
bayesrun_file_timestamp=True,
)
expected_filename_ending = "test_tag.realdata.fast_filter.bayesrun.csv"
actual_filename = config.get_filename()
assert actual_filename.endswith(expected_filename_ending)
regex = config.get_filename_regex()
assert re.match(regex, actual_filename) is not None

View File

@@ -10,3 +10,12 @@ def test_indexifier():
_logger.debug(f"setting up indexifier {indexifier}")
assert indexifier.indexify(0) == {"key_1": 1, "key_2": "a"}
assert indexifier.indexify(5) == {"key_1": 2, "key_2": "c"}
assert len(indexifier) == 9
def test_indexifier_length_short():
weight_dict = {"key_1": [1, 2, 3], "key_2": ["b", "c"]}
indexifier = deepdog.indexify.Indexifier(weight_dict)
_logger.debug(f"setting up indexifier {indexifier}")
assert len(indexifier) == 6

View File

@@ -48,6 +48,28 @@ def test_parse_groupdict_with_magnitude():
assert parsed == expected
def test_parse_groupdict_with_negative_magnitude():
example_column_name = "geom_-20_20_-10_10_0_5-magnitude_-3.5-orientation_free-dipole_count_100_success"
parsed = deepdog.results.read_csv._parse_bayesrun_column(example_column_name)
assert parsed is not None
expected = deepdog.results.read_csv.BayesrunColumnParsed(
{
"xmin": "-20",
"xmax": "20",
"ymin": "-10",
"ymax": "10",
"zmin": "0",
"zmax": "5",
"orientation": "free",
"avg_filled": "100",
"log_magnitude": "-3.5",
"field_name": "success",
}
)
assert parsed == expected
# def test_parse_no_match_column_name():
# parsed = deepdog.results.parse_bayesrun_column("There's nothing here")
# assert parsed is None

View File

@@ -0,0 +1,19 @@
import deepdog.results
import pytest
def test_parse_bayesrun_filename():
valid1 = "20250226-204120-dot1-dot1-2-0.realdata.fast_filter.bayesrun.csv"
timestamp, slug = deepdog.results._parse_string_output_filename(valid1)
assert timestamp == "20250226-204120"
assert slug == "dot1-dot1-2-0"
valid2 = "dot1-dot1-2-0.realdata.fast_filter.bayesrun.csv"
timestamp, slug = deepdog.results._parse_string_output_filename(valid2)
assert timestamp is None
assert slug == "dot1-dot1-2-0"
with pytest.raises(ValueError):
deepdog.results._parse_string_output_filename("not_a_valid_filename")