perf: precompile the magic regexes for probs parsing

This commit is contained in:
Deepak Mallubhotla 2024-05-11 20:49:45 -05:00
parent 60f29b0b2f
commit b3bf4cde97
Signed by: deepak
GPG Key ID: BEBAEBF28083E022

View File

@ -8,20 +8,30 @@ import csv
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
FILENAME_REGEX = r"(?P<timestamp>\d{8}-\d{6})-(?P<filename_slug>.*)\.realdata\.fast_filter\.bayesrun\.csv" FILENAME_REGEX = re.compile(
r"(?P<timestamp>\d{8}-\d{6})-(?P<filename_slug>.*)\.realdata\.fast_filter\.bayesrun\.csv"
)
MODEL_REGEXES = [ MODEL_REGEXES = [
r"geom_(?P<xmin>-?\d+)_(?P<xmax>-?\d+)_(?P<ymin>-?\d+)_(?P<ymax>-?\d+)_(?P<zmin>-?\d+)_(?P<zmax>-?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)", re.compile(pattern)
r"geom_(?P<xmin>-?\d+)_(?P<xmax>-?\d+)_(?P<ymin>-?\d+)_(?P<ymax>-?\d+)_(?P<zmin>-?\d+)_(?P<zmax>-?\d+)-magnitude_(?P<log_magnitude>\d*\.?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)", for pattern in [
r"geom_(?P<xmin>-?\d*\.?\d+)_(?P<xmax>-?\d*\.?\d+)_(?P<ymin>-?\d*\.?\d+)_(?P<ymax>-?\d*\.?\d+)_(?P<zmin>-?\d*\.?\d+)_(?P<zmax>-?\d*\.?\d+)-magnitude_(?P<log_magnitude>\d*\.?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)" r"geom_(?P<xmin>-?\d+)_(?P<xmax>-?\d+)_(?P<ymin>-?\d+)_(?P<ymax>-?\d+)_(?P<zmin>-?\d+)_(?P<zmax>-?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)",
r"geom_(?P<xmin>-?\d+)_(?P<xmax>-?\d+)_(?P<ymin>-?\d+)_(?P<ymax>-?\d+)_(?P<zmin>-?\d+)_(?P<zmax>-?\d+)-magnitude_(?P<log_magnitude>\d*\.?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)",
r"geom_(?P<xmin>-?\d*\.?\d+)_(?P<xmax>-?\d*\.?\d+)_(?P<ymin>-?\d*\.?\d+)_(?P<ymax>-?\d*\.?\d+)_(?P<zmin>-?\d*\.?\d+)_(?P<zmax>-?\d*\.?\d+)-magnitude_(?P<log_magnitude>\d*\.?\d+)-orientation_(?P<orientation>free|fixedxy|fixedz)-dipole_count_(?P<avg_filled>\d+)_(?P<field_name>\w*)",
]
] ]
FILE_SLUG_REGEXES = [ FILE_SLUG_REGEXES = [
r"mock_tarucha-(?P<job_index>\d+)", re.compile(pattern)
r"(?:(?P<mock>mock)_)?tarucha(?:_(?P<tarucha_run_id>\d+))?-(?P<job_index>\d+)", for pattern in [
r"(?P<tag>\w+)-(?P<job_index>\d+)", r"(?P<tag>\w+)-(?P<job_index>\d+)",
r"mock_tarucha-(?P<job_index>\d+)",
r"(?:(?P<mock>mock)_)?tarucha(?:_(?P<tarucha_run_id>\d+))?-(?P<job_index>\d+)",
]
] ]
SIMPLE_TAG_REGEX = re.compile(r"\w+-\d+")
@dataclasses.dataclass @dataclasses.dataclass
class BayesrunOutputFilename: class BayesrunOutputFilename:
@ -86,7 +96,7 @@ def _parse_bayesrun_column(
Returns the groupdict for the first match, or None if no match found. Returns the groupdict for the first match, or None if no match found.
""" """
for pattern in MODEL_REGEXES: for pattern in MODEL_REGEXES:
match = re.match(pattern, column) match = pattern.match(column)
if match: if match:
return BayesrunColumnParsed(match.groupdict()) return BayesrunColumnParsed(match.groupdict())
else: else:
@ -125,7 +135,7 @@ def _parse_bayesrun_row(
def _parse_output_filename(file: pathlib.Path) -> BayesrunOutputFilename: def _parse_output_filename(file: pathlib.Path) -> BayesrunOutputFilename:
filename = file.name filename = file.name
match = re.match(FILENAME_REGEX, filename) match = FILENAME_REGEX.match(filename)
if not match: if not match:
raise ValueError(f"{filename} was not a valid bayesrun output") raise ValueError(f"{filename} was not a valid bayesrun output")
groups = match.groupdict() groups = match.groupdict()
@ -136,7 +146,7 @@ def _parse_output_filename(file: pathlib.Path) -> BayesrunOutputFilename:
def _parse_file_slug(slug: str) -> typing.Optional[typing.Dict[str, str]]: def _parse_file_slug(slug: str) -> typing.Optional[typing.Dict[str, str]]:
for pattern in FILE_SLUG_REGEXES: for pattern in FILE_SLUG_REGEXES:
match = re.match(pattern, slug) match = pattern.match(slug)
if match: if match:
return match.groupdict() return match.groupdict()
else: else: