From 797c4be580142662e5b66740e9231494fa9c71d4 Mon Sep 17 00:00:00 2001 From: RyanAugust Date: Thu, 30 Apr 2026 13:04:54 +0000 Subject: [PATCH 1/2] central random state --- .github/workflows/documentation.yaml | 5 ++++- src/pysimmmulator/geos.py | 10 ++++++---- src/pysimmmulator/simulate.py | 3 ++- src/pysimmmulator/study.py | 5 ++++- tests/test_edge_cases.py | 19 +++++++++++++++++++ 5 files changed, 35 insertions(+), 7 deletions(-) diff --git a/.github/workflows/documentation.yaml b/.github/workflows/documentation.yaml index 49525ba..a3838c1 100644 --- a/.github/workflows/documentation.yaml +++ b/.github/workflows/documentation.yaml @@ -1,6 +1,9 @@ name: documentation -on: [pull_request, workflow_dispatch] +on: + release: + types: [published] # Runs only on official releases + workflow_dispatch: permissions: contents: write diff --git a/src/pysimmmulator/geos.py b/src/pysimmmulator/geos.py index 93fcefd..3a94fb3 100644 --- a/src/pysimmmulator/geos.py +++ b/src/pysimmmulator/geos.py @@ -4,9 +4,9 @@ class Geos: """Provides randomized generation of population subsets""" - def __init__(self, total_population: int, random_seed: Optional[int] = None) -> None: + def __init__(self, total_population: int, random_seed: Optional[int] = None, rng: Optional[np.random.Generator] = None) -> None: self.total_population = total_population - self.rng = self._create_random_factory(seed=random_seed) + self.rng = rng if rng is not None else self._create_random_factory(seed=random_seed) def __call__(self, geo_specs: Optional[dict] = None, @@ -18,7 +18,7 @@ def __call__(self, geo_specs (Optional[dict]): Geography names coupled with a dict of parameters for the normal distribution of that geos population (ie {"California":{"loc": 3.0, "scale": 0.5}}). 'loc' in this case is the multiplicative bias relative to an equal apportionment of the total population. - universal_scale (Optional[flaot]): Scale parameter to be used universally for all geographies. Increased value means increased + universal_scale (Optional[float]): Scale parameter to be used universally for all geographies. Increased value means increased spread in the distribution of all geos count (int): in the absense of specified geographies, this is the number of geos to be created using the `create_random_geos` function. Returns: @@ -116,6 +116,7 @@ def distribute_to_geos( mmm_input: 'pd.DataFrame', geo_details: dict, random_seed: Optional[int] = None, + rng: Optional[np.random.Generator] = None, dist_spec: tuple[float, float] = (0.0, 0.25), media_cost_spec: tuple[float, float] = (0.0, 0.069), perf_spec: tuple[float, float] = (0.0, 0.069) @@ -126,6 +127,7 @@ def distribute_to_geos( mmm_input (pd.DataFrame): simulated MMM data that was generated as part of a prior process geo_details (dict): formulated dict or output of the `geos` creation call (ie `geos(count=50)`) random_seed (int): random seed for rng--if needed + rng (np.random.Generator): optional random number generator dist_spec (tuple[float, float]): Parameters to control the normal distribution function for populations of the geographies media_cost_spec (tuple[float, float]): Parameters to control the normal distribution function for allocation of spend across geographies perf_spec (tuple[float, float]): Parameters to control the normal distribution function for allocation of performance across geographies @@ -137,7 +139,7 @@ def distribute_to_geos( geo_dataframes = [] total_population: int = sum(geo_details.values()) - rng = np.random.default_rng(seed=random_seed) + rng = rng if rng is not None else np.random.default_rng(seed=random_seed) media_cols = [w for w in mmm_input.columns if "impressions" in w or "clicks" in w] for geo_name, geo_pop in geo_details.items(): pop_pct = geo_pop / total_population diff --git a/src/pysimmmulator/simulate.py b/src/pysimmmulator/simulate.py index 1eec7c3..a62515a 100644 --- a/src/pysimmmulator/simulate.py +++ b/src/pysimmmulator/simulate.py @@ -355,12 +355,13 @@ def simulate_geos(self, mmm_df: pd.DataFrame, params: GeoParameters) -> pd.DataF params (GeoParameters): Parameters for geographic distribution. Returns: pd.DataFrame: MMM DataFrame with geographic distribution""" - geos = Geos(total_population=params.total_population, random_seed=None) + geos = Geos(total_population=params.total_population, random_seed=None, rng=self.rng) geo_details = geos(geo_specs=params.geo_specs, universal_scale=params.universal_scale, count=params.count) mmm_df = distribute_to_geos( mmm_input=mmm_df, geo_details=geo_details, + rng=self.rng, dist_spec=params.dist_spec, media_cost_spec=params.media_cost_spec, perf_spec=params.perf_spec diff --git a/src/pysimmmulator/study.py b/src/pysimmmulator/study.py index f1d2120..36fd70f 100644 --- a/src/pysimmmulator/study.py +++ b/src/pysimmmulator/study.py @@ -11,11 +11,12 @@ def __init__(self, channel_name: str, true_roi: float, random_seed: int = None, + rng: Optional[np.random.Generator] = None, bias: float = DEFAULT_STUDY_BIAS, stdev: float = DEFAULT_STUDY_SCALE) -> None: self.channel_name = channel_name self._true_roi = true_roi - self.rng = self._create_random_factory(seed=random_seed) + self.rng = rng if rng is not None else self._create_random_factory(seed=random_seed) self._bias = bias self._stdev = stdev @@ -94,12 +95,14 @@ def __init__(self, channel_rois: dict, channel_distributions: dict[str, dict] = dict(), random_seed: int = None, + rng: Optional[np.random.Generator] = None, bias: float = DEFAULT_STUDY_BIAS, stdev: float = DEFAULT_STUDY_SCALE) -> None: self._study_hold = { k: Study(channel_name=k, true_roi=v, random_seed=random_seed, + rng=rng, bias=channel_distributions.get(k, {}).get("bias", bias), stdev=channel_distributions.get(k, {}).get("stdev", stdev)) for k, v in channel_rois.items() diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py index b2b08ec..aea25cc 100644 --- a/tests/test_edge_cases.py +++ b/tests/test_edge_cases.py @@ -156,3 +156,22 @@ def test_multisim_get_data_coverage(): ms = Multisim() ms.data = "test_data" assert ms.get_data == "test_data" + +def test_reproducibility(): + with open("examples/example_config.yaml", "r") as f: + config = yaml.safe_load(f) + + config["geo_params"] = { + "total_population": 1000000, + "count": 5 + } + + seed = 42 + sim1 = Simulate(random_seed=seed) + df1, roi1 = sim1.run_with_config(config) + + sim2 = Simulate(random_seed=seed) + df2, roi2 = sim2.run_with_config(config) + + pd.testing.assert_frame_equal(df1, df2) + assert roi1 == roi2 From 607adcd910897ef9a26110a7e24db4906ca5142b Mon Sep 17 00:00:00 2001 From: RyanAugust Date: Thu, 30 Apr 2026 13:05:49 +0000 Subject: [PATCH 2/2] some cleanup --- tests/test_edge_cases.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py index aea25cc..1703b16 100644 --- a/tests/test_edge_cases.py +++ b/tests/test_edge_cases.py @@ -160,18 +160,18 @@ def test_multisim_get_data_coverage(): def test_reproducibility(): with open("examples/example_config.yaml", "r") as f: config = yaml.safe_load(f) - + config["geo_params"] = { "total_population": 1000000, "count": 5 } - + seed = 42 sim1 = Simulate(random_seed=seed) df1, roi1 = sim1.run_with_config(config) - + sim2 = Simulate(random_seed=seed) df2, roi2 = sim2.run_with_config(config) - + pd.testing.assert_frame_equal(df1, df2) assert roi1 == roi2