Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 56 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,18 @@ PySiMMMulator's simulator can either be run on a step-by-step basis, or can be r

### Run via config

Run using this method, you'll be returned both a dataframe of for MMM input as well as the "True ROI" values for each of your channels. These true values are critical to validating your MMM model.
Run using this method, you'll be returned a `SimulationResult` object containing both a dataframe for MMM input as well as the "True ROI" values for each of your channels, and associated metadata. These true values are critical to validating your MMM model.

```python
from pysimmmulator import load_config, Simulate

cfg = load_config(config_path="./my_config.yaml")
simmm = Simulate()
mmm_input_df, channel_roi = simmm.run_with_config(config=cfg)
result = simmm.run_with_config(config=cfg)

# Access results
mmm_input_df = result.df
channel_roi = result.channel_roi
```

### Run via CLI
Expand All @@ -47,21 +51,61 @@ pysimmm -i example_config.yaml -o .
Alternatively you may run each of the stages independently, which allows for easier debugging and in-run adjustments. Due to the stateless architecture, each stage returns its results which are then passed to the next stage.

```python
from pysimmmulator import load_config, Simulate, define_basic_params
from pysimmmulator import load_config, Simulate, define_basic_params, create_all_parameters

cfg = load_config("./my_config.yaml")
basic_params = define_basic_params(**cfg["basic_params"])
simmm = Simulate(basic_params)

baseline_df = simmm.simulate_baseline(**cfg["baseline_params"])
spend_df = simmm.simulate_ad_spend(baseline_sales_df=baseline_df, **cfg["ad_spend_params"])
spend_df = simmm.simulate_media(spend_df=spend_df, **cfg["media_params"])
spend_df = simmm.simulate_cvr(spend_df=spend_df, **cfg["cvr_params"])
mmm_df = simmm.simulate_decay_returns(spend_df=spend_df, **cfg["adstock_params"])
params = create_all_parameters(cfg)
simmm = Simulate(params["basic_params"])

baseline_df = simmm.simulate_baseline(params["baseline_params"])
spend_df = simmm.simulate_ad_spend(baseline_sales_df=baseline_df, params=params["ad_spend_params"])
spend_df = simmm.simulate_media(spend_df=spend_df, params=params["media_params"])
spend_df = simmm.simulate_cvr(spend_df=spend_df, params=params["cvr_params"])
mmm_df = simmm.simulate_decay_returns(spend_df=spend_df, params=params["adstock_params"])
mmm_df = simmm.calculate_conversions(mmm_df=mmm_df)
mmm_df = simmm.consolidate_dataframe(mmm_df=mmm_df, baseline_sales_df=baseline_df)
channel_roi = simmm.calculate_channel_roi(mmm_df=mmm_df)
final_df = simmm.finalize_output(mmm_df=mmm_df, **cfg["output_params"])
final_df = simmm.finalize_output(mmm_df=mmm_df, params=params["output_params"])
```

### Exogenous Factors

PySiMMMulator supports the inclusion of external shocks, holidays, and promotions. These can be specified as either multipliers or additive impacts within the `baseline_params` block.

```yaml
baseline_params:
...
exogenous_factors:
- name: "Black Friday"
dates: ["2023-11-24"]
impact: 3.5
type: "multiplier"
- name: "Christmas Peak"
start_date: "2023-12-20"
end_date: "2023-12-24"
impact: 2.0
type: "multiplier"
```

### Automated Sensitivity Analysis (Monte Carlo)

The `Multisim` class enables Monte Carlo simulations by allowing you to define uncertainty ranges for any configuration parameter. This helps researchers understand how sensitive an MMM is to data volatility.

```python
from pysimmmulator import Multisim, load_config

base_cfg = load_config("my_config.yaml")
sensitivity_config = {
"baseline_params": {
"error_std": [20.0, 150.0] # sample noise level for each run
}
}

msim = Multisim(random_seed=42)
msim.run(config=base_cfg, runs=100, sensitivity_config=sensitivity_config)

# results is a list of SimulationResult objects
results = msim.get_data
```

### Geographic distribution
Expand Down
6 changes: 3 additions & 3 deletions src/pysimmmulator/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ def run_with_config(config_path, output_path):
cfg = load_config(config_path)
logger.debug("config loaded successfully")
sim = Simulate()
(mmm_input_df, channel_roi) = sim.run_with_config(config=cfg)
result = sim.run_with_config(config=cfg)
logger.debug("sim run successfully, saving results")

# save to current directory. Should be an optional argument for this
os.makedirs(output_path, exist_ok=True)
mmm_input_df.to_csv(os.path.join(output_path, "mmm_input_df.csv"), index=False)
pd.DataFrame.from_dict(channel_roi, orient="index", columns=["true_roi"]).to_csv(os.path.join(output_path, "channel_roi.csv"))
result.df.to_csv(os.path.join(output_path, "mmm_input_df.csv"), index=True)
pd.DataFrame.from_dict(result.channel_roi, orient="index", columns=["true_roi"]).to_csv(os.path.join(output_path, "channel_roi.csv"))

def main():
arg_parser = argparse.ArgumentParser()
Expand Down
19 changes: 19 additions & 0 deletions src/pysimmmulator/param_handlers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Optional, Union
from dataclasses import dataclass
import datetime
import pandas as pd

@dataclass
class BasicParameters:
Expand Down Expand Up @@ -283,3 +284,21 @@ class GeoParameters:

def __post_init__(self):
assert self.total_population > 0, "Total population must be greater than 0"

@dataclass
class SimulationResult:
"""Object for holding the results and metadata of a simulation run.

Args:
df (pd.DataFrame): Final simulation DataFrame.
channel_roi (dict): Ground-truth ROI values per channel.
config (dict): The configuration dictionary used for the run.
random_state (object): The bit generator state of the RNG used.
"""
df: pd.DataFrame
channel_roi: dict
config: dict
random_state: object

def __repr__(self):
return f"SimulationResult(rows={len(self.df)}, channels={list(self.channel_roi.keys())})"
74 changes: 53 additions & 21 deletions src/pysimmmulator/simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
AdstockParameters,
OutputParameters,
GeoParameters,
SimulationResult,
)

from .visualize import Visualize
Expand Down Expand Up @@ -503,7 +504,7 @@ def finalize_output(self, mmm_df: pd.DataFrame, params: OutputParameters) -> pd.
logger.info(f"You have completed running step 9: Finalization of output dataframe at the {params.aggregation_level} level")
return final_df

def run_with_config(self, config: dict) -> tuple[pd.DataFrame, dict]:
def run_with_config(self, config: dict) -> SimulationResult:
"""Orchestrates the full simulation pipeline using a configuration dictionary.

This method handles parameter instantiation, baseline simulation, media and CVR
Expand All @@ -513,8 +514,8 @@ def run_with_config(self, config: dict) -> tuple[pd.DataFrame, dict]:
Args:
config (dict): Complete configuration dictionary.
Returns:
tuple[pd.DataFrame, dict]: Finalized simulation DataFrame and a dictionary
of ground-truth ROI values per channel."""
SimulationResult: Object containing the output DataFrame, ground-truth ROI,
configuration used, and random state metadata."""
from .load_parameters import create_all_parameters
params = create_all_parameters(config)
self.basic_params = params["basic_params"]
Expand All @@ -533,41 +534,72 @@ def run_with_config(self, config: dict) -> tuple[pd.DataFrame, dict]:
channel_roi = self.calculate_channel_roi(mmm_df=mmm_df)
final_df = self.finalize_output(mmm_df=mmm_df, params=params["output_params"])

return (final_df, channel_roi)
return SimulationResult(
df=final_df,
channel_roi=channel_roi,
config=config,
random_state=self._report_random_state()
)

class Multisim(Simulate):
"""Provides capability to generate multiple runs on a single configuration"""
def __init__(self):
super(Multisim, self).__init__()
self.final_frames = []
self.rois = []
def __init__(self, random_seed=None):
super(Multisim, self).__init__(random_seed=random_seed)
self.results = []

def stash_outputs(self, final_df: pd.DataFrame, channel_roi: dict):
def stash_outputs(self, result: SimulationResult):
"""Stores the outputs of a single simulation run.

Args:
final_df (pd.DataFrame): Final simulation DataFrame.
channel_roi (dict): Ground-truth ROI values."""
self.final_frames.append(final_df)
self.rois.append(channel_roi)
result (SimulationResult): The result object from run_with_config."""
self.results.append(result)

@property
def get_data(self):
"""Provides the iterable generator for simulation final dataframes and channel ground truth ROI values
"""Provides the list of SimulationResult objects generated.

Returns:
data (iterable): iterable of final sim dataframes and channel ROI values"""
return self.data
results (list[SimulationResult]): List of simulation results."""
return self.results

def _apply_sensitivity(self, config: dict, sensitivity_config: dict) -> dict:
"""Recursively applies sensitivity ranges to a configuration.

def run(self, config: dict, runs: int) -> None:
Args:
config (dict): The base configuration to copy and update.
sensitivity_config (dict): Configuration specifying ranges [low, high] for parameters.
Returns:
dict: A new configuration with sampled values."""
import copy
new_config = copy.deepcopy(config)

def recursive_update(target, source):
for key, value in source.items():
if isinstance(value, dict) and key in target and isinstance(target[key], dict):
recursive_update(target[key], value)
elif isinstance(value, list) and len(value) == 2 and all(isinstance(x, (int, float)) for x in value):
# Sample from Uniform distribution
target[key] = self.rng.uniform(low=value[0], high=value[1])
else:
target[key] = value

recursive_update(new_config, sensitivity_config)
return new_config

def run(self, config: dict, runs: int, sensitivity_config: dict = None) -> None:
"""Executes multiple simulation runs.

Args:
config (dict): Simulation configuration.
runs (int): Number of runs to execute."""
runs (int): Number of runs to execute.
sensitivity_config (dict): Optional configuration for Monte Carlo sensitivity analysis.
"""
for run in range(runs):
frame, roi = self.run_with_config(config=config)
self.stash_outputs(final_df=frame, channel_roi=roi)
current_config = config
if sensitivity_config:
current_config = self._apply_sensitivity(config, sensitivity_config)
result = self.run_with_config(config=current_config)
self.stash_outputs(result=result)
logger.info(f"{run + 1}/{runs} completed")
self.data = zip(self.final_frames, self.rois)
logger.info(f"{runs} runs complete and stored")

11 changes: 6 additions & 5 deletions tests/test_edge_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_visualize_empty_columns():
def test_multisim_get_data_coverage():
from pysimmmulator.simulate import Multisim
ms = Multisim()
ms.data = "test_data"
ms.results = "test_data"
assert ms.get_data == "test_data"

def test_reproducibility():
Expand All @@ -168,10 +168,11 @@ def test_reproducibility():

seed = 42
sim1 = Simulate(random_seed=seed)
df1, roi1 = sim1.run_with_config(config)
result1 = sim1.run_with_config(config)

sim2 = Simulate(random_seed=seed)
df2, roi2 = sim2.run_with_config(config)
result2 = sim2.run_with_config(config)

pd.testing.assert_frame_equal(result1.df, result2.df)
assert result1.channel_roi == result2.channel_roi

pd.testing.assert_frame_equal(df1, df2)
assert roi1 == roi2
6 changes: 3 additions & 3 deletions tests/test_exogenous_factors.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ def test_us_retail_example_run():
from pysimmmulator.load_parameters import load_config, create_all_parameters
cfg = load_config("examples/us_retail_exogenous_config.yaml")
sim = Simulate()
df, roi = sim.run_with_config(cfg)
assert len(df) > 0
assert "baseline_sales" not in df.columns # it's aggregated in total_revenue
result = sim.run_with_config(cfg)
assert len(result.df) > 0
assert "baseline_sales" not in result.df.columns # it's aggregated in total_revenue

# We can check specific dates in the internal baseline if we run it manually
params = create_all_parameters(cfg)
Expand Down
38 changes: 19 additions & 19 deletions tests/test_geo_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ def base_config():

def test_run_with_config_no_geo(base_config):
sim = Simulate()
final_df, channel_roi = sim.run_with_config(base_config)
result = sim.run_with_config(base_config)

assert final_df.index.name == "date"
assert "geo_name" not in final_df.index.names
assert len(final_df) > 0
assert result.df.index.name == "date"
assert "geo_name" not in result.df.index.names
assert len(result.df) > 0

def test_run_with_config_with_geo(base_config):
base_config["geo_params"] = {
Expand All @@ -24,11 +24,11 @@ def test_run_with_config_with_geo(base_config):
}

sim = Simulate()
final_df, channel_roi = sim.run_with_config(base_config)
result = sim.run_with_config(base_config)

assert "geo_name" in final_df.index.names
assert "date" in final_df.index.names
geos = final_df.index.get_level_values("geo_name").unique()
assert "geo_name" in result.df.index.names
assert "date" in result.df.index.names
geos = result.df.index.get_level_values("geo_name").unique()
assert len(geos) == 5

def test_run_with_config_weekly_geo(base_config):
Expand All @@ -39,11 +39,11 @@ def test_run_with_config_weekly_geo(base_config):
}

sim = Simulate()
final_df, channel_roi = sim.run_with_config(base_config)
result = sim.run_with_config(base_config)

assert "geo_name" in final_df.index.names
assert "week_start" in final_df.index.names
geos = final_df.index.get_level_values("geo_name").unique()
assert "geo_name" in result.df.index.names
assert "week_start" in result.df.index.names
geos = result.df.index.get_level_values("geo_name").unique()
assert len(geos) == 3

def test_run_with_config_single_geo(base_config):
Expand All @@ -53,11 +53,11 @@ def test_run_with_config_single_geo(base_config):
}

sim = Simulate()
final_df, channel_roi = sim.run_with_config(base_config)
result = sim.run_with_config(base_config)

assert "geo_name" in final_df.index.names
assert "date" in final_df.index.names
geos = final_df.index.get_level_values("geo_name").unique()
assert "geo_name" in result.df.index.names
assert "date" in result.df.index.names
geos = result.df.index.get_level_values("geo_name").unique()
assert len(geos) == 1

def test_geo_visualization(base_config):
Expand All @@ -67,14 +67,14 @@ def test_geo_visualization(base_config):
}

sim = Simulate()
final_df, _ = sim.run_with_config(base_config)
result = sim.run_with_config(base_config)

# Test plotting with multi-indexed geo data
try:
sim.plot_spend(final_df, agg="weekly")
sim.plot_spend(result.df, agg="weekly")
assert os.path.exists("Spend_by_channel.png")

sim.plot_revenue(final_df, agg="monthly")
sim.plot_revenue(result.df, agg="monthly")
assert os.path.exists("Revenue_by_channel.png")
finally:
# Cleanup
Expand Down
2 changes: 1 addition & 1 deletion tests/test_multisim.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ def test_multiple_runs():
cfg = load_parameters.load_config(config_path="./examples/example_config.yaml")
msim = Multisim()
msim.run(config=cfg, runs=10)
assert len(msim.final_frames) == 10
assert len(msim.get_data) == 10
Loading
Loading