diff --git a/examples/us_retail_exogenous_config.yaml b/examples/us_retail_exogenous_config.yaml new file mode 100644 index 0000000..feb0f47 --- /dev/null +++ b/examples/us_retail_exogenous_config.yaml @@ -0,0 +1,101 @@ +basic_params: + years: 3 + channels_impressions: ["Amazon", "TV", "Meta"] + channels_clicks: ["Search"] + frequency_of_campaigns: 1 + start_date: "2023/1/1" + true_cvr: + Amazon: 0.006 + TV: 0.001 + Meta: 0.008 + Search: 0.012 + revenue_per_conv: 15.0 + +baseline_params: + base_p: 1000 + trend_p: 200 + temp_var: 50 + temp_coef_mean: 10 + temp_coef_sd: 2 + error_std: 50 + exogenous_factors: + - name: "New Year's Day" + dates: ["2023-01-01", "2024-01-01", "2025-01-01"] + impact: 1.5 + type: "multiplier" + - name: "Independence Day" + dates: ["2023-07-04", "2024-07-04", "2025-07-04"] + impact: 1.3 + type: "multiplier" + - name: "Labor Day Weekend" + start_date: "2023-09-02" + end_date: "2023-09-04" + impact: 1.4 + type: "multiplier" + - name: "Black Friday" + dates: ["2023-11-24", "2024-11-29", "2025-11-28"] + impact: 3.5 + type: "multiplier" + - name: "Cyber Monday" + dates: ["2023-11-27", "2024-12-02", "2025-12-01"] + impact: 3.0 + type: "multiplier" + - name: "Christmas Peak" + start_date: "2023-12-20" + end_date: "2023-12-24" + impact: 2.0 + type: "multiplier" + - name: "Economic Stimulus" + start_date: "2023-03-01" + end_date: "2023-03-15" + impact: 500 + type: "additive" + +ad_spend_params: + campaign_spend_mean: 500000 + campaign_spend_std: 100000 + max_min_proportion_on_each_channel: + Amazon: + min: 0.2 + max: 0.4 + TV: + min: 0.1 + max: 0.3 + Meta: + min: 0.1 + max: 0.2 + +media_params: + true_cpm: + Amazon: 4.0 + TV: 15.0 + Meta: 5.0 + true_cpc: + Search: 3.0 + noisy_cpm_cpc: + Amazon: {loc: 0.0, scale: 0.5} + TV: {loc: 0.0, scale: 2.0} + Meta: {loc: 0.0, scale: 1.0} + Search: {loc: 0.0, scale: 0.3} + +cvr_params: + noisy_cvr: + Amazon: {loc: 0.0, scale: 0.01} + TV: {loc: 0.0, scale: 0.005} + Meta: {loc: 0.0, scale: 0.02} + Search: {loc: 0.0, scale: 0.01} + +adstock_params: + adstock: + Amazon: {type: "geometric", params: {lambda: 0.1}} + TV: {type: "geometric", params: {lambda: 0.3}} + Meta: {type: "geometric", params: {lambda: 0.15}} + Search: {type: "geometric", params: {lambda: 0.05}} + saturation: + Amazon: {type: "scurve", params: {alpha: 3.0, gamma: 0.2}} + TV: {type: "scurve", params: {alpha: 2.0, gamma: 0.3}} + Meta: {type: "scurve", params: {alpha: 4.0, gamma: 0.25}} + Search: {type: "scurve", params: {alpha: 1.0, gamma: 0.5}} + +output_params: + aggregation_level: "daily" diff --git a/src/pysimmmulator/geos.py b/src/pysimmmulator/geos.py index 3a94fb3..9976b11 100644 --- a/src/pysimmmulator/geos.py +++ b/src/pysimmmulator/geos.py @@ -121,13 +121,13 @@ def distribute_to_geos( media_cost_spec: tuple[float, float] = (0.0, 0.069), perf_spec: tuple[float, float] = (0.0, 0.069) ) -> 'pd.DataFrame': - """Distributes MMM data to supplied geographies. Allows randomization in the scale of the distributon + """Distributes MMM data to supplied geographies. Allows randomization in the scale of the distribution. Args: mmm_input (pd.DataFrame): simulated MMM data that was generated as part of a prior process geo_details (dict): formulated dict or output of the `geos` creation call (ie `geos(count=50)`) random_seed (int): random seed for rng--if needed - rng (np.random.Generator): optional random number generator + rng (np.random.Generator): optional pre-instantiated random number generator dist_spec (tuple[float, float]): Parameters to control the normal distribution function for populations of the geographies media_cost_spec (tuple[float, float]): Parameters to control the normal distribution function for allocation of spend across geographies perf_spec (tuple[float, float]): Parameters to control the normal distribution function for allocation of performance across geographies diff --git a/src/pysimmmulator/param_handlers.py b/src/pysimmmulator/param_handlers.py index 6715838..c5273b8 100644 --- a/src/pysimmmulator/param_handlers.py +++ b/src/pysimmmulator/param_handlers.py @@ -80,7 +80,9 @@ class BaselineParameters: (the larger this number, the more important seasonality is for sales) temp_coef_sd (int): The standard deviation of how important seasonality is in our data (the larger this number, the more variable the importance of seasonality is for sales) - error_std (int): Amount of statistical noise added to baseline sales (the larger this number, the noisier baseline sales will be).""" + error_std (int): Amount of statistical noise added to baseline sales (the larger this number, the noisier baseline sales will be). + exogenous_factors (Optional[list[dict]]): List of external factors like holidays or shocks. + """ basic_params: BasicParameters base_p: int @@ -89,6 +91,7 @@ class BaselineParameters: temp_coef_mean: int temp_coef_sd: int error_std: int + exogenous_factors: Optional[list[dict]] = None def __post_init__(self): assert self.error_std < self.base_p, "Error std can not exceed base sales value" diff --git a/src/pysimmmulator/simulate.py b/src/pysimmmulator/simulate.py index a62515a..f3a4d68 100644 --- a/src/pysimmmulator/simulate.py +++ b/src/pysimmmulator/simulate.py @@ -67,6 +67,7 @@ def simulate_baseline(self, params: BaselineParameters) -> pd.DataFrame: - Trend: Linear growth over the period (total growth of trend_p) - Seasonality: Modeled via a sine function (height temp_var) scaled by a random importance coefficient (mean temp_coef_mean, std temp_coef_sd) + - Exogenous Factors: Holidays or shocks applied as multipliers or additions. - Error: Gaussian noise (std error_std) If the combined terms result in negative sales, they are clamped to zero. @@ -86,21 +87,50 @@ def simulate_baseline(self, params: BaselineParameters) -> pd.DataFrame: temp = self.baseline_params.temp_var * np.sin(days * 3.14 / 182.5) seasonality = self.rng.normal(loc=self.baseline_params.temp_coef_mean, scale=self.baseline_params.temp_coef_sd, size=1) * temp + # Calculate Exogenous Impacts + multiplier_impact = np.ones(len(days)) + additive_impact = np.zeros(len(days)) + + if self.baseline_params.exogenous_factors: + date_backbone = pd.Series(pd.date_range(start=self.basic_params.start_date, periods=len(days), freq="D")) + for factor in self.baseline_params.exogenous_factors: + impact = factor.get("impact", 0.0) + f_type = factor.get("type", "multiplier") + + # Identify target indices + mask = np.zeros(len(days), dtype=bool) + if "dates" in factor: + event_dates = pd.to_datetime(factor["dates"]) + mask = date_backbone.isin(event_dates) + elif "start_date" in factor and "end_date" in factor: + start = pd.to_datetime(factor["start_date"]) + end = pd.to_datetime(factor["end_date"]) + mask = (date_backbone >= start) & (date_backbone <= end) + + if f_type == "multiplier": + multiplier_impact[mask] *= impact + else: + additive_impact[mask] += impact + error = self._truncated_normal(loc=0, scale=self.baseline_params.error_std, size=self.basic_params.years * 365, low=-np.inf) - baseline_sales = base + trend + seasonality + error + baseline_sales = (base + trend + seasonality) * multiplier_impact + additive_impact + error if np.any(baseline_sales < 0): baseline_sales = np.where(baseline_sales < 0, 0, baseline_sales) return pd.DataFrame({ "days": days, + "date": date_backbone if self.baseline_params.exogenous_factors is not None else pd.date_range(start=self.basic_params.start_date, periods=len(days), freq="D"), "baseline_sales": baseline_sales, "base": base, "trend": trend, "temp": temp, "seasonality": seasonality, + "multiplier_impact": multiplier_impact, + "additive_impact": additive_impact, "error": error, }) + def simulate_ad_spend( self, baseline_sales_df: pd.DataFrame, params: AdSpendParameters) -> pd.DataFrame: """Simulation of ad spend based on normal distribution parameters for campaign spend. Overall campaign spend is then divided amongst each channel based on passed @@ -291,7 +321,12 @@ def _reformat_for_mmm(self, spend_df: pd.DataFrame) -> pd.DataFrame: def _simulate_decay(self, mmm_df: pd.DataFrame, adstock_config: dict) -> pd.DataFrame: """Helper function for the simulation of adstocking. Ad stocking is the idea that an ad has a lasting effect for some amount of time in the future. - """ + + Args: + mmm_df (pd.DataFrame): MMM DataFrame containing media metrics. + adstock_config (dict): Nested dictionary mapping channels to adstock types and parameters. + Returns: + pd.DataFrame: Updated mmm_df with adstocked media columns.""" from .transforms import geometric_adstock, weibull_adstock for channel, config in adstock_config.items(): metric = ("impressions" if channel in self.basic_params.channels_impressions else "clicks") @@ -312,7 +347,13 @@ def _simulate_decay(self, mmm_df: pd.DataFrame, adstock_config: dict) -> pd.Data return mmm_df def _simulate_diminishing_returns(self, mmm_df: pd.DataFrame, saturation_config: dict) -> pd.DataFrame: - """Helper function for the simulation of diminishing returns.""" + """Helper function for the simulation of diminishing returns. + + Args: + mmm_df (pd.DataFrame): MMM DataFrame containing adstocked media metrics. + saturation_config (dict): Nested dictionary mapping channels to saturation types and parameters. + Returns: + pd.DataFrame: Updated mmm_df with saturated media columns.""" from .transforms import scurve_saturation, hill_saturation for channel, config in saturation_config.items(): metric = ("impressions" if channel in self.basic_params.channels_impressions else "clicks") @@ -463,6 +504,17 @@ def finalize_output(self, mmm_df: pd.DataFrame, params: OutputParameters) -> pd. return final_df def run_with_config(self, config: dict) -> tuple[pd.DataFrame, dict]: + """Orchestrates the full simulation pipeline using a configuration dictionary. + + This method handles parameter instantiation, baseline simulation, media and CVR + simulation, adstock/saturation, conversion calculation, and optional + geographic distribution. + + Args: + config (dict): Complete configuration dictionary. + Returns: + tuple[pd.DataFrame, dict]: Finalized simulation DataFrame and a dictionary + of ground-truth ROI values per channel.""" from .load_parameters import create_all_parameters params = create_all_parameters(config) self.basic_params = params["basic_params"] @@ -491,23 +543,28 @@ def __init__(self): self.rois = [] def stash_outputs(self, final_df: pd.DataFrame, channel_roi: dict): - """Stores the final simulation dataframe as well as the ground truth channel ROI values - for each run of the multiple simulations. - """ + """Stores the outputs of a single simulation run. + + Args: + final_df (pd.DataFrame): Final simulation DataFrame. + channel_roi (dict): Ground-truth ROI values.""" self.final_frames.append(final_df) self.rois.append(channel_roi) @property def get_data(self): - """Provies the iterable generator for simulaton final dataframes and channel ground truth ROI values + """Provides the iterable generator for simulation final dataframes and channel ground truth ROI values - Args: - None Returns: - data (iterable): iterable of final sim dataframes and channel ROI values""" + data (iterable): iterable of final sim dataframes and channel ROI values""" return self.data def run(self, config: dict, runs: int) -> None: + """Executes multiple simulation runs. + + Args: + config (dict): Simulation configuration. + runs (int): Number of runs to execute.""" for run in range(runs): frame, roi = self.run_with_config(config=config) self.stash_outputs(final_df=frame, channel_roi=roi) diff --git a/src/pysimmmulator/study.py b/src/pysimmmulator/study.py index 36fd70f..1a6adf7 100644 --- a/src/pysimmmulator/study.py +++ b/src/pysimmmulator/study.py @@ -70,7 +70,7 @@ def generate(self, count: int = 1) -> 'np.array': Args: count (int): number of study results to return (default is 1) - Retuns: + Returns: study_results (iterable[float]): an array of study results """ return self.rng.normal(loc=self._true_roi + self._bias, scale=self._stdev, size=count) @@ -113,7 +113,7 @@ def generate(self, count: int = 1) -> dict[str, 'np.array']: Args: count (int): number of study results to return (default is 1) - Retuns: + Returns: study_results (dict[iterable[float]]): an array of study results""" return {k: v.generate(count) for k, v in self._study_hold.items()} diff --git a/tests/test_exogenous_factors.py b/tests/test_exogenous_factors.py new file mode 100644 index 0000000..526d990 --- /dev/null +++ b/tests/test_exogenous_factors.py @@ -0,0 +1,110 @@ +from pysimmmulator.simulate import Simulate +from pysimmmulator.param_handlers import BasicParameters, BaselineParameters + +def test_exogenous_multiplier(): + basic_params = BasicParameters( + years=1, + channels_impressions=["TV"], + channels_clicks=[], + frequency_of_campaigns=1, + start_date="2023/01/01", + true_cvr={"TV": 0.01}, + revenue_per_conv=10.0 + ) + sim = Simulate(basic_params) + + # Event on Jan 1st with 2.0 multiplier + exogenous_factors = [ + {"name": "New Year", "dates": ["2023-01-01"], "impact": 2.0, "type": "multiplier"} + ] + + params = BaselineParameters( + basic_params=basic_params, + base_p=100, trend_p=0, temp_var=0, + temp_coef_mean=0, temp_coef_sd=0, error_std=0, + exogenous_factors=exogenous_factors + ) + + df = sim.simulate_baseline(params) + + # Jan 1st should be exactly 200 (100 * 2.0) + assert df.loc[df["date"] == "2023-01-01", "baseline_sales"].values[0] == 200.0 + # Other days should be exactly 100 + assert df.loc[df["date"] == "2023-01-02", "baseline_sales"].values[0] == 100.0 + +def test_exogenous_additive(): + basic_params = BasicParameters( + years=1, + channels_impressions=["TV"], + channels_clicks=[], + frequency_of_campaigns=1, + start_date="2023/01/01", + true_cvr={"TV": 0.01}, + revenue_per_conv=10.0 + ) + sim = Simulate(basic_params) + + # Additive impact of 500 on Jan 5th + exogenous_factors = [ + {"name": "Promo", "dates": ["2023-01-05"], "impact": 500.0, "type": "additive"} + ] + + params = BaselineParameters( + basic_params=basic_params, + base_p=100, trend_p=0, temp_var=0, + temp_coef_mean=0, temp_coef_sd=0, error_std=0, + exogenous_factors=exogenous_factors + ) + + df = sim.simulate_baseline(params) + + # Jan 5th should be 600 (100 + 500) + assert df.loc[df["date"] == "2023-01-05", "baseline_sales"].values[0] == 600.0 + +def test_exogenous_range(): + basic_params = BasicParameters( + years=1, + channels_impressions=["TV"], + channels_clicks=[], + frequency_of_campaigns=1, + start_date="2023/01/01", + true_cvr={"TV": 0.01}, + revenue_per_conv=10.0 + ) + sim = Simulate(basic_params) + + # Multiplier of 0.5 for first week + exogenous_factors = [ + {"name": "Lockdown", "start_date": "2023-01-01", "end_date": "2023-01-07", "impact": 0.5, "type": "multiplier"} + ] + + params = BaselineParameters( + basic_params=basic_params, + base_p=100, trend_p=0, temp_var=0, + temp_coef_mean=0, temp_coef_sd=0, error_std=0, + exogenous_factors=exogenous_factors + ) + + df = sim.simulate_baseline(params) + + # Jan 1st to 7th should be 50 + assert (df.loc[(df["date"] >= "2023-01-01") & (df["date"] <= "2023-01-07"), "baseline_sales"] == 50.0).all() + # Jan 8th should be 100 + assert df.loc[df["date"] == "2023-01-08", "baseline_sales"].values[0] == 100.0 + +def test_us_retail_example_run(): + from pysimmmulator.load_parameters import load_config, create_all_parameters + cfg = load_config("examples/us_retail_exogenous_config.yaml") + sim = Simulate() + df, roi = sim.run_with_config(cfg) + assert len(df) > 0 + assert "baseline_sales" not in df.columns # it's aggregated in total_revenue + + # We can check specific dates in the internal baseline if we run it manually + params = create_all_parameters(cfg) + b_df = sim.simulate_baseline(params["baseline_params"]) + + # Black Friday 2023-11-24 multiplier was 3.5 + # Base is 1000, trend is growing. + bf_row = b_df.loc[b_df["date"] == "2023-11-24"] + assert bf_row["multiplier_impact"].values[0] == 3.5