DAI-Lab · pmfeen · May 6, 2026 · Oct 3, 2025 · Oct 3, 2025 · Oct 7, 2025
diff --git a/.gitignore b/.gitignore
@@ -106,8 +106,10 @@ ENV/
 .*.swp
 
 # Repository Specific
+runs/
 cents/data/*
 cents/data/pecanstreet/*
+cents/data/commercial/*
 cents/data/custom/
 .DS_Store
 .ipynb_checkpoints

diff --git a/cents/config/config.yaml b/cents/config/config.yaml
@@ -1,31 +1,5 @@
-defaults:
-  - model: null
-  - dataset: pecanstreet
-  - evaluator: default
-  - trainer: null
-  - _self_
-
 device: auto
-job_name: ${model.name}_${dataset.name}_${dataset.user_group}
-run_dir: outputs/${job_name}/${now:%Y-%m-%d_%H-%M-%S}
 model_ckpt: null
-hydra:
-  job_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        level: INFO
-    root:
-      handlers: [console]
-      level: INFO
-  run:
-    dir: ${run_dir}
-
 
 wandb:
   enabled: false

diff --git a/cents/config/context/default.yaml b/cents/config/context/default.yaml
@@ -0,0 +1,20 @@
+# Context configuration
+# This file defines the context modules used across the codebase
+
+static_context:
+  type: mlp  # Options: "mlp", "sep_mlp", "transformer"
+  # TransformerStaticContextModule hyperparameters (ignored by mlp/sep_mlp):
+  # n_heads: 4
+  # n_layers: 2
+  # dropout: 0.1
+  # dim_feedforward: 256
+
+# Normalizer: stats head configuration for the normalizer
+normalizer:
+  stats_head_type: mlp  # Stats head type (e.g., "mlp")
+  n_layers: 5
+  # hidden_dim: 512
+
+# Dynamic context: context module used by the normalizer for time series context variables
+dynamic_context:
+  type: null  # Context module type for dynamic context (e.g., "cnn")
diff --git a/cents/config/dataset/airquality.yaml b/cents/config/dataset/airquality.yaml
@@ -0,0 +1,59 @@
+name: airquality
+geography: null
+normalize: True
+scale: False
+use_learned_normalizer: True
+threshold: 8
+seq_len: 24
+shuffle: True
+skip_heavy_processing: False
+max_samples: null
+path: "./data/airquality"
+numeric_context_bins: 1
+reduce_cardinality: False
+time_series_dims: 1
+normalizer_stats_mode: group
+# Normalizer conditions only on these (e.g. per-station); diffusion still gets full context_vars
+normalizer_group_vars: ["station"]
+
+# Targets (what becomes the merged "timeseries" dims)
+# NOTE: use PMcoarse instead of PM10
+time_series_columns: ["PM2.5"]
+
+# Raw CSV columns to load
+# Keep wd/WSPM because we need them to engineer wind_u/wind_v
+# Keep PM10 because we need it to engineer PMcoarse
+data_columns:
+  - "No"
+  - "year"
+  - "month"
+  - "day"
+  - "hour"
+  - "PM2.5"
+  - "PM10"
+  - "SO2"
+  - "NO2"
+  - "CO"
+  - "TEMP"
+  - "DEWP"
+  - "PRES"
+  - "RAIN"
+  - "WSPM"
+  - "wd"
+  - "station"
+
+context_vars:
+  # static categorical
+  year: ["categorical", 5]
+  month: ["categorical", 12]
+  weekday: ["categorical", 7]
+  station: ["categorical", 12]
+
+  # dynamic time-series context
+  TEMP: ["time_series", null]
+  DEWP: ["time_series", null]
+  PRES: ["time_series", null]
+  RAIN: ["time_series", null]
+  wind_u: ["time_series", null]
+  wind_v: ["time_series", null]
+  wd_valid: ["time_series", null]
diff --git a/cents/config/dataset/commercial.yaml b/cents/config/dataset/commercial.yaml
@@ -0,0 +1,30 @@
+name: commercial
+geography: null
+user_group: all
+normalize: True
+scale: False
+use_learned_normalizer: True
+threshold: 8
+seq_len: 24
+time_series_dims: 1
+shuffle: True
+skip_heavy_processing: False  # Skip rarity computation (for faster loading/DDP)
+max_samples: null  # Limit dataset size (null = use all data)
+path: "./data/commercial/csv"
+time_series_columns: "energy_meter"
+data_columns: ["dataid","energy_meter","timestamp"]
+metadata_columns: ["building_id", "site_id", "primaryspaceusage", "sqft", "yearbuilt", "sub_primaryspaceusage"]
+numeric_context_bins: 5
+reduce_cardinality: False
+normalizer_stats_mode: group
+normalizer_group_vars: null
+
+context_vars:
+  year: ["categorical", 2]
+  month: ["categorical", 12]
+  weekday: ["categorical", 7]
+  site_id: ["categorical", 19]
+  primaryspaceusage: ["categorical", 16]
+  sqft: ["categorical", null]
+  yearbuilt: ["categorical", null]
+  sub_primaryspaceusage: ["categorical", 104]
diff --git a/cents/config/dataset/default.yaml b/cents/config/dataset/default.yaml
@@ -1,13 +1,15 @@
-name: default
-normalize: True
-scale: True
-use_learned_normalizer: True
-shuffle: True
-threshold: 6
-time_series_dims: 1
-time_series_columns: []
-seq_len: 8
-user_group: null
+# name: default
+# normalize: True
+# scale: True
+# use_learned_normalizer: True
+# shuffle: True
+# threshold: 6
+# time_series_dims: 1
+# time_series_columns: []
+# seq_len: 8
+# user_group: null
 
-numeric_context_bins: 5
-context_vars: {}
+# numeric_context_bins: 5
+# context_vars: {}  # Dict mapping variable names to category counts (for categorical) or placeholders (for continuous)
+# continuous_context_vars: []  # Optional: list of variable names that should be kept as continuous (not binned)
+# stats_head_type: mlp
diff --git a/cents/config/dataset/metraq.yaml b/cents/config/dataset/metraq.yaml
@@ -0,0 +1,54 @@
+name: metraq
+geography: null
+normalize: True
+scale: False
+use_learned_normalizer: True
+threshold: 8
+seq_len: 24
+shuffle: True
+skip_heavy_processing: False
+max_samples: null
+path: "./data/metraq"
+numeric_context_bins: 1
+reduce_cardinality: False
+time_series_dims: 1
+normalizer_stats_mode: group
+# Normalizer conditions only on these (e.g. per-station); diffusion still gets full context_vars
+normalizer_group_vars: ["sensor_name"]
+max_z_threshold: 15.0
+
+# Targets (what becomes the merged "timeseries" dims)
+# NOTE: use PMcoarse instead of PM10
+time_series_columns: ["PM2.5"]
+
+# Raw CSV columns to load
+# Keep wd/WSPM because we need them to engineer wind_u/wind_v
+# Keep PM10 because we need it to engineer PMcoarse
+data_columns:
+  - "entry_date"
+  - "magnitude_name"
+  - "sensor_name"
+  - "value"
+  # - "utm_x"
+  # - "utm_y"
+
+context_vars:
+  # static categorical
+  year: ["categorical", 6]
+  month: ["categorical", 12]
+  weekday: ["categorical", 7]
+  sensor_name: ["categorical", 24]
+  # utm_x: ["continuous", null]
+  # utm_y: ["continuous", null]
+
+  # dynamic time-series context
+  # WS and WD are decomposed into wind_u/wind_v in preprocessing to handle
+  # the circularity of wind direction (WD=355° ≈ WD=5°, but z-score would give opposite signs).
+  T: ["time_series", null]
+  # wind_u: ["time_series", null]
+  # wind_v: ["time_series", null]
+  # wd_valid: ["time_series", null]
+  # RH, AP, R dropped — per-sample correlation with PM2.5 < 0.025 across all stations
+  # Traffic: TI = vehicles/hour (Kriging interpolation); SP = avg speed km/h
+  TI: ["time_series", null]
+  SP: ["time_series", null]
diff --git a/cents/config/dataset/pecanstreet.yaml b/cents/config/dataset/pecanstreet.yaml
@@ -7,20 +7,24 @@ threshold: 8
 seq_len: 96
 time_series_dims: 1
 shuffle: True
+skip_heavy_processing: False  # Skip rarity computation (for faster loading/DDP)
+max_samples: null  # Limit dataset size (null = use all data)
 path: "./data/pecanstreet/csv"
-time_series_columns: ["grid", "solar"]
+time_series_columns: ["grid"]
 data_columns: ["dataid","local_15min","car1","grid","solar"]
 metadata_columns: ["dataid","building_type","solar","car1","city","state","total_square_footage","house_construction_year"]
 user_group: all # non_pv_users, all, pv_users
 numeric_context_bins: 5
+normalizer_stats_mode: group
 
-context_vars: # for each desired context variable, add the name and number of categories
-  month: 12
-  weekday: 7
-  building_type: 3
-  has_solar: 2 # note that the metadata csv file column name is 'solar', which is renamed to avoid conflicts with the 'solar' column in the data csv.
-  car1: 2
-  city: 7
-  state: 3
-  total_square_footage: 5
-  house_construction_year: 5
+
+context_vars:
+  month: ["categorical", 12]
+  weekday: ["categorical", 7]
+  building_type: ["categorical", 3]
+  has_solar: ["categorical", 2]
+  car1: ["categorical", 2]
+  city: ["categorical", 7]
+  state: ["categorical", 3]
+  total_square_footage: ["categorical", null]
+  house_construction_year: ["categorical", null]
diff --git a/cents/config/dataset/walmart.yaml b/cents/config/dataset/walmart.yaml
@@ -0,0 +1,37 @@
+name: walmart
+geography: null
+normalize: True
+scale: False
+use_learned_normalizer: True
+threshold: 8
+seq_len: 28
+shuffle: True
+skip_heavy_processing: False
+max_samples: null
+path: "./data/walmart"
+numeric_context_bins: 1
+reduce_cardinality: False
+time_series_dims: 1
+normalizer_stats_mode: group
+# Normalizer conditions on category × store to capture per-group sales distributions
+normalizer_group_vars: ["cat_id", "store_id"]
+max_z_threshold: 15.0
+
+# Target: daily unit sales
+time_series_columns: ["sales"]
+
+context_vars:
+  # Static categorical — characterise the window by when it starts
+  year: ["categorical", 6]       # 2011–2016
+  month: ["categorical", 12]
+  # Static categorical — item / store identity
+  cat_id: ["categorical", 3]     # FOODS, HOBBIES, HOUSEHOLD
+  dept_id: ["categorical", 7]    # e.g. FOODS_1 … HOUSEHOLD_2
+  store_id: ["categorical", 10]  # CA_1 … WI_3
+  state_id: ["categorical", 3]   # CA, TX, WI
+
+  # Dynamic time-series context (co-occurring with target, length = seq_len)
+  sell_price: ["time_series", null]    # weekly price broadcast to daily, z-scored
+  snap: ["time_series", null]          # binary SNAP eligibility for the item's state
+  event_binary: ["time_series", null]  # 1 if a named calendar event falls on that day
+  weekday: ["time_series", null]        # day of week encoded as 0 (Mon) – 6 (Sun), z-scored
diff --git a/cents/config/evaluator/airquality.yaml b/cents/config/evaluator/airquality.yaml
@@ -0,0 +1,32 @@
+model:
+  name: diffusion_ts
+dataset:
+  name: airquality
+eval_pv_shift: False
+eval_metrics: True
+eval_context_sparse: True
+save_results: False
+eval_disentanglement: True
+eval_context_recovery: True
+job_name: diffusion_ts_airquality
+save_dir: outputs/diffusion_ts_airquality/eval
+
+# Context Faithfulness Score (CFS) and Granger Causality Preservation (GCP).
+# Runs only when enabled=True AND either:
+#   - the generated signal has multiple dimensions (multivariate), OR
+#   - the dataset uses dynamic (time-series) context variables.
+#
+# pairs: list of {x, c} dicts specifying which time series to evaluate against each other.
+#   x — name of a generated output dimension (must match time_series_columns in dataset config)
+#   c — name of a dynamic context variable (from context_vars with type "time_series")
+#         OR another generated output dimension (multivariate case, GCP only)
+#
+# CFS is computed only when c is a dynamic context variable (shared context).
+# GCP is computed for all pairs.
+#
+eval_context_faithfulness:
+  enabled: true
+  gcp_max_lag: 5
+  pairs:
+    - {x: "PM2.5", c: "TEMP"}
+    - {x: "PM2.5", c: "DEWP"}
diff --git a/cents/config/evaluator/default.yaml b/cents/config/evaluator/default.yaml
@@ -1,7 +1,40 @@
-model_name: ${model.name}
+model:
+  name: diffusion_ts  # Set this to your model name
+dataset:
+  name: commercial  # Set this to your dataset name (e.g., "commercial")
 eval_pv_shift: False
 eval_metrics: True
+pred_score_trtr: True  # If True, also trains on real data (TRTR) and reports MAE delta alongside TSTR MAE
 eval_context_sparse: True
 save_results: False
 eval_disentanglement: True
-save_dir: ${run_dir}/eval
+eval_context_recovery: True
+job_name: diffusion_ts_commercial
+save_dir: outputs/diffusion_ts_commercial/eval
+
+# Context Faithfulness Score (CFS) and Granger Causality Preservation (GCP).
+# Runs only when enabled=True AND either:
+#   - the generated signal has multiple dimensions (multivariate), OR
+#   - the dataset uses dynamic (time-series) context variables.
+#
+# pairs: list of {x, c} dicts specifying which time series to evaluate against each other.
+#   x — name of a generated output dimension (must match time_series_columns in dataset config)
+#   c — name of a dynamic context variable (from context_vars with type "time_series")
+#         OR another generated output dimension (multivariate case, GCP only)
+#
+# CFS is computed only when c is a dynamic context variable (shared context).
+# GCP is computed for all pairs.
+#
+# Example for airquality dataset (PM2.5 generated, TEMP/DEWP as context):
+#   pairs:
+#     - {x: "PM2.5", c: "TEMP"}
+#     - {x: "PM2.5", c: "DEWP"}
+#
+eval_context_faithfulness:
+  enabled: true
+  gcp_max_lag: 5
+  pairs:
+    - {x: "PM2.5", c: "T"}
+    - {x: "PM2.5", c: "TI"}
+    - {x: "PM2.5", c: "SP"}
+  #   - {x: "PM2.5", c: "TEMP"}