diff --git a/config/config_cerra_crps_decode.yml b/config/config_cerra_crps_decode.yml
new file mode 100644
index 000000000..ae4f24258
--- /dev/null
+++ b/config/config_cerra_crps_decode.yml
@@ -0,0 +1,248 @@
+# (C) Copyright 2025 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+# =====================================================================================
+# CERRA CRPS decoding from the frozen pretrained encoder `yqo6nvmy-pretrain`.
+#
+# Goal: attach a CRPS decoder to the pretrained encoder and probe how well CERRA can be
+# predicted at the *same* (forecast) step as the input window.
+#
+# - Encoder loaded from yqo6nvmy-pretrain and FROZEN (linear-probe style).
+# - Inputs = the pretrain_multi_data_all_years streams, now all marked `forcing`.
+# - CERRA = `diagnostic` output stream (contributes no input tokens), trained with CRPS.
+# - Reconstruction / masking training mode; student-teacher (SSL) loss disabled.
+#
+# IMPORTANT: every architecture parameter below is copied verbatim from the
+# yqo6nvmy-pretrain checkpoint config so that load_chkpt (strict=False) can match the
+# encoder/forecast-engine weights. Do NOT change these unless you also retrain the
+# encoder — a shape mismatch will crash load_state_dict.
+# =====================================================================================
+
+
+decoder_type: PerceiverIOCoordConditioning
+pred_adapter_kv: False
+pred_self_attention: True
+pred_dyadic_dims: False
+pred_mlp_adaln: True
+num_class_tokens: 0
+num_register_tokens: 64
+
+fe_num_blocks: 0
+fe_layer_norm_after_blocks: []
+fe_impute_latent_noise_std: 0.0
+forecast_att_dense_rate: 1.0
+
+healpix_level: 5
+
+rope_2D: True
+
+mlp_type: swiglu
+qk_norm_type: RMSNorm
+use_xsa: True
+with_step_conditioning: True
+noise_pre_predictor_std: 0
+
+with_mixed_precision: True
+with_flash_attention: True
+compile_model: False
+# The CRPS decoder decodes all M ensemble members in a SINGLE varlen call
+# (model.py predict_decoders): each member's B*H query groups pair with its own KV groups,
+# so the decoder params are used exactly once per forward — no per-member reuse. Because
+# there is no reuse, per-block activation checkpointing is DDP-safe (it is the combination
+# reuse + checkpointing that trips the reducer's "marked ready twice"), and it keeps peak
+# memory low. The single call launches M*B*H attention groups, which must stay under the
+# CUDA grid-dim cap (65535) — true for M<=4 at healpix-5 with B=1, hence members capped at 4.
+# static_graph is NOT usable here: predict_decoders has data-dependent branches (P==0
+# skip, NaN guard) so the participating-param set varies per iteration.
+# (config_forecasting_crps.yml uses FSDP instead, which has none of these constraints.)
+with_fsdp: False
+decoder_checkpointing: True          # per-block checkpoint on — safe: members batched, no reuse
+ddp_find_unused_parameters: False    # every trainable param must participate in the loss —
+                                     # freeze leftover modules rather than masking them
+attention_dtype: bf16
+mixed_precision_dtype: bf16
+mlp_norm_eps: 1e-5
+norm_eps: 1e-4
+
+latent_noise_kl_weight: 0.0
+latent_noise_gamma: 2.0
+latent_noise_saturate_encodings: 5
+latent_noise_use_additive_noise: False
+latent_noise_deterministic_latents: True
+
+# ---- CRPS decoder (the merged work) ----------------------------------------------
+# Ensemble members are produced by perturbing the latent tokens before decoding.
+# kernel_crps requires >1 member. pred_head.ens_size stays 1 in the stream config.
+latent_perturbation_num_members: 4     # capped at 4 so M× un-checkpointed decoder activations fit
+latent_perturbation_sigma_init: 0.05   # ASSUMPTION: initial latent noise std
+latent_perturbation_sigma_learnable: True
+
+# ASSUMPTION: freeze the whole encoder (probe the frozen representation). Set to "" to
+# fine-tune end-to-end, or narrow the regex to unfreeze parts.
+freeze_modules: ".*encoder.*|.*deep_ssl_fusion.*" # |^(?!.*CERRA).*$" 
+# freeze_modules: "^(encoder|forecast_engine|latent_heads|latent_pre_norm|deep_ssl_fusion|deep_ssl_level_projections|embed_target_coords_(?!CERRA$).+|TargetPredictionEngine_(?!CERRA$).+|TargetPredictionEngineClassic_(?!CERRA$).+|BilinearDecoder_(?!CERRA$).+|EnsPredictionHead_(?!CERRA$).+)$"
+
+norm_type: "LayerNorm"
+
+#####################################
+
+streams_directory: "./config/streams/pretrain_multi_data_all_years_cerra/"
+streams: ???
+
+# type of zarr_store
+zarr_store: "zip"
+
+general:
+
+  # mutable parameters
+  istep: 0
+  rank: ???
+  world_size: ???
+
+  with_ddp: True
+
+  # zarr-v3's asyncio event loop is not fork-safe; forked DataLoader workers abort
+  # during CERRA validation. "spawn" gives each worker a fresh interpreter + event loop.
+  multiprocessing_method: "spawn"
+
+  desc: ""
+  run_id: ???
+  run_history: []
+
+# logging frequency in the training loop (in number of batches)
+train_logging:
+  terminal: 10
+  metrics: 20
+  checkpoint: 250
+
+# parameters for data loading
+data_loading :
+
+  num_workers: 12
+  rng_seed: ???
+  repeat_data_in_mini_epoch : False
+  memory_pinning: True
+  # keep spawned workers alive across mini-epochs / validation calls to avoid paying
+  # the (expensive) spawn startup + reader re-open every time.
+  persistent_workers: True
+
+
+# config for training
+training_config:
+
+  # reconstruction / masking mode (NOT student_teacher, NOT latent_loss)
+  training_mode: ["masking"]
+
+  # ASSUMPTION: training budget. Tune to taste; encoder is frozen so only the decoder
+  # (+ latent perturbation sigma) is optimised.
+  num_mini_epochs: 32
+  samples_per_mini_epoch: 4096
+  shuffle: True
+
+  # ASSUMPTION: CERRA is available 1985-2023; reuse the era5_cerra train/val split.
+  start_date: 1985-01-01T00:00
+  end_date: 2022-12-31T00:00
+
+  time_window_step: 06:00:00
+  time_window_len: 06:00:00
+
+  learning_rate_scheduling :
+    lr_start: 1e-6
+    lr_max: 5e-5
+    lr_final_decay: 2e-6
+    lr_final: 0.0
+    num_steps_warmup: 256
+    num_steps_cooldown: 512
+    policy_warmup: "cosine"
+    policy_decay: "constant"
+    policy_cooldown: "linear"
+    parallel_scaling_policy: "sqrt"
+
+  optimizer:
+    grad_clip: 1.0
+    weight_decay: 0.1
+    log_grad_norms: False
+    adamw :
+      beta1 : 0.98125
+      beta2 : 0.9875
+      eps : 2e-08
+
+  # CERRA decoded with the CRPS (kernel_crps) loss; SSL student-teacher disabled.
+  losses : {
+    "student-teacher": {
+        enabled: False,
+        type: Disabled,
+        },
+    "physical": {
+        type: LossPhysical,
+        # "mse": null removes the mse loss inherited from default_config so only CRPS trains
+        loss_fcts: { "mse": null, "kernel_crps": { }, },
+        },
+    }
+
+  # NOTE: when launched via `train_continue --from-run-id yqo6nvmy-pretrain`, the SSL
+  # pretrain config is the base, so its `random_easy` model_input / `random_easy_target`
+  # target_input are inherited and MUST be disabled here. Otherwise model_input and
+  # target_input end up with mismatched lengths and mask building asserts. (Harmless on
+  # the plain `train` path, where these keys are absent.)
+  model_input: {
+    "random_easy" : {
+      enabled: False,
+      num_samples: 0,
+      },
+    "forecasting" : {
+      masking_strategy: "forecast",
+      },
+    }
+
+  target_input: {
+    "random_easy_target" : {
+      enabled: False,
+      },
+    }
+
+  # ASSUMPTION: "same forecast step" => offset 0 + num_steps 0, i.e. decode CERRA at the
+  # input window (auto-encoder/diagnostic), matching the pretrain encoder (num_steps=0).
+  forecast :
+      time_step: 06:00:00
+      offset: 0
+      num_steps: 0
+      policy: "fixed"
+
+
+# validation config; full validation config is merge of training and validation config
+validation_config:
+
+  samples_per_mini_epoch: 256
+  shuffle: False
+
+  start_date: 2023-10-01T00:00
+  end_date: 2023-12-31T00:00
+
+  # no SSL here, so no EMA teacher needed for validation
+  validate_with_ema:
+    enabled : False
+    ema_ramp_up_ratio: 0.09
+    ema_halflife_in_thousands: 1e-3
+
+  output : {
+    num_samples: 0,
+    normalized_samples: False,
+    streams: null,
+    }
+
+  validate_before_training: False
+
+
+# Tags for experiment tracking
+wgtags:
+  org: null
+  issue: null
+  exp: "cerra_crps_decode"
+  grid: null
diff --git a/config/config_forecasting.yml b/config/config_forecasting.yml
index 2f91f8097..31becbe33 100644
--- a/config/config_forecasting.yml
+++ b/config/config_forecasting.yml
@@ -66,7 +66,6 @@ forecast_att_dense_rate: 1.0
 
 healpix_level: 5
 
-<<<<<<< HEAD
 # Generalized RoPE selector.
 rope_mode: none  # one of: none, 2d, spherical
 # Optional spherical harmonic band for spherical RoPE. If null, the model picks one
@@ -74,9 +73,6 @@ rope_mode: none  # one of: none, 2d, spherical
 rope_spherical_band: null
 mlp_type: swiglu
 use_xsa: True
-=======
-rope_2D: False
->>>>>>> origin/develop
 
 with_mixed_precision: True
 with_flash_attention: True
diff --git a/config/config_forecasting_crps.yml b/config/config_forecasting_crps.yml
new file mode 100644
index 000000000..ee8d59104
--- /dev/null
+++ b/config/config_forecasting_crps.yml
@@ -0,0 +1,256 @@
+# (C) Copyright 2025 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+embed_orientation: "channels"
+embed_unembed_mode: "block"
+embed_dropout_rate: 0.1
+
+ae_local_dim_embed: 2048
+ae_local_num_blocks: 2
+ae_local_num_heads: 16
+ae_local_dropout_rate: 0.1
+ae_local_with_qk_lnorm: True
+
+ae_local_num_queries: 1
+ae_local_queries_per_cell: False
+ae_adapter_num_heads: 16
+ae_adapter_embed: 128
+ae_adapter_with_qk_lnorm: True
+ae_adapter_with_residual: True
+ae_adapter_dropout_rate: 0.1
+
+ae_global_dim_embed: 2048
+ae_global_num_blocks: 4
+ae_global_num_heads: 32
+ae_global_dropout_rate: 0.1
+ae_global_with_qk_lnorm: True
+# TODO: switching to < 1 triggers triton-related issues.
+# See https://github.com/ecmwf/WeatherGenerator/issues/1050
+ae_global_att_dense_rate: 1.0
+ae_global_block_factor: 64
+ae_global_mlp_hidden_factor: 2
+ae_global_trailing_layer_norm: False
+
+ae_aggregation_num_blocks: 2
+ae_aggregation_num_heads: 32
+ae_aggregation_dropout_rate: 0.1
+ae_aggregation_with_qk_lnorm: True
+ae_aggregation_att_dense_rate: 1.0
+ae_aggregation_block_factor: 64
+ae_aggregation_mlp_hidden_factor: 2
+
+decoder_type: PerceiverIOCoordConditioning #  Main options PerceiverIOCoordConditioning or Linear
+pred_adapter_kv: False
+pred_self_attention: True
+pred_dyadic_dims: False
+pred_mlp_adaln: True
+num_class_tokens: 0
+num_register_tokens: 0
+
+# number of steps offset applied to first target window; if set to zero and forecast_steps=0 then
+# one is training an auto-encoder
+fe_num_blocks: 16
+fe_num_heads: 16
+fe_dropout_rate: 0.1
+fe_with_qk_lnorm: True
+fe_layer_norm_after_blocks: [7]  # Index starts at 0. Thus, [3] adds a LayerNorm after the fourth layer
+fe_impute_latent_noise_std: 1e-4
+# currently fixed to 1.0 (due to limitations with flex_attention and triton)
+forecast_att_dense_rate: 1.0
+
+healpix_level: 5
+
+rope_2D: False
+
+with_mixed_precision: True
+with_flash_attention: True
+compile_model: False
+with_fsdp: True
+attention_dtype: bf16
+mixed_precision_dtype: bf16
+mlp_norm_eps: 1e-5
+norm_eps: 1e-4
+
+latent_noise_kl_weight: 0.0 # 1e-5
+latent_noise_gamma: 2.0
+latent_noise_saturate_encodings: 5 
+latent_noise_use_additive_noise: False
+latent_noise_deterministic_latents: True 
+
+# Latent Gaussian perturbation for probabilistic forecasting (CRPS training)
+# Set num_members > 1 to enable; use kernel_crps loss.
+latent_perturbation_num_members: 5    # 0 / 1 = disabled; >1 = number of ensemble members
+latent_perturbation_sigma_init: 0.05  # initial noise standard deviation
+latent_perturbation_sigma_learnable: True  # true = nn.Parameter; false = fixed buffer
+
+freeze_modules: ""
+load_chkpt: {}
+
+norm_type: "LayerNorm"
+
+#####################################
+
+streams_directory: "./config/streams/era5_1deg_forecasting/"
+streams: ???
+
+# type of zarr_store
+zarr_store: "zip" # "zarr" for LocalStore, "zip" for ZipStore
+
+general:
+
+  # mutable parameters
+  istep: 0
+  rank: ???
+  world_size: ???
+
+  # local_rank, 
+  with_ddp: False,
+  # data_path_*, 
+  # model_path, 
+  # run_path, 
+  # path_shared_
+  
+  multiprocessing_method: "fork"
+
+  desc: ""
+  run_id: ???
+  run_history: []
+
+# logging frequency in the training loop (in number of batches)
+train_logging:
+  terminal: 10
+  metrics: 20
+  checkpoint: 250
+
+# parameters for data loading
+data_loading :
+
+  num_workers: 12
+  rng_seed: ???
+  repeat_data_in_mini_epoch : False
+
+
+# config for training
+training_config:
+  
+  # training_mode: "masking", "student_teacher", "latent_loss"
+  training_mode: ["masking"]
+
+  num_mini_epochs: 100
+  samples_per_mini_epoch: 1
+  shuffle: False
+
+  start_date: 1979-01-01T00:00
+  end_date: 2022-12-31T00:00
+
+  time_window_step: 06:00:00
+  time_window_len: 06:00:00
+  
+  learning_rate_scheduling :
+    lr_start: 1e-6
+    lr_max: 5e-5
+    lr_final_decay: 2e-6
+    lr_final: 0.0
+    num_steps_warmup: 256
+    num_steps_cooldown: 512
+    policy_warmup: "cosine"
+    policy_decay: "constant"
+    policy_cooldown: "linear"
+    parallel_scaling_policy: "sqrt"
+
+  optimizer:
+    grad_clip: 1.0
+    weight_decay: 0.1
+    log_grad_norms: False
+    adamw :
+      # parameters are scaled by number of DDP workers
+      beta1 : 0.98125 # == 0.85 on 2 nodes x 4 gpus
+      beta2 : 0.9875  # == 0.90 on 2 nodes x 4 gpus
+      eps : 2e-08
+
+  losses : {
+    "physical": {
+        type: LossPhysical,
+        loss_fcts: {"kernel_crps": { }, },
+        },
+    }
+  
+  model_input: {
+    "forecasting" : {
+      # masking strategy: "random", "healpix", "forecast"
+      masking_strategy: "forecast",
+      },
+    }
+
+  forecast :
+      time_step: 06:00:00
+      offset: 1
+      num_steps: 3
+      policy: "fixed"
+
+
+# validation config; full validation config is merge of training and validation config
+validation_config: 
+
+  samples_per_mini_epoch: 8
+  shuffle: False
+
+  start_date: 2023-10-01T00:00
+  end_date: 2023-12-31T00:00
+  
+  # whether to track the exponential moving average of weights for validation
+  validate_with_ema: 
+    enabled : False
+    ema_ramp_up_ratio: 0.09
+    ema_halflife_in_thousands: 1e-3
+
+  # parameters for validation samples that are written to disk
+  output : {
+    # number of samples that are written
+    num_samples: 0,
+    # write samples in normalized model space
+    normalized_samples: False,
+    # output streams to write; default all
+    streams: null,
+    }
+
+  # run validation before training starts (mainly for model development)
+  validate_before_training: False
+
+
+# test config; full test config is merge of validation and test config
+# test config is used by default when running inference
+
+# Tags for experiment tracking
+# These tags will be logged in MLFlow along with completed runs for train, eval, val
+# The tags are free-form, with the following rules:
+# - tags should be primitive types (strings, numbers, booleans). NO lists or dictionaries
+# - tags should not duplicate existing config entries.
+# - try to reuse existing tags where possible. MLFlow does not like having too many unique tags
+# - do not use long strings in values (less than 20 characters is a good rule of thumb, we may enforce this in the future)
+wgtags:
+  # The name of the organization of the person running the experiment.
+  # This may be autofilled in the future. Expected values are lowercase strings 
+  # e.g. "ecmwf", "cmcc", "metnor", "jsc", "escience"
+  org: null
+  # The Github issue corresponding to this run (number such as 1234)
+  # Github issues are the central point when running experiment and contain 
+  # links to hedgedocs, code branches, pull requests etc.
+  # It is recommended to associate a run with a Github issue.
+  issue: null
+  # The name of the experiment. This is a distinctive codename for the experiment campaign being run.
+  # This is expected to be the primary tag for comparing experiments in MLFlow, along with the
+  # issue number.
+  # Expected values are lowercase strings with no spaces, just underscores:
+  # Examples: "rollout_ablation_grid"  
+  exp: null
+  # *** Experiment-specific tags ***
+  # All extra tags (including lists, dictionaries, etc.) are treated 
+  # as strings by mlflow, so treat all extra tags as simple string key: value pairs.
+  grid: null
diff --git a/config/config_forecasting_crps_debug.yml b/config/config_forecasting_crps_debug.yml
new file mode 100644
index 000000000..83939f4fc
--- /dev/null
+++ b/config/config_forecasting_crps_debug.yml
@@ -0,0 +1,259 @@
+# (C) Copyright 2025 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+embed_orientation: "channels"
+embed_unembed_mode: "block"
+embed_dropout_rate: 0.1
+
+ae_local_dim_embed: 2048
+ae_local_num_blocks: 2
+ae_local_num_heads: 16
+ae_local_dropout_rate: 0.1
+ae_local_with_qk_lnorm: True
+
+ae_local_num_queries: 1
+ae_local_queries_per_cell: False
+ae_adapter_num_heads: 16
+ae_adapter_embed: 128
+ae_adapter_with_qk_lnorm: True
+ae_adapter_with_residual: True
+ae_adapter_dropout_rate: 0.1
+
+ae_global_dim_embed: 2048
+ae_global_num_blocks: 4
+ae_global_num_heads: 32
+ae_global_dropout_rate: 0.1
+ae_global_with_qk_lnorm: True
+# TODO: switching to < 1 triggers triton-related issues.
+# See https://github.com/ecmwf/WeatherGenerator/issues/1050
+ae_global_att_dense_rate: 1.0
+ae_global_block_factor: 64
+ae_global_mlp_hidden_factor: 2
+ae_global_trailing_layer_norm: False
+
+ae_aggregation_num_blocks: 2
+ae_aggregation_num_heads: 32
+ae_aggregation_dropout_rate: 0.1
+ae_aggregation_with_qk_lnorm: True
+ae_aggregation_att_dense_rate: 1.0
+ae_aggregation_block_factor: 64
+ae_aggregation_mlp_hidden_factor: 2
+
+decoder_type: PerceiverIOCoordConditioning #  Main options PerceiverIOCoordConditioning or Linear
+pred_adapter_kv: False
+pred_self_attention: True
+pred_dyadic_dims: False
+pred_mlp_adaln: True
+num_class_tokens: 0
+num_register_tokens: 0
+
+# number of steps offset applied to first target window; if set to zero and forecast_steps=0 then
+# one is training an auto-encoder
+fe_num_blocks: 16
+fe_num_heads: 16
+fe_dropout_rate: 0.1
+fe_with_qk_lnorm: True
+fe_layer_norm_after_blocks: [7]  # Index starts at 0. Thus, [3] adds a LayerNorm after the fourth layer
+fe_impute_latent_noise_std: 1e-4
+# currently fixed to 1.0 (due to limitations with flex_attention and triton)
+forecast_att_dense_rate: 1.0
+
+healpix_level: 5
+
+rope_2D: False
+
+with_mixed_precision: True
+with_flash_attention: True
+compile_model: False
+with_fsdp: True
+attention_dtype: bf16
+mixed_precision_dtype: bf16
+mlp_norm_eps: 1e-5
+norm_eps: 1e-4
+
+latent_noise_kl_weight: 0.0 # 1e-5
+latent_noise_gamma: 2.0
+latent_noise_saturate_encodings: 5 
+latent_noise_use_additive_noise: False
+latent_noise_deterministic_latents: True 
+
+# Latent Gaussian perturbation for probabilistic forecasting (CRPS training)
+# Set num_members > 1 to enable; use kernel_crps loss.
+latent_perturbation_num_members: 5    # 0 / 1 = disabled; >1 = number of ensemble members
+latent_perturbation_sigma_init: 0.1  # initial noise standard deviation
+latent_perturbation_sigma_learnable: False  # true = nn.Parameter; false = fixed buffer
+latent_perturbation_sigma_lr: 0.0 #1.0e-2  # separate LR for log_sigma (~200x main peak LR); 0.0 = disabled
+latent_perturbation_sigma_override: null
+
+freeze_modules: ""
+load_chkpt: {}
+
+norm_type: "LayerNorm"
+
+#####################################
+
+streams_directory: "./config/streams/era5_1deg/"
+streams: ???
+
+# type of zarr_store
+zarr_store: "zip" # "zarr" for LocalStore, "zip" for ZipStore
+
+general:
+
+  # mutable parameters
+  istep: 0
+  rank: ???
+  world_size: ???
+
+  # local_rank, 
+  with_ddp: True
+  # data_path_*, 
+  # model_path, 
+  # run_path, 
+  # path_shared_
+  
+  multiprocessing_method: "fork"
+
+  desc: ""
+  run_id: ???
+  run_history: []
+
+# logging frequency in the training loop (in number of batches)
+train_logging:
+  terminal: 10
+  metrics: 20
+  checkpoint: 1000
+
+# parameters for data loading
+data_loading :
+
+  num_workers: 12
+  rng_seed: ???
+  repeat_data_in_mini_epoch : False
+
+
+# config for training
+training_config:
+  
+  # training_mode: "masking", "student_teacher", "latent_loss"
+  training_mode: ["masking"]
+
+  num_mini_epochs: 64
+  samples_per_mini_epoch: 4096
+  shuffle: True
+
+  start_date: 1979-01-01T00:00
+  end_date: 2022-12-31T00:00
+
+  time_window_step: 06:00:00
+  time_window_len: 06:00:00
+  
+  learning_rate_scheduling :
+    lr_start: 1e-6
+    lr_max: 5e-5
+    lr_final_decay: 2e-6
+    lr_final: 0.0
+    num_steps_warmup: 2048
+    num_steps_cooldown: 6144
+    policy_warmup: "cosine"
+    policy_decay: "constant"
+    policy_cooldown: "linear"
+    parallel_scaling_policy: "sqrt"
+
+  optimizer:
+    grad_clip: 1.0
+    weight_decay: 0.1
+    log_grad_norms: False
+    adamw :
+      # parameters are scaled by number of DDP workers
+      beta1 : 0.9604  # == 0.85 on 1 node x 4 gpus (0.85^(1/4))
+      beta2 : 0.9747  # == 0.90 on 1 node x 4 gpus (0.90^(1/4))
+      eps : 2e-08
+
+  losses : {
+    "physical": {
+        type: LossPhysical,
+        loss_fcts: {"mse": null, "kernel_crps": {}, },
+        },
+    }
+  
+  model_input: {
+    "forecasting" : {
+      # masking strategy: "random", "healpix", "forecast"
+      masking_strategy: "forecast",
+      },
+    }
+
+  forecast :
+      time_step: 06:00:00
+      offset: 1
+      num_steps: 3
+      policy: "fixed"
+
+
+# validation config; full validation config is merge of training and validation config
+validation_config: 
+
+  samples_per_mini_epoch: 256
+  shuffle: False
+
+  start_date: 2023-10-01T00:00
+  end_date: 2023-12-31T00:00
+  
+  # whether to track the exponential moving average of weights for validation
+  validate_with_ema: 
+    enabled : False
+    ema_ramp_up_ratio: 0.09
+    ema_halflife_in_thousands: 1e-3
+
+  # parameters for validation samples that are written to disk
+  output : {
+    # number of samples that are written
+    num_samples: 0,
+    # write samples in normalized model space
+    normalized_samples: False,
+    # output streams to write; default all
+    streams: null,
+    }
+
+  # run validation before training starts (mainly for model development)
+  validate_before_training: False
+
+
+# test config; full test config is merge of validation and test config
+# test config is used by default when running inference
+
+# Tags for experiment tracking
+# These tags will be logged in MLFlow along with completed runs for train, eval, val
+# The tags are free-form, with the following rules:
+# - tags should be primitive types (strings, numbers, booleans). NO lists or dictionaries
+# - tags should not duplicate existing config entries.
+# - try to reuse existing tags where possible. MLFlow does not like having too many unique tags
+# - do not use long strings in values (less than 20 characters is a good rule of thumb, we may enforce this in the future)
+wgtags:
+  # The name of the organization of the person running the experiment.
+  # This may be autofilled in the future. Expected values are lowercase strings 
+  # e.g. "ecmwf", "cmcc", "metnor", "jsc", "escience"
+  org: null
+  # The Github issue corresponding to this run (number such as 1234)
+  # Github issues are the central point when running experiment and contain 
+  # links to hedgedocs, code branches, pull requests etc.
+  # It is recommended to associate a run with a Github issue.
+  issue: null
+  # The name of the experiment. This is a distinctive codename for the experiment campaign being run.
+  # This is expected to be the primary tag for comparing experiments in MLFlow, along with the
+  # issue number.
+  # Expected values are lowercase strings with no spaces, just underscores:
+  # Examples: "rollout_ablation_grid"  
+  exp: "crps_perturbation_v1"
+  # *** Experiment-specific tags ***
+  # All extra tags (including lists, dictionaries, etc.) are treated 
+  # as strings by mlflow, so treat all extra tags as simple string key: value pairs.
+  # Override per-run: m<num_members>_s<sigma_init>_l<t|f>
+  grid: null
diff --git a/config/config_forecasting_era5_cerra.yml b/config/config_forecasting_era5_cerra.yml
new file mode 100644
index 000000000..fd3f1fa9e
--- /dev/null
+++ b/config/config_forecasting_era5_cerra.yml
@@ -0,0 +1,250 @@
+# (C) Copyright 2025 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+embed_orientation: "channels"
+embed_unembed_mode: "block"
+embed_dropout_rate: 0.1
+
+ae_local_dim_embed: 2048
+ae_local_num_blocks: 2 
+ae_local_num_heads: 16
+ae_local_dropout_rate: 0.1
+ae_local_with_qk_lnorm: True
+
+ae_local_num_queries: 1
+ae_local_queries_per_cell: False
+ae_adapter_num_heads: 16
+ae_adapter_embed: 128
+ae_adapter_with_qk_lnorm: True
+ae_adapter_with_residual: True
+ae_adapter_dropout_rate: 0.1
+
+ae_global_dim_embed: 2048
+ae_global_num_blocks: 4
+ae_global_num_heads: 32
+ae_global_dropout_rate: 0.1
+ae_global_with_qk_lnorm: True
+# TODO: switching to < 1 triggers triton-related issues.
+# See https://github.com/ecmwf/WeatherGenerator/issues/1050
+ae_global_att_dense_rate: 1.0
+ae_global_block_factor: 64
+ae_global_mlp_hidden_factor: 2
+ae_global_trailing_layer_norm: False
+
+ae_aggregation_num_blocks: 0
+ae_aggregation_num_heads: 32
+ae_aggregation_dropout_rate: 0.1
+ae_aggregation_with_qk_lnorm: True
+ae_aggregation_att_dense_rate: 1.0
+ae_aggregation_block_factor: 64
+ae_aggregation_mlp_hidden_factor: 2
+
+decoder_type: PerceiverIOCoordConditioning #  Main options PerceiverIOCoordConditioning or Linear
+pred_adapter_kv: False
+pred_self_attention: True
+pred_dyadic_dims: False
+pred_mlp_adaln: True
+num_class_tokens: 0
+num_register_tokens: 0
+
+# number of steps offset applied to first target window; if set to zero and forecast_steps=0 then
+# one is training an auto-encoder
+fe_num_blocks: 16
+fe_num_heads: 16
+fe_dropout_rate: 0.1
+fe_with_qk_lnorm: True
+fe_layer_norm_after_blocks: [7]  # Index starts at 0. Thus, [3] adds a LayerNorm after the fourth layer
+fe_impute_latent_noise_std: 1e-4
+# currently fixed to 1.0 (due to limitations with flex_attention and triton)
+forecast_att_dense_rate: 1.0
+
+healpix_level: 5
+
+rope_2D: False
+
+with_mixed_precision: True
+with_flash_attention: True
+compile_model: False
+with_fsdp: True
+attention_dtype: bf16
+mixed_precision_dtype: bf16
+mlp_norm_eps: 1e-5
+norm_eps: 1e-4
+
+latent_noise_kl_weight: 0.0 # 1e-5
+latent_noise_gamma: 2.0
+latent_noise_saturate_encodings: 5 
+latent_noise_use_additive_noise: False
+latent_noise_deterministic_latents: True 
+
+freeze_modules: ""
+#load_chkpt: {'run_id': 'qgnvcrky', 'mini_epoch': -1}
+
+norm_type: "LayerNorm"
+
+#####################################
+
+#streams_directory: "./config/streams/era5_cerra/" #"./config/streams/era5_cerra/" set to "./config/streams/era5_1deg/"  for era5 training only
+streams: ???
+
+# type of zarr_store
+zarr_store: "zip" # "zarr" for LocalStore, "zip" for ZipStore
+
+general:
+
+  # mutable parameters
+  istep: 0
+  rank: ???
+  world_size: ???
+
+  # local_rank, 
+  with_ddp: True
+  # data_path_*, 
+  # model_path, 
+  # run_path, 
+  # path_shared_
+  
+  multiprocessing_method: "fork"
+
+  desc: ""
+  run_id: ???
+  run_history: []
+
+# logging frequency in the training loop (in number of batches)
+train_logging:
+  terminal: 10
+  metrics: 20
+  checkpoint: 250
+
+# parameters for data loading
+data_loading :
+
+  num_workers: 6
+  rng_seed: ???
+  repeat_data_in_mini_epoch : False
+
+
+# config for training
+training_config:
+  
+  # training_mode: "masking", "student_teacher", "latent_loss"
+  training_mode: ["masking"]
+
+  num_mini_epochs: 64
+  samples_per_mini_epoch: 4096
+  shuffle: True
+
+  start_date: 1985-01-01T00:00
+  end_date: 2022-12-31T00:00
+
+  time_window_step: 06:00:00
+  time_window_len: 06:00:00
+  
+  learning_rate_scheduling :
+    lr_start: 1e-6
+    lr_max: 5e-5
+    lr_final_decay: 2e-6
+    lr_final: 0.0
+    num_steps_warmup: 256
+    num_steps_cooldown: 512
+    policy_warmup: "cosine"
+    policy_decay: "constant"
+    policy_cooldown: "linear"
+    parallel_scaling_policy: "sqrt"
+
+  optimizer:
+    grad_clip: 1.0
+    weight_decay: 0.1
+    log_grad_norms: False
+    adamw :
+      # parameters are scaled by number of DDP workers
+      beta1 : 0.98125 # == 0.85 on 2 nodes x 4 gpus
+      beta2 : 0.9875  # == 0.90 on 2 nodes x 4 gpus
+      eps : 2e-08
+
+  losses : {
+    "physical": {
+        type: LossPhysical,
+        loss_fcts: { "mse": { }, },
+        },
+    }
+  
+  model_input: {
+    "forecasting" : {
+      # masking strategy: "random", "healpix", "forecast"
+      masking_strategy: "forecast",
+      },
+    }
+
+  forecast :
+      time_step: 06:00:00
+      offset: 1
+      num_steps: 3
+      policy: "fixed"
+
+
+# validation config; full validation config is merge of training and validation config
+validation_config: 
+
+  samples_per_mini_epoch: 256
+  shuffle: False
+
+  start_date: 2023-10-01T00:00
+  end_date: 2023-12-31T00:00
+  
+  # whether to track the exponential moving average of weights for validation
+  validate_with_ema: 
+    enabled : True
+    ema_ramp_up_ratio: 0.09
+    ema_halflife_in_thousands: 1e-3
+
+  # parameters for validation samples that are written to disk
+  output : {
+    # number of samples that are written
+    num_samples: 0,
+    # write samples in normalized model space
+    normalized_samples: False,
+    # output streams to write; default all
+    streams: null,
+    }
+
+  # run validation before training starts (mainly for model development)
+  validate_before_training: False
+
+
+# test config; full test config is merge of validation and test config
+# test config is used by default when running inference
+
+# Tags for experiment tracking
+# These tags will be logged in MLFlow along with completed runs for train, eval, val
+# The tags are free-form, with the following rules:
+# - tags should be primitive types (strings, numbers, booleans). NO lists or dictionaries
+# - tags should not duplicate existing config entries.
+# - try to reuse existing tags where possible. MLFlow does not like having too many unique tags
+# - do not use long strings in values (less than 20 characters is a good rule of thumb, we may enforce this in the future)
+wgtags:
+  # The name of the organization of the person running the experiment.
+  # This may be autofilled in the future. Expected values are lowercase strings 
+  # e.g. "ecmwf", "cmcc", "metnor", "jsc", "escience"
+  org: "knmi-mf"
+  # The Github issue corresponding to this run (number such as 1234)
+  # Github issues are the central point when running experiment and contain 
+  # links to hedgedocs, code branches, pull requests etc.
+  # It is recommended to associate a run with a Github issue.
+  issue: null
+  # The name of the experiment. This is a distinctive codename for the experiment campaign being run.
+  # This is expected to be the primary tag for comparing experiments in MLFlow, along with the
+  # issue number.
+  # Expected values are lowercase strings with no spaces, just underscores:
+  # Examples: "rollout_ablation_grid"  
+  exp: ""
+  # *** Experiment-specific tags ***
+  # All extra tags (including lists, dictionaries, etc.) are treated 
+  # as strings by mlflow, so treat all extra tags as simple string key: value pairs.
+  grid: null
diff --git a/config/config_jepa_multi_data.yml b/config/config_jepa_multi_data.yml
index c14f957d9..8790e80cc 100644
--- a/config/config_jepa_multi_data.yml
+++ b/config/config_jepa_multi_data.yml
@@ -69,7 +69,7 @@ healpix_level: 5
 # Use 2D RoPE instead of traditional global positional encoding
 # When True: uses 2D RoPE based on healpix cell coordinates (lat/lon)
 # When False: uses traditional pe_global positional encoding
-rope_2D: True 
+rope_2D: True
 
 with_mixed_precision: True
 with_flash_attention: True
diff --git a/config/default_config.yml b/config/default_config.yml
index 67bcc3e76..655d0720b 100644
--- a/config/default_config.yml
+++ b/config/default_config.yml
@@ -89,6 +89,12 @@ latent_noise_saturate_encodings: 5
 latent_noise_use_additive_noise: False
 latent_noise_deterministic_latents: True
 
+# Latent Gaussian perturbation for probabilistic forecasting (CRPS training)
+# Set num_members > 1 to enable; use kernel_crps loss.
+latent_perturbation_num_members: 0    # 0 / 1 = disabled; >1 = number of ensemble members
+latent_perturbation_sigma_init: 0.01  # initial noise standard deviation
+latent_perturbation_sigma_learnable: true  # true = nn.Parameter; false = fixed buffer
+
 freeze_modules: ""
 load_chkpt: {}
 
diff --git a/config/evaluate/config_crps_debug.yml b/config/evaluate/config_crps_debug.yml
new file mode 100644
index 000000000..a23e15ed8
--- /dev/null
+++ b/config/evaluate/config_crps_debug.yml
@@ -0,0 +1,24 @@
+evaluation:
+  metrics: ["rmse", "bias", "crps", "spread", "ssr", "rank_histogram"]
+  regions: ["global"]
+  summary_plots: true
+  plot_ensemble: "members"   # visualise individual members
+  summary_dir: "./plots/crps_epoch2/"
+
+default_streams:
+  ERA5:
+    channels: ["2t", "10u", "10v", "z_500", "t_850", "u_850", "v_850", "q_850"]  # a few key variables
+    evaluation:
+      forecast_step: "all"
+      sample: "all"
+      ensemble: "all"      # required for all probabilistic metrics
+    plotting:
+      sample: [0]
+      forecast_step: "all" #supported: "all", [1,2,3,...], "1-50" (equivalent of [1,2,3,...50])
+      plot_maps: true
+      plot_histograms: true
+      plot_animations: true
+
+run_ids:
+  zt0vcpho:
+    label: "CRPS_s0.05_learnable_5members"
\ No newline at end of file
diff --git a/config/evaluate/config_crps_spread.yml b/config/evaluate/config_crps_spread.yml
new file mode 100644
index 000000000..f2e61227d
--- /dev/null
+++ b/config/evaluate/config_crps_spread.yml
@@ -0,0 +1,50 @@
+evaluation:
+  metrics: ["rmse", "spread"]
+  regions: ["global"]
+  summary_plots: true
+  plot_score_maps: true
+  plot_ensemble: "members"   # visualise individual members
+  summary_dir: "./plots/crps_spread_v3/"
+
+default_streams:
+  ERA5:
+    channels: ["2t", "10u", "10v", "z_500", "t_850", "u_850", "v_850", "q_850"]  # a few key variables
+    evaluation:
+      forecast_step: "all"
+      sample: "all"
+      ensemble: "all"      # required for all probabilistic metrics
+    plotting:
+      sample: [0,1,2,3]
+      forecast_step: "all" #supported: "all", [1,2,3,...], "1-50" (equivalent of [1,2,3,...50])
+      ensemble: "std"      #supported: "all", "mean", "std", [0,1,2]
+      plot_maps: true
+      plot_histograms: true
+      plot_animations: true
+
+run_ids:
+  xf74qbhg:
+    label: "CRPS_s0.05_learnable_5members"
+  bg93vcya:
+   label: "CRPS_s0.01_learnable_5members"
+  b7p4dbak:
+   label: "CRPS_s0.1_learnable_5members"
+  l6ji4cvl:
+    label: "CRPS_s0.1_learnable_5members_10_more_epochs"
+  xhl8j3wa:
+   label: "CRPS_s0.05_learnable_2members"
+  g2amz7ou:
+    label: "CRPS_s0.05_learnable_10members"
+  a9wa1rvq:
+    label: "CRPS_s0.01_fixed_5members"
+  ne54k2fa:
+    label: "CRPS_s0.05_fixed_5members"
+  g9z7j8ei:
+    label: "CRPS_s0.1_fixed_5members"
+  ra7qm4bc:
+    label: "CRPS_s0.2_fixed_5members"
+  ha23u84m:
+    label: "CRPS_s0.5_fixed_5members"
+  si91kfmq:
+    label: "CRPS_s0.05_learnable_5members_sigma+lr_1e-2"
+  o5scxlnr:
+    label: "CRPS_s0.5_learnable_5members_sigma+lr_1e-2"
\ No newline at end of file
diff --git a/config/evaluate/eval_cerra_europe_config.yml b/config/evaluate/eval_cerra_europe_config.yml
new file mode 100644
index 000000000..466d3738d
--- /dev/null
+++ b/config/evaluate/eval_cerra_europe_config.yml
@@ -0,0 +1,47 @@
+
+global_plotting_options:
+ regions: ["europe"]
+ image_format : "png" 
+ dpi_val : 300
+ fps: 2
+ ERA5:
+   marker_size: 2
+   scale_marker_size: 1
+   marker: "o"
+
+evaluation:
+  metrics  : ["mae"]
+  regions: ["europe"]
+  summary_plots : true
+  ratio_plots : false
+  heat_maps : false
+  summary_dir: "./plots_zfhwspzc_17_4gpu"
+  plot_ensemble: false #supported: false, "std", "minmax", "members"
+  plot_score_maps: false #plot scores on a 2D maps. it slows down score computation 
+  print_summary: false #print out score values on screen. it can be verbose
+  log_scale: false
+  add_grid: false
+  score_cards: false
+  bar_plots: false
+  num_processes: 0  #options: int, "auto", 0 means no parallelism (default)
+
+default_streams:
+  CERRA:
+    channels: ["2t", "10si", "10wdir", "r_850", "t_850", "u_850", "v_850", "z_500"] #, "blah"]
+    evaluation:
+      forecast_step: "all"
+      sample: "all"
+      ensemble: "all"
+    # plotting:
+    #  sample: [0]
+    #  forecast_step: "all"
+    #  ensemble: "mean"
+    #  plot_maps: true
+    #  plot_bias: true
+    #  plot_target: true
+    #  plot_histograms: true
+    #  plot_animations: true
+      
+run_ids:
+  xxx:
+    label: "xxx"
diff --git a/config/inference/inference_era5_1deg.yml b/config/inference/inference_era5_1deg.yml
new file mode 100644
index 000000000..641130fe2
--- /dev/null
+++ b/config/inference/inference_era5_1deg.yml
@@ -0,0 +1,8 @@
+test_config:
+  start_date: 2023-10-01T00:00
+  output:
+    num_samples: 50
+  samples_per_mini_epoch: 50
+  forecast:
+    num_steps: 2
+streams_directory: ./config/inference/streams/era5_1deg/
\ No newline at end of file
diff --git a/config/inference/inference_era5_o96_cerra.yml b/config/inference/inference_era5_o96_cerra.yml
new file mode 100644
index 000000000..6a54dc658
--- /dev/null
+++ b/config/inference/inference_era5_o96_cerra.yml
@@ -0,0 +1,8 @@
+test_config:
+  start_date: 2023-10-01T00:00
+  output:
+    num_samples: 16
+  samples_per_mini_epoch: 16
+  forecast:
+    num_steps: 9
+streams_directory: ./config/inference/streams/era5_o96_cerra/
\ No newline at end of file
diff --git a/config/inference/streams/era5_1deg/era5.yml b/config/inference/streams/era5_1deg/era5.yml
new file mode 100644
index 000000000..39f655024
--- /dev/null
+++ b/config/inference/streams/era5_1deg/era5.yml
@@ -0,0 +1,40 @@
+# (C) Copyright 2024 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+ERA5 :
+  type : anemoi
+  filenames : ['aifs-ea-an-oper-0001-mars-o96-1979-2023-6h-v8.zarr']
+  stream_id : 0
+  source_exclude : ['z', 'w_10', 'w_50', 'w_100', 'w_150', 'w_200', 'w_250', 'w_300', 'w_400', 'w_500', 'w_600', 'w_700', 'w_850', 'w_925', 'w_1000', 'skt', 'tcw', 'cp', 'tp', 'q_50', 'q_100']
+  target_exclude : ['z', 'w_10', 'w_50', 'w_100', 'w_150', 'w_200', 'w_250', 'w_300', 'w_400', 'w_500', 'w_600', 'w_700', 'w_850', 'w_925', 'w_1000', 'slor', 'sdor', 'tcw', 'cp', 'tp', 'q_50', 'q_100']
+  geoinfo_channels : ['z', 'lsm', 'slor', 'sdor', 'insolation', 'cos_local_time', 'sin_local_time', 'cos_julian_day', 'sin_julian_day']
+  loss_weight : 1.
+  location_weight : cosine_latitude
+  masking_rate : 0.6
+  masking_rate_none : 0.05
+  token_size : 8
+  tokenize_spacetime : True
+  max_num_targets: -1
+  frequency : 06:00:00
+  embed : 
+    net : transformer
+    num_tokens : 1
+    num_heads : 8
+    dim_embed : 256
+    num_blocks : 2
+  embed_target_coords :
+    net : linear
+    dim_embed : 256
+  target_readout :
+    num_layers : 2
+    num_heads : 4
+    # sampling_rate : 0.2
+  pred_head :
+    ens_size : 1
+    num_layers : 1
diff --git a/config/inference/streams/era5_o96_cerra/cerra.yml b/config/inference/streams/era5_o96_cerra/cerra.yml
new file mode 100644
index 000000000..ccaf25139
--- /dev/null
+++ b/config/inference/streams/era5_o96_cerra/cerra.yml
@@ -0,0 +1,40 @@
+# (C) Copyright 2024 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+CERRA :
+  type : anemoi
+  filenames : ['cerra-rr-an-oper-se-al-ec-mars-5p5km-1985-2023-3h-v2.zarr']
+  frequency : 6h
+  stream_id : 1
+  source_exclude : ['skt','tciwv','tp','al','rsn','sde','sf']
+  target_exclude : ['tciwv','tp','al','rsn','sde','sf']
+  geoinfo_channels : ['orog', 'lsm']
+  loss_weight : 1.
+  location_weight : cosine_latitude
+  masking_rate : 0.6
+  masking_rate_none : 0.05
+  token_size : 512
+  tokenize_spacetime : True
+  #max_num_targets: 570000
+  max_num_targets: -1 
+  embed : 
+    net : transformer
+    num_tokens : 1
+    num_heads : 8
+    dim_embed : 512
+    num_blocks : 2
+  embed_target_coords :
+    net : linear
+    dim_embed : 512
+  target_readout :
+    num_layers : 2
+    num_heads : 4
+  pred_head :
+    ens_size : 1
+    num_layers : 1
\ No newline at end of file
diff --git a/config/inference/streams/era5_o96_cerra/era5.yml b/config/inference/streams/era5_o96_cerra/era5.yml
new file mode 100644
index 000000000..c6317a2d6
--- /dev/null
+++ b/config/inference/streams/era5_o96_cerra/era5.yml
@@ -0,0 +1,40 @@
+# (C) Copyright 2024 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+ERA5 :
+  type : anemoi
+  filenames : ['aifs-ea-an-oper-0001-mars-o96-1979-2023-6h-v8.zarr']
+  stream_id : 0
+  source_exclude : ['w_', 'skt', 'tcw', 'cp', 'tp']
+  target_exclude : ['w_', 'slor', 'sdor', 'tcw', 'cp', 'tp']
+  geoinfo_channels : ['z', 'lsm']
+  loss_weight : 1.
+  location_weight : cosine_latitude
+  masking_rate : 0.6
+  masking_rate_none : 0.05
+  token_size : 8
+  tokenize_spacetime : True
+  max_num_targets: -1
+  #max_num_targets: -1 
+  embed : 
+    net : transformer
+    num_tokens : 1
+    num_heads : 8
+    dim_embed : 512
+    num_blocks : 2
+  embed_target_coords :
+    net : linear
+    dim_embed : 512
+  target_readout :
+    num_layers : 2
+    num_heads : 4
+    # sampling_rate : 0.2
+  pred_head :
+    ens_size : 1
+    num_layers : 1
\ No newline at end of file
diff --git a/config/streams/era5_1deg/era5.yml b/config/streams/era5_1deg/era5.yml
index 6d6737d44..9e14050e9 100644
--- a/config/streams/era5_1deg/era5.yml
+++ b/config/streams/era5_1deg/era5.yml
@@ -9,7 +9,7 @@
 
 ERA5 :
   type : anemoi
-  filenames : ['aifs-ea-an-oper-0001-mars-o96-1979-2024-1h-v3-with-era51.zarr']
+  filenames : ['aifs-ea-an-oper-0001-mars-o96-1979-2023-6h-v8.zarr']
   stream_id : 0
   source_exclude : ['z', 'w_10', 'w_50', 'w_100', 'w_150', 'w_200', 'w_250', 'w_300', 'w_400', 'w_500', 'w_600', 'w_700', 'w_850', 'w_925', 'w_1000', 'skt', 'tcw', 'cp', 'tp', 'q_50', 'q_100']
   target_exclude : ['z', 'w_10', 'w_50', 'w_100', 'w_150', 'w_200', 'w_250', 'w_300', 'w_400', 'w_500', 'w_600', 'w_700', 'w_850', 'w_925', 'w_1000', 'slor', 'sdor', 'tcw', 'cp', 'tp', 'q_50', 'q_100']
@@ -18,7 +18,7 @@ ERA5 :
   location_weight : cosine_latitude
   token_size : 8
   tokenize_spacetime : True
-  max_num_targets: -1 
+  max_num_targets: 20000
   frequency : 06:00:00
   embed : 
     net : transformer
diff --git a/config/streams/era5_cerra/cerra.yml b/config/streams/era5_cerra/cerra.yml
new file mode 100644
index 000000000..f5b27dd63
--- /dev/null
+++ b/config/streams/era5_cerra/cerra.yml
@@ -0,0 +1,40 @@
+# (C) Copyright 2024 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+CERRA :
+  type : anemoi
+  filenames : ['cerra-rr-an-oper-se-al-ec-mars-5p5km-1985-2023-3h-v2.zarr']
+  frequency : 6h
+  stream_id : 1
+  source_exclude : ['skt','tciwv','tp','al','rsn','sde','sf']
+  target_exclude : ['tciwv','tp','al','rsn','sde','sf']
+  geoinfo_channels : ['orog', 'lsm']
+  loss_weight : 1.
+  location_weight : cosine_latitude
+  masking_rate : 0.6
+  masking_rate_none : 0.05
+  token_size : 64
+  tokenize_spacetime : True
+  max_num_targets: 570000
+  #max_num_targets: -1 
+  embed : 
+    net : transformer
+    num_tokens : 1
+    num_heads : 8
+    dim_embed : 512
+    num_blocks : 2
+  embed_target_coords :
+    net : linear
+    dim_embed : 512
+  target_readout :
+    num_layers : 2
+    num_heads : 4
+  pred_head :
+    ens_size : 1
+    num_layers : 1
diff --git a/config/streams/era5_cerra/era5.yml b/config/streams/era5_cerra/era5.yml
new file mode 100644
index 000000000..3f990eb23
--- /dev/null
+++ b/config/streams/era5_cerra/era5.yml
@@ -0,0 +1,40 @@
+# (C) Copyright 2024 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+ERA5 :
+  type : anemoi
+  filenames : ['aifs-ea-an-oper-0001-mars-o96-1979-2023-6h-v8.zarr']
+  stream_id : 0
+  source_exclude : ['w_', 'skt', 'tcw', 'cp', 'tp']
+  target_exclude : ['w_', 'slor', 'sdor', 'tcw', 'cp', 'tp']
+  geoinfo_channels : ['z', 'lsm']
+  loss_weight : 1.
+  location_weight : cosine_latitude
+  masking_rate : 0.6
+  masking_rate_none : 0.05
+  token_size : 8
+  tokenize_spacetime : True
+  max_num_targets: 20000
+  #max_num_targets: -1 
+  embed : 
+    net : transformer
+    num_tokens : 1
+    num_heads : 8
+    dim_embed : 512
+    num_blocks : 2
+  embed_target_coords :
+    net : linear
+    dim_embed : 512
+  target_readout :
+    num_layers : 2
+    num_heads : 4
+    # sampling_rate : 0.2
+  pred_head :
+    ens_size : 1
+    num_layers : 1
diff --git a/config/streams/era5_crps/era5.yml b/config/streams/era5_crps/era5.yml
new file mode 100644
index 000000000..31895423b
--- /dev/null
+++ b/config/streams/era5_crps/era5.yml
@@ -0,0 +1,38 @@
+# (C) Copyright 2024 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+ERA5 :
+  type : anemoi
+  filenames : ['aifs-ea-an-oper-0001-mars-o96-1979-2023-6h-v8.zarr']
+  stream_id : 0
+  source_exclude : ['w_', 'skt', 'tcw', 'cp', 'tp']
+  target_exclude : ['w_', 'slor', 'sdor', 'tcw', 'cp', 'tp']
+  loss_weight : 1.
+  location_weight : cosine_latitude
+  masking_rate : 0.6
+  masking_rate_none : 0.05
+  token_size : 8
+  tokenize_spacetime : True
+  max_num_targets: 20000
+  embed : 
+    net : transformer
+    num_tokens : 1
+    num_heads : 8
+    dim_embed : 256
+    num_blocks : 2
+  embed_target_coords :
+    net : linear
+    dim_embed : 256
+  target_readout :
+    num_layers : 2
+    num_heads : 4
+    # sampling_rate : 0.2
+  pred_head :
+    ens_size : 1
+    num_layers : 1
\ No newline at end of file
diff --git a/config/streams/era5_n320_cerra/cerra.yml b/config/streams/era5_n320_cerra/cerra.yml
new file mode 100644
index 000000000..a0e888ea7
--- /dev/null
+++ b/config/streams/era5_n320_cerra/cerra.yml
@@ -0,0 +1,40 @@
+# (C) Copyright 2024 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+CERRA :
+  type : anemoi
+  filenames : ['cerra-rr-an-oper-se-al-ec-mars-5p5km-1985-2023-3h-v2.zarr']
+  frequency : 6h
+  stream_id : 1
+  source_exclude : ['skt','tciwv','tp','al','rsn','sde','sf']
+  target_exclude : ['tciwv','tp','al','rsn','sde','sf']
+  geoinfo_channels : ['orog', 'lsm']
+  loss_weight : 1.
+  location_weight : cosine_latitude
+  masking_rate : 0.6
+  masking_rate_none : 0.05
+  token_size : 64
+  tokenize_spacetime : True
+  max_num_targets: 228000
+  #max_num_targets: -1 
+  embed : 
+    net : transformer
+    num_tokens : 1
+    num_heads : 8
+    dim_embed : 512
+    num_blocks : 2
+  embed_target_coords :
+    net : linear
+    dim_embed : 512
+  target_readout :
+    num_layers : 2
+    num_heads : 4
+  pred_head :
+    ens_size : 1
+    num_layers : 1
diff --git a/config/streams/era5_n320_cerra/era5.yml b/config/streams/era5_n320_cerra/era5.yml
new file mode 100644
index 000000000..580173b04
--- /dev/null
+++ b/config/streams/era5_n320_cerra/era5.yml
@@ -0,0 +1,40 @@
+# (C) Copyright 2024 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+ERA5 :
+  type : anemoi
+  filenames : ['aifs-ea-an-oper-0001-mars-n320-1979-2023-6h-v8.zarr']
+  stream_id : 0
+  source_exclude : ['w_', 'skt', 'tcw', 'cp', 'tp']
+  target_exclude : ['w_', 'slor', 'sdor', 'tcw', 'cp', 'tp']
+  geoinfo_channels : ['orog', 'lsm']
+  loss_weight : 1.
+  location_weight : cosine_latitude
+  masking_rate : 0.6
+  masking_rate_none : 0.05
+  token_size : 32
+  tokenize_spacetime : True
+  max_num_targets: 108000
+  #max_num_targets: -1 
+  embed : 
+    net : transformer
+    num_tokens : 1
+    num_heads : 8
+    dim_embed : 512
+    num_blocks : 2
+  embed_target_coords :
+    net : linear
+    dim_embed : 512
+  target_readout :
+    num_layers : 2
+    num_heads : 4
+    # sampling_rate : 0.2
+  pred_head :
+    ens_size : 1
+    num_layers : 1
\ No newline at end of file
diff --git a/evaluation_slurm.sh b/evaluation_slurm.sh
new file mode 100755
index 000000000..263324806
--- /dev/null
+++ b/evaluation_slurm.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+#SBATCH --job-name=wg-eval
+#SBATCH --nodes=1
+#SBATCH --mem=368G
+#SBATCH --cpus-per-task=8
+#SBATCH --ntasks-per-node=4
+#SBATCH --gpus-per-node=0
+#SBATCH --time=11:59:59
+#SBATCH --output=logs/%x.%j.out
+#SBATCH --error=logs/%x.%j.err
+#SBATCH --switches=1
+
+module load gcc/12.2.0
+module load cuda/12.2
+
+if [ $# -lt 3 ]; then
+    echo "Usage: $0 <config_file.yml> <run_id> <label>"
+    exit 1
+fi
+
+CONFIG_FILE="$1"
+RUN_ID="$2"
+LABEL="$3"
+TMP_CONFIG=$(mktemp)
+
+# Copy everything up to (but not including) the first 'run_ids:' line
+awk '/^run_ids:/ {exit} {print}' "$CONFIG_FILE" > "$TMP_CONFIG"
+
+# Add the new run_ids section
+echo "run_ids:" >> "$TMP_CONFIG"
+echo "  $RUN_ID:" >> "$TMP_CONFIG"
+echo "    label: \"$LABEL\"" >> "$TMP_CONFIG"
+
+# Append the rest of the config after the original run_ids section (if any)
+awk 'found {print} /^run_ids:/ {found=1}' "$CONFIG_FILE" | \
+awk 'NR>3' >> "$TMP_CONFIG"
+
+echo "Modified config:"
+cat "$TMP_CONFIG"
+
+echo "Starting Evaluation Job at $(date)"
+uv run --offline evaluate --config "$TMP_CONFIG"
+echo "Finished Evaluation Job at $(date)"
\ No newline at end of file
diff --git a/packages/evaluate/src/weathergen/evaluate/io/data/dataarray_builders.py b/packages/evaluate/src/weathergen/evaluate/io/data/dataarray_builders.py
index 0ee01959f..7c1787f32 100644
--- a/packages/evaluate/src/weathergen/evaluate/io/data/dataarray_builders.py
+++ b/packages/evaluate/src/weathergen/evaluate/io/data/dataarray_builders.py
@@ -26,13 +26,15 @@
 class EnsembleSelect:
     """Pre-resolved ensemble selection.
 
-    Use :meth:`mean` for the ensemble-mean sentinel, or :meth:`from_names`
-    to resolve requested member names against the full list stored in zarr.
+    Use :meth:`mean` for the ensemble-mean sentinel, :meth:`std` for the
+    ensemble-std sentinel, or :meth:`from_names` to resolve requested member
+    names against the full list stored in zarr.
     """
 
     labels: list[str]
     indices: list[int]
     is_mean: bool = False
+    is_std: bool = False
 
     # ------ factories ------
 
@@ -41,6 +43,11 @@ def mean(cls) -> EnsembleSelect:
         """Sentinel: average over the ensemble axis and drop it."""
         return cls(labels=[], indices=[], is_mean=True)
 
+    @classmethod
+    def std(cls) -> EnsembleSelect:
+        """Sentinel: compute std over the ensemble axis and drop it."""
+        return cls(labels=[], indices=[], is_std=True)
+
     @classmethod
     def from_names(
         cls,
@@ -54,6 +61,7 @@ def from_names(
         requested : list[str]
             Requested ensemble members (e.g. ``["ens0", "ens2"]``).
             Pass ``["mean"]`` to get the mean sentinel.
+            Pass ``["std"]`` to get the std sentinel.
         all_ens : list[str] | None
             All ensemble member names from the zarr store.
 
@@ -63,6 +71,8 @@ def from_names(
         """
         if requested == ["mean"]:
             return cls.mean()
+        if requested == ["std"]:
+            return cls.std()
         if all_ens is not None:
             indices = [all_ens.index(e) for e in requested]
         else:
@@ -308,6 +318,9 @@ def _build_dataarray(
         if ens_select is None or ens_select.is_mean:
             # Average over ensemble axis, drop ens coordinate
             data = data.mean(axis=-1)
+        elif ens_select.is_std:
+            # Standard deviation over ensemble axis, drop ens coordinate
+            data = data.std(axis=-1)
         else:
             idx = tuple([slice(None)] * n_base + [ens_select.indices])
             data = data[idx]
diff --git a/packages/evaluate/src/weathergen/evaluate/io/io_reader.py b/packages/evaluate/src/weathergen/evaluate/io/io_reader.py
index ec1f909dd..8965f6a1a 100644
--- a/packages/evaluate/src/weathergen/evaluate/io/io_reader.py
+++ b/packages/evaluate/src/weathergen/evaluate/io/io_reader.py
@@ -239,9 +239,11 @@ def check_availability(
             if not requested[name] <= reader_data[name]:
                 missing = requested[name] - reader_data[name]
 
-                # Special handling for ensemble mean
+                # Special handling for ensemble mean and std
                 if name == "ensemble" and "mean" in missing:
                     missing.remove("mean")
+                if name == "ensemble" and "std" in missing:
+                    missing.remove("std")
 
                 # Derivable channels (e.g. 10ff) will be computed later by
                 # DeriveChannels — keep them in the requested set.
@@ -353,6 +355,8 @@ def normalize(val):
 
         if ensemble == "mean":
             ensemble = ["mean"]
+        elif ensemble == "std":
+            ensemble = ["std"]
 
         fsteps = _parse_range_list(fsteps, "forecast_step")
         samples = _parse_range_list(samples, "sample")
diff --git a/packages/evaluate/src/weathergen/evaluate/plotting/plot_orchestration.py b/packages/evaluate/src/weathergen/evaluate/plotting/plot_orchestration.py
index f129c34eb..f5d79c8df 100644
--- a/packages/evaluate/src/weathergen/evaluate/plotting/plot_orchestration.py
+++ b/packages/evaluate/src/weathergen/evaluate/plotting/plot_orchestration.py
@@ -412,12 +412,14 @@ def _plot_single_sample(
     plot_histograms: bool | str,
     maps_config: dict,
     bias_config: dict,
+    std_config: dict,
 ) -> None:
     """Plot all maps/histograms for a single (fstep, sample) pair (loky worker)."""
     matplotlib.use("Agg")
 
     maps_cfg = oc.OmegaConf.create(maps_config)
     bias_cfg = oc.OmegaConf.create(bias_config)
+    std_cfg = oc.OmegaConf.create(std_config)
     plotter = Plotter(plotter_cfg, Path(output_basedir))
 
     data_selection = {"sample": sample, "stream": stream, "forecast_step": fstep}
@@ -433,18 +435,36 @@ def _plot_single_sample(
             plotter.create_maps_per_sample(bias_data, plot_chs, data_selection, "bias", bias_cfg)
 
     for ens in ensemble:
-        has_ens = "ens" in preds.dims and ens != "mean"
-        preds_ens = preds.sel(ens=ens) if has_ens else preds
-        preds_tag = "" if "ens" not in preds.dims else f"ens_{ens}"
+        if "ens" in preds.dims:
+            if ens == "std":
+                preds_ens = preds.std(dim="ens")
+            elif ens == "mean":
+                preds_ens = preds.mean(dim="ens")
+            else:
+                preds_ens = preds.sel(ens=ens)
+        else:
+            preds_ens = preds
+        preds_tag = f"ens_{ens}" if ens in ("mean", "std") else (
+            "" if "ens" not in preds.dims else f"ens_{ens}"
+        )
         preds_name = "_".join(filter(None, ["preds", preds_tag]))
 
         if plot_maps:
+            cfg_to_use = std_cfg if preds_tag == "ens_std" else maps_cfg
             plotter.create_maps_per_sample(
-                preds_ens, plot_chs, data_selection, preds_name, maps_cfg
+                preds_ens, plot_chs, data_selection, preds_name, cfg_to_use
             )
 
             if plot_bias and bias_has_ens:
-                bias_ens = bias_data.sel(ens=ens) if ens != "mean" else bias_data
+                if "ens" in bias_data.dims:
+                    if ens == "std":
+                        bias_ens = bias_data.std(dim="ens")
+                    elif ens == "mean":
+                        bias_ens = bias_data.mean(dim="ens")
+                    else:
+                        bias_ens = bias_data.sel(ens=ens)
+                else:
+                    bias_ens = bias_data
                 bias_tag = "_".join(filter(None, ["bias", preds_tag]))
                 plotter.create_maps_per_sample(
                     bias_ens, plot_chs, data_selection, bias_tag, bias_cfg
@@ -491,9 +511,18 @@ def _plot_all_samples(
     data_selection = {"sample": "all_samples", "stream": stream, "forecast_step": fstep}
 
     for ens in ensemble:
-        has_ens = "ens" in preds.dims and ens != "mean"
-        preds_ens = preds.sel(ens=ens) if has_ens else preds
-        preds_tag = "" if "ens" not in preds.dims else f"ens_{ens}"
+        if "ens" in preds.dims:
+            if ens == "std":
+                preds_ens = preds.std(dim="ens")
+            elif ens == "mean":
+                preds_ens = preds.mean(dim="ens")
+            else:
+                preds_ens = preds.sel(ens=ens)
+        else:
+            preds_ens = preds
+        preds_tag = f"ens_{ens}" if ens in ("mean", "std") else (
+            "" if "ens" not in preds.dims else f"ens_{ens}"
+        )
         preds_name = "_".join(filter(None, ["preds", preds_tag]))
 
         plotter.create_histograms(
@@ -623,6 +652,23 @@ def plot_data(
     maps_config_dict = oc.OmegaConf.to_container(common_ranges(*_range_args), resolve=True)
     bias_config_dict = oc.OmegaConf.to_container(bias_ranges(*_range_args), resolve=True)
 
+    has_ens = any("ens" in da.dims for da in da_preds.values())
+    if has_ens:
+        std_range_args = (
+        {fs: da.std(dim="ens") for fs, da in da_preds.items()},
+        {fs: da.std(dim="ens") for fs, da in da_preds.items()},
+        available_data.channels,
+        global_plotting_opts[stream],
+        )
+        std_config_dict = oc.OmegaConf.to_container(
+            common_ranges(*std_range_args), resolve=True
+        )
+        for var in std_config_dict:
+            if isinstance(std_config_dict[var], dict):
+                std_config_dict[var]["colormap"] = "YlOrRd"
+    else:
+        std_config_dict = maps_config_dict
+
     num_plot_workers = get_num_workers(
         check_process_headroom=True,
         max_workers=reader.eval_cfg.get("max_workers", None),
@@ -687,6 +733,7 @@ def plot_data(
                     "plot_histograms": plot_histograms,
                     "maps_config": maps_config_dict,
                     "bias_config": bias_config_dict,
+                    "std_config": std_config_dict, 
                 }
             )
 
@@ -735,12 +782,18 @@ def plot_data(
 
         tags: list[str] = []
         for ens in available_data.ensemble:
-            tags.append("preds" if not has_ens else f"preds_ens_{ens}")
+            if ens in ("mean", "std"):
+                tags.append(f"preds_ens_{ens}")
+            else:
+                tags.append("preds" if not has_ens else f"preds_ens_{ens}")
         if plot_target:
             tags.append("targets")
         if plot_bias:
             for ens in available_data.ensemble:
-                tags.append("bias" if not has_ens else f"bias_ens_{ens}")
+                if ens in ("mean", "std"):
+                    tags.append(f"bias_ens_{ens}")
+                else:
+                    tags.append("bias" if not has_ens else f"bias_ens_{ens}")
 
         for tag in tags:
             _dispatch_animations(**anim_kw, tag=tag)
diff --git a/packages/evaluate/src/weathergen/evaluate/plotting/plot_utils.py b/packages/evaluate/src/weathergen/evaluate/plotting/plot_utils.py
index 362ff0814..693036534 100644
--- a/packages/evaluate/src/weathergen/evaluate/plotting/plot_utils.py
+++ b/packages/evaluate/src/weathergen/evaluate/plotting/plot_utils.py
@@ -370,6 +370,7 @@ def plot_metric_region(
             selected_data, labels, run_ids, colors = [], [], [], []
 
             for run_id, data in scores_dict[metric][region].get(stream, {}).items():
+                print(f"{run_id}: dims={data.dims}, sizes={dict(data.sizes)}")
                 # skip if channel is missing or contains NaN
                 if ch not in np.atleast_1d(data.channel.values) or data.isnull().all():
                     continue
diff --git a/packages/evaluate/src/weathergen/evaluate/scores/score.py b/packages/evaluate/src/weathergen/evaluate/scores/score.py
index afee96d9a..2cba7933d 100755
--- a/packages/evaluate/src/weathergen/evaluate/scores/score.py
+++ b/packages/evaluate/src/weathergen/evaluate/scores/score.py
@@ -256,11 +256,12 @@ def get_score(
             f = self.det_metrics_dict[score_name]
             _logger.debug(f"Using deterministic metric: {score_name}")
         elif score_name in self.prob_metrics_dict.keys():
-            assert self.ens_dim in data.prediction.dims, (
-                f"Probablistic score {score_name} chosen, but ensemble dimension {self.ens_dim} "
-                "not found in prediction data. Skipping score calculation."
-            )
-            return None
+            if self._ens_dim not in data.prediction.dims:
+                _logger.warning(
+                    f"Probablistic score {score_name} chosen, but ensemble dimension "
+                    f"{self._ens_dim} not found in prediction data. Skipping score calculation."
+                )
+                return None
             f = self.prob_metrics_dict[score_name]
         else:
             raise ValueError(
@@ -1546,7 +1547,7 @@ def calc_rank_histogram(
                 )
         # preserve the other coordinates
         preserved_coords = {
-            c: obs_stacked[c].values
+            c: obs_stacked[c]
             for c in obs_stacked.coords
             if all(dim not in {self._ens_dim, "npoints"} for dim in obs_stacked[c].dims)
         }
@@ -1563,11 +1564,15 @@ def calc_rank_histogram(
         )
 
         # Reattach preserved coordinates by broadcasting
-        for coord_name, coord_values in preserved_coords.items():
+        for coord_name, coord_da in preserved_coords.items():
             # Only keep unique values along npoints if necessary
             if coord_name in rank_counts.coords:
                 continue
-            rank_counts = rank_counts.assign_coords({coord_name: coord_values})
+            # Skip coords whose dimensions are not a subset of rank_counts dimensions
+            if not all(dim in rank_counts.dims for dim in coord_da.dims):
+                continue
+            # Pass DataArray (not bare numpy array) so xarray uses correct dimension metadata
+            rank_counts = rank_counts.assign_coords({coord_name: coord_da})
 
         # provide normalized rank counts if desired
         if norm:
diff --git a/packages/evaluate/src/weathergen/evaluate/scores/score_orchestration.py b/packages/evaluate/src/weathergen/evaluate/scores/score_orchestration.py
index 82bc8a026..cc1756f44 100644
--- a/packages/evaluate/src/weathergen/evaluate/scores/score_orchestration.py
+++ b/packages/evaluate/src/weathergen/evaluate/scores/score_orchestration.py
@@ -9,10 +9,46 @@
 
 """Score orchestration: per-fstep scoring, stream aggregation, and JSON output."""
 
+import datetime
 import json
 import logging
+import math
 
 import numpy as np
+
+
+def _json_default(obj):
+    """Fallback JSON serialiser for types not handled by the stdlib encoder."""
+    if isinstance(obj, (datetime.datetime, datetime.date)):
+        return obj.isoformat()
+    if isinstance(obj, datetime.timedelta):
+        return obj.total_seconds()
+    if isinstance(obj, np.datetime64):
+        return str(obj)
+    if isinstance(obj, np.timedelta64):
+        return str(obj)
+    if isinstance(obj, np.integer):
+        return int(obj)
+    if isinstance(obj, np.floating):
+        return float(obj)
+    if isinstance(obj, np.ndarray):
+        return obj.tolist()
+    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
+
+
+def _sanitize_json(obj):
+    """Recursively replace NaN/±Inf with None so the output is valid JSON.
+
+    Python's json.dump writes NaN/Inf as bare tokens that its own json.load
+    cannot read back (the C decoder rejects them).
+    """
+    if isinstance(obj, float) and not math.isfinite(obj):
+        return None
+    if isinstance(obj, dict):
+        return {k: _sanitize_json(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_sanitize_json(v) for v in obj]
+    return obj
 import xarray as xr
 from joblib import delayed
 
@@ -106,19 +142,33 @@ def _score_single_fstep(
         if score.attrs:
             metric_attrs[(int(fstep), metric_name)] = score.attrs.copy()
 
-    combined = xr.concat(
-        valid_scores,
-        dim="metric",
-        coords="minimal",
-        combine_attrs="drop_conflicts",
-    )
-    combined = combined.assign_coords(metric=valid_metric_names)
-    combined = combined.compute()
-
-    for coord in ["channel", "sample", "ens"]:
-        combined = scalar_coord_to_dim(combined, coord)
+    # Separate scores with extra dimensions (e.g. rank_bin from rank_histogram)
+    # from standard scores that fit in the fixed-shape metric_stream.
+    _STANDARD_DIMS = frozenset({"sample", "channel", "ens"})
+    std_scores, std_names = [], []
+    extra_dim_scores: dict[str, xr.DataArray] = {}
+    for metric_name, score in zip(valid_metric_names, valid_scores, strict=False):
+        if set(score.dims) - _STANDARD_DIMS:
+            extra_dim_scores[metric_name] = score.compute()
+        else:
+            std_scores.append(score)
+            std_names.append(metric_name)
+
+    if std_scores:
+        combined = xr.concat(
+            std_scores,
+            dim="metric",
+            coords="minimal",
+            combine_attrs="drop_conflicts",
+        )
+        combined = combined.assign_coords(metric=std_names)
+        combined = combined.compute()
+        for coord in ["channel", "sample", "ens"]:
+            combined = scalar_coord_to_dim(combined, coord)
+    else:
+        combined = None
 
-    return fstep, combined, metric_attrs
+    return fstep, combined, extra_dim_scores, metric_attrs
 
 
 def calc_scores_per_stream(
@@ -310,7 +360,7 @@ def compute_scores_for_region(
     )
 
     all_metric_attrs: dict = {}
-    for _, _, fstep_attrs in fstep_results:
+    for _, _, _, fstep_attrs in fstep_results:
         all_metric_attrs.update(fstep_attrs)
 
     return fstep_results, all_metric_attrs
@@ -353,16 +403,24 @@ def store_metrics_for_region(
     da_preds : dict
         Prediction dict (used only to check for ``lead_time`` coord).
     """
+    # Determine which metrics have extra dimensions (e.g. rank_bin) from the results.
+    # These cannot be stored in the fixed-shape metric_stream.
+    extra_dim_metric_names: set[str] = set()
+    for _, _, extra_dim_dict, _ in fstep_results:
+        extra_dim_metric_names.update(extra_dim_dict.keys())
+    std_metrics = {k: v for k, v in metrics.items() if k not in extra_dim_metric_names}
+    extra_dim_metrics = {k: v for k, v in metrics.items() if k in extra_dim_metric_names}
+
     metric_stream = xr.DataArray(
         np.full(
-            (len(samples), len(fsteps), len(channels), len(metrics), len(ensemble)),
+            (len(samples), len(fsteps), len(channels), len(std_metrics), len(ensemble)),
             np.nan,
         ),
         coords={
             "sample": samples,
             "forecast_step": fsteps,
             "channel": channels,
-            "metric": list(metrics.keys()),
+            "metric": list(std_metrics.keys()),
             "ens": ensemble,
         },
     )
@@ -372,7 +430,9 @@ def store_metrics_for_region(
             lead_time=("forecast_step", np.full(len(fsteps), -1, dtype=int))
         )
 
-    for fstep, combined_metrics, _fstep_attrs in fstep_results:
+    for fstep, combined_metrics, _extra_dim_dict, _fstep_attrs in fstep_results:
+        if combined_metrics is None:
+            continue
         criteria = {
             "forecast_step": int(fstep),
             "sample": combined_metrics.sample.values,
@@ -415,10 +475,28 @@ def store_metrics_for_region(
                 indexers = {dim: criteria[dim] for dim in coord_dims if dim in criteria}
                 metric_stream.coords[coord_name].loc[indexers] = combined_metrics.coords[coord_name]
 
+    # Build and store extra-dim metrics (e.g. rank_histogram with rank_bin dimension).
+    for metric_name, parameters in extra_dim_metrics.items():
+        fstep_das = []
+        fstep_ids = []
+        for fstep, _, extra_dim_dict, _ in fstep_results:
+            if metric_name not in extra_dim_dict:
+                continue
+            da = extra_dim_dict[metric_name]
+            for coord in ["channel", "sample", "ens"]:
+                da = scalar_coord_to_dim(da, coord)
+            fstep_das.append(da.expand_dims({"forecast_step": [fstep]}))
+            fstep_ids.append(fstep)
+        if fstep_das:
+            metric_da = xr.concat(fstep_das, dim="forecast_step")
+            local_scores.setdefault(metric_name, {}).setdefault(region, {}).setdefault(stream, {})[
+                run_id
+            ] = metric_da.assign_attrs(parameters)
+
     _logger.info(f"Scores for run {run_id} - {stream} calculated successfully.")
     _logger.debug(f"all_metric_attrs keys: {list(all_metric_attrs.keys())}")
 
-    for metric, parameters in metrics.items():
+    for metric, parameters in std_metrics.items():
         metric_data = metric_stream.sel({"metric": metric}).assign_attrs(parameters)
         for (_stored_fstep, stored_metric), attrs in all_metric_attrs.items():
             if stored_metric == metric and attrs:
@@ -483,7 +561,7 @@ def metric_list_to_json(
                     data_dict = {"scores": [metric_data_dict]}
 
                 with open(save_path, "w") as f:
-                    json.dump(data_dict, f, indent=4)
+                    json.dump(_sanitize_json(data_dict), f, indent=4, default=_json_default)
 
     _logger.info(
         f"Saved all results of inference run {reader.run_id} - mini_epoch {reader.mini_epoch:d} "
diff --git a/src/weathergen/datasets/masking.py b/src/weathergen/datasets/masking.py
index 4fc063bcb..2db315627 100644
--- a/src/weathergen/datasets/masking.py
+++ b/src/weathergen/datasets/masking.py
@@ -260,6 +260,8 @@ def parse_src_target_correspondence(self, losses, target_cfgs, source_cfgs) -> d
         corrs = []
         for _, loss_term in losses.items():
             for loss_name, loss_fct in loss_term.loss_fcts.items():
+                if loss_fct is None:
+                    continue
                 corr = loss_fct.get("target_source_correspondence", None)
 
                 # correspondence not specified; falling back to default 1-to-1 correspondence
diff --git a/src/weathergen/datasets/multi_stream_data_sampler.py b/src/weathergen/datasets/multi_stream_data_sampler.py
index b523a375c..c08dd5aae 100644
--- a/src/weathergen/datasets/multi_stream_data_sampler.py
+++ b/src/weathergen/datasets/multi_stream_data_sampler.py
@@ -93,6 +93,15 @@ def __init__(self, cf: Config, mode_cfg: dict, stage: Stage):
         self._stage = stage
 
         self.mini_epoch = 0
+        # Effective epoch as seen *inside a worker process*. Under persistent_workers the
+        # template's mini_epoch (bumped by advance() in the main process) never reaches the
+        # live workers, so we cannot key the seed / curriculum on self.mini_epoch there.
+        # Instead each worker tracks its own counter: initialised from the inherited
+        # mini_epoch on its first __iter__, then +1 per subsequent __iter__ (one __iter__ ==
+        # one mini_epoch for every loader call site). Matches mini_epoch exactly in the
+        # non-persistent case, and stays correct when workers outlive a mini_epoch.
+        self._effective_epoch = 0
+        self._worker_iter_started = False
         self.mask_value = 0.0
         self.streams = cf.streams
         self.rank = cf.rank
@@ -156,7 +165,11 @@ def __init__(self, cf: Config, mode_cfg: dict, stage: Stage):
         # RNG seed setup
         rs = cf.data_loading.rng_seed
         nw = cf.data_loading.num_workers
+        # Immutable base seed. The per-(rank, worker, epoch) seed is derived from this in
+        # worker_workset() into self._worker_rng_seed; this base is never mutated so the
+        # derivation cannot compound across mini_epochs under persistent_workers.
         self.data_loader_rng_seed = rs if rs > nw else rs * 97
+        self._worker_rng_seed = self.data_loader_rng_seed
 
         self.rng = None
 
@@ -293,7 +306,7 @@ def reset(self) -> tuple[Sequence[int], Sequence[int]]:
 
         Returns: permutation index, forecast steps index
         """
-        self.rng = np.random.default_rng(self.data_loader_rng_seed)
+        self.rng = np.random.default_rng(self._worker_rng_seed)
         fsm = self._get_fsm()
         self.check_samples(fsm)
         perms = self._calc_baseperms(fsm)
@@ -339,7 +352,7 @@ def _get_fsm(self) -> int:
         """Obtain maximum number of forecast steps for current mini epoch."""
         # fixed number of forecast steps for this run
         if self.forecast_policy != "random":
-            idx = min(self.mini_epoch, len(self.list_num_forecast_steps) - 1)
+            idx = min(self._effective_epoch, len(self.list_num_forecast_steps) - 1)
             fsm = self.list_num_forecast_steps[idx]
         else:
             fsm = self.list_num_forecast_steps.max()
@@ -811,6 +824,16 @@ def __iter__(self) -> ModelBatch:
         Return :
             batch of data
         """
+        # Track the effective epoch *within this process*. The first __iter__ adopts the
+        # inherited mini_epoch (correct at worker-birth); each later __iter__ (one per
+        # mini_epoch) advances it by one, so persistent workers stay in step with the
+        # main-process advance() they never observe directly.
+        if self._worker_iter_started:
+            self._effective_epoch += 1
+        else:
+            self._effective_epoch = self.mini_epoch
+            self._worker_iter_started = True
+
         iter_start, iter_end = self.worker_workset()
         logger.info(f"iter_start={iter_start}, iter_end={iter_end}, len={self.len}")
 
@@ -860,19 +883,21 @@ def worker_workset(self):
             # assert self.world_size == 1, self.world_size
             iter_start = 0
             iter_end = len(self)
+            # single-process (e.g. num_workers=0 debug): no per-worker fan-out, seed off base.
+            self._worker_rng_seed = self.data_loader_rng_seed
 
         else:
-            # ensure the rng seed is fully unique across workers and mini_epochs
-            # the worker processes are generated as bit-wise copy of the "template" (the actual
-            # instance of the present class that is created) whenever __iter__ is started. This
-            # happens for each mini_epoch, for train and validation, and independently for each DDP
-            # worker. After the bit-wise copy, the rng seed needs to be made unique for
-            # DDP workers, loader process, mini_epoch.
+            # Make the rng seed unique across DDP ranks, loader workers, and mini_epochs.
+            # Derive (do NOT mutate the base) so this cannot compound when a persistent
+            # worker runs worker_workset() once per mini_epoch. _effective_epoch (not
+            # mini_epoch) is the per-epoch component so it advances even in live workers.
+            # Numerically identical to the previous base *= f(...) in the non-persistent
+            # case, where _effective_epoch == mini_epoch.
             dist = torch.distributed
-            self.data_loader_rng_seed *= (
+            self._worker_rng_seed = self.data_loader_rng_seed * (
                 (((dist.get_rank() + 1) * 73) if dist.is_initialized() else 1)
                 * ((worker_info.id + 1) * 37)
-                * (self.mini_epoch + 13)
+                * (self._effective_epoch + 13)
                 * 7
             )
             # split workload
diff --git a/src/weathergen/model/engines.py b/src/weathergen/model/engines.py
index 3a5842b04..51205d63b 100644
--- a/src/weathergen/model/engines.py
+++ b/src/weathergen/model/engines.py
@@ -816,18 +816,23 @@ def forward(self, latent, output, latent_lens, output_lens, coordinates):
         tokens_lens = latent_lens
         tcs_aux = coordinates
 
+        # Per-block activation checkpointing must be OFF when this decoder is reused per
+        # ensemble member under DDP: each checkpoint recompute re-fires the reducer's grad
+        # hooks for the reused params → "marked ready twice". Plain (un-checkpointed) reuse
+        # is DDP-safe. Gated by decoder_checkpointing (default True preserves old behaviour).
+        use_ckpt = self.cf.get("decoder_checkpointing", True)
+
+        def run_block(block, *args):
+            if use_ckpt:
+                return checkpoint(block, *args, use_reentrant=False)
+            return block(*args)
+
         for ib, block in enumerate(self.tte):
             if self.cf.pred_self_attention and ib % 3 == 1:
-                tc_tokens = checkpoint(block, tc_tokens, tcs_lens, tcs_aux, use_reentrant=False)
+                tc_tokens = run_block(block, tc_tokens, tcs_lens, tcs_aux)
             else:
-                tc_tokens = checkpoint(
-                    block,
-                    tc_tokens,
-                    tokens_stream,
-                    tcs_lens,
-                    tokens_lens,
-                    tcs_aux,
-                    use_reentrant=False,
+                tc_tokens = run_block(
+                    block, tc_tokens, tokens_stream, tcs_lens, tokens_lens, tcs_aux
                 )
         return tc_tokens
 
diff --git a/src/weathergen/model/model.py b/src/weathergen/model/model.py
index bfeceabe1..d1de039be 100644
--- a/src/weathergen/model/model.py
+++ b/src/weathergen/model/model.py
@@ -345,6 +345,7 @@ def __init__(self, cf: Config, sources_size, targets_num_channels, targets_coord
         self.num_register_tokens = cf.num_register_tokens
         self.latent_heads = None
         self.latent_pre_norm = None
+        self.register_buffer("latent_perturbation_log_sigma", None)
         self.deep_ssl_fusion: DeepSSLFusion | None = None
         self.deep_ssl_level_projections: nn.ModuleDict | None = None
         # auxiliary tokens
@@ -355,6 +356,7 @@ def __init__(self, cf: Config, sources_size, targets_num_channels, targets_coord
         self.aux_token_idxs = list(range(cf.num_register_tokens + cf.num_class_tokens))
         self.num_aux_tokens = cf.num_register_tokens + cf.num_class_tokens
 
+
     def _create_latent_pred_head(
         self, global_cfg, name, loss_cfg, use_class_token, use_patch_token
     ):
@@ -609,6 +611,15 @@ def create(self) -> "Model":
                         use_patch_token=False,
                     )
 
+        # Latent-perturbation noise scale (learnable or fixed)
+        # torch.zeros() is used (not torch.tensor) so this respects the meta-device context used by FSDP. The actual value is filled in reset_parameters().
+        num_mem = cf.get("latent_perturbation_num_members", 0)
+        if num_mem > 1:
+            if cf.get("latent_perturbation_sigma_learnable", True):
+                self.latent_perturbation_log_sigma = nn.Parameter(torch.zeros(1))
+            else:
+                self.register_buffer("latent_perturbation_log_sigma", torch.zeros(1))
+
         # Deep SSL fusion and per-level projections (student only)
         deep_ssl_cfg = cf.training_config.get("deep_ssl", None)
         if deep_ssl_cfg and deep_ssl_cfg.get("enabled", False) and deep_ssl_cfg.get("tap_after"):
@@ -651,6 +662,11 @@ def _reset_params(module):
 
         self.apply(_reset_params)
 
+        if self.latent_perturbation_log_sigma is not None:
+            sigma_init = self.cf.get("latent_perturbation_sigma_init", 0.01)
+            with torch.no_grad():
+                self.latent_perturbation_log_sigma.fill_(math.log(sigma_init))
+
     def print_num_parameters(self) -> None:
         """Print number of parameters for entire model and each module used to build the model"""
 
@@ -876,80 +892,197 @@ def predict_decoders(
         if not self.pred_heads:
             return output
 
-        # remove register  and class tokens
-        tokens = tokens[:, self.num_aux_tokens :]
-
-        # get 1-ring neighborhood for prediction
-        batch_size = len(batch)
-        s = [batch_size, self.num_healpix_cells, self.cf.ae_local_num_queries, tokens.shape[-1]]
-        idxs = model_params.hp_nbours.unsqueeze(0).repeat((batch_size, 1, 1)).flatten(0, 1)
-        tokens_nbors = tokens.reshape(s).flatten(0, 1)[idxs.flatten()].flatten(0, 1)
-        # TODO: precompute in model_params?
-        tokens_nbors_lens = torch.full(
-            (s[0] * s[1] + 1,), fill_value=9, dtype=torch.int32, device=tokens_nbors.device
-        )
-        tokens_nbors_lens[0] = 0
+        # ── Strip aux tokens ─────────────────────────────────────────────
+        # tokens: [B, num_aux + H*Q, D] → [B, H*Q, D]
+        tokens = tokens[:, self.num_aux_tokens:]
+
+        B = len(batch)
+        H = self.num_healpix_cells
+        Q = self.cf.ae_local_num_queries   # 1 per default config
+        D = tokens.shape[-1]
+        assert tokens.shape == (B, H * Q, D), f"unexpected token shape {tokens.shape}"
+        # tokens_nbors_lens uses fill_value=9 (one entry per neighbour cell), which is only correct when Q=1.  
+        # If Q>1 each neighbour contributes Q tokens and the value must change to 9*Q.
+        assert Q == 1, "predict_decoders assumes ae_local_num_queries==1; update tokens_nbors_lens fill_value if Q>1"
+
+        # ── Latent Gaussian perturbation ────────────────────────────────
+        # Sample M independent members by treating each as an extra batch element 
+        # The entire decoder forward pass runs once, fully vectorised.
+        # Layout convention (used throughout): M is the OUTER axis, B is the INNER axis.
+        #   tokens_tiled[m*B + b]  belongs to member m, batch item b
+        M = self.cf.get("latent_perturbation_num_members", 0)
+        if M > 1 and self.latent_perturbation_log_sigma is not None:
+            tokens_std = tokens.std().item()
+            tokens_abs = tokens.abs().mean().item()
+
+            sigma_log = torch.exp(self.latent_perturbation_log_sigma).item()
+
+            #logger.info(
+            #    f"tokens_std={tokens_std:.4f} "
+            #    f"tokens_abs={tokens_abs:.4f} "
+            #    f"sigma={sigma_log:.4f} "
+            #    f"sigma/tokens_std={sigma_log/tokens_std:.4f}"
+           #) )
+            override = self.cf.get("latent_perturbation_sigma_override", None)
+            #logger.info(f"sigma_override={override}")
+            if override is not None:
+                sigma = torch.tensor(
+                    override,
+                    device=tokens.device,
+                    dtype=tokens.dtype,
+                )
+            else:
+                sigma = torch.exp(self.latent_perturbation_log_sigma).to(tokens.dtype)
+            #logger.info(f"effective_sigma={sigma.item()}")
+            eps = torch.randn(M, B, H * Q, D, device=tokens.device, dtype=tokens.dtype)
+            tokens_tiled = (tokens.unsqueeze(0) + sigma * eps).reshape(M * B, H * Q, D)
+            latent_spread = (
+                tokens_tiled.reshape(M, B, H * Q, D)
+                .std(dim=0)
+                .mean()
+            )
 
-        # pair with tokens from assimilation engine to obtain target tokens
+            #logger.info(
+            #    f"latent_spread={latent_spread.item():.6f}"
+            #)
+        else:
+            M = 1
+            tokens_tiled = tokens
+        # tokens_tiled: [M*B, H, D]  (Q=1 so H*Q == H)
+        assert tokens_tiled.shape == (M * B, H * Q, D)
+
+        # ── Flatten to cell level ────────────────────────────────────────
+        # tokens_flat[i*H + h]  =  D-dim token for (member i//B, batch i%B, cell h)
+        tokens_flat = tokens_tiled.reshape(M * B, H, Q, D).flatten(0, 1)  # [M*B*H, Q, D]
+        assert tokens_flat.shape == (M * B * H, Q, D)
+
+        # ── Neighbour token gather ───────────────────────────────────────
+        # hp_nbours[h, k] is the k-th neighbour cell index of cell h; values in [0, H).
+        #
+        # For the i-th (m, b) pair (i = m*B + b) the neighbour of cell h lives at flat row
+        #   i*H + nbr   in tokens_flat   (NOT just at `nbr`)
+        # We therefore add a per-(m,b) offset of i*H to every cell index.
+        MB = M * B
+        cell_offsets = torch.arange(MB, device=tokens.device).view(MB, 1, 1) * H
+        idxs = (model_params.hp_nbours.unsqueeze(0) + cell_offsets).flatten(0, 1)  # [M*B*H, 9]
+        assert idxs.shape == (MB * H, 9)
+        assert idxs.min() >= 0 and idxs.max() < MB * H, \
+            f"neighbour indices out of range [0, {MB * H}): min={idxs.min()}, max={idxs.max()}"
+
+        # Gather neighbours: [M*B*H, 9, Q, D] → flatten (9, Q) → [M*B*H*9*Q, D]
+        tokens_nbors = tokens_flat[idxs.flatten()].flatten(0, 1)  # [M*B*H * 9*Q, D]
+        assert tokens_nbors.shape == (MB * H * 9 * Q, D)
+
+        # KV sequence lengths for varlen attention.
+        # attention.py does cumsum(x_kv_lens) before flash_attn, so these are INDIVIDUAL counts
+        # (not already cumulative).  M*B*H groups × 9 neighbour cells (each with Q=1 token).
+        tokens_nbors_lens = tokens_flat.new_zeros(MB * H + 1, dtype=torch.int32)
+        tokens_nbors_lens[1:] = 9 * Q
+
+        # ── Per-stream decoding ──────────────────────────────────────────
         for stream_name in self.stream_names:
-            # extract target coords for current stream and fstep and convert to one tensor
+            # Collect target coordinates for this stream + step across the batch.
             t_coords = [
                 batch.samples[i_b].streams_data[stream_name].target_coords[step]
-                for i_b in range(batch_size)
+                for i_b in range(B)
             ]
-            t_coords_lens = [len(t) for t in t_coords]
-            t_coords = torch.cat(t_coords)
+            t_coords_lens = [len(t) for t in t_coords]  # per-batch-item point counts
+            t_coords = torch.cat(t_coords)               # [P, coord_dim]
+            P = t_coords.shape[0]
 
-            if len(t_coords) == 0:
+            if P == 0:
                 continue
 
-            # embed token coords
-            tc_embed = self.embed_target_coords[stream_name]
-            tc_tokens = checkpoint(tc_embed, t_coords, use_reentrant=False)
+            tc_tokens = checkpoint(
+                self.embed_target_coords[stream_name], t_coords, use_reentrant=False
+            )  # [P, embed_dim]
+            assert tc_tokens.shape[0] == P
 
-            # skip when coordinate embeddings yields nan (i.e. the coord embedding network diverged)
             if torch.isnan(tc_tokens).any():
                 logger.warning(
-                    (
-                        f"Skipping prediction for {stream_name} because",
-                        f" of {torch.isnan(tc_tokens).sum()} NaN in tc_tokens.",
-                    )
+                    f"Skipping prediction for {stream_name} because"
+                    f" of {torch.isnan(tc_tokens).sum()} NaN in tc_tokens."
                 )
                 pred = torch.tensor([], device=tc_tokens.device)
 
-            # skip empty lengths
-            elif tc_tokens.shape[0] == 0:
-                pred = torch.tensor([], device=tc_tokens.device)
-
             else:
-                # lens for varlen attention
-                tcls = torch.cat(
-                    [
-                        sample.streams_data[stream_name].target_coords_lens[step]
-                        for sample in batch.samples
-                    ]
-                )
+                # Build query-group lens for varlen attention.
+                # tcs_lens = [0, c_0, ..., c_{N-1}] where N = B*H, individual counts.
+                tcls = torch.cat([
+                    sample.streams_data[stream_name].target_coords_lens[step]
+                    for sample in batch.samples
+                ])
                 tcs_lens = torch.cat([torch.zeros(1, dtype=torch.int32, device=tcls.device), tcls])
+                N = tcs_lens.shape[0] - 1   # must equal B*H for the KV pairing to be valid
+                assert N == B * H, \
+                    f"expected N={B * H} query groups (B*H), got N={N}"
+
+                # Tile M times: M-major ordering matches tokens_tiled / tokens_nbors layout.
+                #   tc_tokens_in: [M*P, embed_dim]  — [P pts for m=0 | P pts for m=1 | ...]
+                #   tcs_lens_in:  [0, c_0,..,c_{N-1}, c_0,..,c_{N-1}, ...] (M repetitions)
+                tc_tokens_in = tc_tokens.repeat(M, 1)
+                tcs_lens_in = torch.cat([
+                    torch.zeros(1, dtype=torch.int32, device=tcs_lens.device),
+                    tcs_lens[1:].repeat(M),
+                ])
+                assert tc_tokens_in.shape == (M * P, tc_tokens.shape[-1])
+                assert tcs_lens_in.shape[0] == M * N + 1
 
                 if self.cf.decoder_type == "Linear":
                     pred = self.target_token_engines[stream_name](
-                        tc_tokens,
-                        tokens.reshape(-1, s[-1]),  # collapse the batch and token dimensions
-                        tcs_lens,
-                    ).unsqueeze(0)  # add ensemble dim: shape is then [1, preds_per_coord, channels]
+                        tc_tokens_in,
+                        tokens_tiled.reshape(-1, D),   # [M*B*H*Q, D]  all latent tokens flat
+                        tcs_lens_in,
+                    )
+                    # [M*P, C] → [M, P, C]
+                    assert pred.shape[0] == M * P
+                    pred = pred.reshape(M, P, pred.shape[-1])
                 else:
-                    tc_tokens = self.target_token_engines[stream_name](
+                    # Decode all M members in a SINGLE varlen call. Both query groups
+                    # (tcs_lens_in) and KV groups (tokens_nbors_lens) are M-major with N=B*H
+                    # groups per member, so member m's queries pair with member m's KVs. The
+                    # decoder params are therefore used exactly ONCE per forward — no per-member
+                    # reuse — which keeps the per-block activation checkpointing inside the
+                    # engine DDP-safe (reuse + checkpointing is what trips "marked ready twice").
+                    # The single call launches M*B*H attention groups; this must stay under the
+                    # CUDA grid-dim cap (65535), which holds for M<=4 at healpix-5 with B=1.
+                    assert M * N <= 65535, (
+                        f"M*B*H={M * N} exceeds the CUDA grid-dim limit (65535); "
+                        f"lower latent_perturbation_num_members"
+                    )
+                    tc_tokens_out = self.target_token_engines[stream_name](
                         latent=tokens_nbors,
-                        output=tc_tokens,
+                        output=tc_tokens_in,
                         latent_lens=tokens_nbors_lens,
-                        output_lens=tcs_lens,
-                        coordinates=t_coords,
+                        output_lens=tcs_lens_in,
+                        coordinates=t_coords.repeat(M, 1),
+                    )  # [M*P, embed_dim]
+                    pred = self.pred_heads[stream_name](tc_tokens_out)
+                    # pred: [E, M*P, C]
+                    # The M blocks in dim-1 are contiguous (repeat(M,1) ordering), so we can
+                    # split on M and permute: [E, M*P, C] → [M, E, P, C] → [M*E, P, C]
+                    E, pts_M, C = pred.shape
+                    assert pts_M == M * P, f"expected {M * P} pts in pred, got {pts_M}"
+                    pred_members = (
+                        pred.reshape(E, M, P, C)
+                        .permute(1, 0, 2, 3)
                     )
-
-                    # final prediction head to map back to physical space
-                    pred = self.pred_heads[stream_name](tc_tokens)
-
-            # recover batch dimension (ragged, so as list)
+                        #.reshape(M * E, P, C)
+                    #)
+                    if M > 1:
+                        output_spread = pred_members.std(dim=0).mean()
+                        #logger.info(
+                        #    f"{stream_name}: "
+                        #    f"latent_spread={latent_spread.item():.6f} "
+                        #    f"output_spread={output_spread.item():.6f} "
+                        #    f"gain={output_spread.item()/latent_spread.item() + 1e-8:.6f}"
+                        #)
+                    pred = pred_members.reshape(M * E,P,C)
+
+                assert pred.shape[1] == P, f"expected {P} points in dim=1, got {pred.shape[1]}"
+
+            # ── Unpack into per-batch-item list ─────────────────────────
+            # pred: [M or M*E, P, C]  →  list of B tensors, each [M or M*E, pts_b, C]
             pred = torch.split(pred, t_coords_lens, dim=1)
             output.add_physical_prediction(step, stream_name, pred)
 
diff --git a/src/weathergen/model/model_interface.py b/src/weathergen/model/model_interface.py
index 255903ce1..3c6bed812 100644
--- a/src/weathergen/model/model_interface.py
+++ b/src/weathergen/model/model_interface.py
@@ -187,6 +187,12 @@ def init_model_and_shard(
             model.to_empty(device="cuda")
             if with_fsdp:
                 model.reset_parameters()
+        else:
+            # Non-FSDP path: model weights are real tensors (not meta), but
+            # reset_parameters must still be called to apply sigma_init etc.
+            # Unwrap DDP if present — reset_parameters is a custom method not forwarded by DDP.
+            reset_target = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+            reset_target.reset_parameters()
 
     # model params
     model_params = ModelParams(cf).create(cf)
@@ -222,13 +228,19 @@ def load_model(cf, model, device, run_id: str, mini_epoch=-1):
             if sharded_meta_param is None:
                 logger.warning(f"Parameter {param_name} from checkpoint not found in model.")
                 continue
-            sharded_tensor = distribute_tensor(
-                full_tensor,
-                sharded_meta_param.device_mesh,
-                sharded_meta_param.placements,
-            )
-            # maybe_sharded_sd[param_name.replace("module.", "")] = nn.Parameter(sharded_tensor)
-            maybe_sharded_sd[param_name] = torch.nn.Parameter(sharded_tensor)
+            if hasattr(sharded_meta_param, "device_mesh"):
+                # FSDP-sharded parameter (DTensor) — distribute and wrap as Parameter
+                sharded_tensor = distribute_tensor(
+                    full_tensor,
+                    sharded_meta_param.device_mesh,
+                    sharded_meta_param.placements,
+                )
+                maybe_sharded_sd[param_name] = torch.nn.Parameter(sharded_tensor)
+            else:
+                # Plain buffer (e.g. register_buffer) — FSDP does not shard buffers,
+                # so load directly without distribute_tensor. Move to the correct
+                # device since the checkpoint was loaded onto CPU.
+                maybe_sharded_sd[param_name] = full_tensor.to(device)
         # choose `assign=True` for sharded model since we cannot call `copy_` on meta tensor
         mkeys, ukeys = model.load_state_dict(maybe_sharded_sd, strict=False, assign=True)
 
diff --git a/src/weathergen/train/loss_modules/loss_module_physical.py b/src/weathergen/train/loss_modules/loss_module_physical.py
index 7deb727d6..d3df8c6c2 100644
--- a/src/weathergen/train/loss_modules/loss_module_physical.py
+++ b/src/weathergen/train/loss_modules/loss_module_physical.py
@@ -58,6 +58,7 @@ def __init__(
         self.name = "LossPhysical"
 
         # dynamically load loss functions based on configuration and stage
+        # A null entry (params is None) means "remove this loss from the parent config"
         self.loss_fcts = [
             [
                 getattr(loss_fns, name),
@@ -65,6 +66,7 @@ def __init__(
                 name,
             ]
             for name, params in loss_fcts.items()
+            if params is not None
         ]
 
     def _get_weights(self, stream_info):
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
index 51e22d992..7cf5a2382 100644
--- a/src/weathergen/train/trainer.py
+++ b/src/weathergen/train/trainer.py
@@ -369,11 +369,16 @@ def run(self, cf, devices, run_id_contd=None, mini_epoch_contd=None):
             "batch_sampler": None,
             "shuffle": False,
             "num_workers": cf.data_loading.num_workers,
+            # persistent_workers requires num_workers > 0; guard so num_workers=0 (debug) works
+            "persistent_workers": (
+                cf.data_loading.get("persistent_workers", False) and cf.data_loading.num_workers > 0
+            ),
         }
         self.data_loader = torch.utils.data.DataLoader(self.dataset, **loader_params, sampler=None)
         # loader_params["num_workers"]=  0
+        val_loader_params = loader_params | {"num_workers": min(cf.data_loading.num_workers, 6)}
         self.data_loader_validation = torch.utils.data.DataLoader(
-            self.dataset_val, **loader_params, sampler=None
+            self.dataset_val, **val_loader_params, sampler=None
         )
 
         for stage_label, extra_cfg in self.extra_val_cfgs.items():
@@ -383,7 +388,7 @@ def run(self, cf, devices, run_id_contd=None, mini_epoch_contd=None):
             # cap workers: each loader spawns its own processes, each re-opening all
             # stream readers
             extra_loader_params = loader_params | {
-                "num_workers": min(cf.data_loading.num_workers, 2)
+                "num_workers": min(cf.data_loading.num_workers, 6)
             }
             self.data_loaders_val_extra[stage_label] = torch.utils.data.DataLoader(
                 self.datasets_val_extra[stage_label], **extra_loader_params, sampler=None
@@ -438,6 +443,17 @@ def run(self, cf, devices, run_id_contd=None, mini_epoch_contd=None):
         if run_id_contd is not None: # and self.cf.general.istep != 0: # To be tested
             self._load_ema_teacher_state(run_id_contd, mini_epoch_contd)
 
+        # --- Point 1: verify sigma is initialised correctly ---
+        if is_root():
+            _log_sigma = getattr(self.model, "latent_perturbation_log_sigma", None)
+            if _log_sigma is not None:
+                _s = _log_sigma.exp().item()
+                logger.info(
+                    f"[latent_perturbation] sigma at init: {_s:.6f}  "
+                    f"(log_sigma={_log_sigma.item():.4f}, "
+                    f"learnable={_log_sigma.requires_grad})"
+                )
+
         # if with_fsdp then parameter count is unreliable
         if is_root():
             # ddp-wrapped model does not expose this function
@@ -454,8 +470,28 @@ def run(self, cf, devices, run_id_contd=None, mini_epoch_contd=None):
         beta2 = 1.0 - kappa * (1.0 - self.training_cfg.optimizer.adamw.beta2)
         eps = self.training_cfg.optimizer.adamw.get("eps", 2e-08) / np.sqrt(kappa)
 
+        # Optionally give log_sigma a separate, higher LR to counteract decoder collapse.
+        # Set latent_perturbation_sigma_lr in config (e.g. 1e-2); 0.0 disables the feature.
+        self._sigma_lr = cf.get("latent_perturbation_sigma_lr", 0.0)
+        if self._sigma_lr > 0.0:
+            sigma_param_names = {n for n, _ in self.model.named_parameters() if "log_sigma" in n}
+            sigma_params = [p for n, p in self.model.named_parameters() if n in sigma_param_names]
+            other_params = [p for n, p in self.model.named_parameters() if n not in sigma_param_names]
+            param_groups = [
+                {"params": other_params},
+                {"params": sigma_params, "name": "log_sigma"},
+            ]
+            if is_root():
+                logger.info(
+                    f"[latent_perturbation] Separate sigma LR={self._sigma_lr:.2e} "
+                    f"(main lr_max={self.training_cfg.learning_rate_scheduling.lr_max:.2e}x"
+                    f"{cf.world_size ** 0.5:.1f}sqrt_scale)"
+                )
+        else:
+            param_groups = list(self.model.parameters())
+
         self.optimizer = torch.optim.AdamW(
-            self.model.parameters(),
+            param_groups,
             lr=self.training_cfg.learning_rate_scheduling.lr_start,
             weight_decay=self.training_cfg.optimizer.weight_decay,
             betas=(beta1, beta2),
@@ -651,6 +687,26 @@ def train(self, mini_epoch):
             self.optimizer.zero_grad()
             self.grad_scaler.scale(loss).backward()
 
+            for idx, (name, param) in enumerate(self.model.named_parameters()):
+                if param.requires_grad and param.grad is None:
+                    logger.warning(f"[rank={self.cf.rank}] no grad: {idx} {name} {tuple(param.shape)}")
+
+            # --- Point 2: verify sigma gradient on the very first backward pass ---
+            if self.cf.general.istep == 0 and is_root():
+                _log_sigma = getattr(self.model, "latent_perturbation_log_sigma", None)
+                if _log_sigma is not None:
+                    _g = _log_sigma.grad
+                    if _g is None:
+                        logger.warning(
+                            "[latent_perturbation] log_sigma.grad is None after first backward! "
+                            "Sigma will not be learned. Check that kernel_crps is the active loss "
+                            "and that num_members > 1."
+                        )
+                    else:
+                        logger.info(
+                            f"[latent_perturbation] log_sigma.grad after first backward: {_g.item():.6f}"
+                        )
+
             # gradient clipping
             self.grad_scaler.unscale_(self.optimizer)
             total_norm = torch.nn.utils.clip_grad_norm_(
@@ -670,6 +726,11 @@ def train(self, mini_epoch):
 
             # update learning rate
             self.lr_scheduler.step()
+            # Re-apply sigma's independent LR; the scheduler overwrites all param groups
+            if self._sigma_lr > 0.0:
+                for g in self.optimizer.param_groups:
+                    if g.get("name") == "log_sigma":
+                        g["lr"] = self._sigma_lr
 
             batch_size_total = self.get_batch_size_total(self.batch_size_per_gpu)
             step = batch_size_total * self.cf.general.istep
@@ -710,6 +771,32 @@ def train(self, mini_epoch):
                 # Log collapse metrics
                 if self.collapse_monitor.should_log(self.cf.general.istep):
                     self._log_collapse_metrics(TRAIN)
+                # --- Points 3 & 4: log sigma value and its gradient norm ---
+                if is_root():
+                    _log_sigma = getattr(self.model, "latent_perturbation_log_sigma", None)
+                    if _log_sigma is not None:
+                        _metrics = {
+                            "latent_perturbation/sigma": _log_sigma.exp().item(),
+                            "latent_perturbation/log_sigma": _log_sigma.item(),
+                        }
+                        if _log_sigma.grad is not None:
+                            _metrics["latent_perturbation/log_sigma_grad_norm"] = (
+                                _log_sigma.grad.norm().item()
+                            )
+                        self.train_logger.log_metrics(
+                            TRAIN, _metrics, step=self.cf.general.istep
+                        )
+                        _gstr = (
+                            f", grad_norm={_metrics['latent_perturbation/log_sigma_grad_norm']:.6f}"
+                            if "latent_perturbation/log_sigma_grad_norm" in _metrics
+                            else ""
+                        )
+                        logger.info(
+                            f"[latent_perturbation] step={self.cf.general.istep}: "
+                            f"sigma={_metrics['latent_perturbation/sigma']:.6f}, "
+                            f"log_sigma={_metrics['latent_perturbation/log_sigma']:.4f}"
+                            f"{_gstr}"
+                        )
 
             # save model checkpoint (with designation _latest)
             if bidx % self.train_logging.checkpoint == 0 and bidx > 0:
@@ -746,10 +833,10 @@ def validate(self, mini_epoch, mode_cfg, batch_size, stage: Stage = VAL):
         with torch.no_grad():
             # print progress bar but only in interactive mode, i.e. when without ddp
             with tqdm.tqdm(total=len(data_loader), disable=self.cf.with_ddp) as pbar:
+                # --- Point 5: accumulators for spread-skill ratio ---
+                _num_members = self.cf.get("latent_perturbation_num_members", 0)
+                _spread_skill_acc = {}  # stream_name -> {"spread_sum", "rmse_sum", "count"}
                 for bidx, batch in enumerate(dataset_val_iter):
-                    if cf.data_loading.get("memory_pinning", False):
-                        # pin memory for faster CPU-GPU transfer
-                        batch = batch.pin_memory()
 
                     batch.to_device(self.device)
 
@@ -787,6 +874,39 @@ def validate(self, mini_epoch, mode_cfg, batch_size, stage: Stage = VAL):
                         istep=self.cf.general.istep,
                     )
 
+                    # --- Point 5: accumulate spread and RMSE for spread-skill ratio ---
+                    if _num_members > 1 and is_root():
+                        for loss_name, t_aux in targets_and_auxs.items():
+                            if not hasattr(t_aux, "physical"):
+                                continue
+                            for step_idx, (pred_step, tgt_step) in enumerate(
+                                zip(preds.physical, t_aux.physical, strict=False)
+                            ):
+                                for stream_name, pred_list in pred_step.items():
+                                    tgt_dict = tgt_step.get(stream_name)
+                                    if tgt_dict is None:
+                                        continue
+                                    tgt_list = tgt_dict.get("target", [])
+                                    for pred_s, tgt_s in zip(pred_list, tgt_list, strict=False):
+                                        # pred_s: [ens, pts, C], tgt_s: [pts, C]
+                                        if pred_s.numel() == 0 or tgt_s.numel() == 0:
+                                            continue
+                                        p = pred_s.float()
+                                        t = tgt_s.float()
+                                        ens_mean = p.mean(0)  # [pts, C]
+                                        rmse = (ens_mean - t).pow(2).mean(0).sqrt()  # [C]
+                                        spread = p.std(0).mean(0)  # [C]
+                                        key = f"{stream_name}/step{step_idx}"
+                                        if key not in _spread_skill_acc:
+                                            _spread_skill_acc[key] = {
+                                                "spread": torch.zeros_like(spread),
+                                                "rmse": torch.zeros_like(rmse),
+                                                "n": 0,
+                                            }
+                                        _spread_skill_acc[key]["spread"] += spread.detach()
+                                        _spread_skill_acc[key]["rmse"] += rmse.detach()
+                                        _spread_skill_acc[key]["n"] += 1
+
                     # log output
                     if bidx < num_samples_write:
                         # denormalization function for data
@@ -816,6 +936,27 @@ def validate(self, mini_epoch, mode_cfg, batch_size, stage: Stage = VAL):
                 self._log_terminal(0, mini_epoch, stage)
                 self._log(stage)
 
+                # --- Point 5: log spread-skill ratio ---
+                if _num_members > 1 and is_root() and _spread_skill_acc:
+                    ss_metrics = {}
+                    for key, acc in _spread_skill_acc.items():
+                        n = acc["n"]
+                        mean_spread = (acc["spread"] / n).mean().item()
+                        mean_rmse = (acc["rmse"] / n).mean().item()
+                        ratio = mean_spread / (mean_rmse + 1e-8)
+                        ss_metrics[f"spread_skill/{key}/spread"] = mean_spread
+                        ss_metrics[f"spread_skill/{key}/rmse"] = mean_rmse
+                        ss_metrics[f"spread_skill/{key}/ratio"] = ratio
+                        logger.info(
+                            f"[spread-skill] {key}: spread={mean_spread:.4f}, "
+                            f"rmse={mean_rmse:.4f}, ratio={ratio:.3f} "
+                            f"(target≈1.0)"
+                        )
+                    samples = self.cf.general.istep * self.get_batch_size_total(
+                        self.batch_size_per_gpu
+                    )
+                    self.train_logger.log_metrics(VAL, ss_metrics, step=samples)
+
         # avoid that there is a systematic bias in the validation subset
         dataset.advance()
 
@@ -826,7 +967,10 @@ def _get_full_model_state_dict(self):
         if self.cf.with_ddp and self.cf.with_fsdp:
             cpu_state_dict = {}
             for param_name, sharded_param in maybe_sharded_sd.items():
-                full_param = sharded_param.full_tensor()
+                if isinstance(sharded_param, DTensor):
+                    full_param = sharded_param.full_tensor()
+                else:
+                    full_param = sharded_param
                 if is_root():
                     cpu_state_dict[param_name] = full_param.cpu()
                 else:
diff --git a/src/weathergen/train/utils.py b/src/weathergen/train/utils.py
index 4566b9ad7..ee6daa5e9 100644
--- a/src/weathergen/train/utils.py
+++ b/src/weathergen/train/utils.py
@@ -154,7 +154,7 @@ def get_target_idxs_from_cfg(cfg, loss_name) -> list[int] | None:
     Extract target idxs from training/validation/test config
     """
 
-    tc = [v.get("target_source_correspondence") for _, v in cfg.losses[loss_name].loss_fcts.items()]
+    tc = [v.get("target_source_correspondence") for _, v in cfg.losses[loss_name].loss_fcts.items() if v is not None]
     tc = [list(t.keys()) for t in tc if t is not None]
     target_idxs = list(set([int(i) for t in tc for i in t])) if len(tc) > 0 else None
 
diff --git a/src/weathergen/utils/train_logger.py b/src/weathergen/utils/train_logger.py
index 46c847e8a..02c089918 100644
--- a/src/weathergen/utils/train_logger.py
+++ b/src/weathergen/utils/train_logger.py
@@ -156,7 +156,7 @@ def read(
 
         # define cols for training
         cols_train = ["dtime", "samples", "mse", "lr"]
-        cols1 = [_weathergen_timestamp, "num_samples", "loss_avg_mean", "learning_rate"]
+        cols1 = [_weathergen_timestamp, "weathergen.step", "num_samples", "loss_avg_mean", "learning_rate"]
         cols1_patterns = ["loss_avg"] + cols_patterns
 
         # read training log data
@@ -200,7 +200,7 @@ def read(
 
         # define cols for validation
         cols_val = ["dtime", "samples"]
-        cols2 = [_weathergen_timestamp, "num_samples"]
+        cols2 = [_weathergen_timestamp, "weathergen.step", "num_samples"]
         cols2_patterns = ["loss_avg"] + cols_patterns
 
         # read validation log data