diff --git a/src/application/common/monitor_utils.py b/src/application/common/monitor_utils.py index 721d7cf9..4f5e0110 100644 --- a/src/application/common/monitor_utils.py +++ b/src/application/common/monitor_utils.py @@ -15,7 +15,7 @@ from src.application.common import logger from src.application.contracts import IMonitoringStorageService, IAzureCostService from src.application.dtos import CostConfiguration -from src.domain.enums import BlobOperationType, StopReason +from src.domain.enums import BlobOperationType, DatasetSize, StopReason from src.infra.infrastructure import Containers @@ -118,7 +118,9 @@ def _save_run_cost_analytics( monitoring_storage_service: IMonitoringStorageService = Provide[ Containers.monitoring_storage_service ], + dataset_size_value: str = Provide[Containers.config.dataset_size], ) -> None: + dataset_size = DatasetSize(dataset_size_value) benchmark_run = _get_benchmark_run() if cost_configuration.include_aci: aci_cost = azure_cost_service.compute_aci_cost(query_id, start_time, end_time) @@ -138,7 +140,9 @@ def _save_run_cost_analytics( ) if cost_configuration.include_blob_storage and is_blob_params_present: blob_cost = azure_cost_service.compute_blob_storage_cost( - start_time, end_time, bytes_ingress, bytes_egress, operation_type + start_time, end_time, bytes_ingress, bytes_egress, operation_type, + dataset_size=dataset_size, + is_cross_region=cost_configuration.is_cross_region_blob, ) logger.debug("Computed Blob Storage cost: %s", blob_cost.to_dict()) monitoring_storage_service.write_cost_analytics_to_blob_storage( diff --git a/src/application/contracts/azure_cost_service_interface.py b/src/application/contracts/azure_cost_service_interface.py index 422e6b20..e1b2796b 100644 --- a/src/application/contracts/azure_cost_service_interface.py +++ b/src/application/contracts/azure_cost_service_interface.py @@ -2,7 +2,7 @@ from abc import abstractmethod, ABC from src.application.dtos import Cost -from src.domain.enums import BlobOperationType +from src.domain.enums import BlobOperationType, DatasetSize class IAzureCostService(ABC): @@ -35,6 +35,8 @@ def compute_blob_storage_cost( bytes_ingress: float, bytes_egress: float, operation_type: BlobOperationType, + dataset_size: DatasetSize = DatasetSize.SMALL, + is_cross_region: bool = False, ) -> Cost: """ Computes the blob storage cost for the benchmark window. The cost includes prorated storage @@ -45,6 +47,8 @@ def compute_blob_storage_cost( :param bytes_ingress: Bytes uploaded to blob storage during the benchmark. :param bytes_egress: Bytes downloaded from blob storage during the benchmark. :param operation_type: Whether the benchmark performs READ or WRITE operations against blob storage. + :param dataset_size: Dataset size tier used to count blobs for operation cost estimation. + :param is_cross_region: When True, uses cross-region egress pricing instead of intra-region. :return: Cost DTO with compute, storage, network, operations, and total cost. Compute cost is 0 for blob storage. :rtype: Cost diff --git a/src/application/contracts/azure_metric_service_interface.py b/src/application/contracts/azure_metric_service_interface.py index fe7e371d..1b66a0d8 100644 --- a/src/application/contracts/azure_metric_service_interface.py +++ b/src/application/contracts/azure_metric_service_interface.py @@ -4,7 +4,7 @@ from azure.monitor.querymetrics import MetricAggregationType, MetricsQueryResult from src.application.dtos import AciUsage, BlobStorageUsage, DatabaseUsage, DatabricksUsage -from src.domain.enums import AzureMetricNamespace, AzureResourceMetrics, BlobOperationType +from src.domain.enums import AzureMetricNamespace, AzureResourceMetrics, BlobOperationType, DatasetSize class IAzureMetricService(ABC): @@ -66,7 +66,8 @@ def get_blob_storage_usage( end_time: datetime.datetime, bytes_ingress: float, bytes_egress: float, - operation_type: BlobOperationType + operation_type: BlobOperationType, + dataset_size: DatasetSize = DatasetSize.SMALL, ) -> BlobStorageUsage: """ Returns the blob storage usage for the benchmark window. The transaction counts are derived from @@ -78,6 +79,7 @@ def get_blob_storage_usage( :param bytes_ingress: Bytes uploaded to blob storage during the benchmark. :param bytes_egress: Bytes downloaded from blob storage during the benchmark. :param operation_type: Whether the benchmark performs READ or WRITE operations against blob storage. + :param dataset_size: Dataset size tier used to resolve the blob path for counting blobs. :return: BlobStorageUsage DTO with transaction counts, network bytes, and storage size in bytes. :rtype: BlobStorageUsage """ diff --git a/src/application/dtos/blob_storage.py b/src/application/dtos/blob_storage.py index a6aa7050..261a5c13 100644 --- a/src/application/dtos/blob_storage.py +++ b/src/application/dtos/blob_storage.py @@ -26,6 +26,7 @@ class BlobStoragePricing: storage_gb_per_month: float ingress_per_gb: float = 0.0 egress_per_gb: float = 0.0 + cross_region_egress_per_gb: float = 0.0 def to_dict(self) -> dict[str, float]: return asdict(self) diff --git a/src/application/dtos/cost.py b/src/application/dtos/cost.py index cf423fdb..f5402f47 100644 --- a/src/application/dtos/cost.py +++ b/src/application/dtos/cost.py @@ -24,3 +24,4 @@ class CostConfiguration: include_postgres: bool = False include_databricks: bool = False num_workers: int = 0 + is_cross_region_blob: bool = False diff --git a/src/config.py b/src/config.py index de3d22c2..12bfd4f0 100644 --- a/src/config.py +++ b/src/config.py @@ -31,19 +31,6 @@ class Config: f"https://{AZURE_RESOURCE_LOCATION}.metrics.monitor.azure.com" ) - AZURE_ACI_VCPU_PRICE_PER_SECOND: float = 0.0002 - AZURE_ACI_MEMORY_GB_PRICE_PER_SECOND: float = 0.0002 - - AZURE_BLOB_READ_OPERATION_COST: float = 0 - AZURE_BLOB_WRITE_OPERATION_COST: float = 0 - AZURE_BLOB_LIST_OPERATION_COST: float = 0 - AZURE_BLOB_STORAGE_GB_PER_MONTH_COST: float = 0 - AZURE_BLOB_INGRESS_PER_GB_COST: float = 0 - AZURE_BLOB_EGRESS_PER_GB_COST: float = 0 - - AZURE_DATABASE_COMPUTE_PRICE_PER_SECOND: float = 0 - AZURE_DATABASE_STORAGE_GB_PER_MONTH_COST: float = 0 - # POSTGRESQL POSTGRES_USERNAME: str = os.getenv("POSTGRES_USERNAME") POSTGRES_PASSWORD: str = os.getenv("POSTGRES_PASSWORD") @@ -138,9 +125,9 @@ class Config: ) DATABRICKS_POLL_INTERVAL_SECONDS: int = 30 DATABRICKS_HTTP_TIMEOUT_SECONDS: int = 30 - DATABRICKS_DRIVER_MEMORY: str = "14g" + DATABRICKS_DRIVER_MEMORY: str = "9g" DATABRICKS_DRIVER_MEMORY_OVERHEAD: str = "1g" - DATABRICKS_DRIVER_MAX_RESULT_SIZE: str = "8g" + DATABRICKS_DRIVER_MAX_RESULT_SIZE: str = "4g" DATABRICKS_SEDONA_MAVEN_COORDINATES: str = ( "org.apache.sedona:sedona-spark-shaded-3.5_2.12:1.7.1" ) diff --git a/src/infra/infrastructure/services/azure_cost_service.py b/src/infra/infrastructure/services/azure_cost_service.py index a40c767d..2646757e 100644 --- a/src/infra/infrastructure/services/azure_cost_service.py +++ b/src/infra/infrastructure/services/azure_cost_service.py @@ -2,7 +2,7 @@ from src.application.contracts import IAzureCostService, IAzureMetricService, IAzurePricingService from src.application.dtos import Cost -from src.domain.enums import BlobOperationType +from src.domain.enums import BlobOperationType, DatasetSize class AzureCostService(IAzureCostService): @@ -52,6 +52,8 @@ def compute_blob_storage_cost( bytes_ingress: float, bytes_egress: float, operation_type: BlobOperationType, + dataset_size: DatasetSize = DatasetSize.SMALL, + is_cross_region: bool = False, ) -> Cost: usage = self.__azure_metric_service.get_blob_storage_usage( start_time=start_time, @@ -59,6 +61,7 @@ def compute_blob_storage_cost( bytes_ingress=bytes_ingress, bytes_egress=bytes_egress, operation_type=operation_type, + dataset_size=dataset_size, ) pricing = self.__azure_pricing_service.get_blob_storage_pricing() @@ -73,11 +76,15 @@ def compute_blob_storage_cost( + usage.list_transactions * pricing.list_operation_cost ) + egress_rate = ( + pricing.cross_region_egress_per_gb if is_cross_region + else pricing.egress_per_gb + ) ingress_gb = usage.bytes_ingress / (1024 ** 3) egress_gb = usage.bytes_egress / (1024 ** 3) network_cost = ( ingress_gb * pricing.ingress_per_gb - + egress_gb * pricing.egress_per_gb + + egress_gb * egress_rate ) total = storage_cost + operations_cost + network_cost @@ -106,8 +113,9 @@ def compute_databricks_cost( pricing = self.__azure_pricing_service.get_databricks_pricing() duration_hours = usage.duration_seconds / 3600 - dbu_cost = usage.num_workers * pricing.dbu_per_node_per_hour * pricing.dbu_price_per_hour * duration_hours - vm_cost = usage.num_workers * pricing.vm_cost_per_node_per_hour * duration_hours + total_nodes = usage.num_workers + 1 # workers + driver (same VM type) + dbu_cost = total_nodes * pricing.dbu_per_node_per_hour * pricing.dbu_price_per_hour * duration_hours + vm_cost = total_nodes * pricing.vm_cost_per_node_per_hour * duration_hours compute_cost = dbu_cost + vm_cost egress_gb = usage.bytes_egress / (1024 ** 3) diff --git a/src/infra/infrastructure/services/azure_metric_service.py b/src/infra/infrastructure/services/azure_metric_service.py index b729127b..966265cb 100644 --- a/src/infra/infrastructure/services/azure_metric_service.py +++ b/src/infra/infrastructure/services/azure_metric_service.py @@ -110,13 +110,14 @@ def get_blob_storage_usage( bytes_ingress: float, bytes_egress: float, operation_type: BlobOperationType, + dataset_size: DatasetSize = DatasetSize.SMALL, ) -> BlobStorageUsage: path = self.__file_path_service.create_dataset_blob_path( release=Config.BENCHMARK_DOPPA_DATA_RELEASE, theme=Theme.BUILDINGS, region="*", file_name="*.parquet", - dataset_size=DatasetSize.SMALL, + dataset_size=dataset_size, ) blob_count, storage_size = self.__blob_storage_service.get_blob_summary( diff --git a/src/infra/infrastructure/services/azure_pricing_service.py b/src/infra/infrastructure/services/azure_pricing_service.py index e81a495f..5c98384e 100644 --- a/src/infra/infrastructure/services/azure_pricing_service.py +++ b/src/infra/infrastructure/services/azure_pricing_service.py @@ -12,14 +12,20 @@ class AzurePricingService(IAzurePricingService): Hard-coded Azure Norway East pricing (USD) as of 2026. All rates are per-second or per-GB unless noted. - All resources (ACI, Blob Storage, PostgreSQL) are deployed in the same - region (Norway East) and under the same tenant/subscription. Intra-region, - intra-tenant data transfers are free, so all network egress rates are $0.00. + ACI, Blob Storage, and PostgreSQL are deployed in Norway East under the + same tenant/subscription. Intra-region, intra-tenant data transfers are + free, so their network egress rates are $0.00. + + Databricks is deployed in Sweden Central (Norway East does not offer + Databricks). When Databricks executors read GeoParquet from Blob Storage + in Norway East, the transfer is inter-region egress billed on the blob + storage side at the intra-Europe cross-region rate (~$0.02/GB). + See: https://azure.microsoft.com/pricing/details/bandwidth/ + Section "Intra-continental data transfer" → Zone 1 ↔ Zone 1. If resources are later moved to different regions, update the egress rates to the applicable Azure Bandwidth tier. Norway East is in Zone 1. Zone 1 internet egress (Premium Global Network): $0.087/GB (first 10 TB/month). - See: https://azure.microsoft.com/pricing/details/bandwidth/ Sources: ACI compute: https://azure.microsoft.com/pricing/details/container-instances/ @@ -54,6 +60,7 @@ class AzurePricingService(IAzurePricingService): __BLOB_LIST_PER_10K: float = 0.065 # $0.065 per 10 000 list ops __BLOB_INGRESS_PER_GB: float = 0.0 # Free inbound (always free on Azure) __BLOB_EGRESS_PER_GB: float = 0.0 # Free — intra-region, same tenant + __BLOB_CROSS_REGION_EGRESS_PER_GB: float = 0.02 # Intra-Europe cross-region (Zone 1 ↔ Zone 1) # ------------------------------------------------------------------ # Azure Databricks — Sweden Central, Standard tier, Jobs Compute @@ -104,6 +111,7 @@ def get_blob_storage_pricing(self) -> BlobStoragePricing: storage_gb_per_month=self.__BLOB_STORAGE_GB_PER_MONTH, ingress_per_gb=self.__BLOB_INGRESS_PER_GB, egress_per_gb=self.__BLOB_EGRESS_PER_GB, + cross_region_egress_per_gb=self.__BLOB_CROSS_REGION_EGRESS_PER_GB, ) def get_databricks_pricing(self) -> DatabricksPricing: diff --git a/src/presentation/entrypoints/_databricks_benchmark_runner.py b/src/presentation/entrypoints/_databricks_benchmark_runner.py index e96a8268..ee5aa7aa 100644 --- a/src/presentation/entrypoints/_databricks_benchmark_runner.py +++ b/src/presentation/entrypoints/_databricks_benchmark_runner.py @@ -62,7 +62,11 @@ def _build_benchmark_fn( query_id=query_id, benchmark_iteration=BenchmarkIteration.NATIONAL_SCALE_SPATIAL_JOIN, cost_configuration=CostConfiguration( - include_aci=True, include_databricks=True, num_workers=num_workers + include_aci=True, + include_databricks=True, + include_blob_storage=True, + num_workers=num_workers, + is_cross_region_blob=True, ), skip_warmup=False, elapsed_from_result=True,