From fe0eb403f48707d8e0e7053cc93bf4d53983d417 Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Thu, 11 Dec 2025 08:41:04 -0600 Subject: [PATCH 01/59] [DEV-14145] Use isinstance() instead of type is --- .../download/delta_downloads/object_class_program_activity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usaspending_api/download/delta_downloads/object_class_program_activity.py b/usaspending_api/download/delta_downloads/object_class_program_activity.py index 00636bbdde..3ae6cf38aa 100644 --- a/usaspending_api/download/delta_downloads/object_class_program_activity.py +++ b/usaspending_api/download/delta_downloads/object_class_program_activity.py @@ -32,7 +32,7 @@ class ObjectClassProgramActivityMixin: def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - if type(self.spark) is DuckDBSparkSession: + if isinstance(self.spark, DuckDBSparkSession): from duckdb.experimental.spark.sql import functions else: from pyspark.sql import functions From 2a226bd5826c6447273bd6e1971543602083d62a Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Thu, 11 Dec 2025 14:10:01 -0600 Subject: [PATCH 02/59] [DEV-14145] Add DuckDB for file A --- .../delta_downloads/account_balances.py | 323 ++++++++++-------- 1 file changed, 178 insertions(+), 145 deletions(-) diff --git a/usaspending_api/download/delta_downloads/account_balances.py b/usaspending_api/download/delta_downloads/account_balances.py index da07f58611..3170df19dd 100644 --- a/usaspending_api/download/delta_downloads/account_balances.py +++ b/usaspending_api/download/delta_downloads/account_balances.py @@ -1,7 +1,10 @@ -from pyspark.sql import functions as sf, Column, DataFrame, SparkSession -from usaspending_api.config import CONFIG +from duckdb.experimental.spark.sql import SparkSession as DuckDBSparkSession +from duckdb.experimental.spark.sql.column import Column as DuckDBSparkColumn +from duckdb.experimental.spark.sql.dataframe import DataFrame as DuckDBSparkDataFrame +from pyspark.sql import Column, DataFrame, SparkSession from usaspending_api.common.spark.utils import collect_concat +from usaspending_api.config import CONFIG from usaspending_api.download.delta_downloads.abstract_downloads.account_download import ( AbstractAccountDownload, AccountLevel, @@ -18,27 +21,39 @@ class AccountBalancesMixin: """Shared code between concrete implementations of the AbstractAccountDownload""" - spark: SparkSession + spark: SparkSession | DuckDBSparkSession filters: AccountDownloadFilters - dynamic_filters: Column + dynamic_filters: Column | DuckDBSparkColumn group_by_cols: list[str] - agg_cols: list[Column] - select_cols: list[Column] + agg_cols: list[Column | DuckDBSparkColumn] + select_cols: list[Column | DuckDBSparkColumn] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if isinstance(self.spark, DuckDBSparkSession): + from duckdb.experimental.spark.sql import functions + else: + from pyspark.sql import functions + + self.sf = functions @property - def download_table(self) -> DataFrame: - # TODO: This should be reverted back after Spark downloads are migrated to EMR - # return self.spark.table("rpt.account_balances_download") - return self.spark.read.format("delta").load( - f"s3a://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/account_balances_download" - ) + def download_table(self) -> DataFrame | DuckDBSparkDataFrame: + if isinstance(self.spark, DuckDBSparkSession): + return self.spark.table("rpt.account_balances_download") + else: + # TODO: This should be reverted back after Spark downloads are migrated to EMR + return self.spark.read.format("delta").load( + f"s3a://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/account_balances_download" + ) - def _build_dataframe(self) -> DataFrame: + def _build_dataframe(self) -> DataFrame | DuckDBSparkDataFrame: return ( self.download_table.filter( - sf.col("submission_id").isin( + self.sf.col("submission_id").isin( get_submission_ids_for_periods( self.filters.reporting_fiscal_year, self.filters.reporting_fiscal_quarter, @@ -54,7 +69,6 @@ def _build_dataframe(self) -> DataFrame: class FederalAccountDownload(AccountBalancesMixin, AbstractAccountDownload): - @property def account_level(self) -> AccountLevel: return AccountLevel.FEDERAL_ACCOUNT @@ -68,81 +82,92 @@ def group_by_cols(self) -> list[str]: return ["federal_account_symbol", "owning_agency_name", "federal_account_name", "submission_period"] @property - def agg_cols(self) -> list[Column]: + def agg_cols(self) -> list[Column | DuckDBSparkColumn]: return [ collect_concat("reporting_agency_name", spark=self.spark), collect_concat("agency_identifier_name", spark=self.spark), collect_concat("budget_function", spark=self.spark), collect_concat("budget_subfunction", spark=self.spark), - sf.sum(sf.col("budget_authority_unobligated_balance_brought_forward")).alias( + self.sf.sum(self.sf.col("budget_authority_unobligated_balance_brought_forward")).alias( "budget_authority_unobligated_balance_brought_forward" ), - sf.sum(sf.col("adjustments_to_unobligated_balance_brought_forward_cpe")).alias( + self.sf.sum(self.sf.col("adjustments_to_unobligated_balance_brought_forward_cpe")).alias( "adjustments_to_unobligated_balance_brought_forward_cpe" ), - sf.sum(sf.col("budget_authority_appropriated_amount")).alias("budget_authority_appropriated_amount"), - sf.sum(sf.col("borrowing_authority_amount")).alias("borrowing_authority_amount"), - sf.sum(sf.col("contract_authority_amount")).alias("contract_authority_amount"), - sf.sum(sf.col("spending_authority_from_offsetting_collections_amount")).alias( + self.sf.sum(self.sf.col("budget_authority_appropriated_amount")).alias( + "budget_authority_appropriated_amount" + ), + self.sf.sum(self.sf.col("borrowing_authority_amount")).alias("borrowing_authority_amount"), + self.sf.sum(self.sf.col("contract_authority_amount")).alias("contract_authority_amount"), + self.sf.sum(self.sf.col("spending_authority_from_offsetting_collections_amount")).alias( "spending_authority_from_offsetting_collections_amount" ), - sf.sum(sf.col("total_other_budgetary_resources_amount")).alias("total_other_budgetary_resources_amount"), - sf.sum(sf.col("total_budgetary_resources")).alias("total_budgetary_resources"), - sf.sum(sf.col("obligations_incurred")).alias("obligations_incurred"), - sf.sum(sf.col("deobligations_or_recoveries_or_refunds_from_prior_year")).alias( + self.sf.sum(self.sf.col("total_other_budgetary_resources_amount")).alias( + "total_other_budgetary_resources_amount" + ), + self.sf.sum(self.sf.col("total_budgetary_resources")).alias("total_budgetary_resources"), + self.sf.sum(self.sf.col("obligations_incurred")).alias("obligations_incurred"), + self.sf.sum(self.sf.col("deobligations_or_recoveries_or_refunds_from_prior_year")).alias( "deobligations_or_recoveries_or_refunds_from_prior_year" ), - sf.sum(sf.col("unobligated_balance")).alias("unobligated_balance"), - sf.sum( - sf.when( + self.sf.sum(self.sf.col("unobligated_balance")).alias("unobligated_balance"), + self.sf.sum( + self.sf.when( ( ( - sf.col("quarter_format_flag") - & (sf.col("reporting_fiscal_quarter") == self.filters.reporting_fiscal_quarter) + self.sf.col("quarter_format_flag") + & (self.sf.col("reporting_fiscal_quarter") == self.filters.reporting_fiscal_quarter) ) | ( - ~sf.col("quarter_format_flag") - & (sf.col("reporting_fiscal_period") == self.filters.reporting_fiscal_period) + ~self.sf.col("quarter_format_flag") + & (self.sf.col("reporting_fiscal_period") == self.filters.reporting_fiscal_period) ) ) - & (sf.col("reporting_fiscal_year") == self.filters.reporting_fiscal_year), - sf.col("gross_outlay_amount"), + & (self.sf.col("reporting_fiscal_year") == self.filters.reporting_fiscal_year), + self.sf.col("gross_outlay_amount"), ).otherwise(0) ).alias("gross_outlay_amount"), - sf.sum(sf.col("status_of_budgetary_resources_total")).alias("status_of_budgetary_resources_total"), - sf.max(sf.date_format("last_modified_date", "yyyy-MM-dd")).alias("last_modified_date"), + self.sf.sum(self.sf.col("status_of_budgetary_resources_total")).alias( + "status_of_budgetary_resources_total" + ), + ( + self.sf.max(self.sf.call_function("strptime", "last_modified_date", "yyyy-MM-dd")).alias( + "max_last_modified_date" + ) + if isinstance(self.spark, DuckDBSparkSession) + else self.sf.max(self.sf.date_format("last_modified_date", "yyyy-MM-dd")).alias("last_modified_date"), + ), ] @property def select_cols(self) -> list[Column]: return [ - sf.col("owning_agency_name"), - sf.col("reporting_agency_name"), - sf.col("submission_period"), - sf.col("federal_account_symbol"), - sf.col("federal_account_name"), - sf.col("agency_identifier_name"), - sf.col("budget_function"), - sf.col("budget_subfunction"), - sf.col("budget_authority_unobligated_balance_brought_forward"), - sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), - sf.col("budget_authority_appropriated_amount"), - sf.col("borrowing_authority_amount"), - sf.col("contract_authority_amount"), - sf.col("spending_authority_from_offsetting_collections_amount"), - sf.col("total_other_budgetary_resources_amount"), - sf.col("total_budgetary_resources"), - sf.col("obligations_incurred"), - sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), - sf.col("unobligated_balance"), - sf.col("gross_outlay_amount"), - sf.col("status_of_budgetary_resources_total"), - sf.col("last_modified_date"), + self.sf.col("owning_agency_name"), + self.sf.col("reporting_agency_name"), + self.sf.col("submission_period"), + self.sf.col("federal_account_symbol"), + self.sf.col("federal_account_name"), + self.sf.col("agency_identifier_name"), + self.sf.col("budget_function"), + self.sf.col("budget_subfunction"), + self.sf.col("budget_authority_unobligated_balance_brought_forward"), + self.sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), + self.sf.col("budget_authority_appropriated_amount"), + self.sf.col("borrowing_authority_amount"), + self.sf.col("contract_authority_amount"), + self.sf.col("spending_authority_from_offsetting_collections_amount"), + self.sf.col("total_other_budgetary_resources_amount"), + self.sf.col("total_budgetary_resources"), + self.sf.col("obligations_incurred"), + self.sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), + self.sf.col("unobligated_balance"), + self.sf.col("gross_outlay_amount"), + self.sf.col("status_of_budgetary_resources_total"), + self.sf.col("last_modified_date"), ] class TreasuryAccountDownload(AccountBalancesMixin, AbstractAccountDownload): - @property def account_level(self) -> AccountLevel: return AccountLevel.TREASURY_ACCOUNT @@ -152,98 +177,106 @@ def submission_type(self) -> SubmissionType: return SubmissionType.ACCOUNT_BALANCES @property - def group_by_cols(self) -> list[Column]: + def group_by_cols(self) -> list[Column | DuckDBSparkColumn]: return [ - sf.col("data_source"), - sf.col("appropriation_account_balances_id"), - sf.col("budget_authority_unobligated_balance_brought_forward"), - sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), - sf.col("budget_authority_appropriated_amount"), - sf.col("borrowing_authority_amount"), - sf.col("contract_authority_amount"), - sf.col("spending_authority_from_offsetting_collections_amount"), - sf.col("total_other_budgetary_resources_amount"), - sf.col("total_budgetary_resources"), - sf.col("gross_outlay_amount"), - sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), - sf.col("unobligated_balance"), - sf.col("status_of_budgetary_resources_total"), - sf.col("obligations_incurred"), - sf.col("drv_appropriation_availability_period_start_date"), - sf.col("drv_appropriation_availability_period_end_date"), - sf.col("drv_appropriation_account_expired_status"), - sf.col("drv_obligations_unpaid_amount"), - sf.col("drv_other_obligated_amount"), - sf.col("reporting_period_start"), - sf.col("reporting_period_end"), - sf.col("appropriation_account_last_modified"), - sf.col("certified_date"), - sf.col("create_date"), - sf.col("update_date"), - sf.col("final_of_fy"), - sf.col("submission_id"), - sf.col("treasury_account_identifier"), - sf.col("owning_agency_name"), - sf.col("reporting_agency_name"), - sf.col("allocation_transfer_agency_identifier_code"), - sf.col("agency_identifier_code"), - sf.col("beginning_period_of_availability"), - sf.col("ending_period_of_availability"), - sf.col("availability_type_code"), - sf.col("main_account_code"), - sf.col("sub_account_code"), - sf.col("treasury_account_symbol"), - sf.col("treasury_account_name"), - sf.col("budget_function"), - sf.col("budget_subfunction"), - sf.col("federal_account_symbol"), - sf.col("federal_account_name"), - sf.col("agency_identifier_name"), - sf.col("allocation_transfer_agency_identifier_name"), - sf.col("submission_period"), + self.sf.col("data_source"), + self.sf.col("appropriation_account_balances_id"), + self.sf.col("budget_authority_unobligated_balance_brought_forward"), + self.sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), + self.sf.col("budget_authority_appropriated_amount"), + self.sf.col("borrowing_authority_amount"), + self.sf.col("contract_authority_amount"), + self.sf.col("spending_authority_from_offsetting_collections_amount"), + self.sf.col("total_other_budgetary_resources_amount"), + self.sf.col("total_budgetary_resources"), + self.sf.col("gross_outlay_amount"), + self.sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), + self.sf.col("unobligated_balance"), + self.sf.col("status_of_budgetary_resources_total"), + self.sf.col("obligations_incurred"), + self.sf.col("drv_appropriation_availability_period_start_date"), + self.sf.col("drv_appropriation_availability_period_end_date"), + self.sf.col("drv_appropriation_account_expired_status"), + self.sf.col("drv_obligations_unpaid_amount"), + self.sf.col("drv_other_obligated_amount"), + self.sf.col("reporting_period_start"), + self.sf.col("reporting_period_end"), + self.sf.col("appropriation_account_last_modified"), + self.sf.col("certified_date"), + self.sf.col("create_date"), + self.sf.col("update_date"), + self.sf.col("final_of_fy"), + self.sf.col("submission_id"), + self.sf.col("treasury_account_identifier"), + self.sf.col("owning_agency_name"), + self.sf.col("reporting_agency_name"), + self.sf.col("allocation_transfer_agency_identifier_code"), + self.sf.col("agency_identifier_code"), + self.sf.col("beginning_period_of_availability"), + self.sf.col("ending_period_of_availability"), + self.sf.col("availability_type_code"), + self.sf.col("main_account_code"), + self.sf.col("sub_account_code"), + self.sf.col("treasury_account_symbol"), + self.sf.col("treasury_account_name"), + self.sf.col("budget_function"), + self.sf.col("budget_subfunction"), + self.sf.col("federal_account_symbol"), + self.sf.col("federal_account_name"), + self.sf.col("agency_identifier_name"), + self.sf.col("allocation_transfer_agency_identifier_name"), + self.sf.col("submission_period"), ] @property - def agg_cols(self) -> list[Column]: - return [ - sf.max(sf.date_format("last_modified_date", "yyyy-MM-dd")).alias("max_last_modified_date"), - ] + def agg_cols(self) -> list[Column | DuckDBSparkColumn]: + if isinstance(self.spark, DuckDBSparkSession): + # DuckDB's Spark implementation doesn't include the `date_format()` function so we have to use Python's `strptime` + return [ + self.sf.max(self.sf.call_function("strptime", "last_modified_date", "yyyy-MM-dd")).alias( + "max_last_modified_date" + ) + ] + else: + return [ + self.sf.max(self.sf.date_format("last_modified_date", "yyyy-MM-dd")).alias("max_last_modified_date"), + ] @property - def select_cols(self) -> list[Column]: + def select_cols(self) -> list[Column | DuckDBSparkColumn]: return [ - sf.col("owning_agency_name"), - sf.col("reporting_agency_name"), - sf.col("submission_period"), - sf.col("allocation_transfer_agency_identifier_code"), - sf.col("agency_identifier_code"), - sf.col("beginning_period_of_availability"), - sf.col("ending_period_of_availability"), - sf.col("availability_type_code"), - sf.col("main_account_code"), - sf.col("sub_account_code"), - sf.col("treasury_account_symbol"), - sf.col("treasury_account_name"), - sf.col("agency_identifier_name"), - sf.col("allocation_transfer_agency_identifier_name"), - sf.col("budget_function"), - sf.col("budget_subfunction"), - sf.col("federal_account_symbol"), - sf.col("federal_account_name"), - sf.col("budget_authority_unobligated_balance_brought_forward"), - sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), - sf.col("budget_authority_appropriated_amount"), - sf.col("borrowing_authority_amount"), - sf.col("contract_authority_amount"), - sf.col("spending_authority_from_offsetting_collections_amount"), - sf.col("total_other_budgetary_resources_amount"), - sf.col("total_budgetary_resources"), - sf.col("obligations_incurred"), - sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), - sf.col("unobligated_balance"), - sf.col("gross_outlay_amount"), - sf.col("status_of_budgetary_resources_total"), - sf.col("max_last_modified_date").alias("last_modified_date"), + self.sf.col("owning_agency_name"), + self.sf.col("reporting_agency_name"), + self.sf.col("submission_period"), + self.sf.col("allocation_transfer_agency_identifier_code"), + self.sf.col("agency_identifier_code"), + self.sf.col("beginning_period_of_availability"), + self.sf.col("ending_period_of_availability"), + self.sf.col("availability_type_code"), + self.sf.col("main_account_code"), + self.sf.col("sub_account_code"), + self.sf.col("treasury_account_symbol"), + self.sf.col("treasury_account_name"), + self.sf.col("agency_identifier_name"), + self.sf.col("allocation_transfer_agency_identifier_name"), + self.sf.col("budget_function"), + self.sf.col("budget_subfunction"), + self.sf.col("federal_account_symbol"), + self.sf.col("federal_account_name"), + self.sf.col("budget_authority_unobligated_balance_brought_forward"), + self.sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), + self.sf.col("budget_authority_appropriated_amount"), + self.sf.col("borrowing_authority_amount"), + self.sf.col("contract_authority_amount"), + self.sf.col("spending_authority_from_offsetting_collections_amount"), + self.sf.col("total_other_budgetary_resources_amount"), + self.sf.col("total_budgetary_resources"), + self.sf.col("obligations_incurred"), + self.sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), + self.sf.col("unobligated_balance"), + self.sf.col("gross_outlay_amount"), + self.sf.col("status_of_budgetary_resources_total"), + self.sf.col("max_last_modified_date").alias("last_modified_date"), ] From 920cbc95d59c8d0b8ca634baa484bebaf0a93458 Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Tue, 16 Dec 2025 14:17:44 -0600 Subject: [PATCH 03/59] [DEV-14145] black fixes --- .../download/delta_downloads/account_balances.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/usaspending_api/download/delta_downloads/account_balances.py b/usaspending_api/download/delta_downloads/account_balances.py index 3170df19dd..cec64a6ced 100644 --- a/usaspending_api/download/delta_downloads/account_balances.py +++ b/usaspending_api/download/delta_downloads/account_balances.py @@ -131,11 +131,15 @@ def agg_cols(self) -> list[Column | DuckDBSparkColumn]: "status_of_budgetary_resources_total" ), ( - self.sf.max(self.sf.call_function("strptime", "last_modified_date", "yyyy-MM-dd")).alias( - "max_last_modified_date" - ) - if isinstance(self.spark, DuckDBSparkSession) - else self.sf.max(self.sf.date_format("last_modified_date", "yyyy-MM-dd")).alias("last_modified_date"), + ( + self.sf.max(self.sf.call_function("strptime", "last_modified_date", "yyyy-MM-dd")).alias( + "max_last_modified_date" + ) + if isinstance(self.spark, DuckDBSparkSession) + else self.sf.max(self.sf.date_format("last_modified_date", "yyyy-MM-dd")).alias( + "last_modified_date" + ) + ), ), ] From 34190b7cae194362945a3a35b8592f5def2ddbb9 Mon Sep 17 00:00:00 2001 From: Seth Stoudenmier Date: Wed, 31 Dec 2025 09:33:18 -0500 Subject: [PATCH 04/59] [DEV-14094] initial work for EMR downloads --- .../commands/download_sqs_worker.py | 22 ++++++++++++++++--- usaspending_api/settings.py | 9 ++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/usaspending_api/download/management/commands/download_sqs_worker.py b/usaspending_api/download/management/commands/download_sqs_worker.py index 879cea7688..a8261897c3 100644 --- a/usaspending_api/download/management/commands/download_sqs_worker.py +++ b/usaspending_api/download/management/commands/download_sqs_worker.py @@ -5,6 +5,8 @@ import traceback from typing import Callable +import boto3 + # Third-party library imports from opentelemetry.trace import SpanKind, Status, StatusCode @@ -14,7 +16,7 @@ # Application imports from usaspending_api.common.logging import configure_logging -from usaspending_api.common.spark.jobs import SparkJobs, LocalStrategy, DatabricksStrategy +from usaspending_api.common.spark.jobs import SparkJobs, LocalStrategy, EmrServerlessStrategy from usaspending_api.common.sqs.sqs_handler import DownloadLogic, get_sqs_queue from usaspending_api.common.sqs.sqs_job_logging import log_job_message from usaspending_api.common.sqs.sqs_work_dispatcher import ( @@ -165,9 +167,23 @@ def _run_spark_download(download_job_id: int, job_name: str) -> None: command_options = [f"--skip-local-cleanup"] extra_options = {"run_as_container": True} else: - strategy = DatabricksStrategy() + strategy = EmrServerlessStrategy() command_options = [] - extra_options = {} + + ssm_client = boto3.client("ssm", settings.USASPENDING_AWS_REGION) + param_resp = ssm_client.get_parameters( + Names=[settings.EMR_DOWNLOAD_APP_PARAM_NAME, settings.EMR_DOWNLOAD_ROLE_PARAM_NAME], WithDecryption=True + ) + if param_resp.get("InvalidParameters"): + logger.error(f"Invalid parameters: {param_resp['InvalidParameters']}") + raise ValueError("Invalid parameters") + param_values = {param["Name"]: param["Value"] for param in param_resp["Parameters"]} + + extra_options = { + "application_id": param_values[settings.EMR_DOWNLOAD_APP_PARAM_NAME], + "execution_role_arn": param_values[settings.EMR_DOWNLOAD_ROLE_PARAM_NAME], + } + spark_jobs = SparkJobs(strategy) spark_jobs.start( job_name=job_name, diff --git a/usaspending_api/settings.py b/usaspending_api/settings.py index 52a231f915..59a4733d6c 100644 --- a/usaspending_api/settings.py +++ b/usaspending_api/settings.py @@ -91,6 +91,15 @@ BROKER_AGENCY_BUCKET_NAME = "" UNLINKED_AWARDS_DOWNLOAD_REDIRECT_DIR = "unlinked_awards_downloads" +# AWS parameter store key names +EMR_DOWNLOAD_APP_PARAM_NAME = "" +if not EMR_DOWNLOAD_APP_PARAM_NAME: + EMR_DOWNLOAD_APP_PARAM_NAME = os.environ.get("EMR_DOWNLOAD_APP_PARAM_NAME") + +EMR_DOWNLOAD_ROLE_PARAM_NAME = "" +if not EMR_DOWNLOAD_ROLE_PARAM_NAME: + EMR_DOWNLOAD_ROLE_PARAM_NAME = os.environ.get("EMR_DOWNLOAD_ROLE_PARAM_NAME") + # This list contains any abnormal characters in agency names # This list is important to track which characters we need to replace in # the agency name before the name can be used in a file name From 5f49a1d35ac0b9ca898c5513a0b40d3cc5808744 Mon Sep 17 00:00:00 2001 From: Seth Stoudenmier Date: Fri, 2 Jan 2026 15:00:37 -0500 Subject: [PATCH 05/59] [DEV-14094] set retry policy to 2 --- usaspending_api/common/spark/jobs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/usaspending_api/common/spark/jobs.py b/usaspending_api/common/spark/jobs.py index fe397c2667..14e7b8c6e2 100644 --- a/usaspending_api/common/spark/jobs.py +++ b/usaspending_api/common/spark/jobs.py @@ -169,6 +169,7 @@ def handle_start(self, job_name: str, command_name: str, command_options: list[s "entryPointArguments": command_options, } }, + retryPolicy={"maxAttempts": 2}, ) return response From 766ffb0265b3ba5e62b85a8388a585a2f8858d79 Mon Sep 17 00:00:00 2001 From: Lorelei Trimberger Date: Mon, 5 Jan 2026 11:16:07 -0600 Subject: [PATCH 06/59] [DEV-14110] initial_report_date and last_modified_date changed to DateTimeField --- ...ransactionnormalized_last_modified_date.py | 15 ++ .../awards/models/transaction_normalized.py | 2 +- .../dataframes/transaction_search.py | 4 +- ...tionsearch_initial_report_date_and_more.py | 166 ++++++++++++++++++ .../search/models/transaction_search.py | 4 +- .../delta_models/transaction_search.py | 4 +- 6 files changed, 188 insertions(+), 7 deletions(-) create mode 100644 usaspending_api/awards/migrations/0115_alter_transactionnormalized_last_modified_date.py create mode 100644 usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py diff --git a/usaspending_api/awards/migrations/0115_alter_transactionnormalized_last_modified_date.py b/usaspending_api/awards/migrations/0115_alter_transactionnormalized_last_modified_date.py new file mode 100644 index 0000000000..63cac462c4 --- /dev/null +++ b/usaspending_api/awards/migrations/0115_alter_transactionnormalized_last_modified_date.py @@ -0,0 +1,15 @@ +from django.db import migrations, models + +class Migration(migrations.Migration): + + dependencies = [ + ("awards", "0114_alter_ctodlinkageupdates_award_id"), + ] + + operations = [ + migrations.AlterField( + model_name="transactionnormalized", + name="last_modified_date", + field=models.DateTimeField(null=True), + ), + ] \ No newline at end of file diff --git a/usaspending_api/awards/models/transaction_normalized.py b/usaspending_api/awards/models/transaction_normalized.py index 7ab1180ebf..b98a516667 100644 --- a/usaspending_api/awards/models/transaction_normalized.py +++ b/usaspending_api/awards/models/transaction_normalized.py @@ -91,7 +91,7 @@ class TransactionNormalized(models.Model): help_text="The agency which is funding this transaction", ) description = models.TextField(null=True, help_text="The description of this transaction") - last_modified_date = models.DateField( + last_modified_date = models.DateTimeField( blank=True, null=True, help_text="The date this transaction was last modified" ) certified_date = models.DateField(blank=True, null=True, help_text="The date this transaction was certified") diff --git a/usaspending_api/search/delta_models/dataframes/transaction_search.py b/usaspending_api/search/delta_models/dataframes/transaction_search.py index c290183f47..bc222f3df7 100644 --- a/usaspending_api/search/delta_models/dataframes/transaction_search.py +++ b/usaspending_api/search/delta_models/dataframes/transaction_search.py @@ -75,7 +75,7 @@ def date_cols(self) -> list[Column]: return [ sf.to_date(self.transaction_normalized.action_date).alias("action_date"), sf.add_months(sf.to_date(self.transaction_normalized.action_date), 3).alias("fiscal_action_date"), - sf.to_date(self.transaction_normalized.last_modified_date).alias("last_modified_date"), + self.transaction_normalized.last_modified_date, self.transaction_normalized.fiscal_year, self.awards.certified_date.alias("award_certified_date"), sf.year(sf.add_months(sf.to_date(self.awards.certified_date), 3)).alias("award_fiscal_year"), @@ -94,7 +94,7 @@ def date_cols(self) -> list[Column]: ), sf.coalesce( sf.to_date(self.transaction_fabs.created_at), - sf.to_date(self.transaction_fpds.initial_report_date), + self.transaction_fpds.initial_report_date, ).alias("initial_report_date"), ] diff --git a/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py new file mode 100644 index 0000000000..07fe699bd6 --- /dev/null +++ b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py @@ -0,0 +1,166 @@ +# Generated by Django 4.2.23 on 2026-01-02 16:42 + +from django.db import migrations, models +from usaspending_api.awards.models.transaction_normalized import vw_transaction_normalized_sql +from usaspending_api.awards.models.transaction_fpds import vw_transaction_fpds_sql +from usaspending_api.awards.models.transaction_fabs import vw_transaction_fabs_sql + +class Migration(migrations.Migration): + + dependencies = [ + ("search", "0058_add_transaction_count_field"), + ] + + operations = [ + # Without dropping these tables, it caused the error cannot alter type of a column used by a view or rule + migrations.RunSQL( + sql="""DROP VIEW IF EXISTS + vw_transaction_fabs, + vw_transaction_normalized, + vw_transaction_fpds, + transaction_delta_view + """, + reverse_sql=f"""{vw_transaction_normalized_sql} + {vw_transaction_fpds_sql} + {vw_transaction_fabs_sql} + CREATE VIEW transaction_delta_view AS SELECT + "transaction_id", + "award_id", + "modification_number", + "detached_award_proc_unique", + "afa_generated_unique", + "generated_unique_award_id", + "piid", + "fain", + "uri", + CASE + WHEN "detached_award_proc_unique" IS NOT NULL THEN 'CONT_TX_' || "detached_award_proc_unique" + WHEN "afa_generated_unique" IS NOT NULL THEN 'ASST_TX_' || "afa_generated_unique" + ELSE NULL + END AS generated_unique_transaction_id, + CASE + WHEN "type" IN ('02', '03', '04', '05', '06', '10', '07', '08', '09', '11') AND "fain" IS NOT NULL THEN "fain" + WHEN "piid" IS NOT NULL THEN "piid" -- contracts. Did it this way to easily handle IDV contracts + ELSE "uri" + END AS display_award_id, + "action_date", + "fiscal_action_date", + "last_modified_date", + "fiscal_year", + "award_certified_date", + "award_fiscal_year", + "award_date_signed", + "update_date", + "award_update_date", + "etl_update_date", + "period_of_performance_start_date", + "period_of_performance_current_end_date", + "ordering_period_end_date", + "type_raw", + "type_description_raw", + "type", + "type_description", + "award_category", + "transaction_description", + "award_amount", + "generated_pragmatic_obligation", + "federal_action_obligation", + "original_loan_subsidy_cost", + "face_value_loan_guarantee", + "business_categories", + "naics_code", + "naics_description", + "product_or_service_code", + "product_or_service_description", + "type_of_contract_pricing", + "type_set_aside", + "extent_competed", + "cfda_number", + "cfda_title", + "pop_country_name", + "pop_country_code", + "pop_state_name", + "pop_state_code", + "pop_state_fips", + "pop_state_population", + "pop_county_code", + "pop_county_name", + "pop_county_population", + "pop_zip5", + "place_of_perform_zip_last4", + "pop_congressional_code", + "pop_congressional_population", + "pop_congressional_code_current", + "pop_city_name", + "pop_county_fips", + "recipient_location_country_code", + "recipient_location_country_name", + "recipient_location_state_name", + "recipient_location_state_code", + "recipient_location_state_fips", + "recipient_location_state_population", + "recipient_location_county_code", + "recipient_location_county_name", + "recipient_location_county_population", + "recipient_location_congressional_code", + "recipient_location_congressional_population", + "recipient_location_congressional_code_current", + "recipient_location_zip5", + "recipient_location_city_name", + "recipient_location_county_fips", + "action_type", + "legal_entity_address_line1", + "legal_entity_address_line2", + "legal_entity_address_line3", + "legal_entity_foreign_posta", + "legal_entity_foreign_provi", + "legal_entity_zip_last4", + "recipient_hash", + "recipient_name", + "recipient_levels", + "recipient_unique_id", + "parent_recipient_hash", + "parent_recipient_name", + "parent_recipient_unique_id", + "recipient_uei", + "parent_uei", + "awarding_agency_id", + "funding_agency_id", + "awarding_toptier_agency_id", + "funding_toptier_agency_id", + "awarding_agency_code", + "awarding_toptier_agency_name", + "funding_agency_code", + "funding_toptier_agency_name", + "awarding_sub_tier_agency_c", + "awarding_subtier_agency_name", + "funding_sub_tier_agency_co", + "funding_subtier_agency_name", + "awarding_office_code", + "awarding_office_name", + "funding_office_code", + "funding_office_name", + "awarding_toptier_agency_abbreviation", + "funding_toptier_agency_abbreviation", + "awarding_subtier_agency_abbreviation", + "funding_subtier_agency_abbreviation", + "tas_paths", + "tas_components", + CAST("federal_accounts" AS VARCHAR(65535)) AS federal_accounts, + "disaster_emergency_fund_codes", + CAST("program_activities" AS VARCHAR(65535)) AS program_activities + FROM "transaction_search" + WHERE "action_date" >= '2007-10-01'; + """, + ), + migrations.AlterField( + model_name="transactionsearch", + name="initial_report_date", + field=models.DateTimeField(null=True), + ), + migrations.AlterField( + model_name="transactionsearch", + name="last_modified_date", + field=models.DateTimeField(null=True), + ), + ] diff --git a/usaspending_api/search/models/transaction_search.py b/usaspending_api/search/models/transaction_search.py index 7b97920e76..33a110700d 100644 --- a/usaspending_api/search/models/transaction_search.py +++ b/usaspending_api/search/models/transaction_search.py @@ -28,7 +28,7 @@ class TransactionSearch(models.Model): # Dates action_date = models.DateField(null=True) fiscal_action_date = models.DateField(null=True) - last_modified_date = models.DateField(null=True) + last_modified_date = models.DateTimeField(null=True) fiscal_year = models.IntegerField(null=True) award_certified_date = models.DateField(null=True) award_fiscal_year = models.IntegerField(null=True) @@ -39,7 +39,7 @@ class TransactionSearch(models.Model): etl_update_date = models.DateTimeField(null=True) period_of_performance_start_date = models.DateField(null=True) period_of_performance_current_end_date = models.DateField(null=True) - initial_report_date = models.DateField(null=True) + initial_report_date = models.DateTimeField(null=True) # Agencies awarding_agency_code = models.TextField(null=True) diff --git a/usaspending_api/transactions/delta_models/transaction_search.py b/usaspending_api/transactions/delta_models/transaction_search.py index b4960aa142..52a878b62b 100644 --- a/usaspending_api/transactions/delta_models/transaction_search.py +++ b/usaspending_api/transactions/delta_models/transaction_search.py @@ -12,7 +12,7 @@ # Dates "action_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "fiscal_action_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "last_modified_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, + "last_modified_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, "fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "award_certified_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "award_fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, @@ -23,7 +23,7 @@ "etl_update_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, "period_of_performance_start_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "period_of_performance_current_end_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "initial_report_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, + "initial_report_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, # Agencies "awarding_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "awarding_toptier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, From ce680e77349d944c4d166a9a3da1dfb9101710e1 Mon Sep 17 00:00:00 2001 From: Lorelei Trimberger Date: Tue, 6 Jan 2026 09:05:26 -0600 Subject: [PATCH 07/59] [DEV-14110] update transaction_delta_view in migration --- ...tionsearch_initial_report_date_and_more.py | 133 +----------------- 1 file changed, 5 insertions(+), 128 deletions(-) diff --git a/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py index 07fe699bd6..77044a1568 100644 --- a/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py +++ b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py @@ -5,6 +5,10 @@ from usaspending_api.awards.models.transaction_fpds import vw_transaction_fpds_sql from usaspending_api.awards.models.transaction_fabs import vw_transaction_fabs_sql +transaction_delta_view_file = "usaspending_api/database_scripts/etl/transaction_delta_view.sql" +with open(transaction_delta_view_file, "r") as f: + transaction_delta_view = f.read() + class Migration(migrations.Migration): dependencies = [ @@ -23,134 +27,7 @@ class Migration(migrations.Migration): reverse_sql=f"""{vw_transaction_normalized_sql} {vw_transaction_fpds_sql} {vw_transaction_fabs_sql} - CREATE VIEW transaction_delta_view AS SELECT - "transaction_id", - "award_id", - "modification_number", - "detached_award_proc_unique", - "afa_generated_unique", - "generated_unique_award_id", - "piid", - "fain", - "uri", - CASE - WHEN "detached_award_proc_unique" IS NOT NULL THEN 'CONT_TX_' || "detached_award_proc_unique" - WHEN "afa_generated_unique" IS NOT NULL THEN 'ASST_TX_' || "afa_generated_unique" - ELSE NULL - END AS generated_unique_transaction_id, - CASE - WHEN "type" IN ('02', '03', '04', '05', '06', '10', '07', '08', '09', '11') AND "fain" IS NOT NULL THEN "fain" - WHEN "piid" IS NOT NULL THEN "piid" -- contracts. Did it this way to easily handle IDV contracts - ELSE "uri" - END AS display_award_id, - "action_date", - "fiscal_action_date", - "last_modified_date", - "fiscal_year", - "award_certified_date", - "award_fiscal_year", - "award_date_signed", - "update_date", - "award_update_date", - "etl_update_date", - "period_of_performance_start_date", - "period_of_performance_current_end_date", - "ordering_period_end_date", - "type_raw", - "type_description_raw", - "type", - "type_description", - "award_category", - "transaction_description", - "award_amount", - "generated_pragmatic_obligation", - "federal_action_obligation", - "original_loan_subsidy_cost", - "face_value_loan_guarantee", - "business_categories", - "naics_code", - "naics_description", - "product_or_service_code", - "product_or_service_description", - "type_of_contract_pricing", - "type_set_aside", - "extent_competed", - "cfda_number", - "cfda_title", - "pop_country_name", - "pop_country_code", - "pop_state_name", - "pop_state_code", - "pop_state_fips", - "pop_state_population", - "pop_county_code", - "pop_county_name", - "pop_county_population", - "pop_zip5", - "place_of_perform_zip_last4", - "pop_congressional_code", - "pop_congressional_population", - "pop_congressional_code_current", - "pop_city_name", - "pop_county_fips", - "recipient_location_country_code", - "recipient_location_country_name", - "recipient_location_state_name", - "recipient_location_state_code", - "recipient_location_state_fips", - "recipient_location_state_population", - "recipient_location_county_code", - "recipient_location_county_name", - "recipient_location_county_population", - "recipient_location_congressional_code", - "recipient_location_congressional_population", - "recipient_location_congressional_code_current", - "recipient_location_zip5", - "recipient_location_city_name", - "recipient_location_county_fips", - "action_type", - "legal_entity_address_line1", - "legal_entity_address_line2", - "legal_entity_address_line3", - "legal_entity_foreign_posta", - "legal_entity_foreign_provi", - "legal_entity_zip_last4", - "recipient_hash", - "recipient_name", - "recipient_levels", - "recipient_unique_id", - "parent_recipient_hash", - "parent_recipient_name", - "parent_recipient_unique_id", - "recipient_uei", - "parent_uei", - "awarding_agency_id", - "funding_agency_id", - "awarding_toptier_agency_id", - "funding_toptier_agency_id", - "awarding_agency_code", - "awarding_toptier_agency_name", - "funding_agency_code", - "funding_toptier_agency_name", - "awarding_sub_tier_agency_c", - "awarding_subtier_agency_name", - "funding_sub_tier_agency_co", - "funding_subtier_agency_name", - "awarding_office_code", - "awarding_office_name", - "funding_office_code", - "funding_office_name", - "awarding_toptier_agency_abbreviation", - "funding_toptier_agency_abbreviation", - "awarding_subtier_agency_abbreviation", - "funding_subtier_agency_abbreviation", - "tas_paths", - "tas_components", - CAST("federal_accounts" AS VARCHAR(65535)) AS federal_accounts, - "disaster_emergency_fund_codes", - CAST("program_activities" AS VARCHAR(65535)) AS program_activities - FROM "transaction_search" - WHERE "action_date" >= '2007-10-01'; + {transaction_delta_view} """, ), migrations.AlterField( From 826f1acd703483ec86ae6b9fdabfc0e9cc946acd Mon Sep 17 00:00:00 2001 From: Seth Stoudenmier Date: Tue, 6 Jan 2026 16:11:47 -0500 Subject: [PATCH 08/59] [DEV-14094] update sparkSubmit --- usaspending_api/common/spark/jobs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/usaspending_api/common/spark/jobs.py b/usaspending_api/common/spark/jobs.py index 14e7b8c6e2..ab14c95459 100644 --- a/usaspending_api/common/spark/jobs.py +++ b/usaspending_api/common/spark/jobs.py @@ -13,6 +13,7 @@ from django.conf import settings from django.core.management import call_command from duckdb.experimental.spark.sql import SparkSession as DuckDBSparkSession +from usaspending_api.config import CONFIG from usaspending_api.common.spark.configs import LOCAL_EXTENDED_EXTRA_CONF, OPTIONAL_SPARK_HIVE_JAR, SPARK_SESSION_JARS @@ -165,8 +166,8 @@ def handle_start(self, job_name: str, command_name: str, command_options: list[s mode="BATCH", jobDriver={ "sparkSubmit": { - "entryPoint": command_name, - "entryPointArguments": command_options, + "entryPoint": f"s3://{CONFIG.SPARK_S3_BUCKET}/master/manage.py", + "entryPointArguments": [command_name, *command_options], } }, retryPolicy={"maxAttempts": 2}, From ba1b194a88837f77c24a00f3f28d56350ed71caa Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Wed, 7 Jan 2026 08:31:24 -0600 Subject: [PATCH 09/59] [DEV-14145] flake8 fix --- usaspending_api/download/delta_downloads/account_balances.py | 1 - 1 file changed, 1 deletion(-) diff --git a/usaspending_api/download/delta_downloads/account_balances.py b/usaspending_api/download/delta_downloads/account_balances.py index 3762fd2cde..953f3908ab 100644 --- a/usaspending_api/download/delta_downloads/account_balances.py +++ b/usaspending_api/download/delta_downloads/account_balances.py @@ -50,7 +50,6 @@ def download_table(self) -> DataFrame | DuckDBSparkDataFrame: f"s3a://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/account_balances_download" ) - def _build_dataframes(self) -> list[DataFrame | DuckDBSparkDataFrame]: return [ self.download_table.filter( From bef41a2b9d646265134fbbe1fea033db3711ba5e Mon Sep 17 00:00:00 2001 From: Lorelei Trimberger Date: Wed, 7 Jan 2026 13:17:45 -0600 Subject: [PATCH 10/59] [DEV-14110] split up migrations to avoid race condition --- ...tionsearch_initial_report_date_and_more.py | 29 +++---------- .../0060_alter_initial_report_date_andmore.py | 43 +++++++++++++++++++ 2 files changed, 48 insertions(+), 24 deletions(-) create mode 100644 usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py diff --git a/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py index 77044a1568..4edca94e74 100644 --- a/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py +++ b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py @@ -1,16 +1,9 @@ # Generated by Django 4.2.23 on 2026-01-02 16:42 from django.db import migrations, models -from usaspending_api.awards.models.transaction_normalized import vw_transaction_normalized_sql -from usaspending_api.awards.models.transaction_fpds import vw_transaction_fpds_sql -from usaspending_api.awards.models.transaction_fabs import vw_transaction_fabs_sql - -transaction_delta_view_file = "usaspending_api/database_scripts/etl/transaction_delta_view.sql" -with open(transaction_delta_view_file, "r") as f: - transaction_delta_view = f.read() class Migration(migrations.Migration): - + atomic = False dependencies = [ ("search", "0058_add_transaction_count_field"), ] @@ -18,26 +11,14 @@ class Migration(migrations.Migration): operations = [ # Without dropping these tables, it caused the error cannot alter type of a column used by a view or rule migrations.RunSQL( - sql="""DROP VIEW IF EXISTS + sql=""" + DROP VIEW IF EXISTS vw_transaction_fabs, vw_transaction_normalized, vw_transaction_fpds, transaction_delta_view + CASCADE; """, - reverse_sql=f"""{vw_transaction_normalized_sql} - {vw_transaction_fpds_sql} - {vw_transaction_fabs_sql} - {transaction_delta_view} - """, - ), - migrations.AlterField( - model_name="transactionsearch", - name="initial_report_date", - field=models.DateTimeField(null=True), - ), - migrations.AlterField( - model_name="transactionsearch", - name="last_modified_date", - field=models.DateTimeField(null=True), + reverse_sql=migrations.RunSQL.noop, ), ] diff --git a/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py b/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py new file mode 100644 index 0000000000..15c64aadaf --- /dev/null +++ b/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py @@ -0,0 +1,43 @@ +# Generated by Django 4.2.23 on 2026-01-02 16:42 + +from django.db import migrations, models +from usaspending_api.awards.models.transaction_normalized import vw_transaction_normalized_sql +from usaspending_api.awards.models.transaction_fpds import vw_transaction_fpds_sql +from usaspending_api.awards.models.transaction_fabs import vw_transaction_fabs_sql + +transaction_delta_view_file = "usaspending_api/database_scripts/etl/transaction_delta_view.sql" +with open(transaction_delta_view_file, "r") as f: + transaction_delta_view = f.read() + +class Migration(migrations.Migration): + atomic = False + dependencies = [ + ("search", "0059_alter_transactionsearch_initial_report_date_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="transactionsearch", + name="initial_report_date", + field=models.DateTimeField(null=True), + ), + migrations.AlterField( + model_name="transactionsearch", + name="last_modified_date", + field=models.DateTimeField(null=True), + ), + + migrations.RunSQL( + sql=f"""{vw_transaction_normalized_sql} + {vw_transaction_fpds_sql} + {vw_transaction_fabs_sql} + {transaction_delta_view} + """, + reverse_sql="""DROP VIEW IF EXISTS + vw_transaction_fabs, + vw_transaction_normalized, + vw_transaction_fpds, + transaction_delta_view + """, + ) + ] From d46fd085e1c86f6a6a49d1290075cb074dd37f4a Mon Sep 17 00:00:00 2001 From: Lorelei Trimberger Date: Wed, 7 Jan 2026 14:55:16 -0600 Subject: [PATCH 11/59] [DEV-14110] change DateType to TimestampType last_modified_date --- usaspending_api/transactions/delta_models/transaction_fpds.py | 2 +- .../transactions/delta_models/transaction_normalized.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/usaspending_api/transactions/delta_models/transaction_fpds.py b/usaspending_api/transactions/delta_models/transaction_fpds.py index e8f016f29f..e624fc44fa 100644 --- a/usaspending_api/transactions/delta_models/transaction_fpds.py +++ b/usaspending_api/transactions/delta_models/transaction_fpds.py @@ -395,7 +395,7 @@ TransactionColumn("funding_amount", "NULL", "NUMERIC(23, 2)", "literal"), TransactionColumn("indirect_federal_sharing", "NULL", "NUMERIC(23, 2)", "literal"), TransactionColumn("is_fpds", "TRUE", "BOOLEAN", "literal"), - TransactionColumn("last_modified_date", "last_modified", "DATE", "cast"), + TransactionColumn("last_modified_date", "last_modified", "TIMESTAMP", "cast"), TransactionColumn("modification_number", "award_modification_amendme", "STRING"), TransactionColumn("non_federal_funding_amount", "NULL", "NUMERIC(23, 2)", "literal"), TransactionColumn("original_loan_subsidy_cost", "NULL", "NUMERIC(23, 2)", "literal"), diff --git a/usaspending_api/transactions/delta_models/transaction_normalized.py b/usaspending_api/transactions/delta_models/transaction_normalized.py index f9261f54f9..eb75def0aa 100644 --- a/usaspending_api/transactions/delta_models/transaction_normalized.py +++ b/usaspending_api/transactions/delta_models/transaction_normalized.py @@ -16,7 +16,7 @@ "id": "LONG NOT NULL", "indirect_federal_sharing": "NUMERIC(23, 2)", "is_fpds": "BOOLEAN NOT NULL", - "last_modified_date": "DATE", + "last_modified_date": "TIMESTAMP", "modification_number": "STRING", "non_federal_funding_amount": "NUMERIC(23, 2)", "original_loan_subsidy_cost": "NUMERIC(23, 2)", From 89909e488e89939d0da9f3b9bc389340fa4c0045 Mon Sep 17 00:00:00 2001 From: Lorelei Trimberger Date: Fri, 9 Jan 2026 11:22:17 -0600 Subject: [PATCH 12/59] [DEV-14110] update types --- usaspending_api/search/delta_models/award_search.py | 2 +- .../delta_models/dataframes/transaction_search.py | 10 ++++++---- usaspending_api/search/models/award_search.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/usaspending_api/search/delta_models/award_search.py b/usaspending_api/search/delta_models/award_search.py index 756bfe839e..cbc505a19c 100644 --- a/usaspending_api/search/delta_models/award_search.py +++ b/usaspending_api/search/delta_models/award_search.py @@ -50,7 +50,7 @@ "subaward_count": {"delta": "INTEGER", "postgres": "INTEGER", "gold": True}, "action_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "last_modified_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, + "last_modified_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, "period_of_performance_start_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "period_of_performance_current_end_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "date_signed": {"delta": "DATE", "postgres": "DATE", "gold": False}, diff --git a/usaspending_api/search/delta_models/dataframes/transaction_search.py b/usaspending_api/search/delta_models/dataframes/transaction_search.py index bc222f3df7..692c888718 100644 --- a/usaspending_api/search/delta_models/dataframes/transaction_search.py +++ b/usaspending_api/search/delta_models/dataframes/transaction_search.py @@ -92,10 +92,12 @@ def date_cols(self) -> list[Column]: sf.to_date(self.transaction_normalized.period_of_performance_current_end_date).alias( "period_of_performance_current_end_date" ), - sf.coalesce( - sf.to_date(self.transaction_fabs.created_at), - self.transaction_fpds.initial_report_date, - ).alias("initial_report_date"), + sf.to_timestamp( + sf.coalesce( + sf.to_date(self.transaction_fabs.created_at), + self.transaction_fpds.initial_report_date, + ).alias("initial_report_date") + ), ] @property diff --git a/usaspending_api/search/models/award_search.py b/usaspending_api/search/models/award_search.py index c915ee234c..8e863df081 100644 --- a/usaspending_api/search/models/award_search.py +++ b/usaspending_api/search/models/award_search.py @@ -45,7 +45,7 @@ class AwardSearch(models.Model): action_date = models.DateField(null=True) fiscal_year = models.IntegerField(null=True) - last_modified_date = models.DateField(blank=True, null=True) + last_modified_date = models.DateTimeField(blank=True, null=True) period_of_performance_start_date = models.DateField(null=True, db_index=True) period_of_performance_current_end_date = models.DateField(null=True, db_index=True) From 46ea86b4840fa589d5002f7e5fd6744a96040b51 Mon Sep 17 00:00:00 2001 From: Lorelei Trimberger Date: Fri, 9 Jan 2026 14:34:26 -0600 Subject: [PATCH 13/59] [DEV-14110] fix transaction_search df and update tests --- .../tests/integration/test_awards_v2.py | 26 +++++++++++-------- .../integration/test_load_to_from_delta.py | 2 +- .../dataframes/transaction_search.py | 4 +-- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/usaspending_api/awards/tests/integration/test_awards_v2.py b/usaspending_api/awards/tests/integration/test_awards_v2.py index a695be16c8..44228292bf 100644 --- a/usaspending_api/awards/tests/integration/test_awards_v2.py +++ b/usaspending_api/awards/tests/integration/test_awards_v2.py @@ -130,7 +130,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+00", "officer_1_amount": 50000.00, "officer_1_name": "John Apple", "officer_2_amount": 4623.00, @@ -183,7 +183,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+00", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -236,7 +236,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+00", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -289,7 +289,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+00", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -344,7 +344,7 @@ def awards_and_transactions(db): "recipient_location_state_name": None, "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+00", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -405,7 +405,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03", + "last_modified_date": "2001-02-03 00:00:00+00", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -495,7 +495,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03", + "last_modified_date": "2001-02-03 00:00:00+00", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -585,7 +585,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03", + "last_modified_date": "2001-02-03 00:00:00+00", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -1489,7 +1489,11 @@ def test_outlay_calculations(client, awards_and_transactions): {"name": None, "amount": None}, ] }, - "period_of_performance": {"start_date": "2004-02-04", "end_date": "2005-02-04", "last_modified_date": "2000-01-02"}, + "period_of_performance": { + "start_date": "2004-02-04", + "end_date": "2005-02-04", + "last_modified_date": "2000-01-02 00:00:00+00", + }, "place_of_performance": { "address_line1": None, "address_line2": None, @@ -1586,8 +1590,8 @@ def test_outlay_calculations(client, awards_and_transactions): "period_of_performance": { "start_date": "2004-02-04", "end_date": "2005-02-04", - "last_modified_date": "2001-02-03", - "potential_end_date": "2003-04-05", + "last_modified_date": "2001-02-03 00:00:00+00", + "potential_end_date": "2003-04-05 00:00:00+00", }, "place_of_performance": { "address_line1": None, diff --git a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py index ac383cc9e4..0c2779443c 100644 --- a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py +++ b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py @@ -364,7 +364,7 @@ def test_load_table_to_from_delta_for_recipient_lookup( award_id=new_award.award_id, is_fpds=False, type="07", - last_modified_date="2021-01-01", + last_modified_date=datetime.strptime("2021-01-01", "%Y-%m-%d"), cfda_number="12.456", recipient_uei="FABSUEI12345", recipient_unique_id="FABSDUNS12345", diff --git a/usaspending_api/search/delta_models/dataframes/transaction_search.py b/usaspending_api/search/delta_models/dataframes/transaction_search.py index 692c888718..d80c7a5943 100644 --- a/usaspending_api/search/delta_models/dataframes/transaction_search.py +++ b/usaspending_api/search/delta_models/dataframes/transaction_search.py @@ -96,8 +96,8 @@ def date_cols(self) -> list[Column]: sf.coalesce( sf.to_date(self.transaction_fabs.created_at), self.transaction_fpds.initial_report_date, - ).alias("initial_report_date") - ), + ) + ).alias("initial_report_date"), ] @property From d1ecdc601b5f6d5e0bff9c1f4cad50b1a2f69e4e Mon Sep 17 00:00:00 2001 From: Seth Stoudenmier Date: Tue, 13 Jan 2026 15:24:16 -0500 Subject: [PATCH 14/59] [DEV-14094] comment out retry policy --- usaspending_api/common/spark/jobs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/usaspending_api/common/spark/jobs.py b/usaspending_api/common/spark/jobs.py index ab14c95459..5e3f95e23e 100644 --- a/usaspending_api/common/spark/jobs.py +++ b/usaspending_api/common/spark/jobs.py @@ -170,7 +170,8 @@ def handle_start(self, job_name: str, command_name: str, command_options: list[s "entryPointArguments": [command_name, *command_options], } }, - retryPolicy={"maxAttempts": 2}, + # TODO: Requires updating to EMR 7 + # retryPolicy={"maxAttempts": 2}, ) return response From 37c7b0beb00ad402a22b42c47156c6bf2dceb5fb Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 23 Jan 2026 14:34:26 -0600 Subject: [PATCH 15/59] [DEV-14110] - updating fixtures --- .../tests/integration/test_awards_v2.py | 24 +++++++++++-------- .../test_populate_monthly_delta_files.py | 2 +- .../integration/test_load_to_from_delta.py | 2 +- .../etl/tests/integration/test_spark_app.py | 6 ++--- .../tests/integration/test_awards_idv_v2.py | 6 ++--- 5 files changed, 22 insertions(+), 18 deletions(-) diff --git a/usaspending_api/awards/tests/integration/test_awards_v2.py b/usaspending_api/awards/tests/integration/test_awards_v2.py index a695be16c8..ea00667700 100644 --- a/usaspending_api/awards/tests/integration/test_awards_v2.py +++ b/usaspending_api/awards/tests/integration/test_awards_v2.py @@ -130,7 +130,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+00", "officer_1_amount": 50000.00, "officer_1_name": "John Apple", "officer_2_amount": 4623.00, @@ -183,7 +183,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+00", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -236,7 +236,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+00", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -289,7 +289,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+00", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -344,7 +344,7 @@ def awards_and_transactions(db): "recipient_location_state_name": None, "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+00", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -405,7 +405,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03", + "last_modified_date": "2001-02-03 00:00:00+00", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -495,7 +495,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03", + "last_modified_date": "2001-02-03 00:00:00+00", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -585,7 +585,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03", + "last_modified_date": "2001-02-03 00:00:00+00", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -1489,7 +1489,11 @@ def test_outlay_calculations(client, awards_and_transactions): {"name": None, "amount": None}, ] }, - "period_of_performance": {"start_date": "2004-02-04", "end_date": "2005-02-04", "last_modified_date": "2000-01-02"}, + "period_of_performance": { + "start_date": "2004-02-04", + "end_date": "2005-02-04", + "last_modified_date": "2000-01-02 00:00:00+00", + }, "place_of_performance": { "address_line1": None, "address_line2": None, @@ -1586,7 +1590,7 @@ def test_outlay_calculations(client, awards_and_transactions): "period_of_performance": { "start_date": "2004-02-04", "end_date": "2005-02-04", - "last_modified_date": "2001-02-03", + "last_modified_date": "2001-02-03 00:00:00+00", "potential_end_date": "2003-04-05", }, "place_of_performance": { diff --git a/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py b/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py index c23c057c59..44a38586c4 100644 --- a/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py +++ b/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py @@ -399,7 +399,7 @@ def test_specific_agency(monthly_download_delta_data, monkeypatch): "", f"{HOST}/award/CONT_AWD_1_0_0/" if "localhost" in HOST else f"https://{HOST}/award/CONT_AWD_1_0_0/", "", - "2020-05-07", + "2020-05-07 00:00:00+00", ] call_command("populate_monthly_delta_files", "--agencies=1", "--debugging_skip_deleted", "--last_date=2020-12-31") file_list = listdir("csv_downloads") diff --git a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py index ac383cc9e4..9c362ac5d9 100644 --- a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py +++ b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py @@ -364,7 +364,7 @@ def test_load_table_to_from_delta_for_recipient_lookup( award_id=new_award.award_id, is_fpds=False, type="07", - last_modified_date="2021-01-01", + last_modified_date="2021-01-01 00:00:00+00", cfda_number="12.456", recipient_uei="FABSUEI12345", recipient_unique_id="FABSDUNS12345", diff --git a/usaspending_api/etl/tests/integration/test_spark_app.py b/usaspending_api/etl/tests/integration/test_spark_app.py index 09387d99f1..2b5580515c 100644 --- a/usaspending_api/etl/tests/integration/test_spark_app.py +++ b/usaspending_api/etl/tests/integration/test_spark_app.py @@ -8,7 +8,7 @@ import random import sys import uuid -from datetime import date +from datetime import datetime from unittest.mock import MagicMock, call import boto3 @@ -138,7 +138,7 @@ def _transaction_and_award_test_data(db): award=awd1, modification_number="1", awarding_agency_id=agency1.id, - last_modified_date=date(2012, 3, 1), + last_modified_date=datetime(2012, 3, 1), business_funds_indicator="a", record_type=1, total_funding_amount=1000.00, @@ -153,7 +153,7 @@ def _transaction_and_award_test_data(db): award=awd2, modification_number="1", awarding_agency_id=agency1.id, - last_modified_date=date(2012, 4, 1), + last_modified_date=datetime(2012, 4, 1), is_fpds=True, piid="abc", base_and_all_options_value=1000, diff --git a/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py b/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py index d97ca5b2d6..4777f194d2 100644 --- a/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py +++ b/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py @@ -150,7 +150,7 @@ def awards_and_transactions(db): "is_fpds": True, "labor_standards": None, "labor_standards_descrip": "NO", - "last_modified_date": "2018-08-24", + "last_modified_date": "2018-08-24 00:00:00+00", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -270,7 +270,7 @@ def awards_and_transactions(db): "is_fpds": True, "labor_standards": None, "labor_standards_descrip": "NO", - "last_modified_date": "2018-08-24", + "last_modified_date": "2018-08-24 00:00:00+00", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -381,7 +381,7 @@ def test_award_endpoint_for_null_recipient_information(client, awards_and_transa "period_of_performance": { "start_date": "2004-02-04", "end_date": "2025-06-30", - "last_modified_date": "2018-08-24", + "last_modified_date": "2018-08-24 00:00:00+00", "potential_end_date": "2003-04-05", }, "awarding_agency": { From 34acab5a109cefeaef41c34e174da09801198b1b Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 23 Jan 2026 16:17:52 -0600 Subject: [PATCH 16/59] [DEV-14110] - updating fixtures and orm logic to return datetime --- .../awards/tests/integration/test_awards_v2.py | 18 +++++++++--------- usaspending_api/awards/v2/data_layer/orm.py | 3 +-- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/usaspending_api/awards/tests/integration/test_awards_v2.py b/usaspending_api/awards/tests/integration/test_awards_v2.py index ea00667700..1e476cd1b5 100644 --- a/usaspending_api/awards/tests/integration/test_awards_v2.py +++ b/usaspending_api/awards/tests/integration/test_awards_v2.py @@ -130,7 +130,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02 00:00:00+00", + "last_modified_date": "2000-01-02 00:00:00+0000", "officer_1_amount": 50000.00, "officer_1_name": "John Apple", "officer_2_amount": 4623.00, @@ -183,7 +183,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02 00:00:00+00", + "last_modified_date": "2000-01-02 00:00:00+0000", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -236,7 +236,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02 00:00:00+00", + "last_modified_date": "2000-01-02 00:00:00+0000", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -289,7 +289,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02 00:00:00+00", + "last_modified_date": "2000-01-02 00:00:00+0000", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -344,7 +344,7 @@ def awards_and_transactions(db): "recipient_location_state_name": None, "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02 00:00:00+00", + "last_modified_date": "2000-01-02 00:00:00+0000", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -405,7 +405,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03 00:00:00+00", + "last_modified_date": "2001-02-03 00:00:00+0000", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -495,7 +495,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03 00:00:00+00", + "last_modified_date": "2001-02-03 00:00:00+0000", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -585,7 +585,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03 00:00:00+00", + "last_modified_date": "2001-02-03 00:00:00+0000", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -1492,7 +1492,7 @@ def test_outlay_calculations(client, awards_and_transactions): "period_of_performance": { "start_date": "2004-02-04", "end_date": "2005-02-04", - "last_modified_date": "2000-01-02 00:00:00+00", + "last_modified_date": "2000-01-02 00:00:00+0000", }, "place_of_performance": { "address_line1": None, diff --git a/usaspending_api/awards/v2/data_layer/orm.py b/usaspending_api/awards/v2/data_layer/orm.py index 289ffc5662..4dc2cb8785 100644 --- a/usaspending_api/awards/v2/data_layer/orm.py +++ b/usaspending_api/awards/v2/data_layer/orm.py @@ -25,7 +25,6 @@ from usaspending_api.awards.v2.data_layer.orm_utils import delete_keys_from_dict, split_mapper_into_qs from usaspending_api.common.helpers.business_categories_helper import get_business_category_display_names from usaspending_api.common.helpers.data_constants import state_code_from_name, state_name_from_code -from usaspending_api.common.helpers.date_helper import get_date_from_datetime from usaspending_api.common.helpers.sql_helpers import execute_sql_to_ordered_dictionary from usaspending_api.common.recipient_lookups import obtain_recipient_uri from usaspending_api.references.models import ( @@ -70,7 +69,7 @@ def construct_assistance_response(requested_award_dict: dict) -> OrderedDict: [ ("start_date", award["_start_date"]), ("end_date", award["_end_date"]), - ("last_modified_date", get_date_from_datetime(transaction["_modified_at"])), + ("last_modified_date", transaction["_modified_at"].strftime("%Y-%m-%d %H:%M:%S%z")), ] ) response["recipient"] = create_recipient_object(transaction) From 31bf105bd342de5b28bd93b60933764e3487f8f8 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 26 Jan 2026 11:44:17 -0600 Subject: [PATCH 17/59] [DEV-14110] - updating fixtures in conftest --- usaspending_api/tests/conftest_spark.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/usaspending_api/tests/conftest_spark.py b/usaspending_api/tests/conftest_spark.py index 587c877b63..6308f54ba4 100644 --- a/usaspending_api/tests/conftest_spark.py +++ b/usaspending_api/tests/conftest_spark.py @@ -747,7 +747,7 @@ def _build_usas_data_for_spark(): total_obligation=0.00, total_subsidy_cost=0.00, total_obl_bin="<1M", - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", update_date="2020-01-01", awarding_agency_id=32, funding_agency_id=32, @@ -840,7 +840,7 @@ def _build_usas_data_for_spark(): funding_subtier_agency_name_raw="TEST SUBTIER 1", awarding_toptier_agency_id=awarding_agency.id, funding_toptier_agency_id=funding_agency.id, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", federal_action_obligation=0, cfda_number="12.456", cfda_id=cfda.id, @@ -948,7 +948,7 @@ def _build_usas_data_for_spark(): funding_subtier_agency_abbreviation=funding_subtier_agency.abbreviation, awarding_toptier_agency_id=awarding_agency.id, funding_toptier_agency_id=funding_agency.id, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", federal_action_obligation=0, published_fabs_id=2, cfda_number="12.456", @@ -1057,7 +1057,7 @@ def _build_usas_data_for_spark(): funding_subtier_agency_name_raw="TEST SUBTIER 1", awarding_toptier_agency_id=awarding_agency.id, funding_toptier_agency_id=funding_agency.id, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", federal_action_obligation=0, cfda_number="12.456", cfda_id=cfda.id, @@ -1151,7 +1151,7 @@ def _build_usas_data_for_spark(): funding_toptier_agency_abbreviation=funding_toptier_agency.abbreviation, awarding_subtier_agency_abbreviation=awarding_subtier_agency.abbreviation, funding_subtier_agency_abbreviation=funding_subtier_agency.abbreviation, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", federal_action_obligation=0, naics_code="123456", product_or_service_code="12", @@ -1249,7 +1249,7 @@ def _build_usas_data_for_spark(): funding_toptier_agency_abbreviation=funding_toptier_agency.abbreviation, awarding_subtier_agency_abbreviation=awarding_subtier_agency.abbreviation, funding_subtier_agency_abbreviation=funding_subtier_agency.abbreviation, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", federal_action_obligation=0, naics_code="123456", product_or_service_code="12", @@ -1339,7 +1339,7 @@ def _build_usas_data_for_spark(): funding_subtier_agency_abbreviation=subtier.abbreviation, awarding_toptier_agency_id=agency.id, funding_toptier_agency_id=agency.id, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", award_update_date=cont_award2.update_date, generated_pragmatic_obligation=0.00, original_loan_subsidy_cost=0.00, From f901da3d2f82de9f749bc5a968688d9b0f80a4d2 Mon Sep 17 00:00:00 2001 From: Seth Stoudenmier Date: Mon, 26 Jan 2026 14:05:36 -0500 Subject: [PATCH 18/59] [DEV-14094] Update to older syntax and fix TODOs --- .../delta_downloads/account_balances.py | 7 +--- .../delta_downloads/award_financial.py | 9 ++--- .../object_class_program_activity.py | 9 +---- .../test_account_download_factories.py | 35 +++---------------- .../integration/test_download_accounts.py | 21 +---------- .../integration/test_load_to_from_delta.py | 22 ------------ 6 files changed, 9 insertions(+), 94 deletions(-) diff --git a/usaspending_api/download/delta_downloads/account_balances.py b/usaspending_api/download/delta_downloads/account_balances.py index b8b0ceb61d..d2c1ecd531 100644 --- a/usaspending_api/download/delta_downloads/account_balances.py +++ b/usaspending_api/download/delta_downloads/account_balances.py @@ -1,5 +1,4 @@ from pyspark.sql import functions as sf, Column, DataFrame, SparkSession -from usaspending_api.config import CONFIG from usaspending_api.common.spark.utils import collect_concat from usaspending_api.download.delta_downloads.abstract_downloads.account_download import ( @@ -29,11 +28,7 @@ class AccountBalancesMixin: @property def download_table(self) -> DataFrame: - # TODO: This should be reverted back after Spark downloads are migrated to EMR - # return self.spark.table("rpt.account_balances_download") - return self.spark.read.format("delta").load( - f"s3a://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/account_balances_download" - ) + return self.spark.table("rpt.account_balances_download") def _build_dataframes(self) -> list[DataFrame]: return [ diff --git a/usaspending_api/download/delta_downloads/award_financial.py b/usaspending_api/download/delta_downloads/award_financial.py index b0990b4b3b..1a59ab932d 100644 --- a/usaspending_api/download/delta_downloads/award_financial.py +++ b/usaspending_api/download/delta_downloads/award_financial.py @@ -2,7 +2,6 @@ from pyspark.sql import functions as sf, Column, DataFrame, SparkSession -from usaspending_api.config import CONFIG from usaspending_api.common.spark.utils import collect_concat, filter_submission_and_sum from usaspending_api.download.delta_downloads.abstract_downloads.account_download import ( @@ -31,11 +30,7 @@ class AwardFinancialMixin: @property def download_table(self) -> DataFrame: - # TODO: This should be reverted back after Spark downloads are migrated to EMR - # return self.spark.table("rpt.award_financial_download") - return self.spark.read.format("delta").load( - f"s3a://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/award_financial_download" - ) + return self.spark.table("rpt.award_financial_download") @property def non_zero_filters(self) -> Column: @@ -49,7 +44,7 @@ def non_zero_filters(self) -> Column: @property def award_categories(self) -> dict[str, Column]: return { - "Assistance": (sf.isnotnull(sf.col("is_fpds")) & ~sf.col("is_fpds")), + "Assistance": (~sf.isnull(sf.col("is_fpds")) & ~sf.col("is_fpds")), "Contracts": sf.col("is_fpds"), "Unlinked": sf.isnull(sf.col("is_fpds")), } diff --git a/usaspending_api/download/delta_downloads/object_class_program_activity.py b/usaspending_api/download/delta_downloads/object_class_program_activity.py index 3cec5a6272..c5a6ced9cf 100644 --- a/usaspending_api/download/delta_downloads/object_class_program_activity.py +++ b/usaspending_api/download/delta_downloads/object_class_program_activity.py @@ -4,7 +4,6 @@ from pyspark.sql import Column, DataFrame, SparkSession from usaspending_api.common.spark.utils import collect_concat, filter_submission_and_sum -from usaspending_api.config import CONFIG from usaspending_api.download.delta_downloads.abstract_downloads.account_download import ( AbstractAccountDownload, AccountLevel, @@ -42,13 +41,7 @@ def __init__(self, *args, **kwargs): @property def download_table(self) -> DataFrame | DuckDBSparkDataFrame: - if isinstance(self.spark, DuckDBSparkSession): - return self.spark.table("rpt.object_class_program_activity_download") - else: - # TODO: This should be reverted back after Spark downloads are migrated to EMR - return self.spark.read.format("delta").load( - f"s3a://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/object_class_program_activity_download" - ) + return self.spark.table("rpt.object_class_program_activity_download") def _build_dataframes(self) -> list[DataFrame | DuckDBSparkDataFrame]: return [ diff --git a/usaspending_api/download/tests/integration/test_account_download_factories.py b/usaspending_api/download/tests/integration/test_account_download_factories.py index b554a3b4ca..091a2b445a 100644 --- a/usaspending_api/download/tests/integration/test_account_download_factories.py +++ b/usaspending_api/download/tests/integration/test_account_download_factories.py @@ -6,7 +6,6 @@ import pytest from django.core.management import call_command from model_bakery import baker -from usaspending_api.config import CONFIG from usaspending_api.common.etl.spark import create_ref_temp_views from usaspending_api.download.delta_downloads.account_balances import AccountBalancesDownloadFactory @@ -23,18 +22,12 @@ @pytest.fixture(scope="function") -def award_financial_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db, monkeypatch): +def award_financial_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): call_command( "create_delta_table", "--destination-table=award_financial_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.award_financial.AwardFinancialMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/award_financial_download" - ), - ) column_placeholders = {field.name: [None] * 5 for field in award_financial_schema} test_data_df = pd.DataFrame( data={ @@ -75,18 +68,12 @@ def award_financial_table(spark, s3_unittest_data_bucket, hive_unittest_metastor @pytest.fixture(scope="function") -def award_financial_table_award_category(spark, s3_unittest_data_bucket, hive_unittest_metastore_db, monkeypatch): +def award_financial_table_award_category(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): call_command( "create_delta_table", "--destination-table=award_financial_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.award_financial.AwardFinancialMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/award_financial_download" - ), - ) column_placeholders = {field.name: [None] * 5 for field in award_financial_schema} test_data_df = pd.DataFrame( data={ @@ -127,18 +114,12 @@ def award_financial_table_award_category(spark, s3_unittest_data_bucket, hive_un @pytest.fixture(scope="function") -def account_balances_download_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db, monkeypatch): +def account_balances_download_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): call_command( "create_delta_table", "--destination-table=account_balances_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.account_balances.AccountBalancesMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/account_balances_download" - ), - ) column_placeholders = {field.name: [None] * 5 for field in account_balances_schema} test_data_df = pd.DataFrame( data={ @@ -175,20 +156,12 @@ def account_balances_download_table(spark, s3_unittest_data_bucket, hive_unittes @pytest.fixture(scope="function") -def object_class_by_program_activity_download_table( - spark, s3_unittest_data_bucket, hive_unittest_metastore_db, monkeypatch -): +def object_class_by_program_activity_download_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): call_command( "create_delta_table", "--destination-table=object_class_program_activity_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.object_class_program_activity.ObjectClassProgramActivityMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/object_class_program_activity_download" - ), - ) column_placeholders = {field.name: [None] * 5 for field in object_class_program_activity_schema} test_data_df = pd.DataFrame( data={ diff --git a/usaspending_api/download/tests/integration/test_download_accounts.py b/usaspending_api/download/tests/integration/test_download_accounts.py index 62123cb966..08560f6d95 100644 --- a/usaspending_api/download/tests/integration/test_download_accounts.py +++ b/usaspending_api/download/tests/integration/test_download_accounts.py @@ -10,7 +10,6 @@ from django.core.management import call_command from model_bakery import baker from rest_framework import status -from usaspending_api.config import CONFIG from usaspending_api.accounts.models import FederalAccount, TreasuryAppropriationAccount from usaspending_api.awards.models import FinancialAccountsByAwards @@ -23,42 +22,24 @@ @pytest.fixture -def create_download_delta_tables(spark, s3_unittest_data_bucket, hive_unittest_metastore_db, monkeypatch): +def create_download_delta_tables(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): call_command( "create_delta_table", f"--spark-s3-bucket={s3_unittest_data_bucket}", f"--destination-table=award_financial_download", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.award_financial.AwardFinancialMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/award_financial_download" - ), - ) call_command( "create_delta_table", f"--spark-s3-bucket={s3_unittest_data_bucket}", f"--destination-table=object_class_program_activity_download", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.object_class_program_activity.ObjectClassProgramActivityMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/object_class_program_activity_download" - ), - ) call_command( "create_delta_table", f"--spark-s3-bucket={s3_unittest_data_bucket}", f"--destination-table=account_balances_download", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.account_balances.AccountBalancesMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/account_balances_download" - ), - ) yield diff --git a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py index ac383cc9e4..273c889214 100644 --- a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py +++ b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py @@ -19,7 +19,6 @@ from django.conf import settings from django.core.management import call_command from django.db import connection, connections, transaction, models -from usaspending_api.config import CONFIG from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.etl.award_helpers import update_awards from usaspending_api.etl.broker_etl_helpers import dictfetchall @@ -1053,19 +1052,12 @@ def test_load_object_class_program_activity_class( s3_unittest_data_bucket, hive_unittest_metastore_db, populate_usas_data_and_recipients_from_broker, - monkeypatch, ): call_command( "create_delta_table", "--destination-table=object_class_program_activity_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.object_class_program_activity.ObjectClassProgramActivityMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/object_class_program_activity_download" - ), - ) verify_delta_table_loaded_to_delta( spark, @@ -1082,7 +1074,6 @@ def test_load_award_financial_download( s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db, - monkeypatch, ): load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) @@ -1124,12 +1115,6 @@ def test_load_award_financial_download( "--destination-table=award_financial_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.award_financial.AwardFinancialMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/award_financial_download" - ), - ) expected_data = [ { @@ -1237,7 +1222,6 @@ def test_load_account_balances_download( spark, s3_unittest_data_bucket, hive_unittest_metastore_db, - monkeypatch, populate_usas_data_and_recipients_from_broker, ): call_command( @@ -1245,12 +1229,6 @@ def test_load_account_balances_download( "--destination-table=account_balances_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.account_balances.AccountBalancesMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/account_balances_download" - ), - ) verify_delta_table_loaded_to_delta( spark, From 02414869577774db2b3ef6fced111898da89031d Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Mon, 26 Jan 2026 13:32:35 -0600 Subject: [PATCH 19/59] [DEV-12858] Add and configure ruff linter --- .github/workflows/code-style-checks.yaml | 19 ++++++++--- pyproject.toml | 43 ++++++++++++++++++++++-- uv.lock | 30 ++++++++++++++++- 3 files changed, 85 insertions(+), 7 deletions(-) diff --git a/.github/workflows/code-style-checks.yaml b/.github/workflows/code-style-checks.yaml index 488458f2ec..5f5d1e731f 100644 --- a/.github/workflows/code-style-checks.yaml +++ b/.github/workflows/code-style-checks.yaml @@ -31,6 +31,14 @@ jobs: - name: Checkout Source Repository uses: actions/checkout@v4 + - name: Get Changed Files + id: changed-files + uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 + with: + files: | + **.py + separator: " " + - name: Set Combined ENV run: | echo "DATA_BROKER_DATABASE_URL=postgres://$BROKER_DB_USER:$BROKER_DB_PASSWORD@$BROKER_DB_HOST:$BROKER_DB_PORT/$BROKER_DB_NAME" >> $GITHUB_ENV @@ -52,11 +60,14 @@ jobs: - name: Init Python Environment uses: ./.github/actions/init-python-environment - - name: Run Flake8 - run: flake8 +# - name: Run Flake8 +# run: flake8 +# +# - name: Run Black +# run: black --check --diff . - - name: Run Black - run: black --check --diff . + - name: Run ruff linter + run: ruff check ${{ steps.changed-files.outputs.all_changed_files }} - name: Run Check For Endpoint Documentation run: python manage.py check_for_endpoint_documentation diff --git a/pyproject.toml b/pyproject.toml index fcc33952fc..11ced84641 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,11 +97,11 @@ dependencies = [ [project.optional-dependencies] dev = [ - "black==24.10.0", +# "black==24.10.0", "click==8.1.7", "docker==7.0.0", "dredd-hooks==0.2.0", - "flake8==7.1.0", +# "flake8==7.1.0", "importlib-metadata==8.5.0", "mock==5.1.*", "model-bakery==1.17.*", @@ -112,6 +112,7 @@ dev = [ "pytest-django==4.8.*", "pytest-pretty==1.2.*", "pytest-xdist==3.5.*", + "ruff==0.14.14", ] spark = [ "delta-spark==3.2.*", @@ -167,3 +168,41 @@ exclude_lines = [ line-length = 120 target-version = ['py310'] exclude = '/(\.git|\.venv|venv|migrations)/' + +[tool.ruff.lint] +exclude = [ + '.git', + '.venv', + 'venv', + '**/migrations/**', + 'build', + 'usaspending_api.egg-info' +] + +select = [ + "PLR0913", # max arguments in function + "PLR0904", # max number of public methods + "PLR0911", # max number of return statements + "PLR0916", # max number of boolean expressions + "PLR0915", # max number of lines in a function + "PLR0912", # max number of logical branches in a function + "PLR1702", # max number of nested blocks + "C901", # cognitive complexity (functions) + "I001", # unsorted imports + "E", # pycodestyle errors + "F", # pyflakes + "W", # pycodestyle warnings +] +ignore = [ + "E203", # whitespace before punctuation +] + +pylint.max-args = 6 +pylint.max-public-methods = 20 +pylint.max-returns = 3 +pylint.max-bool-expr = 8 +pylint.max-statements = 45 +pylint.max-branches = 10 +pylint.max-nested-blocks = 5 +mccabe.max-complexity = 15 +pycodestyle.max-line-length = 120 \ No newline at end of file diff --git a/uv.lock b/uv.lock index a8ad9c7a03..b5c4cb1125 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.12'", @@ -2088,6 +2088,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/93/0c0f002031f18b53af7a6166103c02b9c0667be528944137cc954ec921b3/rsa-4.7.2-py3-none-any.whl", hash = "sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2", size = 34505, upload-time = "2021-02-24T10:55:03.55Z" }, ] +[[package]] +name = "ruff" +version = "0.14.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2e/06/f71e3a86b2df0dfa2d2f72195941cd09b44f87711cb7fa5193732cb9a5fc/ruff-0.14.14.tar.gz", hash = "sha256:2d0f819c9a90205f3a867dbbd0be083bee9912e170fd7d9704cc8ae45824896b", size = 4515732, upload-time = "2026-01-22T22:30:17.527Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/89/20a12e97bc6b9f9f68343952da08a8099c57237aef953a56b82711d55edd/ruff-0.14.14-py3-none-linux_armv6l.whl", hash = "sha256:7cfe36b56e8489dee8fbc777c61959f60ec0f1f11817e8f2415f429552846aed", size = 10467650, upload-time = "2026-01-22T22:30:08.578Z" }, + { url = "https://files.pythonhosted.org/packages/a3/b1/c5de3fd2d5a831fcae21beda5e3589c0ba67eec8202e992388e4b17a6040/ruff-0.14.14-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6006a0082336e7920b9573ef8a7f52eec837add1265cc74e04ea8a4368cd704c", size = 10883245, upload-time = "2026-01-22T22:30:04.155Z" }, + { url = "https://files.pythonhosted.org/packages/b8/7c/3c1db59a10e7490f8f6f8559d1db8636cbb13dccebf18686f4e3c9d7c772/ruff-0.14.14-py3-none-macosx_11_0_arm64.whl", hash = "sha256:026c1d25996818f0bf498636686199d9bd0d9d6341c9c2c3b62e2a0198b758de", size = 10231273, upload-time = "2026-01-22T22:30:34.642Z" }, + { url = "https://files.pythonhosted.org/packages/a1/6e/5e0e0d9674be0f8581d1f5e0f0a04761203affce3232c1a1189d0e3b4dad/ruff-0.14.14-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f666445819d31210b71e0a6d1c01e24447a20b85458eea25a25fe8142210ae0e", size = 10585753, upload-time = "2026-01-22T22:30:31.781Z" }, + { url = "https://files.pythonhosted.org/packages/23/09/754ab09f46ff1884d422dc26d59ba18b4e5d355be147721bb2518aa2a014/ruff-0.14.14-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c0f18b922c6d2ff9a5e6c3ee16259adc513ca775bcf82c67ebab7cbd9da5bc8", size = 10286052, upload-time = "2026-01-22T22:30:24.827Z" }, + { url = "https://files.pythonhosted.org/packages/c8/cc/e71f88dd2a12afb5f50733851729d6b571a7c3a35bfdb16c3035132675a0/ruff-0.14.14-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1629e67489c2dea43e8658c3dba659edbfd87361624b4040d1df04c9740ae906", size = 11043637, upload-time = "2026-01-22T22:30:13.239Z" }, + { url = "https://files.pythonhosted.org/packages/67/b2/397245026352494497dac935d7f00f1468c03a23a0c5db6ad8fc49ca3fb2/ruff-0.14.14-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:27493a2131ea0f899057d49d303e4292b2cae2bb57253c1ed1f256fbcd1da480", size = 12194761, upload-time = "2026-01-22T22:30:22.542Z" }, + { url = "https://files.pythonhosted.org/packages/5b/06/06ef271459f778323112c51b7587ce85230785cd64e91772034ddb88f200/ruff-0.14.14-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:01ff589aab3f5b539e35db38425da31a57521efd1e4ad1ae08fc34dbe30bd7df", size = 12005701, upload-time = "2026-01-22T22:30:20.499Z" }, + { url = "https://files.pythonhosted.org/packages/41/d6/99364514541cf811ccc5ac44362f88df66373e9fec1b9d1c4cc830593fe7/ruff-0.14.14-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1cc12d74eef0f29f51775f5b755913eb523546b88e2d733e1d701fe65144e89b", size = 11282455, upload-time = "2026-01-22T22:29:59.679Z" }, + { url = "https://files.pythonhosted.org/packages/ca/71/37daa46f89475f8582b7762ecd2722492df26421714a33e72ccc9a84d7a5/ruff-0.14.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb8481604b7a9e75eff53772496201690ce2687067e038b3cc31aaf16aa0b974", size = 11215882, upload-time = "2026-01-22T22:29:57.032Z" }, + { url = "https://files.pythonhosted.org/packages/2c/10/a31f86169ec91c0705e618443ee74ede0bdd94da0a57b28e72db68b2dbac/ruff-0.14.14-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:14649acb1cf7b5d2d283ebd2f58d56b75836ed8c6f329664fa91cdea19e76e66", size = 11180549, upload-time = "2026-01-22T22:30:27.175Z" }, + { url = "https://files.pythonhosted.org/packages/fd/1e/c723f20536b5163adf79bdd10c5f093414293cdf567eed9bdb7b83940f3f/ruff-0.14.14-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e8058d2145566510790eab4e2fad186002e288dec5e0d343a92fe7b0bc1b3e13", size = 10543416, upload-time = "2026-01-22T22:30:01.964Z" }, + { url = "https://files.pythonhosted.org/packages/3e/34/8a84cea7e42c2d94ba5bde1d7a4fae164d6318f13f933d92da6d7c2041ff/ruff-0.14.14-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e651e977a79e4c758eb807f0481d673a67ffe53cfa92209781dfa3a996cf8412", size = 10285491, upload-time = "2026-01-22T22:30:29.51Z" }, + { url = "https://files.pythonhosted.org/packages/55/ef/b7c5ea0be82518906c978e365e56a77f8de7678c8bb6651ccfbdc178c29f/ruff-0.14.14-py3-none-musllinux_1_2_i686.whl", hash = "sha256:cc8b22da8d9d6fdd844a68ae937e2a0adf9b16514e9a97cc60355e2d4b219fc3", size = 10733525, upload-time = "2026-01-22T22:30:06.499Z" }, + { url = "https://files.pythonhosted.org/packages/6a/5b/aaf1dfbcc53a2811f6cc0a1759de24e4b03e02ba8762daabd9b6bd8c59e3/ruff-0.14.14-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:16bc890fb4cc9781bb05beb5ab4cd51be9e7cb376bf1dd3580512b24eb3fda2b", size = 11315626, upload-time = "2026-01-22T22:30:36.848Z" }, + { url = "https://files.pythonhosted.org/packages/2c/aa/9f89c719c467dfaf8ad799b9bae0df494513fb21d31a6059cb5870e57e74/ruff-0.14.14-py3-none-win32.whl", hash = "sha256:b530c191970b143375b6a68e6f743800b2b786bbcf03a7965b06c4bf04568167", size = 10502442, upload-time = "2026-01-22T22:30:38.93Z" }, + { url = "https://files.pythonhosted.org/packages/87/44/90fa543014c45560cae1fffc63ea059fb3575ee6e1cb654562197e5d16fb/ruff-0.14.14-py3-none-win_amd64.whl", hash = "sha256:3dde1435e6b6fe5b66506c1dff67a421d0b7f6488d466f651c07f4cab3bf20fd", size = 11630486, upload-time = "2026-01-22T22:30:10.852Z" }, + { url = "https://files.pythonhosted.org/packages/9e/6a/40fee331a52339926a92e17ae748827270b288a35ef4a15c9c8f2ec54715/ruff-0.14.14-py3-none-win_arm64.whl", hash = "sha256:56e6981a98b13a32236a72a8da421d7839221fa308b223b9283312312e5ac76c", size = 10920448, upload-time = "2026-01-22T22:30:15.417Z" }, +] + [[package]] name = "s3transfer" version = "0.10.4" @@ -2315,6 +2341,7 @@ dev = [ { name = "pytest-django" }, { name = "pytest-pretty" }, { name = "pytest-xdist" }, + { name = "ruff" }, ] server = [ { name = "django-redis" }, @@ -2420,6 +2447,7 @@ requires-dist = [ { name = "python-json-logger", specifier = "==2.0.7" }, { name = "requests", specifier = "==2.31.*" }, { name = "retrying", specifier = "==1.3.4" }, + { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.14.14" }, { name = "setuptools", marker = "extra == 'server'", specifier = ">=68.1.2" }, { name = "sqlparse", specifier = "==0.5.*" }, { name = "supervisor", marker = "extra == 'server'", specifier = "==4.1.0" }, From a1d393ea9c7347e03721fa88ffde28abd340bcbc Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Mon, 26 Jan 2026 14:00:17 -0600 Subject: [PATCH 20/59] [DEV-12858] Add flake8-bugber rules --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 11ced84641..59bda03976 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -189,6 +189,7 @@ select = [ "PLR1702", # max number of nested blocks "C901", # cognitive complexity (functions) "I001", # unsorted imports + "B", # flake8 bugbear "E", # pycodestyle errors "F", # pyflakes "W", # pycodestyle warnings From bc5064c9afbfa755d5a74fc0275ffdd0ef02b785 Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Mon, 26 Jan 2026 14:18:27 -0600 Subject: [PATCH 21/59] [DEV-12858] Log changed files --- .github/workflows/code-style-checks.yaml | 8 ++++++++ pyproject.toml | 1 + 2 files changed, 9 insertions(+) diff --git a/.github/workflows/code-style-checks.yaml b/.github/workflows/code-style-checks.yaml index 5f5d1e731f..d9edcd4991 100644 --- a/.github/workflows/code-style-checks.yaml +++ b/.github/workflows/code-style-checks.yaml @@ -39,6 +39,14 @@ jobs: **.py separator: " " + - name: List all changed files + env: + ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} + run: | + for file in ${ALL_CHANGED_FILES}; do + echo "$file was changed" + done + - name: Set Combined ENV run: | echo "DATA_BROKER_DATABASE_URL=postgres://$BROKER_DB_USER:$BROKER_DB_PASSWORD@$BROKER_DB_HOST:$BROKER_DB_PORT/$BROKER_DB_NAME" >> $GITHUB_ENV diff --git a/pyproject.toml b/pyproject.toml index 59bda03976..6ec0c4d606 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -170,6 +170,7 @@ target-version = ['py310'] exclude = '/(\.git|\.venv|venv|migrations)/' [tool.ruff.lint] +preview = true # enable new rules exclude = [ '.git', '.venv', From 83c74a9fcdcb5a8c0b76119f5a0475376afd5833 Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Mon, 26 Jan 2026 14:31:20 -0600 Subject: [PATCH 22/59] [DEV-12858] Check if changed files is an empty string --- .github/workflows/code-style-checks.yaml | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/code-style-checks.yaml b/.github/workflows/code-style-checks.yaml index d9edcd4991..47abf6be68 100644 --- a/.github/workflows/code-style-checks.yaml +++ b/.github/workflows/code-style-checks.yaml @@ -31,21 +31,17 @@ jobs: - name: Checkout Source Repository uses: actions/checkout@v4 - - name: Get Changed Files - id: changed-files + - name: Get Changed Python Files + id: changed-python-files uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 with: files: | **.py separator: " " - - name: List all changed files - env: - ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} - run: | - for file in ${ALL_CHANGED_FILES}; do - echo "$file was changed" - done + - name: Check if changed files is an empty string + if: steps.changed-python-files.outputs.all_changed_files == '' + run: echo "No changed files detected" - name: Set Combined ENV run: | @@ -75,7 +71,10 @@ jobs: # run: black --check --diff . - name: Run ruff linter - run: ruff check ${{ steps.changed-files.outputs.all_changed_files }} + if: steps.changed-python-files.outputs.all_changed_files != '' + run: | + echo "Running: ruff check ${{ steps.changed-python-files.outputs.all_changed_files }}" + ruff check ${{ steps.changed-python-files.outputs.all_changed_files }} - name: Run Check For Endpoint Documentation run: python manage.py check_for_endpoint_documentation From c4da9249d523237cea53761de4ceeb4ad7c2a348 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 26 Jan 2026 16:56:22 -0600 Subject: [PATCH 23/59] [DEV-14110] - add missing migration and update column type --- usaspending_api/awards/delta_models/awards.py | 2 +- ...tionsearch_initial_report_date_and_more.py | 2 ++ .../0060_alter_initial_report_date_andmore.py | 21 +++++++++++++------ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/usaspending_api/awards/delta_models/awards.py b/usaspending_api/awards/delta_models/awards.py index 51c105c089..a1296246aa 100644 --- a/usaspending_api/awards/delta_models/awards.py +++ b/usaspending_api/awards/delta_models/awards.py @@ -17,7 +17,7 @@ "generated_unique_award_id": "STRING NOT NULL", "id": "LONG NOT NULL", "is_fpds": "BOOLEAN NOT NULL", - "last_modified_date": "DATE", + "last_modified_date": "TIMESTAMP", "latest_transaction_id": "LONG", "non_federal_funding_amount": "NUMERIC(23,2)", "officer_1_amount": "NUMERIC(23,2)", diff --git a/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py index 4edca94e74..e9846e9b84 100644 --- a/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py +++ b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py @@ -2,6 +2,7 @@ from django.db import migrations, models + class Migration(migrations.Migration): atomic = False dependencies = [ @@ -13,6 +14,7 @@ class Migration(migrations.Migration): migrations.RunSQL( sql=""" DROP VIEW IF EXISTS + vw_awards, vw_transaction_fabs, vw_transaction_normalized, vw_transaction_fpds, diff --git a/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py b/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py index 15c64aadaf..29b15cf51a 100644 --- a/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py +++ b/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py @@ -1,6 +1,7 @@ # Generated by Django 4.2.23 on 2026-01-02 16:42 from django.db import migrations, models +from usaspending_api.awards.models.award import vw_awards_sql from usaspending_api.awards.models.transaction_normalized import vw_transaction_normalized_sql from usaspending_api.awards.models.transaction_fpds import vw_transaction_fpds_sql from usaspending_api.awards.models.transaction_fabs import vw_transaction_fabs_sql @@ -9,6 +10,7 @@ with open(transaction_delta_view_file, "r") as f: transaction_delta_view = f.read() + class Migration(migrations.Migration): atomic = False dependencies = [ @@ -26,18 +28,25 @@ class Migration(migrations.Migration): name="last_modified_date", field=models.DateTimeField(null=True), ), - + migrations.AlterField( + model_name="awardsearch", + name="last_modified_date", + field=models.DateTimeField(null=True), + ), migrations.RunSQL( - sql=f"""{vw_transaction_normalized_sql} - {vw_transaction_fpds_sql} - {vw_transaction_fabs_sql} - {transaction_delta_view} + sql=f""" + {vw_awards_sql} + {vw_transaction_normalized_sql} + {vw_transaction_fpds_sql} + {vw_transaction_fabs_sql} + {transaction_delta_view} """, reverse_sql="""DROP VIEW IF EXISTS + vw_awards, vw_transaction_fabs, vw_transaction_normalized, vw_transaction_fpds, transaction_delta_view """, - ) + ), ] From 8972c09a85d37551a8ce6fac888485bf5bf51f16 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 26 Jan 2026 17:04:35 -0600 Subject: [PATCH 24/59] [DEV-14110] - update migration to be consistent with model --- .../search/migrations/0060_alter_initial_report_date_andmore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py b/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py index 29b15cf51a..c279029034 100644 --- a/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py +++ b/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py @@ -31,7 +31,7 @@ class Migration(migrations.Migration): migrations.AlterField( model_name="awardsearch", name="last_modified_date", - field=models.DateTimeField(null=True), + field=models.DateTimeField(blank=True, null=True), ), migrations.RunSQL( sql=f""" From f82a47fdc0d33a216abb8e8c2796c9863c9a9289 Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Tue, 27 Jan 2026 07:56:14 -0600 Subject: [PATCH 25/59] [DEV-12858] Update workflow comments --- .github/workflows/code-style-checks.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/code-style-checks.yaml b/.github/workflows/code-style-checks.yaml index 47abf6be68..271479078f 100644 --- a/.github/workflows/code-style-checks.yaml +++ b/.github/workflows/code-style-checks.yaml @@ -39,7 +39,7 @@ jobs: **.py separator: " " - - name: Check if changed files is an empty string + - name: Check If `changed-python-files` Is An Empty String if: steps.changed-python-files.outputs.all_changed_files == '' run: echo "No changed files detected" @@ -70,7 +70,8 @@ jobs: # - name: Run Black # run: black --check --diff . - - name: Run ruff linter + # changed-python-files could be an empty string, which would cause `ruff check` to be run against the entire project. + - name: Run Ruff Linter if: steps.changed-python-files.outputs.all_changed_files != '' run: | echo "Running: ruff check ${{ steps.changed-python-files.outputs.all_changed_files }}" From 19d339c536ff0fad9df9ad74fdab0600e553185e Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Tue, 27 Jan 2026 09:09:46 -0600 Subject: [PATCH 26/59] [DEV-12858] Ruff linter fix --- usaspending_api/views.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/usaspending_api/views.py b/usaspending_api/views.py index f8b97629d1..aeea1f6d3e 100644 --- a/usaspending_api/views.py +++ b/usaspending_api/views.py @@ -1,6 +1,7 @@ +import json + from django.http import HttpResponse from django.views import View -import json class StatusView(View): From 181d2c92e8c73795f3dddf62293e3bde26222fa4 Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Tue, 27 Jan 2026 09:43:52 -0600 Subject: [PATCH 27/59] [DEV-12858] Remove black and flake8 --- pyproject.toml | 7 ---- uv.lock | 90 +------------------------------------------------- 2 files changed, 1 insertion(+), 96 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6ec0c4d606..90df4a9565 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,11 +97,9 @@ dependencies = [ [project.optional-dependencies] dev = [ -# "black==24.10.0", "click==8.1.7", "docker==7.0.0", "dredd-hooks==0.2.0", -# "flake8==7.1.0", "importlib-metadata==8.5.0", "mock==5.1.*", "model-bakery==1.17.*", @@ -164,11 +162,6 @@ exclude_lines = [ "pragma: no cover" ] -[tool.black] -line-length = 120 -target-version = ['py310'] -exclude = '/(\.git|\.venv|venv|migrations)/' - [tool.ruff.lint] preview = true # enable new rules exclude = [ diff --git a/uv.lock b/uv.lock index b5c4cb1125..e1b655e1f6 100644 --- a/uv.lock +++ b/uv.lock @@ -100,40 +100,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/09/71/0f5e89fcafc2aae704a421c899df4d56622a364731751ba93a1794f1879e/awscli-1.34.33-py3-none-any.whl", hash = "sha256:4ef6e2b0b72e7d33c0c5ce3ae499f26eb1e814e35deb036b708cdc46cb39ef27", size = 4520041, upload-time = "2024-10-03T19:18:14.963Z" }, ] -[[package]] -name = "black" -version = "24.10.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "mypy-extensions" }, - { name = "packaging" }, - { name = "pathspec" }, - { name = "platformdirs" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d8/0d/cc2fb42b8c50d80143221515dd7e4766995bd07c56c9a3ed30baf080b6dc/black-24.10.0.tar.gz", hash = "sha256:846ea64c97afe3bc677b761787993be4991810ecc7a4a937816dd6bddedc4875", size = 645813, upload-time = "2024-10-07T19:20:50.361Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a3/f3/465c0eb5cddf7dbbfe1fecd9b875d1dcf51b88923cd2c1d7e9ab95c6336b/black-24.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6668650ea4b685440857138e5fe40cde4d652633b1bdffc62933d0db4ed9812", size = 1623211, upload-time = "2024-10-07T19:26:12.43Z" }, - { url = "https://files.pythonhosted.org/packages/df/57/b6d2da7d200773fdfcc224ffb87052cf283cec4d7102fab450b4a05996d8/black-24.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1c536fcf674217e87b8cc3657b81809d3c085d7bf3ef262ead700da345bfa6ea", size = 1457139, upload-time = "2024-10-07T19:25:06.453Z" }, - { url = "https://files.pythonhosted.org/packages/6e/c5/9023b7673904a5188f9be81f5e129fff69f51f5515655fbd1d5a4e80a47b/black-24.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:649fff99a20bd06c6f727d2a27f401331dc0cc861fb69cde910fe95b01b5928f", size = 1753774, upload-time = "2024-10-07T19:23:58.47Z" }, - { url = "https://files.pythonhosted.org/packages/e1/32/df7f18bd0e724e0d9748829765455d6643ec847b3f87e77456fc99d0edab/black-24.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe4d6476887de70546212c99ac9bd803d90b42fc4767f058a0baa895013fbb3e", size = 1414209, upload-time = "2024-10-07T19:24:42.54Z" }, - { url = "https://files.pythonhosted.org/packages/c2/cc/7496bb63a9b06a954d3d0ac9fe7a73f3bf1cd92d7a58877c27f4ad1e9d41/black-24.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5a2221696a8224e335c28816a9d331a6c2ae15a2ee34ec857dcf3e45dbfa99ad", size = 1607468, upload-time = "2024-10-07T19:26:14.966Z" }, - { url = "https://files.pythonhosted.org/packages/2b/e3/69a738fb5ba18b5422f50b4f143544c664d7da40f09c13969b2fd52900e0/black-24.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f9da3333530dbcecc1be13e69c250ed8dfa67f43c4005fb537bb426e19200d50", size = 1437270, upload-time = "2024-10-07T19:25:24.291Z" }, - { url = "https://files.pythonhosted.org/packages/c9/9b/2db8045b45844665c720dcfe292fdaf2e49825810c0103e1191515fc101a/black-24.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4007b1393d902b48b36958a216c20c4482f601569d19ed1df294a496eb366392", size = 1737061, upload-time = "2024-10-07T19:23:52.18Z" }, - { url = "https://files.pythonhosted.org/packages/a3/95/17d4a09a5be5f8c65aa4a361444d95edc45def0de887810f508d3f65db7a/black-24.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:394d4ddc64782e51153eadcaaca95144ac4c35e27ef9b0a42e121ae7e57a9175", size = 1423293, upload-time = "2024-10-07T19:24:41.7Z" }, - { url = "https://files.pythonhosted.org/packages/90/04/bf74c71f592bcd761610bbf67e23e6a3cff824780761f536512437f1e655/black-24.10.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b5e39e0fae001df40f95bd8cc36b9165c5e2ea88900167bddf258bacef9bbdc3", size = 1644256, upload-time = "2024-10-07T19:27:53.355Z" }, - { url = "https://files.pythonhosted.org/packages/4c/ea/a77bab4cf1887f4b2e0bce5516ea0b3ff7d04ba96af21d65024629afedb6/black-24.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d37d422772111794b26757c5b55a3eade028aa3fde43121ab7b673d050949d65", size = 1448534, upload-time = "2024-10-07T19:26:44.953Z" }, - { url = "https://files.pythonhosted.org/packages/4e/3e/443ef8bc1fbda78e61f79157f303893f3fddf19ca3c8989b163eb3469a12/black-24.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14b3502784f09ce2443830e3133dacf2c0110d45191ed470ecb04d0f5f6fcb0f", size = 1761892, upload-time = "2024-10-07T19:24:10.264Z" }, - { url = "https://files.pythonhosted.org/packages/52/93/eac95ff229049a6901bc84fec6908a5124b8a0b7c26ea766b3b8a5debd22/black-24.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:30d2c30dc5139211dda799758559d1b049f7f14c580c409d6ad925b74a4208a8", size = 1434796, upload-time = "2024-10-07T19:25:06.239Z" }, - { url = "https://files.pythonhosted.org/packages/d0/a0/a993f58d4ecfba035e61fca4e9f64a2ecae838fc9f33ab798c62173ed75c/black-24.10.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1cbacacb19e922a1d75ef2b6ccaefcd6e93a2c05ede32f06a21386a04cedb981", size = 1643986, upload-time = "2024-10-07T19:28:50.684Z" }, - { url = "https://files.pythonhosted.org/packages/37/d5/602d0ef5dfcace3fb4f79c436762f130abd9ee8d950fa2abdbf8bbc555e0/black-24.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1f93102e0c5bb3907451063e08b9876dbeac810e7da5a8bfb7aeb5a9ef89066b", size = 1448085, upload-time = "2024-10-07T19:28:12.093Z" }, - { url = "https://files.pythonhosted.org/packages/47/6d/a3a239e938960df1a662b93d6230d4f3e9b4a22982d060fc38c42f45a56b/black-24.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ddacb691cdcdf77b96f549cf9591701d8db36b2f19519373d60d31746068dbf2", size = 1760928, upload-time = "2024-10-07T19:24:15.233Z" }, - { url = "https://files.pythonhosted.org/packages/dd/cf/af018e13b0eddfb434df4d9cd1b2b7892bab119f7a20123e93f6910982e8/black-24.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:680359d932801c76d2e9c9068d05c6b107f2584b2a5b88831c83962eb9984c1b", size = 1436875, upload-time = "2024-10-07T19:24:42.762Z" }, - { url = "https://files.pythonhosted.org/packages/8d/a7/4b27c50537ebca8bec139b872861f9d2bf501c5ec51fcf897cb924d9e264/black-24.10.0-py3-none-any.whl", hash = "sha256:3bb2b7a1f7b685f85b11fed1ef10f8a9148bceb49853e47a294a3dd963c1dd7d", size = 206898, upload-time = "2024-10-07T19:20:48.317Z" }, -] - [[package]] name = "boto3" version = "1.35.33" @@ -785,20 +751,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/ea/6e5568ef338ba918be8c8fccc0a717d824c13187fe5cb9e8ad8530d113d1/fiscalyear-0.4.0-py3-none-any.whl", hash = "sha256:8adb8022a76cc52974d059d176ec3f33b2d7a6c1f72ac356702bc70e1e5e4d92", size = 8417, upload-time = "2022-02-17T03:18:26.523Z" }, ] -[[package]] -name = "flake8" -version = "7.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mccabe" }, - { name = "pycodestyle" }, - { name = "pyflakes" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/4e/34/64f8a43736d9862ced7dd0ea5c3ed99815b8ff4b826a4f3bfd3a1b0639b1/flake8-7.1.0.tar.gz", hash = "sha256:48a07b626b55236e0fb4784ee69a465fbf59d79eec1f5b4785c3d3bc57d17aa5", size = 48240, upload-time = "2024-06-15T21:37:07.633Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/43/d5147aadaa52558e94e024811f2f9543b4bd7203b3a9659eeb5dff9c61b3/flake8-7.1.0-py2.py3-none-any.whl", hash = "sha256:2e416edcc62471a64cea09353f4e7bdba32aeb079b6e360554c659a122b1bc6a", size = 57569, upload-time = "2024-06-15T21:37:05.342Z" }, -] - [[package]] name = "google-auth" version = "2.40.3" @@ -1024,15 +976,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/04/37055b7013dfaaf66e3a9a51e46857cc9be151476a891b995fa70da7e139/marshmallow-3.21.1-py3-none-any.whl", hash = "sha256:f085493f79efb0644f270a9bf2892843142d80d7174bbbd2f3713f2a589dc633", size = 49362, upload-time = "2024-03-04T20:21:15.753Z" }, ] -[[package]] -name = "mccabe" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e7/ff/0ffefdcac38932a54d2b5eed4e0ba8a408f215002cd178ad1df0f2806ff8/mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325", size = 9658, upload-time = "2022-01-24T01:14:51.113Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/27/1a/1f68f9ba0c207934b35b86a8ca3aad8395a3d6dd7921c0686e23853ff5a9/mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e", size = 7350, upload-time = "2022-01-24T01:14:49.62Z" }, -] - [[package]] name = "mdurl" version = "0.1.2" @@ -1640,15 +1583,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436, upload-time = "2024-09-20T13:09:48.112Z" }, ] -[[package]] -name = "pathspec" -version = "0.12.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, -] - [[package]] name = "pip" version = "23.2.1" @@ -1775,15 +1709,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, ] -[[package]] -name = "pycodestyle" -version = "2.12.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/aa/210b2c9aedd8c1cbeea31a50e42050ad56187754b34eb214c46709445801/pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521", size = 39232, upload-time = "2024-08-04T20:26:54.576Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3a/d8/a211b3f85e99a0daa2ddec96c949cac6824bd305b040571b82a03dd62636/pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3", size = 31284, upload-time = "2024-08-04T20:26:53.173Z" }, -] - [[package]] name = "pycparser" version = "2.22" @@ -1817,15 +1742,6 @@ dotenv = [ { name = "python-dotenv" }, ] -[[package]] -name = "pyflakes" -version = "3.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/57/f9/669d8c9c86613c9d568757c7f5824bd3197d7b1c6c27553bc5618a27cce2/pyflakes-3.2.0.tar.gz", hash = "sha256:1c61603ff154621fb2a9172037d84dca3500def8c8b630657d1701f026f8af3f", size = 63788, upload-time = "2024-01-05T00:28:47.703Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d4/d7/f1b7db88d8e4417c5d47adad627a93547f44bdc9028372dbd2313f34a855/pyflakes-3.2.0-py2.py3-none-any.whl", hash = "sha256:84b5be138a2dfbb40689ca07e2152deb896a65c3a3e24c251c5c62489568074a", size = 62725, upload-time = "2024-01-05T00:28:45.903Z" }, -] - [[package]] name = "pygments" version = "2.19.1" @@ -2326,11 +2242,9 @@ awscli = [ { name = "awscli" }, ] dev = [ - { name = "black" }, { name = "click" }, { name = "docker" }, { name = "dredd-hooks" }, - { name = "flake8" }, { name = "importlib-metadata" }, { name = "mock" }, { name = "model-bakery" }, @@ -2362,7 +2276,6 @@ requires-dist = [ { name = "asyncpg", specifier = "==0.29.*" }, { name = "attrs", specifier = "==23.2.*" }, { name = "awscli", marker = "extra == 'awscli'", specifier = "==1.34.*" }, - { name = "black", marker = "extra == 'dev'", specifier = "==24.10.0" }, { name = "boto3", specifier = ">=1.34,<1.36" }, { name = "certifi", specifier = "==2024.7.4" }, { name = "click", marker = "extra == 'dev'", specifier = "==8.1.7" }, @@ -2389,7 +2302,6 @@ requires-dist = [ { name = "et-xmlfile", specifier = "==1.1.0" }, { name = "filelock", specifier = "==3.13.1" }, { name = "fiscalyear", specifier = "==0.4.0" }, - { name = "flake8", marker = "extra == 'dev'", specifier = "==7.1.0" }, { name = "importlib-metadata", marker = "extra == 'dev'", specifier = "==8.5.0" }, { name = "markdown", specifier = "==3.5.*" }, { name = "marshmallow", specifier = "==3.21.1" }, @@ -2447,7 +2359,7 @@ requires-dist = [ { name = "python-json-logger", specifier = "==2.0.7" }, { name = "requests", specifier = "==2.31.*" }, { name = "retrying", specifier = "==1.3.4" }, - { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.14.14" }, + { name = "ruff", marker = "extra == 'dev'", specifier = "==0.14.14" }, { name = "setuptools", marker = "extra == 'server'", specifier = ">=68.1.2" }, { name = "sqlparse", specifier = "==0.5.*" }, { name = "supervisor", marker = "extra == 'server'", specifier = "==4.1.0" }, From 6d34b836dd84a162c67a8ec560bc6f42b862e3e4 Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Tue, 27 Jan 2026 09:46:20 -0600 Subject: [PATCH 28/59] [DEV-12858] Add ruff to pre-commit and clean up workflow --- .github/workflows/code-style-checks.yaml | 10 +--------- .pre-commit-config.yaml | 17 +++++------------ 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/.github/workflows/code-style-checks.yaml b/.github/workflows/code-style-checks.yaml index 271479078f..41963673e1 100644 --- a/.github/workflows/code-style-checks.yaml +++ b/.github/workflows/code-style-checks.yaml @@ -64,18 +64,10 @@ jobs: - name: Init Python Environment uses: ./.github/actions/init-python-environment -# - name: Run Flake8 -# run: flake8 -# -# - name: Run Black -# run: black --check --diff . - # changed-python-files could be an empty string, which would cause `ruff check` to be run against the entire project. - name: Run Ruff Linter if: steps.changed-python-files.outputs.all_changed_files != '' - run: | - echo "Running: ruff check ${{ steps.changed-python-files.outputs.all_changed_files }}" - ruff check ${{ steps.changed-python-files.outputs.all_changed_files }} + run: ruff check ${{ steps.changed-python-files.outputs.all_changed_files }} - name: Run Check For Endpoint Documentation run: python manage.py check_for_endpoint_documentation diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7ad8d1eb64..850453ca2c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,16 +3,9 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: - - id: debug-statements -- repo: https://github.com/pycqa/flake8.git - rev: 7.1.0 + - id: debug-statements +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.14.14 hooks: - - id: flake8 - language_version: python3.10.12 -- repo: https://github.com/psf/black - rev: 24.10.0 - hooks: - - id: black - language_version: python3.10.12 - additional_dependencies: - - "click==8.0.4" + - id: ruff-check + types_or: [ python, pyi ] # avoid linting other python file types like Jupyter notebooks From fd3ac6f7d9d5202ea8005f094194512bab184d55 Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Tue, 27 Jan 2026 13:05:02 -0600 Subject: [PATCH 29/59] [DEV-12858] Add type annotation linting rules --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 90df4a9565..566188c4d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -187,6 +187,9 @@ select = [ "E", # pycodestyle errors "F", # pyflakes "W", # pycodestyle warnings + "ANN001", # missing type annotation for function argument + "ANN201", # missing return type annotation for public function or method + "ANN202", # missing return type annotation for private function or method ] ignore = [ "E203", # whitespace before punctuation From ec830fcd066a2df861b2fa4f5ca0c10694cf0414 Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Tue, 27 Jan 2026 13:05:40 -0600 Subject: [PATCH 30/59] [DEV-12858] Update files as test for ruff linter --- usaspending_api/accounts/helpers.py | 3 +-- usaspending_api/views.py | 7 +++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/usaspending_api/accounts/helpers.py b/usaspending_api/accounts/helpers.py index a978574d32..0751351120 100644 --- a/usaspending_api/accounts/helpers.py +++ b/usaspending_api/accounts/helpers.py @@ -1,6 +1,5 @@ import datetime - TAS_COMPONENT_TO_FIELD_MAPPING = { "ata": "allocation_transfer_agency_id", "aid": "agency_id", @@ -12,7 +11,7 @@ } -def start_and_end_dates_from_fyq(fiscal_year, fiscal_quarter): +def start_and_end_dates_from_fyq(fiscal_year: int, fiscal_quarter: int) -> tuple[datetime.date, datetime.date]: if fiscal_quarter == 1: start_date = datetime.date(fiscal_year - 1, 10, 1) end_date = datetime.date(fiscal_year - 1, 12, 31) diff --git a/usaspending_api/views.py b/usaspending_api/views.py index aeea1f6d3e..f640df5b10 100644 --- a/usaspending_api/views.py +++ b/usaspending_api/views.py @@ -1,10 +1,9 @@ import json -from django.http import HttpResponse +from django.http import HttpRequest, HttpResponse from django.views import View class StatusView(View): - def get(self, request, format=None): - response_object = {"status": "running"} - return HttpResponse(json.dumps(response_object)) + def get(self, request: HttpRequest) -> HttpResponse: + return HttpResponse(json.dumps({"status": "running"})) From 07cfecc7d7cb31d874e7a41fda9cdbe8780bc6fd Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Tue, 27 Jan 2026 14:04:16 -0600 Subject: [PATCH 31/59] [DEV-14145] Fix last_modified_date column type --- .../delta_downloads/account_balances.py | 28 ++++--------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/usaspending_api/download/delta_downloads/account_balances.py b/usaspending_api/download/delta_downloads/account_balances.py index 953f3908ab..024b9ba14c 100644 --- a/usaspending_api/download/delta_downloads/account_balances.py +++ b/usaspending_api/download/delta_downloads/account_balances.py @@ -130,17 +130,7 @@ def agg_cols(self) -> list[Column | DuckDBSparkColumn]: self.sf.sum(self.sf.col("status_of_budgetary_resources_total")).alias( "status_of_budgetary_resources_total" ), - ( - ( - self.sf.max(self.sf.call_function("strptime", "last_modified_date", "yyyy-MM-dd")).alias( - "max_last_modified_date" - ) - if isinstance(self.spark, DuckDBSparkSession) - else self.sf.max(self.sf.date_format("last_modified_date", "yyyy-MM-dd")).alias( - "last_modified_date" - ) - ), - ), + self.sf.max(self.sf.col("last_modified_date")).alias("max_last_modified_date"), ] @property @@ -167,7 +157,7 @@ def select_cols(self) -> list[Column]: self.sf.col("unobligated_balance"), self.sf.col("gross_outlay_amount"), self.sf.col("status_of_budgetary_resources_total"), - self.sf.col("last_modified_date"), + self.sf.col("max_last_modified_date").alias("last_modified_date"), ] @@ -234,17 +224,9 @@ def group_by_cols(self) -> list[Column | DuckDBSparkColumn]: @property def agg_cols(self) -> list[Column | DuckDBSparkColumn]: - if isinstance(self.spark, DuckDBSparkSession): - # DuckDB's Spark implementation doesn't include the `date_format()` function so we have to use Python's `strptime` - return [ - self.sf.max(self.sf.call_function("strptime", "last_modified_date", "yyyy-MM-dd")).alias( - "max_last_modified_date" - ) - ] - else: - return [ - self.sf.max(self.sf.date_format("last_modified_date", "yyyy-MM-dd")).alias("max_last_modified_date"), - ] + return [ + self.sf.max(self.sf.col("last_modified_date")).alias("max_last_modified_date"), + ] @property def select_cols(self) -> list[Column | DuckDBSparkColumn]: From 51302123e228c3d6f051ea08c2ea48fded4da722 Mon Sep 17 00:00:00 2001 From: Andrew Guest <110476931+aguest-kc@users.noreply.github.com> Date: Wed, 28 Jan 2026 11:57:04 -0600 Subject: [PATCH 32/59] [DEV-14145] Fix merge conflict --- usaspending_api/download/delta_downloads/account_balances.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/usaspending_api/download/delta_downloads/account_balances.py b/usaspending_api/download/delta_downloads/account_balances.py index 3da13b405f..78dc3797f8 100644 --- a/usaspending_api/download/delta_downloads/account_balances.py +++ b/usaspending_api/download/delta_downloads/account_balances.py @@ -1,10 +1,9 @@ from duckdb.experimental.spark.sql import SparkSession as DuckDBSparkSession from duckdb.experimental.spark.sql.column import Column as DuckDBSparkColumn from duckdb.experimental.spark.sql.dataframe import DataFrame as DuckDBSparkDataFrame -from pyspark.sql import functions as sf, Column, DataFrame, SparkSession +from pyspark.sql import Column, DataFrame, SparkSession from usaspending_api.common.spark.utils import collect_concat -from usaspending_api.config import CONFIG from usaspending_api.download.delta_downloads.abstract_downloads.account_download import ( AbstractAccountDownload, AccountLevel, @@ -274,3 +273,4 @@ def create_federal_account_download(self) -> FederalAccountDownload: def create_treasury_account_download(self) -> TreasuryAccountDownload: return TreasuryAccountDownload(self.spark, self.filters, self.dynamic_filters) + From 0829bbdfe96f5d25a051613d4e0d1d5704a4568e Mon Sep 17 00:00:00 2001 From: Andrew Guest <110476931+aguest-kc@users.noreply.github.com> Date: Wed, 28 Jan 2026 12:08:39 -0600 Subject: [PATCH 33/59] [DEV-14145] Remove blank line --- usaspending_api/download/delta_downloads/account_balances.py | 1 - 1 file changed, 1 deletion(-) diff --git a/usaspending_api/download/delta_downloads/account_balances.py b/usaspending_api/download/delta_downloads/account_balances.py index 78dc3797f8..cb0b0867a3 100644 --- a/usaspending_api/download/delta_downloads/account_balances.py +++ b/usaspending_api/download/delta_downloads/account_balances.py @@ -273,4 +273,3 @@ def create_federal_account_download(self) -> FederalAccountDownload: def create_treasury_account_download(self) -> TreasuryAccountDownload: return TreasuryAccountDownload(self.spark, self.filters, self.dynamic_filters) - From 16aeed3c428969a8d9025dfc53c2b8d4df070464 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 28 Jan 2026 14:33:29 -0600 Subject: [PATCH 34/59] [DEV-14110] - update to consistently return dates via the API --- .../awards/tests/integration/test_awards_v2.py | 8 ++------ usaspending_api/awards/v2/data_layer/orm.py | 7 ++++--- usaspending_api/common/helpers/date_helper.py | 6 ++++-- .../idvs/tests/integration/test_awards_idv_v2.py | 2 +- .../search/delta_models/dataframes/transaction_search.py | 9 +++------ 5 files changed, 14 insertions(+), 18 deletions(-) diff --git a/usaspending_api/awards/tests/integration/test_awards_v2.py b/usaspending_api/awards/tests/integration/test_awards_v2.py index 1e476cd1b5..001c08553f 100644 --- a/usaspending_api/awards/tests/integration/test_awards_v2.py +++ b/usaspending_api/awards/tests/integration/test_awards_v2.py @@ -1489,11 +1489,7 @@ def test_outlay_calculations(client, awards_and_transactions): {"name": None, "amount": None}, ] }, - "period_of_performance": { - "start_date": "2004-02-04", - "end_date": "2005-02-04", - "last_modified_date": "2000-01-02 00:00:00+0000", - }, + "period_of_performance": {"start_date": "2004-02-04", "end_date": "2005-02-04", "last_modified_date": "2000-01-02"}, "place_of_performance": { "address_line1": None, "address_line2": None, @@ -1590,7 +1586,7 @@ def test_outlay_calculations(client, awards_and_transactions): "period_of_performance": { "start_date": "2004-02-04", "end_date": "2005-02-04", - "last_modified_date": "2001-02-03 00:00:00+00", + "last_modified_date": "2001-02-03", "potential_end_date": "2003-04-05", }, "place_of_performance": { diff --git a/usaspending_api/awards/v2/data_layer/orm.py b/usaspending_api/awards/v2/data_layer/orm.py index 4dc2cb8785..7db0518d59 100644 --- a/usaspending_api/awards/v2/data_layer/orm.py +++ b/usaspending_api/awards/v2/data_layer/orm.py @@ -25,6 +25,7 @@ from usaspending_api.awards.v2.data_layer.orm_utils import delete_keys_from_dict, split_mapper_into_qs from usaspending_api.common.helpers.business_categories_helper import get_business_category_display_names from usaspending_api.common.helpers.data_constants import state_code_from_name, state_name_from_code +from usaspending_api.common.helpers.date_helper import get_date_from_datetime from usaspending_api.common.helpers.sql_helpers import execute_sql_to_ordered_dictionary from usaspending_api.common.recipient_lookups import obtain_recipient_uri from usaspending_api.references.models import ( @@ -69,7 +70,7 @@ def construct_assistance_response(requested_award_dict: dict) -> OrderedDict: [ ("start_date", award["_start_date"]), ("end_date", award["_end_date"]), - ("last_modified_date", transaction["_modified_at"].strftime("%Y-%m-%d %H:%M:%S%z")), + ("last_modified_date", get_date_from_datetime(transaction["_modified_at"])), ] ) response["recipient"] = create_recipient_object(transaction) @@ -112,7 +113,7 @@ def construct_contract_response(requested_award_dict: dict) -> OrderedDict: [ ("start_date", award["_start_date"]), ("end_date", award["_end_date"]), - ("last_modified_date", transaction["_last_modified"]), + ("last_modified_date", get_date_from_datetime(transaction["_last_modified"])), ("potential_end_date", transaction["_period_of_perf_potential_e"]), ] ) @@ -164,7 +165,7 @@ def construct_idv_response(requested_award_dict: dict) -> OrderedDict: [ ("start_date", award["_start_date"]), ("end_date", transaction["_end_date"]), - ("last_modified_date", transaction["_last_modified_date"]), + ("last_modified_date", get_date_from_datetime(transaction["_last_modified_date"])), ("potential_end_date", transaction["_period_of_perf_potential_e"]), ] ) diff --git a/usaspending_api/common/helpers/date_helper.py b/usaspending_api/common/helpers/date_helper.py index 141037f9d1..659f7f15c8 100644 --- a/usaspending_api/common/helpers/date_helper.py +++ b/usaspending_api/common/helpers/date_helper.py @@ -1,7 +1,7 @@ import operator from argparse import ArgumentTypeError -from datetime import datetime, timezone +from datetime import date, datetime, timezone from dateutil import parser from typing import Callable @@ -68,13 +68,15 @@ def _datetime_command_line_argument_type(input_string): return _datetime_command_line_argument_type -def get_date_from_datetime(date_time, **kwargs): +def get_date_from_datetime(date_time: datetime | str, **kwargs) -> date: """ Pass a keyword argument called "default" if you wish to have a specific value returned when the date cannot be extracted from date_time, otherwise date_time will be returned. """ try: + if isinstance(date_time, str): + date_time = parser.parse(date_time) return date_time.date() except Exception: return kwargs.get("default", date_time) diff --git a/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py b/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py index 4777f194d2..4473f2ea02 100644 --- a/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py +++ b/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py @@ -381,7 +381,7 @@ def test_award_endpoint_for_null_recipient_information(client, awards_and_transa "period_of_performance": { "start_date": "2004-02-04", "end_date": "2025-06-30", - "last_modified_date": "2018-08-24 00:00:00+00", + "last_modified_date": "2018-08-24", "potential_end_date": "2003-04-05", }, "awarding_agency": { diff --git a/usaspending_api/search/delta_models/dataframes/transaction_search.py b/usaspending_api/search/delta_models/dataframes/transaction_search.py index d80c7a5943..5e653ae016 100644 --- a/usaspending_api/search/delta_models/dataframes/transaction_search.py +++ b/usaspending_api/search/delta_models/dataframes/transaction_search.py @@ -92,12 +92,9 @@ def date_cols(self) -> list[Column]: sf.to_date(self.transaction_normalized.period_of_performance_current_end_date).alias( "period_of_performance_current_end_date" ), - sf.to_timestamp( - sf.coalesce( - sf.to_date(self.transaction_fabs.created_at), - self.transaction_fpds.initial_report_date, - ) - ).alias("initial_report_date"), + sf.coalesce(self.transaction_fabs.created_at, self.transaction_fpds.initial_report_date).alias( + "initial_report_date" + ), ] @property From c3a1a52e24a6f9b4fecd4d5f0907770f62f1100b Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 28 Jan 2026 15:30:48 -0600 Subject: [PATCH 35/59] [DEV-14110] - update transaction search dataframe --- .../search/delta_models/dataframes/transaction_search.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/usaspending_api/search/delta_models/dataframes/transaction_search.py b/usaspending_api/search/delta_models/dataframes/transaction_search.py index 5e653ae016..82327cc397 100644 --- a/usaspending_api/search/delta_models/dataframes/transaction_search.py +++ b/usaspending_api/search/delta_models/dataframes/transaction_search.py @@ -92,9 +92,9 @@ def date_cols(self) -> list[Column]: sf.to_date(self.transaction_normalized.period_of_performance_current_end_date).alias( "period_of_performance_current_end_date" ), - sf.coalesce(self.transaction_fabs.created_at, self.transaction_fpds.initial_report_date).alias( - "initial_report_date" - ), + sf.coalesce( + self.transaction_fabs.created_at, sf.to_timestamp(self.transaction_fpds.initial_report_date) + ).alias("initial_report_date"), ] @property From 4c9d4d52a3f0674936f30052c21597027da79dc0 Mon Sep 17 00:00:00 2001 From: Seth Stoudenmier Date: Thu, 29 Jan 2026 14:56:18 -0500 Subject: [PATCH 36/59] Fix formatting errors for the staging deploy --- pyproject.toml | 18 +- usaspending_api/accounts/helpers.py | 4 +- ...ransactionnormalized_last_modified_date.py | 4 +- .../awards/models/transaction_normalized.py | 68 +- .../tests/integration/test_awards_v2.py | 271 +++++-- usaspending_api/awards/v2/data_layer/orm.py | 294 +++++-- usaspending_api/common/helpers/date_helper.py | 45 +- .../delta_downloads/account_balances.py | 74 +- .../object_class_program_activity.py | 131 +++- .../test_populate_monthly_delta_files.py | 73 +- .../integration/test_load_to_from_delta.py | 461 ++++++++--- .../etl/tests/integration/test_spark_app.py | 123 ++- .../tests/integration/test_awards_idv_v2.py | 138 +++- .../search/delta_models/award_search.py | 446 +++++++++-- .../dataframes/transaction_search.py | 401 ++++++---- ...tionsearch_initial_report_date_and_more.py | 2 +- .../0060_alter_initial_report_date_andmore.py | 11 +- usaspending_api/search/models/award_search.py | 123 ++- .../search/models/transaction_search.py | 142 +++- usaspending_api/tests/conftest_spark.py | 253 ++++-- .../delta_models/transaction_fpds.py | 696 ++++++++++++----- .../delta_models/transaction_normalized.py | 2 +- .../delta_models/transaction_search.py | 724 +++++++++++++++--- 23 files changed, 3472 insertions(+), 1032 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 566188c4d2..46195c3d32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -203,4 +203,20 @@ pylint.max-statements = 45 pylint.max-branches = 10 pylint.max-nested-blocks = 5 mccabe.max-complexity = 15 -pycodestyle.max-line-length = 120 \ No newline at end of file +pycodestyle.max-line-length = 120 + +[tool.ruff.lint.per-file-ignores] +"**/tests/**/test*.py" = [ + "ANN001", # missing-type-function-argument; avoid conflict with fixtures + "ANN201", # missing-return-type-undocumented-public-function; avoid conflict with test case return values + "ANN202", # missing-return-type-undocumented-private-function; avoid conflict with test case return values + "PLR0913", # too-many-arguments; avoid conflict with too many fixtures + "PLR0915", # too-many-statements; avoid conflict with long fixtures +] +"**/conftest*.py" = [ + "ANN001", # missing-type-function-argument; avoid conflict with fixtures + "ANN201", # missing-return-type-undocumented-public-function; avoid conflict with test case return values + "ANN202", # missing-return-type-undocumented-private-function; avoid conflict with test case return values + "PLR0913", # too-many-arguments; avoid conflict with too many fixtures + "PLR0915", # too-many-statements; avoid conflict with long fixtures +] diff --git a/usaspending_api/accounts/helpers.py b/usaspending_api/accounts/helpers.py index 0751351120..c4c1ab3532 100644 --- a/usaspending_api/accounts/helpers.py +++ b/usaspending_api/accounts/helpers.py @@ -11,7 +11,9 @@ } -def start_and_end_dates_from_fyq(fiscal_year: int, fiscal_quarter: int) -> tuple[datetime.date, datetime.date]: +def start_and_end_dates_from_fyq( + fiscal_year: int, fiscal_quarter: int +) -> tuple[datetime.date, datetime.date]: if fiscal_quarter == 1: start_date = datetime.date(fiscal_year - 1, 10, 1) end_date = datetime.date(fiscal_year - 1, 12, 31) diff --git a/usaspending_api/awards/migrations/0115_alter_transactionnormalized_last_modified_date.py b/usaspending_api/awards/migrations/0115_alter_transactionnormalized_last_modified_date.py index 63cac462c4..dc4df9a4ce 100644 --- a/usaspending_api/awards/migrations/0115_alter_transactionnormalized_last_modified_date.py +++ b/usaspending_api/awards/migrations/0115_alter_transactionnormalized_last_modified_date.py @@ -1,7 +1,7 @@ from django.db import migrations, models -class Migration(migrations.Migration): +class Migration(migrations.Migration): dependencies = [ ("awards", "0114_alter_ctodlinkageupdates_award_id"), ] @@ -12,4 +12,4 @@ class Migration(migrations.Migration): name="last_modified_date", field=models.DateTimeField(null=True), ), - ] \ No newline at end of file + ] diff --git a/usaspending_api/awards/models/transaction_normalized.py b/usaspending_api/awards/models/transaction_normalized.py index b98a516667..cc0324a5f0 100644 --- a/usaspending_api/awards/models/transaction_normalized.py +++ b/usaspending_api/awards/models/transaction_normalized.py @@ -1,14 +1,15 @@ import os -from django.db import models - from django.contrib.postgres.fields import ArrayField +from django.db import models class TransactionNormalized(models.Model): id = models.BigAutoField(primary_key=True) award = models.ForeignKey( - "search.AwardSearch", on_delete=models.DO_NOTHING, help_text="The award which this transaction is contained in" + "search.AwardSearch", + on_delete=models.DO_NOTHING, + help_text="The award which this transaction is contained in", ) usaspending_unique_transaction_id = models.TextField( blank=True, @@ -29,7 +30,9 @@ class TransactionNormalized(models.Model): help_text="The plain text description of the transaction type", ) period_of_performance_start_date = models.DateField( - verbose_name="Period of Performance Start Date", null=True, help_text="The period of performance start date" + verbose_name="Period of Performance Start Date", + null=True, + help_text="The period of performance start date", ) period_of_performance_current_end_date = models.DateField( verbose_name="Period of Performance Current End Date", @@ -37,9 +40,15 @@ class TransactionNormalized(models.Model): help_text="The current end date of the period of performance", ) action_date = models.DateField( - verbose_name="Transaction Date", help_text="The date this transaction was actioned", db_index=True + verbose_name="Transaction Date", + help_text="The date this transaction was actioned", + db_index=True, + ) + action_type = models.TextField( + blank=True, + null=True, + help_text="The type of transaction. For example, A, B, C, D", ) - action_type = models.TextField(blank=True, null=True, help_text="The type of transaction. For example, A, B, C, D") action_type_description = models.TextField(blank=True, null=True) federal_action_obligation = models.DecimalField( max_digits=23, @@ -90,22 +99,36 @@ class TransactionNormalized(models.Model): null=True, help_text="The agency which is funding this transaction", ) - description = models.TextField(null=True, help_text="The description of this transaction") + description = models.TextField( + null=True, help_text="The description of this transaction" + ) last_modified_date = models.DateTimeField( blank=True, null=True, help_text="The date this transaction was last modified" ) - certified_date = models.DateField(blank=True, null=True, help_text="The date this transaction was certified") + certified_date = models.DateField( + blank=True, null=True, help_text="The date this transaction was certified" + ) create_date = models.DateTimeField( - auto_now_add=True, blank=True, null=True, help_text="The date this transaction was created in the API" + auto_now_add=True, + blank=True, + null=True, + help_text="The date this transaction was created in the API", ) update_date = models.DateTimeField( - auto_now=True, null=True, help_text="The last time this transaction was updated in the API", db_index=True + auto_now=True, + null=True, + help_text="The last time this transaction was updated in the API", + db_index=True, + ) + fiscal_year = models.IntegerField( + blank=True, null=True, help_text="Fiscal Year calculated based on Action Date" ) - fiscal_year = models.IntegerField(blank=True, null=True, help_text="Fiscal Year calculated based on Action Date") transaction_unique_id = models.TextField( blank=False, null=False, default="NONE", verbose_name="Transaction Unique ID" ) - is_fpds = models.BooleanField(blank=False, null=False, default=False, verbose_name="Is FPDS") + is_fpds = models.BooleanField( + blank=False, null=False, default=False, verbose_name="Is FPDS" + ) funding_amount = models.DecimalField( max_digits=23, decimal_places=2, @@ -114,7 +137,11 @@ class TransactionNormalized(models.Model): help_text="Assistance data variable. non_federal_funding_amount + federal_action_obligation", ) non_federal_funding_amount = models.DecimalField( - max_digits=23, decimal_places=2, blank=True, null=True, help_text="Assistance Data variable." + max_digits=23, + decimal_places=2, + blank=True, + null=True, + help_text="Assistance Data variable.", ) unique_award_key = models.TextField(null=True, db_index=True) # From broker. business_categories = ArrayField(models.TextField(), default=list) @@ -149,9 +176,18 @@ class Meta: vw_transaction_normalized_sql = f""" CREATE OR REPLACE VIEW rpt.vw_transaction_normalized AS SELECT - {(','+os.linesep+' '*12).join([ - (v+(f'::{NORM_CASTED_COL_MAP[k]}' if k in NORM_CASTED_COL_MAP else '')).ljust(62)+' AS '+k.ljust(48) - for k, v in NORM_TO_TRANSACTION_SEARCH_COL_MAP.items()])} + { + ("," + os.linesep + " " * 12).join( + [ + ( + v + (f"::{NORM_CASTED_COL_MAP[k]}" if k in NORM_CASTED_COL_MAP else "") + ).ljust(62) + + " AS " + + k.ljust(48) + for k, v in NORM_TO_TRANSACTION_SEARCH_COL_MAP.items() + ] + ) +} FROM rpt.transaction_search; """ diff --git a/usaspending_api/awards/tests/integration/test_awards_v2.py b/usaspending_api/awards/tests/integration/test_awards_v2.py index 001c08553f..1e7a7b582c 100644 --- a/usaspending_api/awards/tests/integration/test_awards_v2.py +++ b/usaspending_api/awards/tests/integration/test_awards_v2.py @@ -2,6 +2,7 @@ import json import pytest +from django.test import Client from model_bakery import baker from rest_framework import status @@ -14,15 +15,29 @@ def awards_and_transactions(db): baker.make("recipient.DUNS", **duns) # Recipient Lookup - parent_recipient_lookup = {"duns": "123", "uei": "ABC", "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e"} - recipient_lookup = {"duns": "456", "uei": "DEF", "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab"} + parent_recipient_lookup = { + "duns": "123", + "uei": "ABC", + "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", + } + recipient_lookup = { + "duns": "456", + "uei": "DEF", + "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", + } baker.make("recipient.RecipientLookup", **parent_recipient_lookup) baker.make("recipient.RecipientLookup", **recipient_lookup) # Recipient Profile - parent_recipient_profile = {"recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", "recipient_level": "P"} - recipient_profile = {"recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", "recipient_level": "C"} + parent_recipient_profile = { + "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", + "recipient_level": "P", + } + recipient_profile = { + "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", + "recipient_level": "C", + } baker.make("recipient.RecipientProfile", **parent_recipient_profile) baker.make("recipient.RecipientProfile", **recipient_profile) @@ -59,30 +74,66 @@ def awards_and_transactions(db): baker.make("references.PSC", code="1005", description="More specific whatever") baker.make("references.PSC", code="A", description="R&D") baker.make("references.PSC", code="A1", description="R&D - Steak Sauce") - baker.make("references.PSC", code="A13", description="R&D - Brand specific steak condiments") - baker.make("references.PSC", code="A136", description="R&D - Very specific steak research") + baker.make( + "references.PSC", + code="A13", + description="R&D - Brand specific steak condiments", + ) + baker.make( + "references.PSC", code="A136", description="R&D - Very specific steak research" + ) baker.make("references.PSC", code="M", description="Something") baker.make("references.PSC", code="M1", description="Something More Specific") baker.make("references.PSC", code="M123", description="Something Most Specific") # NAICS baker.make("references.NAICS", code="11", description="Agriculture") - baker.make("references.NAICS", code="1111", description="Soybean & Oilseed Agriculture") + baker.make( + "references.NAICS", code="1111", description="Soybean & Oilseed Agriculture" + ) baker.make("references.NAICS", code="111120", description="Soybean Harvesting") # Toptier Agency - toptier_agency_1 = {"pk": 1, "abbreviation": "TA1", "name": "TOPTIER AGENCY 1", "toptier_code": "ABC"} - toptier_agency_2 = {"pk": 2, "abbreviation": "TA2", "name": "TOPTIER AGENCY 2", "toptier_code": "002"} + toptier_agency_1 = { + "pk": 1, + "abbreviation": "TA1", + "name": "TOPTIER AGENCY 1", + "toptier_code": "ABC", + } + toptier_agency_2 = { + "pk": 2, + "abbreviation": "TA2", + "name": "TOPTIER AGENCY 2", + "toptier_code": "002", + } - ta1 = baker.make("references.ToptierAgency", **toptier_agency_1, _fill_optional=True) - ta2 = baker.make("references.ToptierAgency", **toptier_agency_2, _fill_optional=True) + ta1 = baker.make( + "references.ToptierAgency", **toptier_agency_1, _fill_optional=True + ) + ta2 = baker.make( + "references.ToptierAgency", **toptier_agency_2, _fill_optional=True + ) # Subtier Agency - subtier_agency_1 = {"pk": 1, "abbreviation": "SA1", "name": "SUBTIER AGENCY 1", "subtier_code": "DEF"} - subtier_agency_2 = {"pk": 2, "abbreviation": "SA2", "name": "SUBTIER AGENCY 2", "subtier_code": "1000"} + subtier_agency_1 = { + "pk": 1, + "abbreviation": "SA1", + "name": "SUBTIER AGENCY 1", + "subtier_code": "DEF", + } + subtier_agency_2 = { + "pk": 2, + "abbreviation": "SA2", + "name": "SUBTIER AGENCY 2", + "subtier_code": "1000", + } - sa1 = baker.make("references.SubtierAgency", **subtier_agency_1, _fill_optional=True) - sa2 = baker.make("references.SubtierAgency", **subtier_agency_2, _fill_optional=True) + sa1 = baker.make( + "references.SubtierAgency", **subtier_agency_1, _fill_optional=True + ) + sa2 = baker.make( + "references.SubtierAgency", **subtier_agency_2, _fill_optional=True + ) # Agency agency = { @@ -971,15 +1022,27 @@ def awards_and_transactions(db): "rollup_total_obligation": 4500, "parent_award_id": None, } - parent_award_2 = {"award_id": 8, "generated_unique_award_id": "CONT_IDV_AWARD8_1000", "parent_award_id": 9} - parent_award_3 = {"award_id": 9, "generated_unique_award_id": "CONT_IDV_AWARD9_1000", "parent_award_id": None} + parent_award_2 = { + "award_id": 8, + "generated_unique_award_id": "CONT_IDV_AWARD8_1000", + "parent_award_id": 9, + } + parent_award_3 = { + "award_id": 9, + "generated_unique_award_id": "CONT_IDV_AWARD9_1000", + "parent_award_id": None, + } baker.make("awards.ParentAward", **parent_award_1) baker.make("awards.ParentAward", **parent_award_2) baker.make("awards.ParentAward", **parent_award_3) - dsws1 = baker.make("submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-01-01") - baker.make("submissions.SubmissionAttributes", toptier_code="ABC", submission_window=dsws1) + dsws1 = baker.make( + "submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-01-01" + ) + baker.make( + "submissions.SubmissionAttributes", toptier_code="ABC", submission_window=dsws1 + ) @pytest.fixture @@ -988,14 +1051,14 @@ def update_awards(db): baker.make("search.AwardSearch", award_id=12) -def test_award_last_updated_endpoint(client, update_awards): +def test_award_last_updated_endpoint(client: Client, update_awards): """Test the awards endpoint.""" resp = client.get("/api/v2/awards/last_updated/") assert resp.status_code == status.HTTP_200_OK assert resp.data["last_updated"] == datetime.datetime.now().strftime("%m/%d/%Y") -def test_award_endpoint_generated_id(client, awards_and_transactions): +def test_award_endpoint_generated_id(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/ASST_AGG_1830212.0481163_3620/") assert resp.status_code == status.HTTP_200_OK assert json.loads(resp.content.decode("utf-8")) == expected_response_asst @@ -1017,15 +1080,24 @@ def test_award_endpoint_generated_id(client, awards_and_transactions): assert json.loads(resp.content.decode("utf-8")) == expected_response_asst -def test_award_endpoint_parent_award(client, awards_and_transactions): - dsws1 = baker.make("submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-01-01") - baker.make("submissions.SubmissionAttributes", toptier_code="ABC", submission_window=dsws1) - baker.make("submissions.SubmissionAttributes", toptier_code="002", submission_window=dsws1) +def test_award_endpoint_parent_award(client: Client, awards_and_transactions): + dsws1 = baker.make( + "submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-01-01" + ) + baker.make( + "submissions.SubmissionAttributes", toptier_code="ABC", submission_window=dsws1 + ) + baker.make( + "submissions.SubmissionAttributes", toptier_code="002", submission_window=dsws1 + ) # Test contract award with parent resp = client.get("/api/v2/awards/7/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["parent_award"] == expected_contract_award_parent() + assert ( + json.loads(resp.content.decode("utf-8"))["parent_award"] + == expected_contract_award_parent() + ) # Test contract award without parent resp = client.get("/api/v2/awards/10/") @@ -1035,7 +1107,10 @@ def test_award_endpoint_parent_award(client, awards_and_transactions): # Test idv award with parent resp = client.get("/api/v2/awards/8/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["parent_award"] == expected_idv_award_parent() + assert ( + json.loads(resp.content.decode("utf-8"))["parent_award"] + == expected_idv_award_parent() + ) # Test idv award without parent resp = client.get("/api/v2/awards/9/") @@ -1043,13 +1118,15 @@ def test_award_endpoint_parent_award(client, awards_and_transactions): assert json.loads(resp.content.decode("utf-8"))["parent_award"] is None -def test_award_endpoint_parent_award_no_submissions(client, awards_and_transactions): +def test_award_endpoint_parent_award_no_submissions( + client: Client, awards_and_transactions +): # Test contract award with parent resp = client.get("/api/v2/awards/7/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["parent_award"] == expected_contract_award_parent( - include_slug=False - ) + assert json.loads(resp.content.decode("utf-8"))[ + "parent_award" + ] == expected_contract_award_parent(include_slug=False) # Test contract award without parent resp = client.get("/api/v2/awards/10/") @@ -1059,7 +1136,9 @@ def test_award_endpoint_parent_award_no_submissions(client, awards_and_transacti # Test idv award with parent resp = client.get("/api/v2/awards/8/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["parent_award"] == expected_idv_award_parent(include_slug=False) + assert json.loads(resp.content.decode("utf-8"))[ + "parent_award" + ] == expected_idv_award_parent(include_slug=False) # Test idv award without parent resp = client.get("/api/v2/awards/9/") @@ -1067,7 +1146,7 @@ def test_award_endpoint_parent_award_no_submissions(client, awards_and_transacti assert json.loads(resp.content.decode("utf-8"))["parent_award"] is None -def test_award_multiple_cfdas(client, awards_and_transactions): +def test_award_multiple_cfdas(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/3/") assert resp.status_code == status.HTTP_200_OK assert json.loads(resp.content.decode("utf-8"))["cfda_info"] == [ @@ -1104,14 +1183,20 @@ def test_award_multiple_cfdas(client, awards_and_transactions): ] -def test_award_psc_hierarchy_types(client, awards_and_transactions): +def test_award_psc_hierarchy_types(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/5/") assert resp.status_code == status.HTTP_200_OK assert json.loads(resp.content.decode("utf-8"))["psc_hierarchy"] == { "toptier_code": {"description": "R&D", "code": "A"}, "midtier_code": {"description": "R&D - Steak Sauce", "code": "A1"}, - "subtier_code": {"description": "R&D - Brand specific steak condiments", "code": "A13"}, - "base_code": {"description": "R&D - Very specific steak research", "code": "A136"}, + "subtier_code": { + "description": "R&D - Brand specific steak condiments", + "code": "A13", + }, + "base_code": { + "description": "R&D - Very specific steak research", + "code": "A136", + }, } resp = client.get("/api/v2/awards/6/") @@ -1124,7 +1209,7 @@ def test_award_psc_hierarchy_types(client, awards_and_transactions): } -def test_foreign_city(client, awards_and_transactions): +def test_foreign_city(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/13/") assert resp.status_code == status.HTTP_200_OK assert json.loads(resp.content.decode("utf-8"))["recipient"]["location"] == { @@ -1146,25 +1231,34 @@ def test_foreign_city(client, awards_and_transactions): } -def test_special_characters(client, awards_and_transactions): +def test_special_characters(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/") assert resp.status_code == status.HTTP_200_OK - resp = client.get("/api/v2/awards/count/transaction/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/") + resp = client.get( + "/api/v2/awards/count/transaction/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/" + ) assert resp.status_code == status.HTTP_200_OK - resp = client.get("/api/v2/awards/count/subaward/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/") + resp = client.get( + "/api/v2/awards/count/subaward/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/" + ) assert resp.status_code == status.HTTP_200_OK - resp = client.get("/api/v2/awards/count/federal_account/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/") + resp = client.get( + "/api/v2/awards/count/federal_account/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/" + ) assert resp.status_code == status.HTTP_200_OK -def test_zip4_switch(client, awards_and_transactions): +def test_zip4_switch(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/10/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["recipient"]["location"]["zip4"] == "0000" + assert ( + json.loads(resp.content.decode("utf-8"))["recipient"]["location"]["zip4"] + == "0000" + ) -def test_file_c_data(client, awards_and_transactions): +def test_file_c_data(client: Client, awards_and_transactions): defc = baker.make("references.DisasterEmergencyFundCode", code="L") baker.make( "submissions.DABSSubmissionWindowSchedule", @@ -1211,8 +1305,12 @@ def test_file_c_data(client, awards_and_transactions): # fiscal period is not 12 & is not after 2020-04-01, so we expect no data to come back resp = client.get("/api/v2/awards/1/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 100.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 0.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 100.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 0.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 100.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 0.0 baker.make( @@ -1236,8 +1334,12 @@ def test_file_c_data(client, awards_and_transactions): resp = client.get("/api/v2/awards/1/") # now we have the period 12 data, so we expect outlays here assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 200.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 100.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 200.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 100.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 200.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 100.0 baker.make( @@ -1261,8 +1363,12 @@ def test_file_c_data(client, awards_and_transactions): # again, period is not 12, no data reported resp = client.get("/api/v2/awards/1/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 210.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 100.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 210.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 100.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 210.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 100.0 baker.make( @@ -1286,8 +1392,12 @@ def test_file_c_data(client, awards_and_transactions): # expect outlays here resp = client.get("/api/v2/awards/1/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 220.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 110.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 220.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 110.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 220.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 110.0 baker.make( @@ -1310,13 +1420,17 @@ def test_file_c_data(client, awards_and_transactions): # period is 12 but amounts are 0, so we expect no change resp = client.get("/api/v2/awards/1/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 220.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 110.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 220.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 110.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 220.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 110.0 -def test_outlay_calculations(client, awards_and_transactions): +def test_outlay_calculations(client: Client, awards_and_transactions: None): defc = baker.make("references.DisasterEmergencyFundCode", code="L") baker.make( "submissions.DABSSubmissionWindowSchedule", @@ -1374,8 +1488,12 @@ def test_outlay_calculations(client, awards_and_transactions): resp = client.get("/api/v2/awards/1/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 10.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 7.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 10.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 7.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 10.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 7.0 assert json.loads(resp.content.decode("utf-8"))["total_outlay"] == 7.0 @@ -1435,7 +1553,11 @@ def test_outlay_calculations(client, awards_and_transactions): "code": "ABC", "slug": "toptier-agency-1", }, - "subtier_agency": {"name": "SUBTIER AGENCY 1", "abbreviation": "SA1", "code": "DEF"}, + "subtier_agency": { + "name": "SUBTIER AGENCY 1", + "abbreviation": "SA1", + "code": "DEF", + }, "office_agency_name": "awarding_office", }, "funding_agency": { @@ -1447,7 +1569,11 @@ def test_outlay_calculations(client, awards_and_transactions): "code": "ABC", "slug": "toptier-agency-1", }, - "subtier_agency": {"name": "SUBTIER AGENCY 1", "abbreviation": "SA1", "code": "DEF"}, + "subtier_agency": { + "name": "SUBTIER AGENCY 1", + "abbreviation": "SA1", + "code": "DEF", + }, "office_agency_name": "funding_office", }, "recipient": { @@ -1489,7 +1615,11 @@ def test_outlay_calculations(client, awards_and_transactions): {"name": None, "amount": None}, ] }, - "period_of_performance": {"start_date": "2004-02-04", "end_date": "2005-02-04", "last_modified_date": "2000-01-02"}, + "period_of_performance": { + "start_date": "2004-02-04", + "end_date": "2005-02-04", + "last_modified_date": "2000-01-02", + }, "place_of_performance": { "address_line1": None, "address_line2": None, @@ -1537,7 +1667,11 @@ def test_outlay_calculations(client, awards_and_transactions): "code": "ABC", "slug": "toptier-agency-1", }, - "subtier_agency": {"name": "SUBTIER AGENCY 1", "abbreviation": "SA1", "code": "DEF"}, + "subtier_agency": { + "name": "SUBTIER AGENCY 1", + "abbreviation": "SA1", + "code": "DEF", + }, "office_agency_name": "awarding_office", }, "funding_agency": { @@ -1549,7 +1683,11 @@ def test_outlay_calculations(client, awards_and_transactions): "code": "ABC", "slug": "toptier-agency-1", }, - "subtier_agency": {"name": "SUBTIER AGENCY 1", "abbreviation": "SA1", "code": "DEF"}, + "subtier_agency": { + "name": "SUBTIER AGENCY 1", + "abbreviation": "SA1", + "code": "DEF", + }, "office_agency_name": "funding_office", }, "recipient": { @@ -1691,7 +1829,10 @@ def test_outlay_calculations(client, awards_and_transactions): "date_signed": "2004-03-02", "naics_hierarchy": { "toptier_code": {"description": "Agriculture", "code": "11"}, - "midtier_code": {"description": "Soybean & Oilseed Agriculture", "code": "1111"}, + "midtier_code": { + "description": "Soybean & Oilseed Agriculture", + "code": "1111", + }, "base_code": {"description": "Soybean Harvesting", "code": "111120"}, }, "psc_hierarchy": { @@ -1721,7 +1862,7 @@ def test_outlay_calculations(client, awards_and_transactions): } -def expected_contract_award_parent(include_slug=True): +def expected_contract_award_parent(include_slug: bool = True) -> dict[str, any]: return { "agency_id": 2, "agency_name": "TOPTIER AGENCY 2", @@ -1737,7 +1878,7 @@ def expected_contract_award_parent(include_slug=True): } -def expected_idv_award_parent(include_slug=True): +def expected_idv_award_parent(include_slug: bool = True) -> dict[str, any]: return { "agency_id": 2, "agency_name": "TOPTIER AGENCY 2", diff --git a/usaspending_api/awards/v2/data_layer/orm.py b/usaspending_api/awards/v2/data_layer/orm.py index 7db0518d59..617324b652 100644 --- a/usaspending_api/awards/v2/data_layer/orm.py +++ b/usaspending_api/awards/v2/data_layer/orm.py @@ -1,12 +1,11 @@ import copy import logging - from collections import OrderedDict from decimal import Decimal -from django.db.models import Sum, F, Subquery -from django.utils.text import slugify from typing import Optional +from django.db.models import F, Subquery, Sum +from django.utils.text import slugify from usaspending_api.awards.models import ( Award, @@ -22,22 +21,30 @@ FPDS_AWARD_FIELDS, FPDS_CONTRACT_FIELDS, ) -from usaspending_api.awards.v2.data_layer.orm_utils import delete_keys_from_dict, split_mapper_into_qs -from usaspending_api.common.helpers.business_categories_helper import get_business_category_display_names -from usaspending_api.common.helpers.data_constants import state_code_from_name, state_name_from_code +from usaspending_api.awards.v2.data_layer.orm_utils import ( + delete_keys_from_dict, + split_mapper_into_qs, +) +from usaspending_api.awards.v2.data_layer.sql import defc_sql +from usaspending_api.common.helpers.business_categories_helper import ( + get_business_category_display_names, +) +from usaspending_api.common.helpers.data_constants import ( + state_code_from_name, + state_name_from_code, +) from usaspending_api.common.helpers.date_helper import get_date_from_datetime from usaspending_api.common.helpers.sql_helpers import execute_sql_to_ordered_dictionary from usaspending_api.common.recipient_lookups import obtain_recipient_uri from usaspending_api.references.models import ( + NAICS, + PSC, Agency, Cfda, DisasterEmergencyFundCode, - NAICS, - PSC, SubtierAgency, ToptierAgencyPublishedDABSView, ) -from usaspending_api.awards.v2.data_layer.sql import defc_sql from usaspending_api.search.models import AwardSearch logger = logging.getLogger("console") @@ -59,13 +66,19 @@ def construct_assistance_response(requested_award_dict: dict) -> OrderedDict: response["record_type"] = transaction["record_type"] response["cfda_info"] = fetch_all_cfda_details(award) - response["transaction_obligated_amount"] = fetch_transaction_obligated_amount_by_internal_award_id(award["id"]) + response["transaction_obligated_amount"] = ( + fetch_transaction_obligated_amount_by_internal_award_id(award["id"]) + ) response["funding_agency"] = fetch_agency_details(response["_funding_agency_id"]) if response["funding_agency"]: - response["funding_agency"]["office_agency_name"] = transaction["_funding_office_name"] + response["funding_agency"]["office_agency_name"] = transaction[ + "_funding_office_name" + ] response["awarding_agency"] = fetch_agency_details(response["_awarding_agency_id"]) if response["awarding_agency"]: - response["awarding_agency"]["office_agency_name"] = transaction["_awarding_office_name"] + response["awarding_agency"]["office_agency_name"] = transaction[ + "_awarding_office_name" + ] response["period_of_performance"] = OrderedDict( [ ("start_date", award["_start_date"]), @@ -105,15 +118,22 @@ def construct_contract_response(requested_award_dict: dict) -> OrderedDict: response["latest_transaction_contract_data"] = transaction response["funding_agency"] = fetch_agency_details(response["_funding_agency_id"]) if response["funding_agency"]: - response["funding_agency"]["office_agency_name"] = transaction["_funding_office_name"] + response["funding_agency"]["office_agency_name"] = transaction[ + "_funding_office_name" + ] response["awarding_agency"] = fetch_agency_details(response["_awarding_agency_id"]) if response["awarding_agency"]: - response["awarding_agency"]["office_agency_name"] = transaction["_awarding_office_name"] + response["awarding_agency"]["office_agency_name"] = transaction[ + "_awarding_office_name" + ] response["period_of_performance"] = OrderedDict( [ ("start_date", award["_start_date"]), ("end_date", award["_end_date"]), - ("last_modified_date", get_date_from_datetime(transaction["_last_modified"])), + ( + "last_modified_date", + get_date_from_datetime(transaction["_last_modified"]), + ), ("potential_end_date", transaction["_period_of_perf_potential_e"]), ] ) @@ -121,7 +141,9 @@ def construct_contract_response(requested_award_dict: dict) -> OrderedDict: response["executive_details"] = create_officers_object(award) response["place_of_performance"] = create_place_of_performance_object(transaction) if transaction["product_or_service_code"]: - response["psc_hierarchy"] = fetch_psc_hierarchy(transaction["product_or_service_code"]) + response["psc_hierarchy"] = fetch_psc_hierarchy( + transaction["product_or_service_code"] + ) if transaction["naics"]: response["naics_hierarchy"] = fetch_naics_hierarchy(transaction["naics"]) response["total_outlay"] = fetch_total_outlays(award["id"]) @@ -153,19 +175,28 @@ def construct_idv_response(requested_award_dict: dict) -> OrderedDict: transaction = fetch_fpds_details_by_pk(award["_trx"], mapper) - response["parent_award"] = fetch_idv_parent_award_details(award["generated_unique_award_id"]) + response["parent_award"] = fetch_idv_parent_award_details( + award["generated_unique_award_id"] + ) response["latest_transaction_contract_data"] = transaction response["funding_agency"] = fetch_agency_details(response["_funding_agency_id"]) if response["funding_agency"]: - response["funding_agency"]["office_agency_name"] = transaction["_funding_office_name"] + response["funding_agency"]["office_agency_name"] = transaction[ + "_funding_office_name" + ] response["awarding_agency"] = fetch_agency_details(response["_awarding_agency_id"]) if response["awarding_agency"]: - response["awarding_agency"]["office_agency_name"] = transaction["_awarding_office_name"] + response["awarding_agency"]["office_agency_name"] = transaction[ + "_awarding_office_name" + ] response["period_of_performance"] = OrderedDict( [ ("start_date", award["_start_date"]), ("end_date", transaction["_end_date"]), - ("last_modified_date", get_date_from_datetime(transaction["_last_modified_date"])), + ( + "last_modified_date", + get_date_from_datetime(transaction["_last_modified_date"]), + ), ("potential_end_date", transaction["_period_of_perf_potential_e"]), ] ) @@ -173,7 +204,9 @@ def construct_idv_response(requested_award_dict: dict) -> OrderedDict: response["executive_details"] = create_officers_object(award) response["place_of_performance"] = create_place_of_performance_object(transaction) if transaction["product_or_service_code"]: - response["psc_hierarchy"] = fetch_psc_hierarchy(transaction["product_or_service_code"]) + response["psc_hierarchy"] = fetch_psc_hierarchy( + transaction["product_or_service_code"] + ) if transaction["naics"]: response["naics_hierarchy"] = fetch_naics_hierarchy(transaction["naics"]) response["total_outlay"] = fetch_total_outlays(award["id"]) @@ -213,27 +246,46 @@ def create_recipient_object(db_row_dict: dict) -> OrderedDict: ( "business_categories", get_business_category_display_names( - fetch_business_categories_by_transaction_id(db_row_dict["_transaction_id"]) + fetch_business_categories_by_transaction_id( + db_row_dict["_transaction_id"] + ) ), ), ( "location", OrderedDict( [ - ("location_country_code", db_row_dict["_rl_location_country_code"]), + ( + "location_country_code", + db_row_dict["_rl_location_country_code"], + ), ("country_name", db_row_dict["_rl_country_name"]), ("state_code", db_row_dict["_rl_state_code"]), ("state_name", db_row_dict["_rl_state_name"]), - ("city_name", db_row_dict["_rl_city_name"] or db_row_dict.get("_rl_foreign_city")), + ( + "city_name", + db_row_dict["_rl_city_name"] + or db_row_dict.get("_rl_foreign_city"), + ), ("county_code", db_row_dict["_rl_county_code"]), ("county_name", db_row_dict["_rl_county_name"]), ("address_line1", db_row_dict["_rl_address_line1"]), ("address_line2", db_row_dict["_rl_address_line2"]), ("address_line3", db_row_dict["_rl_address_line3"]), - ("congressional_code", db_row_dict["_rl_congressional_code_current"]), - ("zip4", db_row_dict.get("_rl_zip_last_4") or db_row_dict.get("_rl_zip4")), + ( + "congressional_code", + db_row_dict["_rl_congressional_code_current"], + ), + ( + "zip4", + db_row_dict.get("_rl_zip_last_4") + or db_row_dict.get("_rl_zip4"), + ), ("zip5", db_row_dict["_rl_zip5"]), - ("foreign_postal_code", db_row_dict.get("_rl_foreign_postal_code")), + ( + "foreign_postal_code", + db_row_dict.get("_rl_foreign_postal_code"), + ), ("foreign_province", db_row_dict.get("_rl_foreign_province")), ] ), @@ -296,12 +348,19 @@ def fetch_award_details(filter_q: dict, mapper_fields: OrderedDict) -> dict: return Award.objects.filter(**filter_q).values(*vals).annotate(**ann).first() -def fetch_contract_parent_award_details(parent_piid: str, parent_fpds_agency: str) -> Optional[OrderedDict]: - parent_guai = "CONT_IDV_{}_{}".format(parent_piid or "NONE", parent_fpds_agency or "NONE") +def fetch_contract_parent_award_details( + parent_piid: str, parent_fpds_agency: str +) -> Optional[OrderedDict]: + parent_guai = "CONT_IDV_{}_{}".format( + parent_piid or "NONE", parent_fpds_agency or "NONE" + ) parent_award_ids = ( ParentAward.objects.filter(generated_unique_award_id=parent_guai) - .annotate(parent_award_award_id=F("award_id"), parent_award_guai=F("generated_unique_award_id")) + .annotate( + parent_award_award_id=F("award_id"), + parent_award_guai=F("generated_unique_award_id"), + ) .values("parent_award_award_id", "parent_award_guai") .first() ) @@ -311,7 +370,9 @@ def fetch_contract_parent_award_details(parent_piid: str, parent_fpds_agency: st def fetch_idv_parent_award_details(guai: str) -> Optional[OrderedDict]: parent_award_ids = ( - ParentAward.objects.filter(generated_unique_award_id=guai, parent_award__isnull=False) + ParentAward.objects.filter( + generated_unique_award_id=guai, parent_award__isnull=False + ) .annotate( parent_award_award_id=F("parent_award__award_id"), parent_award_guai=F("parent_award__generated_unique_award_id"), @@ -348,7 +409,9 @@ def _fetch_parent_award_details(parent_award_ids: dict) -> Optional[OrderedDict] return None parent_sub_agency = ( - SubtierAgency.objects.filter(subtier_code=parent_award["latest_transaction__contract_data__agency_id"]) + SubtierAgency.objects.filter( + subtier_code=parent_award["latest_transaction__contract_data__agency_id"] + ) .values("name", "subtier_agency_id") .first() ) @@ -358,7 +421,8 @@ def _fetch_parent_award_details(parent_award_ids: dict) -> Optional[OrderedDict] toptier_flag=True, toptier_agency_id=Subquery( Agency.objects.filter( - subtier_agency_id__isnull=False, subtier_agency_id=parent_sub_agency["subtier_agency_id"] + subtier_agency_id__isnull=False, + subtier_agency_id=parent_sub_agency["subtier_agency_id"], ).values("toptier_agency_id") ), ) @@ -379,17 +443,33 @@ def _fetch_parent_award_details(parent_award_ids: dict) -> Optional[OrderedDict] ("agency_id", parent_agency["id"] if parent_agency else None), ("agency_name", agency_name), ("agency_slug", slugify(agency_name) if has_agency_page else None), - ("sub_agency_id", parent_award["latest_transaction__contract_data__agency_id"]), - ("sub_agency_name", parent_sub_agency["name"] if parent_sub_agency else None), + ( + "sub_agency_id", + parent_award["latest_transaction__contract_data__agency_id"], + ), + ( + "sub_agency_name", + parent_sub_agency["name"] if parent_sub_agency else None, + ), ("award_id", parent_award_award_id), ("generated_unique_award_id", parent_award_guai), - ("idv_type_description", parent_award["latest_transaction__contract_data__idv_type_description"]), + ( + "idv_type_description", + parent_award["latest_transaction__contract_data__idv_type_description"], + ), ( "multiple_or_single_aw_desc", - parent_award["latest_transaction__contract_data__multiple_or_single_aw_desc"], + parent_award[ + "latest_transaction__contract_data__multiple_or_single_aw_desc" + ], ), ("piid", parent_award["latest_transaction__contract_data__piid"]), - ("type_of_idc_description", parent_award["latest_transaction__contract_data__type_of_idc_description"]), + ( + "type_of_idc_description", + parent_award[ + "latest_transaction__contract_data__type_of_idc_description" + ], + ), ] ) @@ -398,19 +478,33 @@ def _fetch_parent_award_details(parent_award_ids: dict) -> Optional[OrderedDict] def fetch_fabs_details_by_pk(primary_key: int, mapper: OrderedDict) -> dict: vals, ann = split_mapper_into_qs(mapper) - return TransactionFABS.objects.filter(pk=primary_key).values(*vals).annotate(**ann).first() + return ( + TransactionFABS.objects.filter(pk=primary_key) + .values(*vals) + .annotate(**ann) + .first() + ) def fetch_fpds_details_by_pk(primary_key: int, mapper: OrderedDict) -> dict: vals, ann = split_mapper_into_qs(mapper) - return TransactionFPDS.objects.filter(pk=primary_key).values(*vals).annotate(**ann).first() + return ( + TransactionFPDS.objects.filter(pk=primary_key) + .values(*vals) + .annotate(**ann) + .first() + ) -def fetch_latest_ec_details(award_id: int, mapper: OrderedDict, transaction_type: str) -> dict: +def fetch_latest_ec_details( + award_id: int, mapper: OrderedDict, transaction_type: str +) -> dict: vals, ann = split_mapper_into_qs(mapper) model = TransactionFPDS if transaction_type == "fpds" else TransactionFABS retval = ( - model.objects.filter(transaction__award_id=award_id, officer_1_name__isnull=False) + model.objects.filter( + transaction__award_id=award_id, officer_1_name__isnull=False + ) .values(*vals) .annotate(**ann) .order_by("-action_date") @@ -418,8 +512,10 @@ def fetch_latest_ec_details(award_id: int, mapper: OrderedDict, transaction_type return retval.first() -def agency_has_file_c_submission(toptier_agency_id): - return ToptierAgencyPublishedDABSView.objects.filter(toptier_agency_id=toptier_agency_id).exists() +def agency_has_file_c_submission(toptier_agency_id: int) -> bool: + return ToptierAgencyPublishedDABSView.objects.filter( + toptier_agency_id=toptier_agency_id + ).exists() def fetch_agency_details(agency_id: int) -> Optional[dict]: @@ -444,7 +540,9 @@ def fetch_agency_details(agency_id: int) -> Optional[dict]: "name": agency["toptier_agency__name"], "code": agency["toptier_agency__toptier_code"], "abbreviation": agency["toptier_agency__abbreviation"], - "slug": slugify(agency["toptier_agency__name"]) if has_agency_page else None, + "slug": slugify(agency["toptier_agency__name"]) + if has_agency_page + else None, }, "subtier_agency": { "name": agency["subtier_agency__name"], @@ -456,7 +554,11 @@ def fetch_agency_details(agency_id: int) -> Optional[dict]: def fetch_business_categories_by_transaction_id(transaction_id: int) -> list: - tn = TransactionNormalized.objects.filter(pk=transaction_id).values("business_categories").first() + tn = ( + TransactionNormalized.objects.filter(pk=transaction_id) + .values("business_categories") + .first() + ) if tn: return tn["business_categories"] @@ -473,8 +575,15 @@ def normalize_cfda_number_format(fabs_transaction: dict) -> str: def fetch_all_cfda_details(award: dict) -> list: - fabs_values = ["cfda_number", "federal_action_obligation", "non_federal_funding_amount", "total_funding_amount"] - queryset = TransactionFABS.objects.filter(transaction__award_id=award["id"]).values(*fabs_values) + fabs_values = [ + "cfda_number", + "federal_action_obligation", + "non_federal_funding_amount", + "total_funding_amount", + ] + queryset = TransactionFABS.objects.filter(transaction__award_id=award["id"]).values( + *fabs_values + ) cfda_dicts = {} for transaction in queryset: clean_cfda_number_str = normalize_cfda_number_format(transaction) @@ -482,11 +591,17 @@ def fetch_all_cfda_details(award: dict) -> list: cfda_dicts.update( { clean_cfda_number_str: { - "federal_action_obligation": cfda_dicts[clean_cfda_number_str]["federal_action_obligation"] + "federal_action_obligation": cfda_dicts[clean_cfda_number_str][ + "federal_action_obligation" + ] + Decimal(transaction["federal_action_obligation"] or 0), - "non_federal_funding_amount": cfda_dicts[clean_cfda_number_str]["non_federal_funding_amount"] + "non_federal_funding_amount": cfda_dicts[clean_cfda_number_str][ + "non_federal_funding_amount" + ] + Decimal(transaction["non_federal_funding_amount"] or 0), - "total_funding_amount": cfda_dicts[clean_cfda_number_str]["total_funding_amount"] + "total_funding_amount": cfda_dicts[clean_cfda_number_str][ + "total_funding_amount" + ] + Decimal(transaction["total_funding_amount"] or 0), } } @@ -495,9 +610,15 @@ def fetch_all_cfda_details(award: dict) -> list: cfda_dicts.update( { clean_cfda_number_str: { - "federal_action_obligation": Decimal(transaction["federal_action_obligation"] or 0), - "non_federal_funding_amount": Decimal(transaction["non_federal_funding_amount"] or 0), - "total_funding_amount": Decimal(transaction["total_funding_amount"] or 0), + "federal_action_obligation": Decimal( + transaction["federal_action_obligation"] or 0 + ), + "non_federal_funding_amount": Decimal( + transaction["non_federal_funding_amount"] or 0 + ), + "total_funding_amount": Decimal( + transaction["total_funding_amount"] or 0 + ), } } ) @@ -519,10 +640,19 @@ def fetch_all_cfda_details(award: dict) -> list: ("cfda_popular_name", details.get("popular_name")), ("cfda_title", details.get("program_title")), ("cfda_website", details.get("website_address")), - ("federal_action_obligation_amount", cfda_dicts[cfda_number]["federal_action_obligation"]), - ("non_federal_funding_amount", cfda_dicts[cfda_number]["non_federal_funding_amount"]), + ( + "federal_action_obligation_amount", + cfda_dicts[cfda_number]["federal_action_obligation"], + ), + ( + "non_federal_funding_amount", + cfda_dicts[cfda_number]["non_federal_funding_amount"], + ), ("sam_website", details.get("url")), - ("total_funding_amount", cfda_dicts[cfda_number]["total_funding_amount"]), + ( + "total_funding_amount", + cfda_dicts[cfda_number]["total_funding_amount"], + ), ] ) ) @@ -547,10 +677,12 @@ def fetch_cfda_details_using_cfda_number(cfda: str) -> dict: return cfda_details or {} -def fetch_transaction_obligated_amount_by_internal_award_id(internal_award_id: int) -> Optional[Decimal]: - _sum = FinancialAccountsByAwards.objects.filter(award_id=internal_award_id).aggregate( - Sum("transaction_obligated_amount") - ) +def fetch_transaction_obligated_amount_by_internal_award_id( + internal_award_id: int, +) -> Optional[Decimal]: + _sum = FinancialAccountsByAwards.objects.filter( + award_id=internal_award_id + ).aggregate(Sum("transaction_obligated_amount")) if _sum: return _sum["transaction_obligated_amount__sum"] @@ -558,12 +690,19 @@ def fetch_transaction_obligated_amount_by_internal_award_id(internal_award_id: i def fetch_psc_hierarchy(psc_code: str) -> dict: - codes = [psc_code, psc_code[:2], psc_code[:1], psc_code[:3] if psc_code[0] == "A" else None] + codes = [ + psc_code, + psc_code[:2], + psc_code[:1], + psc_code[:3] if psc_code[0] == "A" else None, + ] toptier_code = {} midtier_code = {} subtier_code = {} # only used for R&D codes which start with "A" base_code = {} - if psc_code[0].isalpha(): # we only want to look for the toptier code for services, which start with letters + if psc_code[ + 0 + ].isalpha(): # we only want to look for the toptier code for services, which start with letters try: psc_top = PSC.objects.get(code=codes[2]) toptier_code = {"code": psc_top.code, "description": psc_top.description} @@ -579,7 +718,9 @@ def fetch_psc_hierarchy(psc_code: str) -> dict: base_code = {"code": psc.code, "description": psc.description} except PSC.DoesNotExist: pass - if codes[3] is not None: # don't bother looking for 3 digit codes unless they start with "A" + if ( + codes[3] is not None + ): # don't bother looking for 3 digit codes unless they start with "A" try: psc_rd = PSC.objects.get(code=codes[3]) subtier_code = {"code": psc_rd.code, "description": psc_rd.description} @@ -615,13 +756,19 @@ def fetch_naics_hierarchy(naics: str) -> dict: base_code = {"code": base.code, "description": base.description} except NAICS.DoesNotExist: pass - results = {"toptier_code": toptier_code, "midtier_code": midtier_code, "base_code": base_code} + results = { + "toptier_code": toptier_code, + "midtier_code": midtier_code, + "base_code": base_code, + } return results def fetch_account_details_award(award_id: int) -> dict: award_id_sql = "faba.award_id = {award_id}".format(award_id=award_id) - results = execute_sql_to_ordered_dictionary(defc_sql.format(award_id_sql=award_id_sql)) + results = execute_sql_to_ordered_dictionary( + defc_sql.format(award_id_sql=award_id_sql) + ) outlay_by_code = [] obligation_by_code = [] total_outlay = 0 @@ -631,8 +778,15 @@ def fetch_account_details_award(award_id: int) -> dict: if row["disaster_emergency_fund_code"] in defcs: total_outlay += row["total_outlay"] total_obligations += row["obligated_amount"] - outlay_by_code.append({"code": row["disaster_emergency_fund_code"], "amount": row["total_outlay"]}) - obligation_by_code.append({"code": row["disaster_emergency_fund_code"], "amount": row["obligated_amount"]}) + outlay_by_code.append( + {"code": row["disaster_emergency_fund_code"], "amount": row["total_outlay"]} + ) + obligation_by_code.append( + { + "code": row["disaster_emergency_fund_code"], + "amount": row["obligated_amount"], + } + ) results = { "total_account_outlay": total_outlay, "total_account_obligation": total_obligations, @@ -681,7 +835,9 @@ def fetch_total_outlays(award_id: int) -> dict: faba.gross_outlay_amount_by_award_cpe != 0 ); """ - results = execute_sql_to_ordered_dictionary(sql.format(award_id_sql=f"faba.award_id = {award_id}")) + results = execute_sql_to_ordered_dictionary( + sql.format(award_id_sql=f"faba.award_id = {award_id}") + ) if len(results) > 0: return results[0]["total_outlay"] return None diff --git a/usaspending_api/common/helpers/date_helper.py b/usaspending_api/common/helpers/date_helper.py index 659f7f15c8..5bf8a92c3f 100644 --- a/usaspending_api/common/helpers/date_helper.py +++ b/usaspending_api/common/helpers/date_helper.py @@ -1,37 +1,36 @@ import operator - from argparse import ArgumentTypeError from datetime import date, datetime, timezone -from dateutil import parser from typing import Callable +from dateutil import parser -def now(): + +def now() -> datetime: """Now now() is a standardized function to obtain "now" when you need it now.""" return datetime.now(timezone.utc) -def cast_datetime_to_naive(datetime): +def cast_datetime_to_naive(datetime_to_cast: datetime) -> datetime: """ Removes timezone information, but converts non-UTC datetimes to UTC - beforehand so that the returned datetime will be naive but will also be UTC. - """ - if datetime.tzinfo is not None: - datetime = datetime.astimezone(timezone.utc) - return datetime.replace(tzinfo=None) + beforehand so that the returned datetime will be naive but will also be UTC.""" + if datetime_to_cast.tzinfo is not None: + datetime_to_cast = datetime_to_cast.astimezone(timezone.utc) + return datetime_to_cast.replace(tzinfo=None) -def cast_datetime_to_utc(datetime): +def cast_datetime_to_utc(datetime_to_cast: datetime) -> datetime: """ If datetime has no tzinfo, assume it is UTC, otherwise convert the datetime to UTC. """ - if datetime.tzinfo is None: - return datetime.replace(tzinfo=timezone.utc) - return datetime.astimezone(timezone.utc) + if datetime_to_cast.tzinfo is None: + return datetime_to_cast.replace(tzinfo=timezone.utc) + return datetime_to_cast.astimezone(timezone.utc) -def datetime_command_line_argument_type(naive): +def datetime_command_line_argument_type(naive: bool) -> datetime: """ This function is designed to be used as a date/time type for argparse command line parameters. argparse parameter types need to be passed @@ -45,7 +44,7 @@ def datetime_command_line_argument_type(naive): is timezone aware. If it is timezone naive, it is assumed to be UTC. """ - def _datetime_command_line_argument_type(input_string): + def _datetime_command_line_argument_type(input_string: str) -> datetime: """ A very flexible date/time parser to be used as a command line argument parser. See wrapper for timezone handling instructions. @@ -62,8 +61,10 @@ def _datetime_command_line_argument_type(input_string): else: return cast_datetime_to_utc(parsed) - except (OverflowError, TypeError, ValueError): - raise ArgumentTypeError("Unable to convert provided value to date/time") + except (OverflowError, TypeError, ValueError) as exc: + raise ArgumentTypeError( + "Unable to convert provided value to date/time" + ) from exc return _datetime_command_line_argument_type @@ -82,7 +83,7 @@ def get_date_from_datetime(date_time: datetime | str, **kwargs) -> date: return kwargs.get("default", date_time) -def fy(raw_date): +def fy(raw_date: str | date | None) -> int | None: """Federal fiscal year corresponding to date""" if raw_date is None: @@ -95,8 +96,8 @@ def fy(raw_date): result = raw_date.year if raw_date.month > 9: result += 1 - except AttributeError: - raise TypeError("{} needs year and month attributes".format(raw_date)) + except AttributeError as exc: + raise TypeError(f"{raw_date} needs year and month attributes") from exc return result @@ -111,7 +112,9 @@ def datetime_is_lt(first_datetime: datetime, second_datetime: datetime) -> bool: return _compare_datetimes(first_datetime, second_datetime, operator.lt) -def _compare_datetimes(first_datetime: datetime, second_datetime: datetime, op_func: Callable) -> bool: +def _compare_datetimes( + first_datetime: datetime, second_datetime: datetime, op_func: Callable +) -> bool: """Comparison of datetimes using provided function. If TZ-unaware, assumes UTC""" dt_1 = cast_datetime_to_utc(first_datetime) dt_2 = cast_datetime_to_utc(second_datetime) diff --git a/usaspending_api/download/delta_downloads/account_balances.py b/usaspending_api/download/delta_downloads/account_balances.py index cb0b0867a3..21eb9378c0 100644 --- a/usaspending_api/download/delta_downloads/account_balances.py +++ b/usaspending_api/download/delta_downloads/account_balances.py @@ -13,7 +13,9 @@ AbstractAccountDownloadFactory, AccountDownloadConditionName, ) -from usaspending_api.download.delta_downloads.filters.account_filters import AccountDownloadFilters +from usaspending_api.download.delta_downloads.filters.account_filters import ( + AccountDownloadFilters, +) from usaspending_api.submissions.helpers import get_submission_ids_for_periods @@ -72,7 +74,12 @@ def submission_type(self) -> SubmissionType: @property def group_by_cols(self) -> list[str]: - return ["federal_account_symbol", "owning_agency_name", "federal_account_name", "submission_period"] + return [ + "federal_account_symbol", + "owning_agency_name", + "federal_account_name", + "submission_period", + ] @property def agg_cols(self) -> list[Column | DuckDBSparkColumn]: @@ -81,49 +88,70 @@ def agg_cols(self) -> list[Column | DuckDBSparkColumn]: collect_concat("agency_identifier_name", spark=self.spark), collect_concat("budget_function", spark=self.spark), collect_concat("budget_subfunction", spark=self.spark), - self.sf.sum(self.sf.col("budget_authority_unobligated_balance_brought_forward")).alias( - "budget_authority_unobligated_balance_brought_forward" - ), - self.sf.sum(self.sf.col("adjustments_to_unobligated_balance_brought_forward_cpe")).alias( - "adjustments_to_unobligated_balance_brought_forward_cpe" - ), + self.sf.sum( + self.sf.col("budget_authority_unobligated_balance_brought_forward") + ).alias("budget_authority_unobligated_balance_brought_forward"), + self.sf.sum( + self.sf.col("adjustments_to_unobligated_balance_brought_forward_cpe") + ).alias("adjustments_to_unobligated_balance_brought_forward_cpe"), self.sf.sum(self.sf.col("budget_authority_appropriated_amount")).alias( "budget_authority_appropriated_amount" ), - self.sf.sum(self.sf.col("borrowing_authority_amount")).alias("borrowing_authority_amount"), - self.sf.sum(self.sf.col("contract_authority_amount")).alias("contract_authority_amount"), - self.sf.sum(self.sf.col("spending_authority_from_offsetting_collections_amount")).alias( - "spending_authority_from_offsetting_collections_amount" + self.sf.sum(self.sf.col("borrowing_authority_amount")).alias( + "borrowing_authority_amount" ), + self.sf.sum(self.sf.col("contract_authority_amount")).alias( + "contract_authority_amount" + ), + self.sf.sum( + self.sf.col("spending_authority_from_offsetting_collections_amount") + ).alias("spending_authority_from_offsetting_collections_amount"), self.sf.sum(self.sf.col("total_other_budgetary_resources_amount")).alias( "total_other_budgetary_resources_amount" ), - self.sf.sum(self.sf.col("total_budgetary_resources")).alias("total_budgetary_resources"), - self.sf.sum(self.sf.col("obligations_incurred")).alias("obligations_incurred"), - self.sf.sum(self.sf.col("deobligations_or_recoveries_or_refunds_from_prior_year")).alias( - "deobligations_or_recoveries_or_refunds_from_prior_year" + self.sf.sum(self.sf.col("total_budgetary_resources")).alias( + "total_budgetary_resources" + ), + self.sf.sum(self.sf.col("obligations_incurred")).alias( + "obligations_incurred" + ), + self.sf.sum( + self.sf.col("deobligations_or_recoveries_or_refunds_from_prior_year") + ).alias("deobligations_or_recoveries_or_refunds_from_prior_year"), + self.sf.sum(self.sf.col("unobligated_balance")).alias( + "unobligated_balance" ), - self.sf.sum(self.sf.col("unobligated_balance")).alias("unobligated_balance"), self.sf.sum( self.sf.when( ( ( self.sf.col("quarter_format_flag") - & (self.sf.col("reporting_fiscal_quarter") == self.filters.reporting_fiscal_quarter) + & ( + self.sf.col("reporting_fiscal_quarter") + == self.filters.reporting_fiscal_quarter + ) ) | ( ~self.sf.col("quarter_format_flag") - & (self.sf.col("reporting_fiscal_period") == self.filters.reporting_fiscal_period) + & ( + self.sf.col("reporting_fiscal_period") + == self.filters.reporting_fiscal_period + ) ) ) - & (self.sf.col("reporting_fiscal_year") == self.filters.reporting_fiscal_year), + & ( + self.sf.col("reporting_fiscal_year") + == self.filters.reporting_fiscal_year + ), self.sf.col("gross_outlay_amount"), ).otherwise(0) ).alias("gross_outlay_amount"), self.sf.sum(self.sf.col("status_of_budgetary_resources_total")).alias( "status_of_budgetary_resources_total" ), - self.sf.max(self.sf.col("last_modified_date")).alias("max_last_modified_date"), + self.sf.max(self.sf.col("last_modified_date")).alias( + "max_last_modified_date" + ), ] @property @@ -218,7 +246,9 @@ def group_by_cols(self) -> list[Column | DuckDBSparkColumn]: @property def agg_cols(self) -> list[Column | DuckDBSparkColumn]: return [ - self.sf.max(self.sf.col("last_modified_date")).alias("max_last_modified_date"), + self.sf.max(self.sf.col("last_modified_date")).alias( + "max_last_modified_date" + ), ] @property diff --git a/usaspending_api/download/delta_downloads/object_class_program_activity.py b/usaspending_api/download/delta_downloads/object_class_program_activity.py index d70fce6b04..82fd05018e 100644 --- a/usaspending_api/download/delta_downloads/object_class_program_activity.py +++ b/usaspending_api/download/delta_downloads/object_class_program_activity.py @@ -12,7 +12,9 @@ from usaspending_api.download.delta_downloads.abstract_factories.account_download_factory import ( AbstractAccountDownloadFactory, ) -from usaspending_api.download.delta_downloads.filters.account_filters import AccountDownloadFilters +from usaspending_api.download.delta_downloads.filters.account_filters import ( + AccountDownloadFilters, +) from usaspending_api.download.v2.download_column_historical_lookups import query_paths from usaspending_api.submissions.helpers import get_submission_ids_for_periods @@ -57,7 +59,12 @@ def _build_dataframes(self) -> list[DataFrame | DuckDBSparkDataFrame]: .filter(self.dynamic_filters) .groupby(self.group_by_cols) .agg(*[agg_func(col) for col, agg_func in self.agg_cols.items()]) - .drop(*[self.sf.col(f"object_class_program_activity_download.{col}") for col in self.agg_cols]) + .drop( + *[ + self.sf.col(f"object_class_program_activity_download.{col}") + for col in self.agg_cols + ] + ) .select(*self.select_cols) # Sorting by a value that is repeated often will help improve compression during the zipping step .sort(self.sort_by_cols), @@ -97,51 +104,109 @@ def agg_cols(self) -> dict[str, callable]: "budget_function": lambda col: collect_concat(col, spark=self.spark), "budget_subfunction": lambda col: collect_concat(col, spark=self.spark), "obligations_incurred": lambda col: self.sf.sum(col).alias(col), - "obligations_undelivered_orders_unpaid_total": lambda col: self.sf.sum(col).alias(col), - "obligations_undelivered_orders_unpaid_total_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL480100_undelivered_orders_obligations_unpaid": lambda col: self.sf.sum(col).alias(col), - "USSGL480100_undelivered_orders_obligations_unpaid_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL488100_upward_adj_prior_year_undeliv_orders_oblig_unpaid": lambda col: self.sf.sum(col).alias(col), - "obligations_delivered_orders_unpaid_total": lambda col: self.sf.sum(col).alias(col), - "obligations_delivered_orders_unpaid_total_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL490100_delivered_orders_obligations_unpaid": lambda col: self.sf.sum(col).alias(col), - "USSGL490100_delivered_orders_obligations_unpaid_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL498100_upward_adj_of_prior_year_deliv_orders_oblig_unpaid": lambda col: self.sf.sum(col).alias(col), + "obligations_undelivered_orders_unpaid_total": lambda col: self.sf.sum( + col + ).alias(col), + "obligations_undelivered_orders_unpaid_total_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL480100_undelivered_orders_obligations_unpaid": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL480100_undelivered_orders_obligations_unpaid_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL488100_upward_adj_prior_year_undeliv_orders_oblig_unpaid": lambda col: self.sf.sum( + col + ).alias(col), + "obligations_delivered_orders_unpaid_total": lambda col: self.sf.sum( + col + ).alias(col), + "obligations_delivered_orders_unpaid_total_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL490100_delivered_orders_obligations_unpaid": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL490100_delivered_orders_obligations_unpaid_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL498100_upward_adj_of_prior_year_deliv_orders_oblig_unpaid": lambda col: self.sf.sum( + col + ).alias(col), "gross_outlay_amount_FYB_to_period_end": lambda col: filter_submission_and_sum( col, self.filters, spark=self.spark ), "gross_outlay_amount_FYB": lambda col: self.sf.sum(col).alias(col), - "gross_outlays_undelivered_orders_prepaid_total": lambda col: self.sf.sum(col).alias(col), - "gross_outlays_undelivered_orders_prepaid_total_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL480200_undelivered_orders_obligations_prepaid_advanced": lambda col: self.sf.sum(col).alias(col), - "USSGL480200_undelivered_orders_obligations_prepaid_advanced_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL488200_upward_adj_prior_year_undeliv_orders_oblig_prepaid": lambda col: self.sf.sum(col).alias(col), - "gross_outlays_delivered_orders_paid_total": lambda col: self.sf.sum(col).alias(col), - "gross_outlays_delivered_orders_paid_total_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL490200_delivered_orders_obligations_paid": lambda col: self.sf.sum(col).alias(col), - "USSGL490800_authority_outlayed_not_yet_disbursed": lambda col: self.sf.sum(col).alias(col), - "USSGL490800_authority_outlayed_not_yet_disbursed_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL498200_upward_adj_of_prior_year_deliv_orders_oblig_paid": lambda col: self.sf.sum(col).alias(col), - "deobligations_or_recoveries_or_refunds_from_prior_year": lambda col: self.sf.sum(col).alias(col), - "USSGL487100_downward_adj_prior_year_unpaid_undeliv_orders_oblig": lambda col: self.sf.sum(col).alias(col), - "USSGL497100_downward_adj_prior_year_unpaid_deliv_orders_oblig": lambda col: self.sf.sum(col).alias(col), + "gross_outlays_undelivered_orders_prepaid_total": lambda col: self.sf.sum( + col + ).alias(col), + "gross_outlays_undelivered_orders_prepaid_total_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL480200_undelivered_orders_obligations_prepaid_advanced": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL480200_undelivered_orders_obligations_prepaid_advanced_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL488200_upward_adj_prior_year_undeliv_orders_oblig_prepaid": lambda col: self.sf.sum( + col + ).alias(col), + "gross_outlays_delivered_orders_paid_total": lambda col: self.sf.sum( + col + ).alias(col), + "gross_outlays_delivered_orders_paid_total_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL490200_delivered_orders_obligations_paid": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL490800_authority_outlayed_not_yet_disbursed": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL490800_authority_outlayed_not_yet_disbursed_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL498200_upward_adj_of_prior_year_deliv_orders_oblig_paid": lambda col: self.sf.sum( + col + ).alias(col), + "deobligations_or_recoveries_or_refunds_from_prior_year": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL487100_downward_adj_prior_year_unpaid_undeliv_orders_oblig": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL497100_downward_adj_prior_year_unpaid_deliv_orders_oblig": lambda col: self.sf.sum( + col + ).alias(col), "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig": lambda col: filter_submission_and_sum( col, self.filters, spark=self.spark ), "USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig": lambda col: filter_submission_and_sum( col, self.filters, spark=self.spark ), - "USSGL483100_undelivered_orders_obligations_transferred_unpaid": lambda col: self.sf.sum(col).alias(col), - "USSGL493100_delivered_orders_obligations_transferred_unpaid": lambda col: self.sf.sum(col).alias(col), - "USSGL483200_undeliv_orders_oblig_transferred_prepaid_advanced": lambda col: self.sf.sum(col).alias(col), - "last_modified_date": lambda col: self.sf.max(col).alias("max_last_modified_date"), + "USSGL483100_undelivered_orders_obligations_transferred_unpaid": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL493100_delivered_orders_obligations_transferred_unpaid": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL483200_undeliv_orders_oblig_transferred_prepaid_advanced": lambda col: self.sf.sum( + col + ).alias(col), + "last_modified_date": lambda col: self.sf.max(col).alias( + "max_last_modified_date" + ), } @property def select_cols(self) -> list[Column | DuckDBSparkColumn]: return [ self.sf.col(col) - for col in query_paths["object_class_program_activity"]["federal_account"].keys() + for col in query_paths["object_class_program_activity"][ + "federal_account" + ].keys() if not col.startswith("last_modified_date") ] + [self.sf.col("max_last_modified_date").alias("last_modified_date")] @@ -260,7 +325,9 @@ def sort_by_cols(self) -> list[str]: def select_cols(self) -> list[Column | DuckDBSparkColumn]: return [ self.sf.col(col) - for col in query_paths["object_class_program_activity"]["treasury_account"].keys() + for col in query_paths["object_class_program_activity"][ + "treasury_account" + ].keys() if not col.startswith("last_modified_date") ] + [self.sf.col("max_last_modified_date").alias("last_modified_date")] diff --git a/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py b/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py index 44a38586c4..00669b306c 100644 --- a/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py +++ b/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py @@ -1,18 +1,17 @@ -import zipfile import datetime -import pytest import os +import zipfile +from csv import reader +from os import listdir +import pytest from django.core.management import call_command -from os import listdir from model_bakery import baker -from csv import reader -from usaspending_api.settings import HOST from usaspending_api.awards.models import TransactionDelta from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.download.v2.download_column_historical_lookups import query_paths - +from usaspending_api.settings import HOST # Make sure UTC or test will fail later in the day TODAY = datetime.datetime.strftime(datetime.datetime.utcnow(), "%Y%m%d") @@ -22,11 +21,19 @@ @pytest.mark.django_db(transaction=True) def monthly_download_delta_data(db, monkeypatch): baker.make( - "references.ToptierAgency", toptier_agency_id=1, toptier_code="001", name="Test_Agency", _fill_optional=True + "references.ToptierAgency", + toptier_agency_id=1, + toptier_code="001", + name="Test_Agency", + _fill_optional=True, ) baker.make("references.Agency", pk=1, toptier_agency_id=1, _fill_optional=True) baker.make( - "references.ToptierAgency", toptier_agency_id=2, toptier_code="002", name="Test_Agency 2", _fill_optional=True + "references.ToptierAgency", + toptier_agency_id=2, + toptier_code="002", + name="Test_Agency 2", + _fill_optional=True, ) baker.make("references.Agency", pk=2, toptier_agency_id=2, _fill_optional=True) i = 1 @@ -92,10 +99,16 @@ def monthly_download_delta_data(db, monkeypatch): @pytest.mark.django_db(transaction=True) def test_all_agencies(monthly_download_delta_data, monkeypatch): - call_command("populate_monthly_delta_files", "--debugging_skip_deleted", "--last_date=2020-12-31") + call_command( + "populate_monthly_delta_files", + "--debugging_skip_deleted", + "--last_date=2020-12-31", + ) file_list = listdir("csv_downloads") assert f"FY(All)_All_Contracts_Delta_{TODAY}.zip" in file_list - os.remove(os.path.normpath(f"csv_downloads/FY(All)_All_Contracts_Delta_{TODAY}.zip")) + os.remove( + os.path.normpath(f"csv_downloads/FY(All)_All_Contracts_Delta_{TODAY}.zip") + ) @pytest.mark.django_db(transaction=True) @@ -397,23 +410,37 @@ def test_specific_agency(monthly_download_delta_data, monkeypatch): "", "", "", - f"{HOST}/award/CONT_AWD_1_0_0/" if "localhost" in HOST else f"https://{HOST}/award/CONT_AWD_1_0_0/", + f"{HOST}/award/CONT_AWD_1_0_0/" + if "localhost" in HOST + else f"https://{HOST}/award/CONT_AWD_1_0_0/", "", "2020-05-07 00:00:00+00", ] - call_command("populate_monthly_delta_files", "--agencies=1", "--debugging_skip_deleted", "--last_date=2020-12-31") + call_command( + "populate_monthly_delta_files", + "--agencies=1", + "--debugging_skip_deleted", + "--last_date=2020-12-31", + ) file_list = listdir("csv_downloads") assert f"FY(All)_001_Contracts_Delta_{TODAY}.zip" in file_list - with zipfile.ZipFile(os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}.zip"), "r") as zip_ref: + with zipfile.ZipFile( + os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}.zip"), "r" + ) as zip_ref: zip_ref.extractall("csv_downloads") assert f"FY(All)_001_Contracts_Delta_{TODAY}_1.csv" in listdir("csv_downloads") - with open(os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}_1.csv"), "r") as contract_file: + with open( + os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}_1.csv"), + "r", + ) as contract_file: csv_reader = reader(contract_file) row_count = 0 for row in csv_reader: if row_count == 0: # 63 is the character limit for column names - expected_row = [s[:63] for s in query_paths["transaction_search"]["d1"].keys()] + expected_row = [ + s[:63] for s in query_paths["transaction_search"]["d1"].keys() + ] # These cols are prepended during file processing expected_row = ["correction_delete_ind", "agency_id"] + expected_row assert row == expected_row @@ -421,8 +448,12 @@ def test_specific_agency(monthly_download_delta_data, monkeypatch): assert row == contract_data row_count += 1 assert row_count == 2 - os.remove(os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}.zip")) - os.remove(os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}_1.csv")) + os.remove( + os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}.zip") + ) + os.remove( + os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}_1.csv") + ) @pytest.mark.django_db(transaction=True) @@ -474,7 +505,9 @@ def test_award_types(client, monthly_download_delta_data, monkeypatch): awarding_toptier_agency_name="Test_Agency", awarding_subtier_agency_name="Test_Agency", ) - baker.make("awards.TransactionDelta", transaction_id=2, created_at=datetime.datetime.now()) + baker.make( + "awards.TransactionDelta", transaction_id=2, created_at=datetime.datetime.now() + ) call_command( "populate_monthly_delta_files", "--agencies=1", @@ -484,4 +517,6 @@ def test_award_types(client, monthly_download_delta_data, monkeypatch): ) file_list = listdir("csv_downloads") assert f"FY(All)_001_Assistance_Delta_{TODAY}.zip" in file_list - os.remove(os.path.normpath(f"csv_downloads/FY(All)_001_Assistance_Delta_{TODAY}.zip")) + os.remove( + os.path.normpath(f"csv_downloads/FY(All)_001_Assistance_Delta_{TODAY}.zip") + ) diff --git a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py index 5321ab3fe0..dcd0b38ab8 100644 --- a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py +++ b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py @@ -4,7 +4,7 @@ """ import json - +from copy import deepcopy from datetime import date, datetime, timedelta, timezone from typing import Any, Dict, List, Optional, Union @@ -12,23 +12,25 @@ import psycopg2 import pytest import pytz - +from django.conf import settings +from django.core.management import call_command +from django.db import connection, connections, models, transaction from model_bakery import baker from pyspark.sql import SparkSession -from django.conf import settings -from django.core.management import call_command -from django.db import connection, connections, transaction, models from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.etl.award_helpers import update_awards from usaspending_api.etl.broker_etl_helpers import dictfetchall from usaspending_api.etl.management.commands.create_delta_table import ( TABLE_SPEC, ) -from usaspending_api.etl.tests.data.delta_model_for_test import TestModel, TEST_TABLE_POSTGRES, TEST_TABLE_SPEC +from usaspending_api.etl.tests.data.delta_model_for_test import ( + TEST_TABLE_POSTGRES, + TEST_TABLE_SPEC, + TestModel, +) from usaspending_api.recipient.models import RecipientLookup from usaspending_api.tests.conftest_spark import create_and_load_all_delta_tables -from copy import deepcopy _NEW_ASSIST = { "published_fabs_id": 6, @@ -54,12 +56,18 @@ def _handle_string_cast(val: str) -> Union[str, dict, list]: """ if isinstance(val, list): try: - casted = [json.loads(element) if isinstance(element, str) else element for element in val] + casted = [ + json.loads(element) if isinstance(element, str) else element + for element in val + ] except (TypeError, json.decoder.JSONDecodeError): casted = [str(element) for element in val] elif isinstance(val, dict): try: - casted = {k: json.loads(element) if isinstance(element, str) else element for k, element in val.items()} + casted = { + k: json.loads(element) if isinstance(element, str) else element + for k, element in val.items() + } except (TypeError, json.decoder.JSONDecodeError): casted = {k: str(element) for k, element in val.items()} else: @@ -96,10 +104,17 @@ def equal_datasets( # Parsing custom_schema to specify schema_changes = {} - schema_type_converters = {"INT": int, "STRING": _handle_string_cast, "ARRAY": _handle_string_cast} + schema_type_converters = { + "INT": int, + "STRING": _handle_string_cast, + "ARRAY": _handle_string_cast, + } if custom_schema: for schema_change in custom_schema.split(", "): - col, new_col_type = schema_change.split()[0].strip(), schema_change.split()[1].strip() + col, new_col_type = ( + schema_change.split()[0].strip(), + schema_change.split()[1].strip(), + ) schema_changes[col] = new_col_type # Iterating through the values and finding any differences @@ -139,7 +154,9 @@ def equal_datasets( if isinstance(psql_val, list): psql_val = sorted_deep(psql_val) if isinstance(spark_val, str): - spark_val = [json.loads(idx.replace("'", '"')) for idx in [spark_val]][0] + spark_val = [ + json.loads(idx.replace("'", '"')) for idx in [spark_val] + ][0] spark_val = sorted_deep(spark_val) if psql_val != spark_val: @@ -173,7 +190,8 @@ def load_delta_table_from_postgres( call_command(load_command, *cmd_args) -def verify_delta_table_loaded_to_delta( +# TODO: Refactor and remove the "noqa" for PLR0912 +def verify_delta_table_loaded_to_delta( # noqa: PLR0912 spark: SparkSession, delta_table_name: str, s3_bucket: str, @@ -198,7 +216,9 @@ def verify_delta_table_loaded_to_delta( call_command("create_delta_table", f"--spark-s3-bucket={s3_bucket}", *cmd_args) call_command(load_command, *cmd_args) else: - load_delta_table_from_postgres(delta_table_name, s3_bucket, alt_db, alt_name, load_command) + load_delta_table_from_postgres( + delta_table_name, s3_bucket, alt_db, alt_name, load_command + ) if alt_name: expected_table_name = alt_name @@ -243,7 +263,12 @@ def verify_delta_table_loaded_to_delta( received_query = f"{received_query} ORDER BY {partition_col}" received_data = [row.asDict() for row in spark.sql(received_query).collect()] - assert equal_datasets(dummy_data, received_data, TABLE_SPEC[delta_table_name]["custom_schema"], ignore_fields) + assert equal_datasets( + dummy_data, + received_data, + TABLE_SPEC[delta_table_name]["custom_schema"], + ignore_fields, + ) def verify_delta_table_loaded_from_delta( @@ -267,7 +292,7 @@ def verify_delta_table_loaded_from_delta( cmd_args += [f"--alt-delta-name={alt_name}"] expected_table_name = alt_name if jdbc_inserts: - cmd_args += [f"--jdbc-inserts"] + cmd_args += ["--jdbc-inserts"] else: if not spark_s3_bucket: raise RuntimeError( @@ -279,7 +304,10 @@ def verify_delta_table_loaded_from_delta( call_command(load_command, *cmd_args) # get the postgres data to compare - source_table = TABLE_SPEC[delta_table_name]["source_table"] or TABLE_SPEC[delta_table_name]["swap_table"] + source_table = ( + TABLE_SPEC[delta_table_name]["source_table"] + or TABLE_SPEC[delta_table_name]["swap_table"] + ) temp_schema = "temp" if source_table: tmp_table_name = f"{temp_schema}.{source_table}_temp" @@ -301,13 +329,21 @@ def verify_delta_table_loaded_from_delta( delta_data = [row.asDict() for row in spark.sql(delta_query).collect()] assert equal_datasets( - postgres_data, delta_data, TABLE_SPEC[delta_table_name]["custom_schema"], ignore_fields=ignore_fields + postgres_data, + delta_data, + TABLE_SPEC[delta_table_name]["custom_schema"], + ignore_fields=ignore_fields, ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_from_delta_for_recipient_lookup( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to # Postgres table, and then push the updated table to Delta. @@ -315,20 +351,35 @@ def test_load_table_to_from_delta_for_recipient_lookup( insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) ignore_fields = ["id", "update_date"] - tables_to_load = ["sam_recipient", "transaction_fabs", "transaction_fpds", "transaction_normalized"] + tables_to_load = [ + "sam_recipient", + "transaction_fabs", + "transaction_fpds", + "transaction_normalized", + ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) # Test initial load of Recipient Lookup @@ -396,7 +447,12 @@ def test_load_table_to_from_delta_for_recipient_lookup( # Verify that the update alternate name exists expected_result = ["FABS RECIPIENT 12345"] - assert sorted(RecipientLookup.objects.filter(uei="FABSUEI12345").first().alternate_names) == expected_result + assert ( + sorted( + RecipientLookup.objects.filter(uei="FABSUEI12345").first().alternate_names + ) + == expected_result + ) tables_to_load = ["transaction_fabs", "transaction_normalized"] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) @@ -408,7 +464,10 @@ def test_load_table_to_from_delta_for_recipient_lookup( ignore_fields=ignore_fields, ) verify_delta_table_loaded_from_delta( - spark, "recipient_lookup", spark_s3_bucket=s3_unittest_data_bucket, ignore_fields=ignore_fields + spark, + "recipient_lookup", + spark_s3_bucket=s3_unittest_data_bucket, + ignore_fields=ignore_fields, ) verify_delta_table_loaded_from_delta( spark, "recipient_lookup", jdbc_inserts=True, ignore_fields=ignore_fields @@ -416,7 +475,9 @@ def test_load_table_to_from_delta_for_recipient_lookup( @pytest.mark.django_db(transaction=True) -def test_load_table_to_delta_for_published_fabs(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): +def test_load_table_to_delta_for_published_fabs( + spark, s3_unittest_data_bucket, hive_unittest_metastore_db +): baker.make( "transactions.SourceAssistanceTransaction", published_fabs_id=7, @@ -435,9 +496,14 @@ def test_load_table_to_delta_for_published_fabs(spark, s3_unittest_data_bucket, verify_delta_table_loaded_to_delta(spark, "published_fabs", s3_unittest_data_bucket) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_from_delta_for_recipient_profile( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to # Postgres table, and then push the updated table to Delta. @@ -445,17 +511,27 @@ def test_load_table_to_from_delta_for_recipient_profile( insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -468,13 +544,21 @@ def test_load_table_to_from_delta_for_recipient_profile( ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "recipient_profile", s3_unittest_data_bucket, load_command="load_query_to_delta", ignore_fields=["id"] + spark, + "recipient_profile", + s3_unittest_data_bucket, + load_command="load_query_to_delta", + ignore_fields=["id"], + ) + verify_delta_table_loaded_from_delta( + spark, "recipient_profile", jdbc_inserts=True, ignore_fields=["id"] ) - verify_delta_table_loaded_from_delta(spark, "recipient_profile", jdbc_inserts=True, ignore_fields=["id"]) @pytest.mark.django_db(transaction=True) -def test_load_table_to_delta_timezone_aware(spark, monkeypatch, s3_unittest_data_bucket, hive_unittest_metastore_db): +def test_load_table_to_delta_timezone_aware( + spark, monkeypatch, s3_unittest_data_bucket, hive_unittest_metastore_db +): """Test that timestamps are not inadvertently shifted due to loss of timezone during reads and writes. The big takeaways from this are: @@ -512,7 +596,10 @@ def test_load_table_to_delta_timezone_aware(spark, monkeypatch, s3_unittest_data with new_psycopg2_conn.cursor() as cursor: cursor.execute(TEST_TABLE_POSTGRES) TABLE_SPEC.update(TEST_TABLE_SPEC) - monkeypatch.setattr("usaspending_api.etl.management.commands.load_table_to_delta.TABLE_SPEC", TABLE_SPEC) + monkeypatch.setattr( + "usaspending_api.etl.management.commands.load_table_to_delta.TABLE_SPEC", + TABLE_SPEC, + ) # Prepare a model object without saving it, but do save the related fields # - https://model-bakery.readthedocs.io/en/latest/basic_usage.html#non-persistent-objects @@ -527,14 +614,22 @@ def test_load_table_to_delta_timezone_aware(spark, monkeypatch, s3_unittest_data populated_columns = ("id", "test_timestamp") def _get_sql_insert_from_model(model, populated_columns): - values = [value for value in model._meta.local_fields if value.column in populated_columns] + values = [ + value + for value in model._meta.local_fields + if value.column in populated_columns + ] q = models.sql.InsertQuery(model) q.insert_values(values, [model]) compiler = q.get_compiler("default") - setattr(compiler, "return_id", False) + compiler.return_id = False stmts = compiler.as_sql() stmt = [ - stmt % tuple(f"'{param}'" if type(param) in [str, date, datetime] else param for param in params) + stmt + % tuple( + f"'{param}'" if type(param) in [str, date, datetime] else param + for param in params + ) for stmt, params in stmts ] return stmt[0] @@ -543,7 +638,9 @@ def _get_sql_insert_from_model(model, populated_columns): with psycopg2.connect(get_database_dsn_string()) as new_psycopg2_conn: with new_psycopg2_conn.cursor() as cursor: cursor.execute("set session time zone 'HST'") - fabs_insert_sql = _get_sql_insert_from_model(model_with_tz, populated_columns) + fabs_insert_sql = _get_sql_insert_from_model( + model_with_tz, populated_columns + ) cursor.execute(fabs_insert_sql) assert cursor.rowcount == 1 new_psycopg2_conn.commit() @@ -566,14 +663,24 @@ def _get_sql_insert_from_model(model, populated_columns): # or with raw SQL), it will apply those time zone settings assert model_datetime.tzname() != "HST" assert model_datetime.tzname() == "UTC" - assert model_datetime.hour == 21 # shifted +10 to counteract the UTC offset by django upon saving it - assert model_datetime.utctimetuple().tm_hour == 21 # already shifted to UTC, so this just matches .hour (== 21) - assert dt_naive.utctimetuple().tm_hour == dt_naive.hour # naive, so stays the same - assert dt_with_utc.utctimetuple().tm_hour == dt_with_utc.hour # already UTC, so stays the same + assert ( + model_datetime.hour == 21 + ) # shifted +10 to counteract the UTC offset by django upon saving it + assert ( + model_datetime.utctimetuple().tm_hour == 21 + ) # already shifted to UTC, so this just matches .hour (== 21) + assert ( + dt_naive.utctimetuple().tm_hour == dt_naive.hour + ) # naive, so stays the same + assert ( + dt_with_utc.utctimetuple().tm_hour == dt_with_utc.hour + ) # already UTC, so stays the same # Confirm also that this is the case in the DB (i.e. it was at write-time that UTC was set, not read-time with connection.cursor() as cursor: - cursor.execute("select test_table.test_timestamp from test_table where id = 3") + cursor.execute( + "select test_table.test_timestamp from test_table where id = 3" + ) dt_from_db = [row[0] for row in cursor.fetchall()][0] # type: datetime assert dt_from_db.tzinfo is not None assert dt_from_db.tzname() == "UTC" @@ -589,7 +696,9 @@ def _get_sql_insert_from_model(model, populated_columns): with psycopg2.connect(get_database_dsn_string()) as new_psycopg2_conn: with new_psycopg2_conn.cursor() as cursor: cursor.execute("set session time zone 'HST'") - cursor.execute("select test_table.test_timestamp from test_table where id = 3") + cursor.execute( + "select test_table.test_timestamp from test_table where id = 3" + ) dt_from_db = [row[0] for row in cursor.fetchall()][0] # type: datetime assert dt_from_db.tzinfo is not None # Can't use traditional time zone names with tzname() since pyscopg2 uses its own time zone infos. @@ -613,7 +722,9 @@ def _get_sql_insert_from_model(model, populated_columns): @pytest.mark.django_db(transaction=True) -def test_load_table_to_delta_for_detached_award_procurement(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): +def test_load_table_to_delta_for_detached_award_procurement( + spark, s3_unittest_data_bucket, hive_unittest_metastore_db +): baker.make( "transactions.SourceProcurementTransaction", detached_award_procurement_id="4", @@ -631,13 +742,20 @@ def test_load_table_to_delta_for_detached_award_procurement(spark, s3_unittest_d _fill_optional=True, ) - verify_delta_table_loaded_to_delta(spark, "detached_award_procurement", s3_unittest_data_bucket) + verify_delta_table_loaded_to_delta( + spark, "detached_award_procurement", s3_unittest_data_bucket + ) @pytest.mark.django_db(transaction=True) -@pytest.mark.skip(reason="Due to the nature of the views with all the transformations, this will be out of date") +@pytest.mark.skip( + reason="Due to the nature of the views with all the transformations, this will be out of date" +) def test_load_table_to_from_delta_for_recipient_profile_testing( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): tables_to_load = [ "recipient_lookup", @@ -648,13 +766,21 @@ def test_load_table_to_from_delta_for_recipient_profile_testing( ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "recipient_profile_testing", s3_unittest_data_bucket, load_command="load_table_to_delta" + spark, + "recipient_profile_testing", + s3_unittest_data_bucket, + load_command="load_table_to_delta", ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_from_delta_for_transaction_search( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to # Postgres table, and then push the updated table to Delta. @@ -662,17 +788,27 @@ def test_load_table_to_from_delta_for_transaction_search( insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -706,7 +842,10 @@ def test_load_table_to_from_delta_for_transaction_search( ) @pytest.mark.django_db(transaction=True) def test_load_table_to_from_delta_for_transaction_search_testing( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # TODO: Commenting these out while we have `transaction_search_gold` vs `transaction_search` in the TABLE_SPEC # as by design the data in delta will be different from the data in postgres @@ -722,8 +861,12 @@ def test_load_table_to_from_delta_for_transaction_search_testing( def test_load_table_to_delta_for_transaction_normalized_alt_db_and_name( spark, s3_unittest_data_bucket, hive_unittest_metastore_db ): - baker.make("search.TransactionSearch", transaction_id="1", award_id=1, _fill_optional=True) - baker.make("search.TransactionSearch", transaction_id="2", award_id=2, _fill_optional=True) + baker.make( + "search.TransactionSearch", transaction_id="1", award_id=1, _fill_optional=True + ) + baker.make( + "search.TransactionSearch", transaction_id="2", award_id=2, _fill_optional=True + ) verify_delta_table_loaded_to_delta( spark, "transaction_normalized", @@ -734,9 +877,14 @@ def test_load_table_to_delta_for_transaction_normalized_alt_db_and_name( @pytest.mark.django_db(transaction=True) -@pytest.mark.skip(reason="Due to the nature of the views with all the transformations, this will be out of date") +@pytest.mark.skip( + reason="Due to the nature of the views with all the transformations, this will be out of date" +) def test_load_table_to_from_delta_for_transaction_search_alt_db_and_name( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): tables_to_load = [ "awards", @@ -770,9 +918,14 @@ def test_load_table_to_from_delta_for_transaction_search_alt_db_and_name( # ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_from_delta_for_award_search( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to # Postgres table, and then push the updated table to Delta. @@ -780,17 +933,27 @@ def test_load_table_to_from_delta_for_award_search( insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -806,32 +969,54 @@ def test_load_table_to_from_delta_for_award_search( ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "award_search", s3_unittest_data_bucket, load_command="load_query_to_delta" + spark, + "award_search", + s3_unittest_data_bucket, + load_command="load_query_to_delta", + ) + verify_delta_table_loaded_from_delta( + spark, "award_search", spark_s3_bucket=s3_unittest_data_bucket ) - verify_delta_table_loaded_from_delta(spark, "award_search", spark_s3_bucket=s3_unittest_data_bucket) - verify_delta_table_loaded_from_delta(spark, "award_search", jdbc_inserts=True) # test alt write strategy + verify_delta_table_loaded_from_delta( + spark, "award_search", jdbc_inserts=True + ) # test alt write strategy -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_incremental_load_table_to_delta_for_award_search( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Load in data that award_search depends on last_load_datetime = datetime.now(timezone.utc) insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -851,14 +1036,14 @@ def test_incremental_load_table_to_delta_for_award_search( call_command( "create_delta_table", f"--spark-s3-bucket={s3_unittest_data_bucket}", - f"--destination-table=award_search", + "--destination-table=award_search", "--alt-db=int", ) # load in award_search data call_command( "load_query_to_delta", - f"--destination-table=award_search", + "--destination-table=award_search", "--incremental", "--alt-db=int", ) @@ -869,7 +1054,7 @@ def test_incremental_load_table_to_delta_for_award_search( # Reload the data call_command( "load_query_to_delta", - f"--destination-table=award_search", + "--destination-table=award_search", "--incremental", "--alt-db=int", ) @@ -885,33 +1070,55 @@ def test_incremental_load_table_to_delta_for_award_search( expected = pd.DataFrame( { "award_id": [4, 4, 1, 3, 2, 4], - "_change_type": ["delete", "insert", "insert", "insert", "insert", "insert"], + "_change_type": [ + "delete", + "insert", + "insert", + "insert", + "insert", + "insert", + ], "_commit_version": [2, 3, 1, 1, 1, 1], } ) pd.testing.assert_frame_equal(result, expected) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_incremental_load_table_to_delta_for_transaction_search( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Load in data that transaction_search depends on last_load_datetime = datetime.now(timezone.utc) insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -931,14 +1138,14 @@ def test_incremental_load_table_to_delta_for_transaction_search( call_command( "create_delta_table", f"--spark-s3-bucket={s3_unittest_data_bucket}", - f"--destination-table=transaction_search", + "--destination-table=transaction_search", "--alt-db=int", ) # load in award_search data call_command( "load_query_to_delta", - f"--destination-table=transaction_search", + "--destination-table=transaction_search", "--incremental", "--alt-db=int", ) @@ -949,7 +1156,7 @@ def test_incremental_load_table_to_delta_for_transaction_search( # Reload the data call_command( "load_query_to_delta", - f"--destination-table=transaction_search", + "--destination-table=transaction_search", "--incremental", "--alt-db=int", ) @@ -965,14 +1172,25 @@ def test_incremental_load_table_to_delta_for_transaction_search( expected = pd.DataFrame( { "transaction_id": [4, 4, 1, 2, 434, 3, 4, 5], - "_change_type": ["delete", "insert", "insert", "insert", "insert", "insert", "insert", "insert"], + "_change_type": [ + "delete", + "insert", + "insert", + "insert", + "insert", + "insert", + "insert", + "insert", + ], "_commit_version": [2, 3, 1, 1, 1, 1, 1, 1], } ) pd.testing.assert_frame_equal(result, expected) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_delta_for_sam_recipient( spark, s3_unittest_data_bucket, populate_broker_data, hive_unittest_metastore_db ): @@ -1000,7 +1218,11 @@ def test_load_table_to_delta_for_sam_recipient( } ] verify_delta_table_loaded_to_delta( - spark, "sam_recipient", s3_unittest_data_bucket, load_command="load_query_to_delta", dummy_data=expected_data + spark, + "sam_recipient", + s3_unittest_data_bucket, + load_command="load_query_to_delta", + dummy_data=expected_data, ) @@ -1008,15 +1230,21 @@ def test_load_table_to_delta_for_sam_recipient( settings.BROKER_DB_ALIAS not in settings.DATABASES, reason="'data_broker' database not configured in django settings.DATABASES.", ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_delta_for_summary_state_view( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): - # We need the award_search table to create the summary_state_view in delta # And in order to create the award_search table, we need the following load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -1032,21 +1260,36 @@ def test_load_table_to_delta_for_summary_state_view( ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "award_search", s3_unittest_data_bucket, load_command="load_query_to_delta" + spark, + "award_search", + s3_unittest_data_bucket, + load_command="load_query_to_delta", ) - # We now want to load the award_search table that we created above along with other tables needed to create award_search - # Then create the summay_state_view table and populate it using the load_query_to_delta command - tables_to_load = ["transaction_fabs", "transaction_fpds", "transaction_normalized", "award_search"] + # We now want to load the award_search table that we created above along with other tables needed to create + # award_search. Then create the summary_state_view table and populate it using the load_query_to_delta command. + tables_to_load = [ + "transaction_fabs", + "transaction_fpds", + "transaction_normalized", + "award_search", + ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "summary_state_view", s3_unittest_data_bucket, load_command="load_query_to_delta" + spark, + "summary_state_view", + s3_unittest_data_bucket, + load_command="load_query_to_delta", ) # Lastly, check using verify_delta_table_loaded_from_delta function which will run the load_table_from_delta command - verify_delta_table_loaded_from_delta(spark, "summary_state_view", spark_s3_bucket=s3_unittest_data_bucket) + verify_delta_table_loaded_from_delta( + spark, "summary_state_view", spark_s3_bucket=s3_unittest_data_bucket + ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_object_class_program_activity_class( spark, s3_unittest_data_bucket, @@ -1068,16 +1311,19 @@ def test_load_object_class_program_activity_class( ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_award_financial_download( spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db, ): - load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -1094,7 +1340,10 @@ def test_load_award_financial_download( create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "award_search", s3_unittest_data_bucket, load_command="load_query_to_delta" + spark, + "award_search", + s3_unittest_data_bucket, + load_command="load_query_to_delta", ) verify_delta_table_loaded_to_delta( spark, @@ -1217,7 +1466,9 @@ def test_load_award_financial_download( ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_account_balances_download( spark, s3_unittest_data_bucket, diff --git a/usaspending_api/etl/tests/integration/test_spark_app.py b/usaspending_api/etl/tests/integration/test_spark_app.py index 2b5580515c..6e137f9475 100644 --- a/usaspending_api/etl/tests/integration/test_spark_app.py +++ b/usaspending_api/etl/tests/integration/test_spark_app.py @@ -15,15 +15,20 @@ from django.conf import settings from model_bakery import baker from pyspark.context import SparkContext -from pyspark.sql import SparkSession, Row +from pyspark.sql import Row, SparkSession from pytest import fixture, mark + from usaspending_api.awards.models import TransactionFABS, TransactionFPDS +from usaspending_api.common.etl.spark import ( + _BROKER_REF_TABLES, + _USAS_RDS_REF_TABLES, + create_ref_temp_views, +) from usaspending_api.common.helpers.spark_helpers import ( - get_jdbc_url_from_pg_uri, - get_jdbc_connection_properties, get_broker_jdbc_url, + get_jdbc_connection_properties, + get_jdbc_url_from_pg_uri, ) -from usaspending_api.common.etl.spark import _USAS_RDS_REF_TABLES, _BROKER_REF_TABLES, create_ref_temp_views from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.config import CONFIG @@ -37,10 +42,17 @@ def test_jvm_sparksession(spark: SparkSession): sc = SparkContext._active_spark_context assert sc._jvm assert sc._jvm.SparkSession - assert not sc._jvm.SparkSession.getDefaultSession().get().sparkContext().isStopped() + assert ( + not sc._jvm.SparkSession.getDefaultSession() + .get() + .sparkContext() + .isStopped() + ) -def test_hive_metastore_db(spark: SparkSession, s3_unittest_data_bucket, hive_unittest_metastore_db): +def test_hive_metastore_db( + spark: SparkSession, s3_unittest_data_bucket, hive_unittest_metastore_db +): """Ensure that schemas and tables created are tracked in the hive metastore_db""" test_schema = "my_delta_test_schema" test_table = "my_delta_test_table" @@ -65,7 +77,9 @@ def test_hive_metastore_db(spark: SparkSession, s3_unittest_data_bucket, hive_un assert tables_in_test_schema[0]["tableName"] == test_table -def test_tmp_hive_metastore_db_empty_on_test_start(spark: SparkSession, hive_unittest_metastore_db): +def test_tmp_hive_metastore_db_empty_on_test_start( + spark: SparkSession, hive_unittest_metastore_db +): """Test that when using the spark test fixture, the metastore_db is configured to live in a tmp directory, so that schemas and tables created while under-test only live or are known for the duration of a SINGLE test, not a test SESSION. And test that the metastore used for unit tests is empty on each test run (except for the @@ -102,18 +116,55 @@ def test_spark_app_run_local_master(spark: SparkSession): def test_spark_write_csv_app_run(spark: SparkSession, s3_unittest_data_bucket): """More involved integration test that requires MinIO to be up as an s3 alternative.""" data = [ - {"first_col": "row 1", "id": str(uuid.uuid4()), "color": "blue", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 2", "id": str(uuid.uuid4()), "color": "green", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 3", "id": str(uuid.uuid4()), "color": "pink", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 4", "id": str(uuid.uuid4()), "color": "yellow", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 5", "id": str(uuid.uuid4()), "color": "red", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 6", "id": str(uuid.uuid4()), "color": "orange", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 7", "id": str(uuid.uuid4()), "color": "magenta", "numeric_val": random.randint(-100, 100)}, + { + "first_col": "row 1", + "id": str(uuid.uuid4()), + "color": "blue", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 2", + "id": str(uuid.uuid4()), + "color": "green", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 3", + "id": str(uuid.uuid4()), + "color": "pink", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 4", + "id": str(uuid.uuid4()), + "color": "yellow", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 5", + "id": str(uuid.uuid4()), + "color": "red", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 6", + "id": str(uuid.uuid4()), + "color": "orange", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 7", + "id": str(uuid.uuid4()), + "color": "magenta", + "numeric_val": random.randint(-100, 100), + }, ] df = spark.createDataFrame([Row(**data_row) for data_row in data]) # NOTE! NOTE! NOTE! MinIO locally does not support a TRAILING SLASH after object (folder) name - df.write.option("header", True).csv(f"s3a://{s3_unittest_data_bucket}" f"/{CONFIG.DELTA_LAKE_S3_PATH}/write_to_s3") + df.write.option("header", True).csv( + f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/write_to_s3" + ) # Verify there are *.csv part files in the chosen bucket s3_client = boto3.client( @@ -161,7 +212,9 @@ def _transaction_and_award_test_data(db): assert TransactionFPDS.objects.all().count() == 1 -@mark.django_db(transaction=True) # must commit Django data for Spark to be able to read it +@mark.django_db( + transaction=True +) # must commit Django data for Spark to be able to read it def test_spark_write_to_s3_delta_from_db( _transaction_and_award_test_data, spark: SparkSession, @@ -174,18 +227,24 @@ def test_spark_write_to_s3_delta_from_db( pg_uri = get_database_dsn_string() jdbc_url = get_jdbc_url_from_pg_uri(pg_uri) if not jdbc_url.startswith("jdbc:postgresql://"): - raise ValueError("JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://...") + raise ValueError( + "JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://..." + ) schema_name = delta_lake_unittest_schema # ==== transaction_normalized ==== table_name = "vw_transaction_normalized" logger.info(f"Reading db records for {table_name} from connection: {jdbc_url}") - df = spark.read.jdbc(url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties()) + df = spark.read.jdbc( + url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties() + ) # NOTE! NOTE! NOTE! MinIO locally does not support a TRAILING SLASH after object (folder) name path = f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/{table_name}" - logger.info(f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}") + logger.info( + f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}" + ) # Create table in the metastore using DataFrame's schema and write data to the table df.write.saveAsTable( @@ -198,11 +257,15 @@ def test_spark_write_to_s3_delta_from_db( # ==== transaction_fabs ==== table_name = "vw_transaction_fabs" logger.info(f"Reading db records for {table_name} from connection: {jdbc_url}") - df = spark.read.jdbc(url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties()) + df = spark.read.jdbc( + url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties() + ) # NOTE! NOTE! NOTE! MinIO locally does not support a TRAILING SLASH after object (folder) name path = f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/{table_name}" - logger.info(f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}") + logger.info( + f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}" + ) # Create table in the metastore using DataFrame's schema and write data to the table df.write.saveAsTable( @@ -215,11 +278,15 @@ def test_spark_write_to_s3_delta_from_db( # ==== transaction_fpds ==== table_name = "vw_transaction_fpds" logger.info(f"Reading db records for {table_name} from connection: {jdbc_url}") - df = spark.read.jdbc(url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties()) + df = spark.read.jdbc( + url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties() + ) # NOTE! NOTE! NOTE! MinIO locally does not support a TRAILING SLASH after object (folder) name path = f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/{table_name}" - logger.info(f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}") + logger.info( + f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}" + ) # Create table in the metastore using DataFrame's schema and write data to the table df.write.saveAsTable( @@ -238,7 +305,7 @@ def test_spark_write_to_s3_delta_from_db( # Now assert that we're still by-default using the unittest schema, by way of using that pytest fixture. # i.e. don't tell it what schema to look at - tables = spark.sql(f"show tables").collect() + tables = spark.sql("show tables").collect() assert len(tables) == 3 table_names = [t.tableName for t in tables] assert "vw_transaction_normalized" in table_names @@ -246,7 +313,9 @@ def test_spark_write_to_s3_delta_from_db( assert "vw_transaction_fpds" in table_names # Assert rows are present - assert spark.sql("select count(*) from vw_transaction_normalized").collect()[0][0] == 2 + assert ( + spark.sql("select count(*) from vw_transaction_normalized").collect()[0][0] == 2 + ) assert spark.sql("select count(*) from vw_transaction_fabs").collect()[0][0] == 1 assert spark.sql("select count(*) from vw_transaction_fpds").collect()[0][0] == 1 @@ -268,7 +337,9 @@ def test_create_ref_temp_views(spark: SparkSession): # verify the data in the temp view matches the dummy data for rds_ref_table in _USAS_RDS_REF_TABLES: - spark_count = spark.sql(f"select count(*) from global_temp.{rds_ref_table._meta.db_table}").collect()[0][0] + spark_count = spark.sql( + f"select count(*) from global_temp.{rds_ref_table._meta.db_table}" + ).collect()[0][0] assert rds_ref_table.objects.count() == spark_count # Setup for testing the Broker table(s) diff --git a/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py b/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py index 4473f2ea02..a014d44515 100644 --- a/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py +++ b/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py @@ -1,24 +1,45 @@ import json -import pytest +import pytest from model_bakery import baker from rest_framework import status -from usaspending_api.references.models import ToptierAgency, SubtierAgency +from usaspending_api.references.models import SubtierAgency, ToptierAgency @pytest.fixture def awards_and_transactions(db): - subag = {"pk": 1, "name": "agency name", "abbreviation": "some other stuff"} - baker.make("references.SubtierAgency", subtier_code="def", **subag, _fill_optional=True) - baker.make("references.ToptierAgency", toptier_code="abc", **subag, _fill_optional=True) + baker.make( + "references.SubtierAgency", subtier_code="def", **subag, _fill_optional=True + ) + baker.make( + "references.ToptierAgency", toptier_code="abc", **subag, _fill_optional=True + ) - duns = {"awardee_or_recipient_uniqu": "123", "uei": "ABC", "legal_business_name": "Sams Club"} - parent_recipient_lookup = {"duns": "123", "uei": "ABC", "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e"} - recipient_lookup = {"duns": "456", "uei": "DEF", "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab"} - parent_recipient_profile = {"recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", "recipient_level": "P"} - recipient_profile = {"recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", "recipient_level": "C"} + duns = { + "awardee_or_recipient_uniqu": "123", + "uei": "ABC", + "legal_business_name": "Sams Club", + } + parent_recipient_lookup = { + "duns": "123", + "uei": "ABC", + "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", + } + recipient_lookup = { + "duns": "456", + "uei": "DEF", + "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", + } + parent_recipient_profile = { + "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", + "recipient_level": "P", + } + recipient_profile = { + "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", + "recipient_level": "C", + } baker.make("references.Cfda", program_number=1234) baker.make("recipient.DUNS", **duns) baker.make("recipient.RecipientLookup", **parent_recipient_lookup) @@ -26,14 +47,32 @@ def awards_and_transactions(db): baker.make("recipient.RecipientProfile", **parent_recipient_profile) baker.make("recipient.RecipientProfile", **recipient_profile) - ag = {"pk": 1, "toptier_agency": ToptierAgency.objects.get(pk=1), "subtier_agency": SubtierAgency.objects.get(pk=1)} + ag = { + "pk": 1, + "toptier_agency": ToptierAgency.objects.get(pk=1), + "subtier_agency": SubtierAgency.objects.get(pk=1), + } baker.make("references.Agency", **ag, _fill_optional=True) - baker.make("references.PSC", code="4730", description="HOSE, PIPE, TUBE, LUBRICATION, AND RAILING FITTINGS") - baker.make("references.PSC", code="47", description="PIPE, TUBING, HOSE, AND FITTINGS") + baker.make( + "references.PSC", + code="4730", + description="HOSE, PIPE, TUBE, LUBRICATION, AND RAILING FITTINGS", + ) + baker.make( + "references.PSC", code="47", description="PIPE, TUBING, HOSE, AND FITTINGS" + ) - baker.make("references.NAICS", code="333911", description="PUMP AND PUMPING EQUIPMENT MANUFACTURING") - baker.make("references.NAICS", code="3339", description="Other General Purpose Machinery Manufacturing") + baker.make( + "references.NAICS", + code="333911", + description="PUMP AND PUMPING EQUIPMENT MANUFACTURING", + ) + baker.make( + "references.NAICS", + code="3339", + description="Other General Purpose Machinery Manufacturing", + ) baker.make("references.NAICS", code="33", description="Manufacturing") award_1_model = { @@ -102,7 +141,13 @@ def awards_and_transactions(db): baker.make("search.AwardSearch", **award_2_model) baker.make("search.AwardSearch", **award_3_model) - asst_data = {"is_fpds": False, "transaction_id": 1, "award_id": 1, "cfda_number": 1234, "cfda_title": "farms"} + asst_data = { + "is_fpds": False, + "transaction_id": 1, + "award_id": 1, + "cfda_number": 1234, + "cfda_title": "farms", + } baker.make("search.TransactionSearch", **asst_data) latest_transaction_contract_data = { @@ -340,7 +385,10 @@ def awards_and_transactions(db): "funding_office_name": "funding_office", } baker.make("search.TransactionSearch", **latest_transaction_contract_data) - baker.make("search.TransactionSearch", **latest_transaction_contract_data_without_recipient_name_or_id) + baker.make( + "search.TransactionSearch", + **latest_transaction_contract_data_without_recipient_name_or_id, + ) @pytest.mark.django_db @@ -353,7 +401,10 @@ def test_no_data_idv_award_endpoint(client): @pytest.mark.django_db def test_award_endpoint_different_ids(client, awards_and_transactions): - resp = client.get("/api/v2/awards/CONT_AWD_03VD_9700_SPM30012D3486_9700/", content_type="application/json") + resp = client.get( + "/api/v2/awards/CONT_AWD_03VD_9700_SPM30012D3486_9700/", + content_type="application/json", + ) assert resp.status_code == status.HTTP_200_OK assert json.loads(resp.content.decode("utf-8")) == expected_response_idv @@ -366,7 +417,10 @@ def test_award_endpoint_different_ids(client, awards_and_transactions): def test_award_endpoint_for_null_recipient_information(client, awards_and_transactions): resp = client.get("/api/v2/awards/3/", content_type="application/json") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8")).get("recipient") == recipient_without_id_and_name + assert ( + json.loads(resp.content.decode("utf-8")).get("recipient") + == recipient_without_id_and_name + ) expected_response_idv = { @@ -387,15 +441,33 @@ def test_award_endpoint_for_null_recipient_information(client, awards_and_transa "awarding_agency": { "id": 1, "has_agency_page": False, - "toptier_agency": {"name": "agency name", "abbreviation": "some other stuff", "code": "abc", "slug": None}, - "subtier_agency": {"name": "agency name", "abbreviation": "some other stuff", "code": "def"}, + "toptier_agency": { + "name": "agency name", + "abbreviation": "some other stuff", + "code": "abc", + "slug": None, + }, + "subtier_agency": { + "name": "agency name", + "abbreviation": "some other stuff", + "code": "def", + }, "office_agency_name": "awarding_office", }, "funding_agency": { "id": 1, "has_agency_page": False, - "toptier_agency": {"name": "agency name", "abbreviation": "some other stuff", "code": "abc", "slug": None}, - "subtier_agency": {"name": "agency name", "abbreviation": "some other stuff", "code": "def"}, + "toptier_agency": { + "name": "agency name", + "abbreviation": "some other stuff", + "code": "abc", + "slug": None, + }, + "subtier_agency": { + "name": "agency name", + "abbreviation": "some other stuff", + "code": "def", + }, "office_agency_name": "funding_office", }, "recipient": { @@ -531,14 +603,26 @@ def test_award_endpoint_for_null_recipient_information(client, awards_and_transa "date_signed": "2004-03-02", "naics_hierarchy": { "toptier_code": {"description": "Manufacturing", "code": "33"}, - "midtier_code": {"description": "Other General Purpose Machinery Manufacturing", "code": "3339"}, - "base_code": {"description": "PUMP AND PUMPING EQUIPMENT MANUFACTURING", "code": "333911"}, + "midtier_code": { + "description": "Other General Purpose Machinery Manufacturing", + "code": "3339", + }, + "base_code": { + "description": "PUMP AND PUMPING EQUIPMENT MANUFACTURING", + "code": "333911", + }, }, "psc_hierarchy": { "toptier_code": {}, - "midtier_code": {"description": "PIPE, TUBING, HOSE, AND FITTINGS", "code": "47"}, + "midtier_code": { + "description": "PIPE, TUBING, HOSE, AND FITTINGS", + "code": "47", + }, "subtier_code": {}, - "base_code": {"description": "HOSE, PIPE, TUBE, LUBRICATION, AND RAILING FITTINGS", "code": "4730"}, + "base_code": { + "description": "HOSE, PIPE, TUBE, LUBRICATION, AND RAILING FITTINGS", + "code": "4730", + }, }, "account_obligations_by_defc": [], "account_outlays_by_defc": [], diff --git a/usaspending_api/search/delta_models/award_search.py b/usaspending_api/search/delta_models/award_search.py index cbc505a19c..8c417ffbed 100644 --- a/usaspending_api/search/delta_models/award_search.py +++ b/usaspending_api/search/delta_models/award_search.py @@ -1,14 +1,30 @@ from usaspending_api.awards.v2.lookups.lookups import award_type_mapping AWARD_SEARCH_COLUMNS = { - "treasury_account_identifiers": {"delta": "ARRAY", "postgres": "INTEGER[]", "gold": False}, - "award_id": {"delta": "LONG NOT NULL", "postgres": "BIGINT NOT NULL", "gold": False}, + "treasury_account_identifiers": { + "delta": "ARRAY", + "postgres": "INTEGER[]", + "gold": False, + }, + "award_id": { + "delta": "LONG NOT NULL", + "postgres": "BIGINT NOT NULL", + "gold": False, + }, "data_source": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "transaction_unique_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "latest_transaction_id": {"delta": "LONG", "postgres": "BIGINT", "gold": True}, "earliest_transaction_id": {"delta": "LONG", "postgres": "BIGINT", "gold": True}, - "latest_transaction_search_id": {"delta": "LONG", "postgres": "BIGINT", "gold": True}, - "earliest_transaction_search_id": {"delta": "LONG", "postgres": "BIGINT", "gold": True}, + "latest_transaction_search_id": { + "delta": "LONG", + "postgres": "BIGINT", + "gold": True, + }, + "earliest_transaction_search_id": { + "delta": "LONG", + "postgres": "BIGINT", + "gold": True, + }, "category": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type_description_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, @@ -16,7 +32,11 @@ "type_description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "is_fpds": {"delta": "boolean", "postgres": "boolean", "gold": True}, "generated_unique_award_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "generated_unique_award_id_legacy": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "generated_unique_award_id_legacy": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "display_award_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "update_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, "certified_date": {"delta": "DATE", "postgres": "DATE", "gold": True}, @@ -25,81 +45,297 @@ "fain": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "uri": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "parent_award_piid": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "award_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "total_obligation": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, + "award_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "total_obligation": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, "description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "total_obl_bin": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "total_subsidy_cost": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "total_loan_value": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "total_funding_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23,2)", "gold": True}, - "total_indirect_federal_sharing": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, - "base_and_all_options_value": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, - "base_exercised_options_val": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, - "non_federal_funding_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "total_subsidy_cost": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "total_loan_value": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "total_funding_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, + "total_indirect_federal_sharing": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, + "base_and_all_options_value": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, + "base_exercised_options_val": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, + "non_federal_funding_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "recipient_hash": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "recipient_levels": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, "recipient_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "raw_recipient_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "recipient_unique_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "parent_recipient_unique_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "parent_recipient_unique_id": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "recipient_uei": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "parent_uei": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "parent_recipient_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "business_categories": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, - "total_subaward_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "business_categories": { + "delta": "ARRAY", + "postgres": "TEXT[]", + "gold": False, + }, + "total_subaward_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "subaward_count": {"delta": "INTEGER", "postgres": "INTEGER", "gold": True}, "action_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "last_modified_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, - "period_of_performance_start_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "period_of_performance_current_end_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, + "last_modified_date": { + "delta": "TIMESTAMP", + "postgres": "TIMESTAMP", + "gold": False, + }, + "period_of_performance_start_date": { + "delta": "DATE", + "postgres": "DATE", + "gold": False, + }, + "period_of_performance_current_end_date": { + "delta": "DATE", + "postgres": "DATE", + "gold": False, + }, "date_signed": {"delta": "DATE", "postgres": "DATE", "gold": False}, "ordering_period_end_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "original_loan_subsidy_cost": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "face_value_loan_guarantee": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, + "original_loan_subsidy_cost": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "face_value_loan_guarantee": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, "awarding_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "funding_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "awarding_toptier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_name": {"delta": " STRING", "postgres": " STRING", "gold": False}, - "funding_subtier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_toptier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_name_raw": {"delta": " STRING", "postgres": " STRING", "gold": False}, - "funding_subtier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_toptier_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_subtier_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_toptier_agency_code_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_code_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_code_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_subtier_agency_code_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "funding_subtier_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, + "awarding_toptier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_name": { + "delta": " STRING", + "postgres": " STRING", + "gold": False, + }, + "funding_subtier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_toptier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_name_raw": { + "delta": " STRING", + "postgres": " STRING", + "gold": False, + }, + "funding_subtier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_toptier_agency_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_subtier_agency_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_toptier_agency_code_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_code_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_code_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_subtier_agency_code_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_id": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "funding_subtier_agency_id": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, "fpds_agency_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "fpds_parent_agency_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "recipient_location_country_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_country_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_county_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_county_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_congressional_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_congressional_code_current": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "recipient_location_country_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_country_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_county_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_county_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_congressional_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_congressional_code_current": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "recipient_location_zip5": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_city_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_county_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_congressional_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_county_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_address_line1": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_address_line2": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_address_line3": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "recipient_location_city_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_fips": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_county_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_congressional_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_county_fips": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_address_line1": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_address_line2": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_address_line3": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "recipient_location_zip4": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_foreign_postal_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_foreign_province": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "recipient_location_foreign_postal_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_foreign_province": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "pop_country_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_country_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_state_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, @@ -108,13 +344,21 @@ "pop_city_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_zip5": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_congressional_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "pop_congressional_code_current": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "pop_congressional_code_current": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "pop_city_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_state_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_state_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_state_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "pop_county_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "pop_congressional_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, + "pop_congressional_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, "pop_county_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_zip4": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "cfda_program_title": {"delta": "STRING", "postgres": "TEXT", "gold": False}, @@ -125,34 +369,82 @@ "extent_competed": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type_set_aside": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "product_or_service_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "product_or_service_description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "product_or_service_description": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "naics_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "naics_description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "tas_paths": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, "tas_components": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, "federal_accounts": {"delta": "STRING", "postgres": "JSONB", "gold": False}, - "disaster_emergency_fund_codes": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, + "disaster_emergency_fund_codes": { + "delta": "ARRAY", + "postgres": "TEXT[]", + "gold": False, + }, "spending_by_defc": {"delta": "STRING", "postgres": "JSONB", "gold": False}, - "total_covid_outlay": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "total_covid_obligation": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, + "total_covid_outlay": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "total_covid_obligation": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, "officer_1_amount": { "delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True, }, "officer_1_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_2_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "officer_2_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "officer_2_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_3_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "officer_3_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "officer_3_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_4_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "officer_4_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "officer_4_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_5_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "officer_5_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "officer_5_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "total_iija_outlay": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, - "total_iija_obligation": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, - "total_outlays": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "generated_pragmatic_obligation": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, + "total_iija_outlay": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, + "total_iija_obligation": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, + "total_outlays": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "generated_pragmatic_obligation": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, "program_activities": {"delta": "STRING", "postgres": "JSONB", "gold": False}, "transaction_count": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, } @@ -163,8 +455,12 @@ **{k: v["delta"] for k, v in AWARD_SEARCH_COLUMNS.items()}, **DELTA_ONLY_COLUMNS, } -AWARD_SEARCH_POSTGRES_COLUMNS = {k: v["postgres"] for k, v in AWARD_SEARCH_COLUMNS.items() if not v["gold"]} -AWARD_SEARCH_POSTGRES_GOLD_COLUMNS = {k: v["gold"] for k, v in AWARD_SEARCH_COLUMNS.items()} +AWARD_SEARCH_POSTGRES_COLUMNS = { + k: v["postgres"] for k, v in AWARD_SEARCH_COLUMNS.items() if not v["gold"] +} +AWARD_SEARCH_POSTGRES_GOLD_COLUMNS = { + k: v["gold"] for k, v in AWARD_SEARCH_COLUMNS.items() +} ALL_AWARD_TYPES = list(award_type_mapping.keys()) diff --git a/usaspending_api/search/delta_models/dataframes/transaction_search.py b/usaspending_api/search/delta_models/dataframes/transaction_search.py index 82327cc397..b59e60a305 100644 --- a/usaspending_api/search/delta_models/dataframes/transaction_search.py +++ b/usaspending_api/search/delta_models/dataframes/transaction_search.py @@ -1,5 +1,6 @@ from delta.tables import DeltaTable -from pyspark.sql import DataFrame, SparkSession, functions as sf, Column +from pyspark.sql import Column, DataFrame, SparkSession +from pyspark.sql import functions as sf from pyspark.sql.types import ( DecimalType, StringType, @@ -10,20 +11,23 @@ from usaspending_api.recipient.v2.lookups import SPECIAL_CASES from usaspending_api.search.delta_models.dataframes.abstract_search import ( AbstractSearch, - hash_col, extract_numbers_as_string, + hash_col, ) ALL_AWARD_TYPES = list(award_type_mapping.keys()) class TransactionSearch(AbstractSearch): - @property def recipient_hash_and_levels(self) -> DataFrame: return ( self.recipient_profile.groupBy("recipient_hash", "uei") - .agg(sf.sort_array(sf.collect_set("recipient_level")).alias("recipient_levels")) + .agg( + sf.sort_array(sf.collect_set("recipient_level")).alias( + "recipient_levels" + ) + ) .select( sf.col("recipient_hash").alias("recipient_level_hash"), sf.col("recipient_levels"), @@ -35,23 +39,31 @@ def fed_and_tres_acct(self) -> DataFrame: return ( self.faba.join( self.treasury_appropriation_account, - self.treasury_appropriation_account.treasury_account_identifier == self.faba.treasury_account_id, + self.treasury_appropriation_account.treasury_account_identifier + == self.faba.treasury_account_id, "inner", ) .join( self.federal_account, - self.federal_account.id == self.treasury_appropriation_account.federal_account_id, + self.federal_account.id + == self.treasury_appropriation_account.federal_account_id, "inner", ) .join( self.awarding_toptier_agency, - self.federal_account.parent_toptier_agency_id == self.awarding_toptier_agency.toptier_agency_id, + self.federal_account.parent_toptier_agency_id + == self.awarding_toptier_agency.toptier_agency_id, "inner", ) - .join(self.ref_program_activity, self.faba.program_activity_id == self.ref_program_activity.id, "left") + .join( + self.ref_program_activity, + self.faba.program_activity_id == self.ref_program_activity.id, + "left", + ) .join( self.program_activity_park, - self.faba.program_activity_reporting_key == self.program_activity_park.code, + self.faba.program_activity_reporting_key + == self.program_activity_park.code, "left", ) .filter(self.faba["award_id"].isNotNull()) @@ -74,59 +86,73 @@ def key_cols(self) -> list[Column]: def date_cols(self) -> list[Column]: return [ sf.to_date(self.transaction_normalized.action_date).alias("action_date"), - sf.add_months(sf.to_date(self.transaction_normalized.action_date), 3).alias("fiscal_action_date"), + sf.add_months(sf.to_date(self.transaction_normalized.action_date), 3).alias( + "fiscal_action_date" + ), self.transaction_normalized.last_modified_date, self.transaction_normalized.fiscal_year, self.awards.certified_date.alias("award_certified_date"), - sf.year(sf.add_months(sf.to_date(self.awards.certified_date), 3)).alias("award_fiscal_year"), + sf.year(sf.add_months(sf.to_date(self.awards.certified_date), 3)).alias( + "award_fiscal_year" + ), self.transaction_normalized.create_date.cast(TimestampType()), self.transaction_normalized.update_date.cast(TimestampType()), self.awards.update_date.cast(TimestampType()).alias("award_update_date"), sf.to_date(self.awards.date_signed).alias("award_date_signed"), - sf.greatest(sf.to_timestamp(self.transaction_normalized.update_date), self.awards.update_date).alias( - "etl_update_date" - ), - sf.to_date(self.transaction_normalized.period_of_performance_start_date).alias( - "period_of_performance_start_date" - ), - sf.to_date(self.transaction_normalized.period_of_performance_current_end_date).alias( - "period_of_performance_current_end_date" - ), + sf.greatest( + sf.to_timestamp(self.transaction_normalized.update_date), + self.awards.update_date, + ).alias("etl_update_date"), + sf.to_date( + self.transaction_normalized.period_of_performance_start_date + ).alias("period_of_performance_start_date"), + sf.to_date( + self.transaction_normalized.period_of_performance_current_end_date + ).alias("period_of_performance_current_end_date"), sf.coalesce( - self.transaction_fabs.created_at, sf.to_timestamp(self.transaction_fpds.initial_report_date) + self.transaction_fabs.created_at, + sf.to_timestamp(self.transaction_fpds.initial_report_date), ).alias("initial_report_date"), ] @property def agency_cols(self) -> list[Column]: return [ - sf.coalesce(self.transaction_fabs.awarding_agency_code, self.transaction_fpds.awarding_agency_code).alias( - "awarding_agency_code" - ), + sf.coalesce( + self.transaction_fabs.awarding_agency_code, + self.transaction_fpds.awarding_agency_code, + ).alias("awarding_agency_code"), self.awarding_toptier_agency.awarding_toptier_agency_name, - sf.coalesce(self.transaction_fabs.awarding_agency_name, self.transaction_fpds.awarding_agency_name).alias( - "awarding_toptier_agency_name_raw" - ), - sf.coalesce(self.transaction_fabs.funding_agency_code, self.transaction_fpds.funding_agency_code).alias( - "funding_agency_code" - ), + sf.coalesce( + self.transaction_fabs.awarding_agency_name, + self.transaction_fpds.awarding_agency_name, + ).alias("awarding_toptier_agency_name_raw"), + sf.coalesce( + self.transaction_fabs.funding_agency_code, + self.transaction_fpds.funding_agency_code, + ).alias("funding_agency_code"), self.funding_toptier_agency.funding_toptier_agency_name, - sf.coalesce(self.transaction_fabs.funding_agency_name, self.transaction_fpds.funding_agency_name).alias( - "funding_toptier_agency_name_raw" - ), sf.coalesce( - self.transaction_fabs.awarding_sub_tier_agency_c, self.transaction_fpds.awarding_sub_tier_agency_c + self.transaction_fabs.funding_agency_name, + self.transaction_fpds.funding_agency_name, + ).alias("funding_toptier_agency_name_raw"), + sf.coalesce( + self.transaction_fabs.awarding_sub_tier_agency_c, + self.transaction_fpds.awarding_sub_tier_agency_c, ).alias("awarding_sub_tier_agency_c"), self.awarding_subtier_agency.awarding_subtier_agency_name, sf.coalesce( - self.transaction_fabs.awarding_sub_tier_agency_n, self.transaction_fpds.awarding_sub_tier_agency_n + self.transaction_fabs.awarding_sub_tier_agency_n, + self.transaction_fpds.awarding_sub_tier_agency_n, ).alias("awarding_subtier_agency_name_raw"), sf.coalesce( - self.transaction_fabs.funding_sub_tier_agency_co, self.transaction_fpds.funding_sub_tier_agency_co + self.transaction_fabs.funding_sub_tier_agency_co, + self.transaction_fpds.funding_sub_tier_agency_co, ).alias("funding_sub_tier_agency_co"), self.funding_subtier_agency.funding_subtier_agency_name, sf.coalesce( - self.transaction_fabs.funding_sub_tier_agency_na, self.transaction_fpds.funding_sub_tier_agency_na + self.transaction_fabs.funding_sub_tier_agency_na, + self.transaction_fpds.funding_sub_tier_agency_na, ).alias("funding_subtier_agency_name_raw"), self.awarding_agency_id.awarding_toptier_agency_id, self.funding_agency_id.funding_toptier_agency_id, @@ -136,17 +162,19 @@ def agency_cols(self) -> list[Column]: self.funding_toptier_agency.funding_toptier_agency_abbreviation, self.awarding_subtier_agency.awarding_subtier_agency_abbreviation, self.funding_subtier_agency.funding_subtier_agency_abbreviation, - sf.coalesce(self.transaction_fabs.awarding_office_code, self.transaction_fpds.awarding_office_code).alias( - "awarding_office_code" - ), + sf.coalesce( + self.transaction_fabs.awarding_office_code, + self.transaction_fpds.awarding_office_code, + ).alias("awarding_office_code"), sf.coalesce( self.awarding_office.awarding_office_name, self.transaction_fabs.awarding_office_name, self.transaction_fpds.awarding_office_name, ).alias("awarding_office_name"), - sf.coalesce(self.transaction_fabs.funding_office_code, self.transaction_fpds.funding_office_code).alias( - "funding_office_code" - ), + sf.coalesce( + self.transaction_fabs.funding_office_code, + self.transaction_fpds.funding_office_code, + ).alias("funding_office_code"), sf.coalesce( self.funding_office.funding_office_name, self.transaction_fabs.funding_office_name, @@ -186,7 +214,8 @@ def amounts_cols(self) -> list[Column]: return [ sf.coalesce( sf.when( - self.transaction_normalized["type"].isin(["07", "08"]), self.awards.total_subsidy_cost + self.transaction_normalized["type"].isin(["07", "08"]), + self.awards.total_subsidy_cost, ).otherwise(self.awards.total_obligation), sf.lit(0), ) @@ -201,16 +230,24 @@ def amounts_cols(self) -> list[Column]: ) .cast(DecimalType(23, 2)) .alias("generated_pragmatic_obligation"), - sf.coalesce(self.transaction_normalized.federal_action_obligation, sf.lit(0)) + sf.coalesce( + self.transaction_normalized.federal_action_obligation, sf.lit(0) + ) .cast(DecimalType(23, 2)) .alias("federal_action_obligation"), - sf.coalesce(self.transaction_normalized.original_loan_subsidy_cost, sf.lit(0)) + sf.coalesce( + self.transaction_normalized.original_loan_subsidy_cost, sf.lit(0) + ) .cast(DecimalType(23, 2)) .alias("original_loan_subsidy_cost"), - sf.coalesce(self.transaction_normalized.face_value_loan_guarantee, sf.lit(0)) + sf.coalesce( + self.transaction_normalized.face_value_loan_guarantee, sf.lit(0) + ) .cast(DecimalType(23, 2)) .alias("face_value_loan_guarantee"), - self.transaction_normalized.indirect_federal_sharing.cast(DecimalType(23, 2)), + self.transaction_normalized.indirect_federal_sharing.cast( + DecimalType(23, 2) + ), self.transaction_normalized.funding_amount, sf.coalesce(self.transaction_fabs.total_funding_amount, sf.lit("0")) .cast(DecimalType(23, 2)) @@ -223,11 +260,15 @@ def generated_parent_recipient_hash(self) -> Column: return hash_col( sf.when( sf.coalesce( - self.transaction_fpds.ultimate_parent_uei, self.transaction_fabs.ultimate_parent_uei + self.transaction_fpds.ultimate_parent_uei, + self.transaction_fabs.ultimate_parent_uei, ).isNotNull(), sf.concat( sf.lit("uei-"), - sf.coalesce(self.transaction_fpds.ultimate_parent_uei, self.transaction_fabs.ultimate_parent_uei), + sf.coalesce( + self.transaction_fpds.ultimate_parent_uei, + self.transaction_fabs.ultimate_parent_uei, + ), ), ) .when( @@ -263,11 +304,15 @@ def recipient_cols(self) -> list[Column]: hash_col( sf.when( sf.coalesce( - self.transaction_fpds.awardee_or_recipient_uei, self.transaction_fabs.uei + self.transaction_fpds.awardee_or_recipient_uei, + self.transaction_fabs.uei, ).isNotNull(), sf.concat( sf.lit("uei-"), - sf.coalesce(self.transaction_fpds.awardee_or_recipient_uei, self.transaction_fabs.uei), + sf.coalesce( + self.transaction_fpds.awardee_or_recipient_uei, + self.transaction_fabs.uei, + ), ), ) .when( @@ -296,11 +341,13 @@ def recipient_cols(self) -> list[Column]: ), ).alias("recipient_hash"), sf.col("recipient_levels"), - sf.coalesce(self.transaction_fpds.awardee_or_recipient_uei, self.transaction_fabs.uei).alias( - "recipient_uei" - ), sf.coalesce( - self.transaction_fpds.awardee_or_recipient_legal, self.transaction_fabs.awardee_or_recipient_legal + self.transaction_fpds.awardee_or_recipient_uei, + self.transaction_fabs.uei, + ).alias("recipient_uei"), + sf.coalesce( + self.transaction_fpds.awardee_or_recipient_legal, + self.transaction_fabs.awardee_or_recipient_legal, ).alias("recipient_name_raw"), sf.upper( sf.coalesce( @@ -310,18 +357,24 @@ def recipient_cols(self) -> list[Column]: ) ).alias("recipient_name"), sf.coalesce( - self.transaction_fpds.awardee_or_recipient_uniqu, self.transaction_fabs.awardee_or_recipient_uniqu + self.transaction_fpds.awardee_or_recipient_uniqu, + self.transaction_fabs.awardee_or_recipient_uniqu, ).alias("recipient_unique_id"), self.parent_recipient.parent_recipient_hash, - sf.coalesce(self.transaction_fpds.ultimate_parent_uei, self.transaction_fabs.ultimate_parent_uei).alias( - "parent_uei" - ), sf.coalesce( - self.transaction_fpds.ultimate_parent_legal_enti, self.transaction_fabs.ultimate_parent_legal_enti + self.transaction_fpds.ultimate_parent_uei, + self.transaction_fabs.ultimate_parent_uei, + ).alias("parent_uei"), + sf.coalesce( + self.transaction_fpds.ultimate_parent_legal_enti, + self.transaction_fabs.ultimate_parent_legal_enti, ).alias("parent_recipient_name_raw"), - sf.upper(self.parent_recipient.parent_recipient_name).alias("parent_recipient_name"), + sf.upper(self.parent_recipient.parent_recipient_name).alias( + "parent_recipient_name" + ), sf.coalesce( - self.transaction_fpds.ultimate_parent_unique_ide, self.transaction_fabs.ultimate_parent_unique_ide + self.transaction_fpds.ultimate_parent_unique_ide, + self.transaction_fabs.ultimate_parent_unique_ide, ).alias("parent_recipient_unique_id"), ] @@ -329,57 +382,72 @@ def recipient_cols(self) -> list[Column]: def recipient_location_cols(self) -> list[Column]: return [ sf.coalesce( - self.transaction_fpds.legal_entity_country_code, self.transaction_fabs.legal_entity_country_code + self.transaction_fpds.legal_entity_country_code, + self.transaction_fabs.legal_entity_country_code, ).alias("recipient_location_country_code"), sf.coalesce( - self.transaction_fpds.legal_entity_country_name, self.transaction_fabs.legal_entity_country_name + self.transaction_fpds.legal_entity_country_name, + self.transaction_fabs.legal_entity_country_name, ).alias("recipient_location_country_name"), sf.coalesce( - self.transaction_fpds.legal_entity_state_code, self.transaction_fabs.legal_entity_state_code + self.transaction_fpds.legal_entity_state_code, + self.transaction_fabs.legal_entity_state_code, ).alias("recipient_location_state_code"), sf.coalesce( - self.transaction_fpds.legal_entity_state_descrip, self.transaction_fabs.legal_entity_state_name + self.transaction_fpds.legal_entity_state_descrip, + self.transaction_fabs.legal_entity_state_name, ).alias("recipient_location_state_name"), sf.col("recipient_location_state_fips"), self.rl_state_population.recipient_location_state_population, extract_numbers_as_string( sf.coalesce( - self.transaction_fpds.legal_entity_county_code, self.transaction_fabs.legal_entity_county_code + self.transaction_fpds.legal_entity_county_code, + self.transaction_fabs.legal_entity_county_code, ), 3, ).alias("recipient_location_county_code"), sf.coalesce( - self.transaction_fpds.legal_entity_county_name, self.transaction_fabs.legal_entity_county_name + self.transaction_fpds.legal_entity_county_name, + self.transaction_fabs.legal_entity_county_name, ).alias("recipient_location_county_name"), self.rl_county_population.recipient_location_county_population, extract_numbers_as_string( sf.coalesce( - self.transaction_fpds.legal_entity_congressional, self.transaction_fabs.legal_entity_congressional + self.transaction_fpds.legal_entity_congressional, + self.transaction_fabs.legal_entity_congressional, ) ).alias("recipient_location_congressional_code"), self.rl_district_population.recipient_location_congressional_population, self.current_cd.recipient_location_congressional_code_current.alias( "recipient_location_congressional_code_current" ), - sf.coalesce(self.transaction_fpds.legal_entity_zip5, self.transaction_fabs.legal_entity_zip5).alias( - "recipient_location_zip5" - ), + sf.coalesce( + self.transaction_fpds.legal_entity_zip5, + self.transaction_fabs.legal_entity_zip5, + ).alias("recipient_location_zip5"), self.transaction_fpds.legal_entity_zip4, sf.coalesce( - self.transaction_fpds.legal_entity_zip_last4, self.transaction_fabs.legal_entity_zip_last4 + self.transaction_fpds.legal_entity_zip_last4, + self.transaction_fabs.legal_entity_zip_last4, ).alias("legal_entity_zip_last4"), self.transaction_fabs.legal_entity_city_code, sf.rtrim( - sf.coalesce(self.transaction_fpds.legal_entity_city_name, self.transaction_fabs.legal_entity_city_name) + sf.coalesce( + self.transaction_fpds.legal_entity_city_name, + self.transaction_fabs.legal_entity_city_name, + ) ).alias("recipient_location_city_name"), sf.coalesce( - self.transaction_fpds.legal_entity_address_line1, self.transaction_fabs.legal_entity_address_line1 + self.transaction_fpds.legal_entity_address_line1, + self.transaction_fabs.legal_entity_address_line1, ).alias("legal_entity_address_line1"), sf.coalesce( - self.transaction_fpds.legal_entity_address_line2, self.transaction_fabs.legal_entity_address_line2 + self.transaction_fpds.legal_entity_address_line2, + self.transaction_fabs.legal_entity_address_line2, ).alias("legal_entity_address_line2"), sf.coalesce( - self.transaction_fpds.legal_entity_address_line3, self.transaction_fabs.legal_entity_address_line3 + self.transaction_fpds.legal_entity_address_line3, + self.transaction_fabs.legal_entity_address_line3, ).alias("legal_entity_address_line3"), self.transaction_fabs.legal_entity_foreign_city, self.transaction_fabs.legal_entity_foreign_descr, @@ -388,7 +456,8 @@ def recipient_location_cols(self) -> list[Column]: sf.concat( sf.col("recipient_location_state_fips"), sf.coalesce( - self.transaction_fpds.legal_entity_county_code, self.transaction_fabs.legal_entity_county_code + self.transaction_fpds.legal_entity_county_code, + self.transaction_fabs.legal_entity_county_code, ), ).alias("recipient_location_county_fips"), ] @@ -399,55 +468,67 @@ def place_of_performance_cols(self) -> list[Column]: self.transaction_fabs.place_of_performance_code, self.transaction_fabs.place_of_performance_scope, sf.coalesce( - self.transaction_fpds.place_of_perform_country_c, self.transaction_fabs.place_of_perform_country_c + self.transaction_fpds.place_of_perform_country_c, + self.transaction_fabs.place_of_perform_country_c, ).alias("pop_country_code"), sf.coalesce( - self.transaction_fpds.place_of_perf_country_desc, self.transaction_fabs.place_of_perform_country_n + self.transaction_fpds.place_of_perf_country_desc, + self.transaction_fabs.place_of_perform_country_n, ).alias("pop_country_name"), sf.coalesce( - self.transaction_fpds.place_of_performance_state, self.transaction_fabs.place_of_perfor_state_code + self.transaction_fpds.place_of_performance_state, + self.transaction_fabs.place_of_perfor_state_code, ).alias("pop_state_code"), sf.coalesce( - self.transaction_fpds.place_of_perfor_state_desc, self.transaction_fabs.place_of_perform_state_nam + self.transaction_fpds.place_of_perfor_state_desc, + self.transaction_fabs.place_of_perform_state_nam, ).alias("pop_state_name"), sf.col("pop_state_fips"), self.pop_state_population.pop_state_population, extract_numbers_as_string( sf.coalesce( - self.transaction_fpds.place_of_perform_county_co, self.transaction_fabs.place_of_perform_county_co + self.transaction_fpds.place_of_perform_county_co, + self.transaction_fabs.place_of_perform_county_co, ), 3, ).alias("pop_county_code"), sf.coalesce( - self.transaction_fpds.place_of_perform_county_na, self.transaction_fabs.place_of_perform_county_na + self.transaction_fpds.place_of_perform_county_na, + self.transaction_fabs.place_of_perform_county_na, ).alias("pop_county_name"), self.pop_county_population.pop_county_population, extract_numbers_as_string( sf.coalesce( - self.transaction_fpds.place_of_performance_congr, self.transaction_fabs.place_of_performance_congr + self.transaction_fpds.place_of_performance_congr, + self.transaction_fabs.place_of_performance_congr, ) ).alias("pop_congressional_code"), self.pop_district_population.pop_congressional_population, self.current_cd.pop_congressional_code_current, sf.coalesce( - self.transaction_fpds.place_of_performance_zip5, self.transaction_fabs.place_of_performance_zip5 + self.transaction_fpds.place_of_performance_zip5, + self.transaction_fabs.place_of_performance_zip5, ).alias("pop_zip5"), sf.coalesce( - self.transaction_fpds.place_of_performance_zip4a, self.transaction_fabs.place_of_performance_zip4a + self.transaction_fpds.place_of_performance_zip4a, + self.transaction_fabs.place_of_performance_zip4a, ).alias("place_of_performance_zip4a"), sf.coalesce( - self.transaction_fpds.place_of_perform_zip_last4, self.transaction_fabs.place_of_perform_zip_last4 + self.transaction_fpds.place_of_perform_zip_last4, + self.transaction_fabs.place_of_perform_zip_last4, ).alias("place_of_perform_zip_last4"), sf.rtrim( sf.coalesce( - self.transaction_fpds.place_of_perform_city_name, self.transaction_fabs.place_of_performance_city + self.transaction_fpds.place_of_perform_city_name, + self.transaction_fabs.place_of_performance_city, ) ).alias("pop_city_name"), self.transaction_fabs.place_of_performance_forei, sf.concat( sf.col("pop_state_fips"), sf.coalesce( - self.transaction_fpds.place_of_perform_county_co, self.transaction_fabs.place_of_perform_county_co + self.transaction_fpds.place_of_perform_county_co, + self.transaction_fabs.place_of_perform_county_co, ), ).alias("pop_county_fips"), ] @@ -465,36 +546,46 @@ def accounts_cols(self) -> list[Column]: @property def officer_amounts_cols(self) -> list[Column]: return [ - sf.coalesce(self.transaction_fabs.officer_1_name, self.transaction_fpds.officer_1_name).alias( - "officer_1_name" - ), - sf.coalesce(self.transaction_fabs.officer_1_amount, self.transaction_fpds.officer_1_amount).alias( - "officer_1_amount" - ), - sf.coalesce(self.transaction_fabs.officer_2_name, self.transaction_fpds.officer_2_name).alias( - "officer_2_name" - ), - sf.coalesce(self.transaction_fabs.officer_2_amount, self.transaction_fpds.officer_2_amount).alias( - "officer_2_amount" - ), - sf.coalesce(self.transaction_fabs.officer_3_name, self.transaction_fpds.officer_3_name).alias( - "officer_3_name" - ), - sf.coalesce(self.transaction_fabs.officer_3_amount, self.transaction_fpds.officer_3_amount).alias( - "officer_3_amount" - ), - sf.coalesce(self.transaction_fabs.officer_4_name, self.transaction_fpds.officer_4_name).alias( - "officer_4_name" - ), - sf.coalesce(self.transaction_fabs.officer_4_amount, self.transaction_fpds.officer_4_amount).alias( - "officer_4_amount" - ), - sf.coalesce(self.transaction_fabs.officer_5_name, self.transaction_fpds.officer_5_name).alias( - "officer_5_name" - ), - sf.coalesce(self.transaction_fabs.officer_5_amount, self.transaction_fpds.officer_5_amount).alias( - "officer_5_amount" - ), + sf.coalesce( + self.transaction_fabs.officer_1_name, + self.transaction_fpds.officer_1_name, + ).alias("officer_1_name"), + sf.coalesce( + self.transaction_fabs.officer_1_amount, + self.transaction_fpds.officer_1_amount, + ).alias("officer_1_amount"), + sf.coalesce( + self.transaction_fabs.officer_2_name, + self.transaction_fpds.officer_2_name, + ).alias("officer_2_name"), + sf.coalesce( + self.transaction_fabs.officer_2_amount, + self.transaction_fpds.officer_2_amount, + ).alias("officer_2_amount"), + sf.coalesce( + self.transaction_fabs.officer_3_name, + self.transaction_fpds.officer_3_name, + ).alias("officer_3_name"), + sf.coalesce( + self.transaction_fabs.officer_3_amount, + self.transaction_fpds.officer_3_amount, + ).alias("officer_3_amount"), + sf.coalesce( + self.transaction_fabs.officer_4_name, + self.transaction_fpds.officer_4_name, + ).alias("officer_4_name"), + sf.coalesce( + self.transaction_fabs.officer_4_amount, + self.transaction_fpds.officer_4_amount, + ).alias("officer_4_amount"), + sf.coalesce( + self.transaction_fabs.officer_5_name, + self.transaction_fpds.officer_5_name, + ).alias("officer_5_name"), + sf.coalesce( + self.transaction_fabs.officer_5_amount, + self.transaction_fpds.officer_5_amount, + ).alias("officer_5_amount"), ] @property @@ -675,7 +766,9 @@ def fpds_cols(self) -> list[Column]: self.transaction_fpds.price_evaluation_adjustmen, self.transaction_fpds.private_university_or_coll, self.transaction_fpds.product_or_service_code, - self.transaction_fpds.product_or_service_co_desc.alias("product_or_service_description"), + self.transaction_fpds.product_or_service_co_desc.alias( + "product_or_service_description" + ), self.transaction_fpds.program_acronym, self.transaction_fpds.program_system_or_equ_desc, self.transaction_fpds.program_system_or_equipmen, @@ -764,7 +857,8 @@ def dataframe(self) -> DataFrame: ) .join( self.references_cfda, - self.transaction_fabs.cfda_number == self.references_cfda.program_number, + self.transaction_fabs.cfda_number + == self.references_cfda.program_number, "leftouter", ) .join( @@ -772,26 +866,36 @@ def dataframe(self) -> DataFrame: self.recipient_lookup.recipient_hash == self.generated_recipient_hash, "leftouter", ) - .join(self.awards, self.transaction_normalized.award_id == self.awards.id, "leftouter") + .join( + self.awards, + self.transaction_normalized.award_id == self.awards.id, + "leftouter", + ) .join( self.awarding_agency, - self.transaction_normalized.awarding_agency_id == self.awarding_agency.id, + self.transaction_normalized.awarding_agency_id + == self.awarding_agency.id, "leftouter", ) .join( self.awarding_toptier_agency, - self.awarding_agency.toptier_agency_id == self.awarding_toptier_agency.toptier_agency_id, + self.awarding_agency.toptier_agency_id + == self.awarding_toptier_agency.toptier_agency_id, "leftouter", ) .join( self.awarding_subtier_agency, - self.awarding_agency.subtier_agency_id == self.awarding_subtier_agency.subtier_agency_id, + self.awarding_agency.subtier_agency_id + == self.awarding_subtier_agency.subtier_agency_id, "leftouter", ) .join( self.awarding_agency_id, ( - (self.awarding_agency_id.toptier_agency_id == self.awarding_toptier_agency.toptier_agency_id) + ( + self.awarding_agency_id.toptier_agency_id + == self.awarding_toptier_agency.toptier_agency_id + ) & self.awarding_agency_id.toptier_flag ), "leftouter", @@ -803,23 +907,29 @@ def dataframe(self) -> DataFrame: ) .join( self.funding_toptier_agency, - self.funding_agency.funding_toptier_agency_id == self.funding_toptier_agency.toptier_agency_id, + self.funding_agency.funding_toptier_agency_id + == self.funding_toptier_agency.toptier_agency_id, "leftouter", ) .join( self.funding_subtier_agency, - self.funding_agency.funding_subtier_agency_id == self.funding_subtier_agency.subtier_agency_id, + self.funding_agency.funding_subtier_agency_id + == self.funding_subtier_agency.subtier_agency_id, "leftouter", ) .join( self.funding_agency_id, - (self.funding_agency_id.toptier_agency_id == self.funding_toptier_agency.funding_toptier_agency_id) + ( + self.funding_agency_id.toptier_agency_id + == self.funding_toptier_agency.funding_toptier_agency_id + ) & (self.funding_agency_id.row_num == 1), "leftouter", ) .join( self.parent_recipient, - self.parent_recipient.parent_recipient_hash == self.generated_parent_recipient_hash, + self.parent_recipient.parent_recipient_hash + == self.generated_parent_recipient_hash, "leftouter", ) .join( @@ -832,18 +942,26 @@ def dataframe(self) -> DataFrame: df_with_location = self.join_location_data(df) return ( df_with_location.join( - self.current_cd, self.transaction_normalized.id == self.current_cd.transaction_id, "leftouter" + self.current_cd, + self.transaction_normalized.id == self.current_cd.transaction_id, + "leftouter", ) .join( self.awarding_office, self.awarding_office.office_code - == sf.coalesce(self.transaction_fabs.awarding_office_code, self.transaction_fpds.awarding_office_code), + == sf.coalesce( + self.transaction_fabs.awarding_office_code, + self.transaction_fpds.awarding_office_code, + ), "leftouter", ) .join( self.funding_office, self.funding_office.office_code - == sf.coalesce(self.transaction_fabs.funding_office_code, self.transaction_fpds.funding_office_code), + == sf.coalesce( + self.transaction_fabs.funding_office_code, + self.transaction_fpds.funding_office_code, + ), "leftouter", ) .join( @@ -869,7 +987,9 @@ def dataframe(self) -> DataFrame: ) -def load_transaction_search(spark: SparkSession, destination_database: str, destination_table_name: str) -> None: +def load_transaction_search( + spark: SparkSession, destination_database: str, destination_table_name: str +) -> None: df = TransactionSearch(spark).dataframe df.write.saveAsTable( f"{destination_database}.{destination_table_name}", @@ -881,10 +1001,15 @@ def load_transaction_search(spark: SparkSession, destination_database: str, dest def load_transaction_search_incremental( spark: SparkSession, destination_database: str, destination_table_name: str ) -> None: - target = DeltaTable.forName(spark, f"{destination_database}.{destination_table_name}").alias("t") + target = DeltaTable.forName( + spark, f"{destination_database}.{destination_table_name}" + ).alias("t") source = TransactionSearch(spark).dataframe.alias("s") ( - target.merge(source, "s.transaction_id = t.transaction_id and s.merge_hash_key = t.merge_hash_key") + target.merge( + source, + "s.transaction_id = t.transaction_id and s.merge_hash_key = t.merge_hash_key", + ) .whenNotMatchedInsertAll() .whenNotMatchedBySourceDelete() .execute() diff --git a/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py index e9846e9b84..c023c9fca9 100644 --- a/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py +++ b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py @@ -1,6 +1,6 @@ # Generated by Django 4.2.23 on 2026-01-02 16:42 -from django.db import migrations, models +from django.db import migrations class Migration(migrations.Migration): diff --git a/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py b/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py index c279029034..4badf0d9ff 100644 --- a/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py +++ b/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py @@ -1,12 +1,17 @@ # Generated by Django 4.2.23 on 2026-01-02 16:42 from django.db import migrations, models + from usaspending_api.awards.models.award import vw_awards_sql -from usaspending_api.awards.models.transaction_normalized import vw_transaction_normalized_sql -from usaspending_api.awards.models.transaction_fpds import vw_transaction_fpds_sql from usaspending_api.awards.models.transaction_fabs import vw_transaction_fabs_sql +from usaspending_api.awards.models.transaction_fpds import vw_transaction_fpds_sql +from usaspending_api.awards.models.transaction_normalized import ( + vw_transaction_normalized_sql, +) -transaction_delta_view_file = "usaspending_api/database_scripts/etl/transaction_delta_view.sql" +transaction_delta_view_file = ( + "usaspending_api/database_scripts/etl/transaction_delta_view.sql" +) with open(transaction_delta_view_file, "r") as f: transaction_delta_view = f.read() diff --git a/usaspending_api/search/models/award_search.py b/usaspending_api/search/models/award_search.py index 8e863df081..5a6d8ddbc8 100644 --- a/usaspending_api/search/models/award_search.py +++ b/usaspending_api/search/models/award_search.py @@ -9,8 +9,12 @@ class AwardSearch(models.Model): - treasury_account_identifiers = ArrayField(models.IntegerField(), default=list, null=True) - award = models.OneToOneField(Award, on_delete=models.DO_NOTHING, primary_key=True, related_name="%(class)s") + treasury_account_identifiers = ArrayField( + models.IntegerField(), default=list, null=True + ) + award = models.OneToOneField( + Award, on_delete=models.DO_NOTHING, primary_key=True, related_name="%(class)s" + ) category = models.TextField(null=True, db_index=True) type_raw = models.TextField(null=True, db_index=True) type_description_raw = models.TextField(null=True) @@ -18,19 +22,31 @@ class AwardSearch(models.Model): type_description = models.TextField(null=True) generated_unique_award_id = models.TextField(null=False, unique=True) generated_unique_award_id_legacy = models.TextField( - null=True, unique=True, help_text="Legacy generated unique award ID built using subtier awarding agency code" + null=True, + unique=True, + help_text="Legacy generated unique award ID built using subtier awarding agency code", ) display_award_id = models.TextField(null=True) update_date = models.DateTimeField(auto_now=True, null=True) piid = models.TextField(null=True, db_index=True) fain = models.TextField(null=True, db_index=True) uri = models.TextField(null=True, db_index=True) - award_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True, db_index=True) - total_outlays = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True, db_index=True) + award_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True, db_index=True + ) + total_outlays = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True, db_index=True + ) description = models.TextField(null=True) - total_subsidy_cost = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_loan_value = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + total_subsidy_cost = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_loan_value = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) total_obl_bin = models.TextField(null=True) recipient_hash = models.UUIDField(null=True) @@ -52,8 +68,12 @@ class AwardSearch(models.Model): date_signed = models.DateField(null=True) ordering_period_end_date = models.DateField(null=True) - original_loan_subsidy_cost = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - face_value_loan_guarantee = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + original_loan_subsidy_cost = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + face_value_loan_guarantee = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) awarding_agency_id = models.IntegerField(null=True, db_index=True) funding_agency_id = models.IntegerField(null=True, db_index=True) @@ -137,34 +157,64 @@ class AwardSearch(models.Model): tas_paths = ArrayField(models.TextField(), default=list, null=True) tas_components = ArrayField(models.TextField(), default=list, null=True) - disaster_emergency_fund_codes = ArrayField(models.TextField(), default=list, null=True) + disaster_emergency_fund_codes = ArrayField( + models.TextField(), default=list, null=True + ) spending_by_defc = models.JSONField(null=True) - total_covid_outlay = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_covid_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_iija_outlay = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_iija_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - officer_1_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + total_covid_outlay = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_covid_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_iija_outlay = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_iija_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + officer_1_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_1_name = models.TextField(null=True) - officer_2_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_2_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_2_name = models.TextField(null=True) - officer_3_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_3_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_3_name = models.TextField(null=True) - officer_4_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_4_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_4_name = models.TextField(null=True) - officer_5_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_5_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_5_name = models.TextField(null=True) is_fpds = models.BooleanField(default=False) fpds_agency_id = models.TextField(null=True) fpds_parent_agency_id = models.TextField(null=True) - base_and_all_options_value = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - non_federal_funding_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_subaward_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + base_and_all_options_value = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + non_federal_funding_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_subaward_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) subaward_count = models.IntegerField(null=True) - base_exercised_options_val = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + base_exercised_options_val = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) parent_award_piid = models.TextField(null=True, db_index=True) certified_date = models.DateField(blank=True, null=True) create_date = models.DateTimeField(null=True, auto_now_add=True) - total_funding_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + total_funding_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) latest_transaction = models.ForeignKey( "awards.TransactionNormalized", on_delete=models.DO_NOTHING, @@ -199,11 +249,15 @@ class AwardSearch(models.Model): "award", db_constraint=False, ) - total_indirect_federal_sharing = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + total_indirect_federal_sharing = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) transaction_unique_id = models.TextField(null=True) raw_recipient_name = models.TextField(null=True) data_source = models.TextField(null=True) - generated_pragmatic_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + generated_pragmatic_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) program_activities = models.JSONField(null=True) transaction_count = models.IntegerField(null=True) @@ -219,12 +273,15 @@ class Meta: ] indexes = [ models.Index( - fields=["recipient_hash"], name="as_idx_recipient_hash", condition=Q(action_date__gte="2007-10-01") + fields=["recipient_hash"], + name="as_idx_recipient_hash", + condition=Q(action_date__gte="2007-10-01"), ), models.Index( fields=["recipient_unique_id"], name="as_idx_recipient_unique_id", - condition=Q(recipient_unique_id__isnull=False) & Q(action_date__gte="2007-10-01"), + condition=Q(recipient_unique_id__isnull=False) + & Q(action_date__gte="2007-10-01"), ), models.Index( F("action_date").desc(nulls_last=True), @@ -258,8 +315,12 @@ class Meta: condition=Q(action_date__lt="2007-10-01"), ), models.Index(Upper("piid"), name="as_idx_piid_upper"), - models.Index(Upper("parent_award_piid"), name="as_idx_parent_award_piid_upper"), + models.Index( + Upper("parent_award_piid"), name="as_idx_parent_award_piid_upper" + ), models.Index(Upper("fain"), name="as_idx_fain_upper"), models.Index(Upper("uri"), name="as_idx_uri_upper"), - models.Index(F("update_date").desc(nulls_last=True), name="as_idx_update_date_desc"), + models.Index( + F("update_date").desc(nulls_last=True), name="as_idx_update_date_desc" + ), ] diff --git a/usaspending_api/search/models/transaction_search.py b/usaspending_api/search/models/transaction_search.py index 33a110700d..7b9c230e15 100644 --- a/usaspending_api/search/models/transaction_search.py +++ b/usaspending_api/search/models/transaction_search.py @@ -18,8 +18,12 @@ class TransactionSearch(models.Model): # Also, this table has been physically partitioned by partition key: is_fpds. We can no longer have a UNIQUE key # or UNIQUE INDEX on transaction_id (the primary_key) anymore, it must include the partition key. So setting # primary_key=False and adding a UniqueConstraint (is_fpds, transaction) - transaction = models.OneToOneField("awards.TransactionNormalized", on_delete=models.DO_NOTHING, primary_key=True) - award = models.ForeignKey("search.AwardSearch", on_delete=models.DO_NOTHING, null=True) + transaction = models.OneToOneField( + "awards.TransactionNormalized", on_delete=models.DO_NOTHING, primary_key=True + ) + award = models.ForeignKey( + "search.AwardSearch", on_delete=models.DO_NOTHING, null=True + ) transaction_unique_id = models.TextField(blank=False, null=False, default="NONE") usaspending_unique_transaction_id = models.TextField(null=True) modification_number = models.TextField(null=True) @@ -80,15 +84,31 @@ class TransactionSearch(models.Model): business_categories = ArrayField(models.TextField(), null=True) # Amounts - award_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - generated_pragmatic_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - federal_action_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - original_loan_subsidy_cost = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - face_value_loan_guarantee = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + award_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + generated_pragmatic_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + federal_action_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + original_loan_subsidy_cost = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + face_value_loan_guarantee = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) indirect_federal_sharing = NumericField(blank=True, null=True) - funding_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_funding_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - non_federal_funding_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + funding_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_funding_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + non_federal_funding_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) # Recipient recipient_hash = models.UUIDField(null=True) @@ -161,15 +181,25 @@ class TransactionSearch(models.Model): # Officer Amounts officer_1_name = models.TextField(null=True) - officer_1_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_1_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_2_name = models.TextField(null=True) - officer_2_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_2_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_3_name = models.TextField(null=True) - officer_3_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_3_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_4_name = models.TextField(null=True) - officer_4_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_4_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_5_name = models.TextField(null=True) - officer_5_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_5_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) # Exclusively FABS published_fabs_id = models.IntegerField(blank=True, null=True) @@ -417,7 +447,11 @@ class TransactionSearch(models.Model): class Meta: db_table = "transaction_search" - constraints = [models.UniqueConstraint(fields=["is_fpds", "transaction"], name="ts_idx_is_fpds_transaction_id")] + constraints = [ + models.UniqueConstraint( + fields=["is_fpds", "transaction"], name="ts_idx_is_fpds_transaction_id" + ) + ] indexes = [ models.Index(fields=["transaction"], name="ts_idx_transaction_id"), models.Index(fields=["generated_unique_award_id"], name="ts_idx_award_key"), @@ -431,26 +465,50 @@ class Meta: name="ts_idx_fpds_key_pre2008", condition=Q(action_date__lt="2007-10-01"), ), - models.Index(fields=["piid"], name="ts_idx_piid_pre2008", condition=Q(action_date__lt="2007-10-01")), + models.Index( + fields=["piid"], + name="ts_idx_piid_pre2008", + condition=Q(action_date__lt="2007-10-01"), + ), models.Index( fields=["parent_award_id"], name="ts_idx_parent_award_id_pre2008", condition=Q(action_date__lt="2007-10-01"), ), - models.Index(fields=["fain"], name="ts_idx_fain_pre2008", condition=Q(action_date__lt="2007-10-01")), - models.Index(fields=["uri"], name="ts_idx_uri_pre2008", condition=Q(action_date__lt="2007-10-01")), + models.Index( + fields=["fain"], + name="ts_idx_fain_pre2008", + condition=Q(action_date__lt="2007-10-01"), + ), + models.Index( + fields=["uri"], + name="ts_idx_uri_pre2008", + condition=Q(action_date__lt="2007-10-01"), + ), models.Index(fields=["is_fpds"], name="ts_idx_is_fpds"), models.Index( - fields=["-action_date"], name="ts_idx_action_date", condition=Q(action_date__gte="2007-10-01") + fields=["-action_date"], + name="ts_idx_action_date", + condition=Q(action_date__gte="2007-10-01"), ), - models.Index(fields=["-last_modified_date"], name="ts_idx_last_modified_date"), models.Index( - fields=["-fiscal_year"], name="ts_idx_fiscal_year", condition=Q(action_date__gte="2007-10-01") + fields=["-last_modified_date"], name="ts_idx_last_modified_date" ), models.Index( - fields=["type"], name="ts_idx_type", condition=Q(type__isnull=False) & Q(action_date__gte="2007-10-01") + fields=["-fiscal_year"], + name="ts_idx_fiscal_year", + condition=Q(action_date__gte="2007-10-01"), + ), + models.Index( + fields=["type"], + name="ts_idx_type", + condition=Q(type__isnull=False) & Q(action_date__gte="2007-10-01"), + ), + models.Index( + fields=["award"], + name="ts_idx_award_id", + condition=Q(action_date__gte="2007-10-01"), ), - models.Index(fields=["award"], name="ts_idx_award_id", condition=Q(action_date__gte="2007-10-01")), models.Index( fields=["pop_zip5"], name="ts_idx_pop_zip5", @@ -459,12 +517,14 @@ class Meta: models.Index( fields=["recipient_unique_id"], name="ts_idx_recipient_unique_id", - condition=Q(recipient_unique_id__isnull=False) & Q(action_date__gte="2007-10-01"), + condition=Q(recipient_unique_id__isnull=False) + & Q(action_date__gte="2007-10-01"), ), models.Index( fields=["parent_recipient_unique_id"], name="ts_idx_parent_recipient_unique", - condition=Q(parent_recipient_unique_id__isnull=False) & Q(action_date__gte="2007-10-01"), + condition=Q(parent_recipient_unique_id__isnull=False) + & Q(action_date__gte="2007-10-01"), ), models.Index( fields=["pop_state_code", "action_date"], @@ -474,10 +534,14 @@ class Meta: & Q(action_date__gte="2007-10-01"), ), models.Index( - fields=["recipient_hash"], name="ts_idx_recipient_hash", condition=Q(action_date__gte="2007-10-01") + fields=["recipient_hash"], + name="ts_idx_recipient_hash", + condition=Q(action_date__gte="2007-10-01"), ), models.Index( - fields=["action_date"], name="ts_idx_action_date_pre2008", condition=Q(action_date__lt="2007-10-01") + fields=["action_date"], + name="ts_idx_action_date_pre2008", + condition=Q(action_date__lt="2007-10-01"), ), models.Index(fields=["etl_update_date"], name="ts_idx_etl_update_date"), models.Index( @@ -485,12 +549,20 @@ class Meta: name="ts_idx_tocp_pre2008", condition=Q(action_date__lt="2007-10-01"), ), - models.Index(fields=["naics_code"], name="ts_idx_naics_pre2008", condition=Q(action_date__lt="2007-10-01")), models.Index( - fields=["extent_competed"], name="ts_idx_ext_com_pre2008", condition=Q(action_date__lt="2007-10-01") + fields=["naics_code"], + name="ts_idx_naics_pre2008", + condition=Q(action_date__lt="2007-10-01"), + ), + models.Index( + fields=["extent_competed"], + name="ts_idx_ext_com_pre2008", + condition=Q(action_date__lt="2007-10-01"), ), models.Index( - fields=["product_or_service_code"], name="ts_idx_psc_pre2008", condition=Q(action_date__lt="2007-10-01") + fields=["product_or_service_code"], + name="ts_idx_psc_pre2008", + condition=Q(action_date__lt="2007-10-01"), ), models.Index( fields=["type_set_aside"], @@ -498,8 +570,12 @@ class Meta: condition=Q(action_date__lt="2007-10-01"), ), models.Index( - fields=["cfda_number"], name="ts_idx_cfda_aside_pre2008", condition=Q(action_date__lt="2007-10-01") + fields=["cfda_number"], + name="ts_idx_cfda_aside_pre2008", + condition=Q(action_date__lt="2007-10-01"), + ), + models.Index( + fields=["awarding_agency_id"], name="ts_idx_awarding_agency_id" ), - models.Index(fields=["awarding_agency_id"], name="ts_idx_awarding_agency_id"), models.Index(fields=["funding_agency_id"], name="ts_idx_funding_agency_id"), ] diff --git a/usaspending_api/tests/conftest_spark.py b/usaspending_api/tests/conftest_spark.py index 6308f54ba4..60e2a87a9e 100644 --- a/usaspending_api/tests/conftest_spark.py +++ b/usaspending_api/tests/conftest_spark.py @@ -11,6 +11,7 @@ from django.db import connections from model_bakery import baker from psycopg2.extensions import AsIs + from usaspending_api import settings from usaspending_api.common.etl.spark import create_ref_temp_views from usaspending_api.common.helpers.spark_helpers import ( @@ -21,7 +22,10 @@ from usaspending_api.common.spark.configs import LOCAL_BASIC_EXTRA_CONF from usaspending_api.config import CONFIG from usaspending_api.etl.award_helpers import update_awards -from usaspending_api.etl.management.commands.create_delta_table import LOAD_QUERY_TABLE_SPEC, LOAD_TABLE_TABLE_SPEC +from usaspending_api.etl.management.commands.create_delta_table import ( + LOAD_QUERY_TABLE_SPEC, + LOAD_TABLE_TABLE_SPEC, +) if TYPE_CHECKING: from pyspark.sql import SparkSession @@ -59,7 +63,7 @@ def s3_unittest_data_bucket_setup_and_teardown(worker_id: str) -> str: unittest_data_bucket = "unittest-data-{}".format(worker_prefix + str(uuid.uuid4())) logging.warning( - f"Attempting to create unit test data bucket {unittest_data_bucket } " + f"Attempting to create unit test data bucket {unittest_data_bucket} " f"at: http://{CONFIG.AWS_S3_ENDPOINT} using CONFIG.AWS_ACCESS_KEY and CONFIG.AWS_SECRET_KEY" ) s3_client = boto3.client( @@ -140,11 +144,13 @@ def spark(tmp_path_factory) -> Generator["SparkSession", None, None]: # So as not to have interfering schemas and tables in the metastore_db from individual test run to run, # another test-scoped fixture should be created, pulling this in, and blowing away all schemas and tables as part # of each run - spark_sql_warehouse_dir = str(tmp_path_factory.mktemp(basename="spark-warehouse", numbered=False)) + spark_sql_warehouse_dir = str( + tmp_path_factory.mktemp(basename="spark-warehouse", numbered=False) + ) extra_conf = { **LOCAL_BASIC_EXTRA_CONF, "spark.sql.warehouse.dir": spark_sql_warehouse_dir, - "spark.hadoop.javax.jdo.option.ConnectionURL": f"jdbc:derby:;databaseName={spark_sql_warehouse_dir}/metastore_db;create=true", + "spark.hadoop.javax.jdo.option.ConnectionURL": f"jdbc:derby:;databaseName={spark_sql_warehouse_dir}/metastore_db;create=true", # noqa: E501 } spark = configure_spark_session( app_name="Unit Test Session", @@ -224,16 +230,36 @@ def populate_broker_data(broker_server_dblink_setup): USAspending test DB and broker test DB """ broker_data = { - "sam_recipient": json.loads(Path("usaspending_api/recipient/tests/data/broker_sam_recipient.json").read_text()), - "subaward": json.loads(Path("usaspending_api/awards/tests/data/subaward.json").read_text()), + "sam_recipient": json.loads( + Path( + "usaspending_api/recipient/tests/data/broker_sam_recipient.json" + ).read_text() + ), + "subaward": json.loads( + Path("usaspending_api/awards/tests/data/subaward.json").read_text() + ), "cd_state_grouped": json.loads( - Path("usaspending_api/transactions/tests/data/cd_state_grouped.json").read_text() + Path( + "usaspending_api/transactions/tests/data/cd_state_grouped.json" + ).read_text() + ), + "zips": json.loads( + Path("usaspending_api/transactions/tests/data/zips.json").read_text() + ), + "cd_zips_grouped": json.loads( + Path( + "usaspending_api/transactions/tests/data/cd_zips_grouped.json" + ).read_text() + ), + "cd_city_grouped": json.loads( + Path( + "usaspending_api/transactions/tests/data/cd_city_grouped.json" + ).read_text() ), - "zips": json.loads(Path("usaspending_api/transactions/tests/data/zips.json").read_text()), - "cd_zips_grouped": json.loads(Path("usaspending_api/transactions/tests/data/cd_zips_grouped.json").read_text()), - "cd_city_grouped": json.loads(Path("usaspending_api/transactions/tests/data/cd_city_grouped.json").read_text()), "cd_county_grouped": json.loads( - Path("usaspending_api/transactions/tests/data/cd_county_grouped.json").read_text() + Path( + "usaspending_api/transactions/tests/data/cd_county_grouped.json" + ).read_text() ), } insert_statement = "INSERT INTO %(table_name)s (%(columns)s) VALUES %(values)s" @@ -244,7 +270,11 @@ def populate_broker_data(broker_server_dblink_setup): values = [str(tuple(r.values())).replace("None", "null") for r in rows] sql_string = cursor.mogrify( insert_statement, - {"table_name": AsIs(table_name), "columns": AsIs(",".join(columns)), "values": AsIs(",".join(values))}, + { + "table_name": AsIs(table_name), + "columns": AsIs(",".join(columns)), + "values": AsIs(",".join(values)), + }, ) cursor.execute(sql_string) yield @@ -330,10 +360,16 @@ def _build_usas_data_for_spark(): # Create agency data funding_toptier_agency = baker.make( - "references.ToptierAgency", name="TEST AGENCY 1", abbreviation="TA1", _fill_optional=True + "references.ToptierAgency", + name="TEST AGENCY 1", + abbreviation="TA1", + _fill_optional=True, ) funding_subtier_agency = baker.make( - "references.SubtierAgency", name="TEST SUBTIER 1", abbreviation="SA1", _fill_optional=True + "references.SubtierAgency", + name="TEST SUBTIER 1", + abbreviation="SA1", + _fill_optional=True, ) funding_agency = baker.make( "references.Agency", @@ -343,8 +379,18 @@ def _build_usas_data_for_spark(): _fill_optional=True, ) - toptier = baker.make("references.ToptierAgency", name="toptier", abbreviation="tt", _fill_optional=True) - subtier = baker.make("references.SubtierAgency", name="subtier", abbreviation="st", _fill_optional=True) + toptier = baker.make( + "references.ToptierAgency", + name="toptier", + abbreviation="tt", + _fill_optional=True, + ) + subtier = baker.make( + "references.SubtierAgency", + name="subtier", + abbreviation="st", + _fill_optional=True, + ) agency = baker.make( "references.Agency", toptier_agency=toptier, @@ -355,10 +401,17 @@ def _build_usas_data_for_spark(): ) awarding_toptier_agency = baker.make( - "references.ToptierAgency", name="TEST AGENCY 2", abbreviation="TA2", _fill_optional=True + "references.ToptierAgency", + name="TEST AGENCY 2", + abbreviation="TA2", + _fill_optional=True, ) awarding_subtier_agency = baker.make( - "references.SubtierAgency", name="TEST SUBTIER 2", abbreviation="SA2", subtier_code="789", _fill_optional=True + "references.SubtierAgency", + name="TEST SUBTIER 2", + abbreviation="SA2", + subtier_code="789", + _fill_optional=True, ) awarding_agency = baker.make( "references.Agency", @@ -379,14 +432,57 @@ def _build_usas_data_for_spark(): county_name="County Name", _fill_optional=True, ) - baker.make("references.RefCountryCode", country_code="USA", country_name="UNITED STATES", _fill_optional=True) - baker.make("recipient.StateData", code="VA", name="Virginia", fips="51", _fill_optional=True) - baker.make("references.PopCounty", state_code="51", county_number="000", latest_population=1, _fill_optional=True) - baker.make("references.PopCounty", state_code="51", county_number="001", latest_population=1, _fill_optional=True) - baker.make("references.PopCongressionalDistrict", state_code="51", latest_population=1, congressional_district="01") - defc_l = baker.make("references.DisasterEmergencyFundCode", code="L", group_name="covid_19", _fill_optional=True) - defc_m = baker.make("references.DisasterEmergencyFundCode", code="M", group_name="covid_19", _fill_optional=True) - defc_q = baker.make("references.DisasterEmergencyFundCode", code="Q", group_name=None, _fill_optional=True) + baker.make( + "references.RefCountryCode", + country_code="USA", + country_name="UNITED STATES", + _fill_optional=True, + ) + baker.make( + "recipient.StateData", + code="VA", + name="Virginia", + fips="51", + _fill_optional=True, + ) + baker.make( + "references.PopCounty", + state_code="51", + county_number="000", + latest_population=1, + _fill_optional=True, + ) + baker.make( + "references.PopCounty", + state_code="51", + county_number="001", + latest_population=1, + _fill_optional=True, + ) + baker.make( + "references.PopCongressionalDistrict", + state_code="51", + latest_population=1, + congressional_district="01", + ) + defc_l = baker.make( + "references.DisasterEmergencyFundCode", + code="L", + group_name="covid_19", + _fill_optional=True, + ) + defc_m = baker.make( + "references.DisasterEmergencyFundCode", + code="M", + group_name="covid_19", + _fill_optional=True, + ) + defc_q = baker.make( + "references.DisasterEmergencyFundCode", + code="Q", + group_name=None, + _fill_optional=True, + ) rpa_1 = baker.make( "references.RefProgramActivity", id=1, @@ -408,7 +504,9 @@ def _build_usas_data_for_spark(): # Create account data federal_account = baker.make( - "accounts.FederalAccount", parent_toptier_agency=funding_toptier_agency, _fill_optional=True + "accounts.FederalAccount", + parent_toptier_agency=funding_toptier_agency, + _fill_optional=True, ) tas = baker.make( "accounts.TreasuryAppropriationAccount", @@ -502,10 +600,16 @@ def _build_usas_data_for_spark(): recipient_location_congressional_population=1, pop_congressional_population=1, tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], disaster_emergency_fund_codes=["L", "M"], total_covid_outlay=2.0, @@ -702,10 +806,16 @@ def _build_usas_data_for_spark(): recipient_location_state_population=1, pop_state_population=1, tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], disaster_emergency_fund_codes=["Q"], spending_by_defc=[{"defc": "Q", "outlay": 1.00, "obligation": 1.00}], @@ -719,7 +829,9 @@ def _build_usas_data_for_spark(): recipient_location_county_fips=None, pop_county_fips=None, generated_pragmatic_obligation=0.00, - program_activities=[{"name": "TRAINING AND RECRUITING", "code": "0003", "type": "PAC/PAN"}], + program_activities=[ + {"name": "TRAINING AND RECRUITING", "code": "0003", "type": "PAC/PAN"} + ], federal_accounts=[ { "id": federal_account.id, @@ -889,10 +1001,16 @@ def _build_usas_data_for_spark(): non_federal_funding_amount=0.00, treasury_account_identifiers=[tas.treasury_account_identifier], tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], federal_accounts=[ { @@ -998,10 +1116,16 @@ def _build_usas_data_for_spark(): non_federal_funding_amount=0.00, treasury_account_identifiers=[tas.treasury_account_identifier], tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], federal_accounts=[ { @@ -1191,10 +1315,16 @@ def _build_usas_data_for_spark(): total_funding_amount=0.00, treasury_account_identifiers=[tas.treasury_account_identifier], tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], federal_accounts=[ { @@ -1206,7 +1336,9 @@ def _build_usas_data_for_spark(): disaster_emergency_fund_codes=["Q"], recipient_location_county_fips=None, pop_county_fips=None, - program_activities=[{"code": "0003", "name": "TRAINING AND RECRUITING", "type": "PAC/PAN"}], + program_activities=[ + {"code": "0003", "name": "TRAINING AND RECRUITING", "type": "PAC/PAN"} + ], ) pap1 = baker.make("references.ProgramActivityPark", code="1000", name="PAP name") @@ -1289,10 +1421,16 @@ def _build_usas_data_for_spark(): total_funding_amount=0.00, treasury_account_identifiers=[tas.treasury_account_identifier], tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], federal_accounts=[ { @@ -1304,7 +1442,9 @@ def _build_usas_data_for_spark(): disaster_emergency_fund_codes=["Q"], recipient_location_county_fips=None, pop_county_fips=None, - program_activities=[{"code": "0003", "name": "TRAINING AND RECRUITING", "type": "PAC/PAN"}], + program_activities=[ + {"code": "0003", "name": "TRAINING AND RECRUITING", "type": "PAC/PAN"} + ], ) baker.make( "search.TransactionSearch", @@ -1415,7 +1555,9 @@ def _build_usas_data_for_spark(): _fill_optional=True, ) - dabs = baker.make("submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-05-01") + dabs = baker.make( + "submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-05-01" + ) sa = baker.make( "submissions.SubmissionAttributes", reporting_period_start="2020-04-02", @@ -1487,21 +1629,28 @@ def populate_usas_data(db): @pytest.fixture -def populate_usas_data_and_recipients_from_broker(db, populate_usas_data, populate_broker_data): +def populate_usas_data_and_recipients_from_broker( + db, populate_usas_data, populate_broker_data +): with connections[settings.DEFAULT_DB_ALIAS].cursor() as cursor: - restock_duns_sql = open("usaspending_api/broker/management/sql/restock_duns.sql", "r").read() + restock_duns_sql = open( + "usaspending_api/broker/management/sql/restock_duns.sql", "r" + ).read() restock_duns_sql = restock_duns_sql.replace("VACUUM ANALYZE int.duns;", "") cursor.execute(restock_duns_sql) call_command("update_recipient_lookup") with connections[settings.DEFAULT_DB_ALIAS].cursor() as cursor: restock_recipient_profile_sql = open( - "usaspending_api/recipient/management/sql/restock_recipient_profile.sql", "r" + "usaspending_api/recipient/management/sql/restock_recipient_profile.sql", + "r", ).read() cursor.execute(restock_recipient_profile_sql) yield -def create_all_delta_tables(spark: "SparkSession", s3_bucket: str, tables_to_load: list): +def create_all_delta_tables( + spark: "SparkSession", s3_bucket: str, tables_to_load: list +): load_query_tables = [val for val in tables_to_load if val in LOAD_QUERY_TABLE_SPEC] load_table_tables = [val for val in tables_to_load if val in LOAD_TABLE_TABLE_SPEC] for dest_table in load_table_tables + load_query_tables: @@ -1519,10 +1668,16 @@ def create_all_delta_tables(spark: "SparkSession", s3_bucket: str, tables_to_loa f"--spark-s3-bucket={s3_bucket}", ) else: - call_command("create_delta_table", f"--destination-table={dest_table}", f"--spark-s3-bucket={s3_bucket}") + call_command( + "create_delta_table", + f"--destination-table={dest_table}", + f"--spark-s3-bucket={s3_bucket}", + ) -def create_and_load_all_delta_tables(spark: "SparkSession", s3_bucket: str, tables_to_load: list): +def create_and_load_all_delta_tables( + spark: "SparkSession", s3_bucket: str, tables_to_load: list +): create_all_delta_tables(spark, s3_bucket, tables_to_load) load_query_tables = [val for val in tables_to_load if val in LOAD_QUERY_TABLE_SPEC] diff --git a/usaspending_api/transactions/delta_models/transaction_fpds.py b/usaspending_api/transactions/delta_models/transaction_fpds.py index e624fc44fa..748132223d 100644 --- a/usaspending_api/transactions/delta_models/transaction_fpds.py +++ b/usaspending_api/transactions/delta_models/transaction_fpds.py @@ -2,145 +2,316 @@ TRANSACTION_FPDS_COLUMN_INFO = [ TransactionColumn("a_76_fair_act_action", "a_76_fair_act_action", "STRING"), - TransactionColumn("a_76_fair_act_action_desc", "a_76_fair_act_action_desc", "STRING"), - TransactionColumn("action_date", "action_date", "STRING", "string_datetime_remove_timestamp"), + TransactionColumn( + "a_76_fair_act_action_desc", "a_76_fair_act_action_desc", "STRING" + ), + TransactionColumn( + "action_date", "action_date", "STRING", "string_datetime_remove_timestamp" + ), TransactionColumn("action_type", "action_type", "STRING"), TransactionColumn("action_type_description", "action_type_description", "STRING"), TransactionColumn("agency_id", "agency_id", "STRING"), TransactionColumn("airport_authority", "airport_authority", "BOOLEAN"), - TransactionColumn("alaskan_native_owned_corpo", "alaskan_native_owned_corpo", "BOOLEAN"), - TransactionColumn("alaskan_native_servicing_i", "alaskan_native_servicing_i", "BOOLEAN"), - TransactionColumn("american_indian_owned_busi", "american_indian_owned_busi", "BOOLEAN"), + TransactionColumn( + "alaskan_native_owned_corpo", "alaskan_native_owned_corpo", "BOOLEAN" + ), + TransactionColumn( + "alaskan_native_servicing_i", "alaskan_native_servicing_i", "BOOLEAN" + ), + TransactionColumn( + "american_indian_owned_busi", "american_indian_owned_busi", "BOOLEAN" + ), TransactionColumn("annual_revenue", "annual_revenue", "STRING"), - TransactionColumn("asian_pacific_american_own", "asian_pacific_american_own", "BOOLEAN"), + TransactionColumn( + "asian_pacific_american_own", "asian_pacific_american_own", "BOOLEAN" + ), TransactionColumn("award_description", "award_description", "STRING"), - TransactionColumn("award_modification_amendme", "award_modification_amendme", "STRING"), + TransactionColumn( + "award_modification_amendme", "award_modification_amendme", "STRING" + ), TransactionColumn("award_or_idv_flag", "award_or_idv_flag", "STRING"), - TransactionColumn("awardee_or_recipient_legal", "awardee_or_recipient_legal", "STRING"), + TransactionColumn( + "awardee_or_recipient_legal", "awardee_or_recipient_legal", "STRING" + ), TransactionColumn("awardee_or_recipient_uei", "awardee_or_recipient_uei", "STRING"), - TransactionColumn("awardee_or_recipient_uniqu", "awardee_or_recipient_uniqu", "STRING"), + TransactionColumn( + "awardee_or_recipient_uniqu", "awardee_or_recipient_uniqu", "STRING" + ), TransactionColumn("awarding_agency_code", "awarding_agency_code", "STRING"), TransactionColumn("awarding_agency_name", "awarding_agency_name", "STRING"), TransactionColumn("awarding_office_code", "awarding_office_code", "STRING"), TransactionColumn("awarding_office_name", "awarding_office_name", "STRING"), - TransactionColumn("awarding_sub_tier_agency_c", "awarding_sub_tier_agency_c", "STRING"), - TransactionColumn("awarding_sub_tier_agency_n", "awarding_sub_tier_agency_n", "STRING"), - TransactionColumn("base_and_all_options_value", "base_and_all_options_value", "STRING"), - TransactionColumn("base_exercised_options_val", "base_exercised_options_val", "STRING"), - TransactionColumn("black_american_owned_busin", "black_american_owned_busin", "BOOLEAN"), - TransactionColumn("c1862_land_grant_college", "c1862_land_grant_college", "BOOLEAN"), - TransactionColumn("c1890_land_grant_college", "c1890_land_grant_college", "BOOLEAN"), - TransactionColumn("c1994_land_grant_college", "c1994_land_grant_college", "BOOLEAN"), + TransactionColumn( + "awarding_sub_tier_agency_c", "awarding_sub_tier_agency_c", "STRING" + ), + TransactionColumn( + "awarding_sub_tier_agency_n", "awarding_sub_tier_agency_n", "STRING" + ), + TransactionColumn( + "base_and_all_options_value", "base_and_all_options_value", "STRING" + ), + TransactionColumn( + "base_exercised_options_val", "base_exercised_options_val", "STRING" + ), + TransactionColumn( + "black_american_owned_busin", "black_american_owned_busin", "BOOLEAN" + ), + TransactionColumn( + "c1862_land_grant_college", "c1862_land_grant_college", "BOOLEAN" + ), + TransactionColumn( + "c1890_land_grant_college", "c1890_land_grant_college", "BOOLEAN" + ), + TransactionColumn( + "c1994_land_grant_college", "c1994_land_grant_college", "BOOLEAN" + ), TransactionColumn("c8a_program_participant", "c8a_program_participant", "BOOLEAN"), TransactionColumn("cage_code", "cage_code", "STRING"), TransactionColumn("city_local_government", "city_local_government", "BOOLEAN"), - TransactionColumn("clinger_cohen_act_pla_desc", "clinger_cohen_act_pla_desc", "STRING"), - TransactionColumn("clinger_cohen_act_planning", "clinger_cohen_act_planning", "STRING"), - TransactionColumn("commercial_item_acqui_desc", "commercial_item_acqui_desc", "STRING"), - TransactionColumn("commercial_item_acquisitio", "commercial_item_acquisitio", "STRING"), - TransactionColumn("commercial_item_test_desc", "commercial_item_test_desc", "STRING"), - TransactionColumn("commercial_item_test_progr", "commercial_item_test_progr", "STRING"), - TransactionColumn("community_developed_corpor", "community_developed_corpor", "BOOLEAN"), - TransactionColumn("community_development_corp", "community_development_corp", "BOOLEAN"), + TransactionColumn( + "clinger_cohen_act_pla_desc", "clinger_cohen_act_pla_desc", "STRING" + ), + TransactionColumn( + "clinger_cohen_act_planning", "clinger_cohen_act_planning", "STRING" + ), + TransactionColumn( + "commercial_item_acqui_desc", "commercial_item_acqui_desc", "STRING" + ), + TransactionColumn( + "commercial_item_acquisitio", "commercial_item_acquisitio", "STRING" + ), + TransactionColumn( + "commercial_item_test_desc", "commercial_item_test_desc", "STRING" + ), + TransactionColumn( + "commercial_item_test_progr", "commercial_item_test_progr", "STRING" + ), + TransactionColumn( + "community_developed_corpor", "community_developed_corpor", "BOOLEAN" + ), + TransactionColumn( + "community_development_corp", "community_development_corp", "BOOLEAN" + ), TransactionColumn("consolidated_contract", "consolidated_contract", "STRING"), - TransactionColumn("consolidated_contract_desc", "consolidated_contract_desc", "STRING"), - TransactionColumn("construction_wage_rat_desc", "construction_wage_rat_desc", "STRING"), - TransactionColumn("construction_wage_rate_req", "construction_wage_rate_req", "STRING"), - TransactionColumn("contingency_humanitar_desc", "contingency_humanitar_desc", "STRING"), - TransactionColumn("contingency_humanitarian_o", "contingency_humanitarian_o", "STRING"), + TransactionColumn( + "consolidated_contract_desc", "consolidated_contract_desc", "STRING" + ), + TransactionColumn( + "construction_wage_rat_desc", "construction_wage_rat_desc", "STRING" + ), + TransactionColumn( + "construction_wage_rate_req", "construction_wage_rate_req", "STRING" + ), + TransactionColumn( + "contingency_humanitar_desc", "contingency_humanitar_desc", "STRING" + ), + TransactionColumn( + "contingency_humanitarian_o", "contingency_humanitarian_o", "STRING" + ), TransactionColumn("contract_award_type", "contract_award_type", "STRING"), TransactionColumn("contract_award_type_desc", "contract_award_type_desc", "STRING"), TransactionColumn("contract_bundling", "contract_bundling", "STRING"), - TransactionColumn("contract_bundling_descrip", "contract_bundling_descrip", "STRING"), + TransactionColumn( + "contract_bundling_descrip", "contract_bundling_descrip", "STRING" + ), TransactionColumn("contract_financing", "contract_financing", "STRING"), - TransactionColumn("contract_financing_descrip", "contract_financing_descrip", "STRING"), - TransactionColumn("contracting_officers_desc", "contracting_officers_desc", "STRING"), - TransactionColumn("contracting_officers_deter", "contracting_officers_deter", "STRING"), + TransactionColumn( + "contract_financing_descrip", "contract_financing_descrip", "STRING" + ), + TransactionColumn( + "contracting_officers_desc", "contracting_officers_desc", "STRING" + ), + TransactionColumn( + "contracting_officers_deter", "contracting_officers_deter", "STRING" + ), TransactionColumn("contracts", "contracts", "BOOLEAN"), - TransactionColumn("corporate_entity_not_tax_e", "corporate_entity_not_tax_e", "BOOLEAN"), - TransactionColumn("corporate_entity_tax_exemp", "corporate_entity_tax_exemp", "BOOLEAN"), - TransactionColumn("cost_accounting_stand_desc", "cost_accounting_stand_desc", "STRING"), - TransactionColumn("cost_accounting_standards", "cost_accounting_standards", "STRING"), + TransactionColumn( + "corporate_entity_not_tax_e", "corporate_entity_not_tax_e", "BOOLEAN" + ), + TransactionColumn( + "corporate_entity_tax_exemp", "corporate_entity_tax_exemp", "BOOLEAN" + ), + TransactionColumn( + "cost_accounting_stand_desc", "cost_accounting_stand_desc", "STRING" + ), + TransactionColumn( + "cost_accounting_standards", "cost_accounting_standards", "STRING" + ), TransactionColumn("cost_or_pricing_data", "cost_or_pricing_data", "STRING"), - TransactionColumn("cost_or_pricing_data_desc", "cost_or_pricing_data_desc", "STRING"), + TransactionColumn( + "cost_or_pricing_data_desc", "cost_or_pricing_data_desc", "STRING" + ), TransactionColumn("council_of_governments", "council_of_governments", "BOOLEAN"), - TransactionColumn("country_of_product_or_desc", "country_of_product_or_desc", "STRING"), - TransactionColumn("country_of_product_or_serv", "country_of_product_or_serv", "STRING"), + TransactionColumn( + "country_of_product_or_desc", "country_of_product_or_desc", "STRING" + ), + TransactionColumn( + "country_of_product_or_serv", "country_of_product_or_serv", "STRING" + ), TransactionColumn("county_local_government", "county_local_government", "BOOLEAN"), TransactionColumn("created_at", "created_at", "TIMESTAMP"), - TransactionColumn("current_total_value_award", "current_total_value_award", "STRING"), - TransactionColumn("detached_award_proc_unique", "detached_award_proc_unique", "STRING"), - TransactionColumn("detached_award_procurement_id", "detached_award_procurement_id", "INTEGER"), + TransactionColumn( + "current_total_value_award", "current_total_value_award", "STRING" + ), + TransactionColumn( + "detached_award_proc_unique", "detached_award_proc_unique", "STRING" + ), + TransactionColumn( + "detached_award_procurement_id", "detached_award_procurement_id", "INTEGER" + ), TransactionColumn("division_name", "division_name", "STRING"), - TransactionColumn("division_number_or_office", "division_number_or_office", "STRING"), - TransactionColumn("dod_claimant_prog_cod_desc", "dod_claimant_prog_cod_desc", "STRING"), - TransactionColumn("dod_claimant_program_code", "dod_claimant_program_code", "STRING"), - TransactionColumn("domestic_or_foreign_e_desc", "domestic_or_foreign_e_desc", "STRING"), - TransactionColumn("domestic_or_foreign_entity", "domestic_or_foreign_entity", "STRING"), + TransactionColumn( + "division_number_or_office", "division_number_or_office", "STRING" + ), + TransactionColumn( + "dod_claimant_prog_cod_desc", "dod_claimant_prog_cod_desc", "STRING" + ), + TransactionColumn( + "dod_claimant_program_code", "dod_claimant_program_code", "STRING" + ), + TransactionColumn( + "domestic_or_foreign_e_desc", "domestic_or_foreign_e_desc", "STRING" + ), + TransactionColumn( + "domestic_or_foreign_entity", "domestic_or_foreign_entity", "STRING" + ), TransactionColumn("domestic_shelter", "domestic_shelter", "BOOLEAN"), - TransactionColumn("dot_certified_disadvantage", "dot_certified_disadvantage", "BOOLEAN"), - TransactionColumn("economically_disadvantaged", "economically_disadvantaged", "BOOLEAN"), + TransactionColumn( + "dot_certified_disadvantage", "dot_certified_disadvantage", "BOOLEAN" + ), + TransactionColumn( + "economically_disadvantaged", "economically_disadvantaged", "BOOLEAN" + ), TransactionColumn("educational_institution", "educational_institution", "BOOLEAN"), TransactionColumn("emerging_small_business", "emerging_small_business", "BOOLEAN"), TransactionColumn("entity_data_source", "entity_data_source", "STRING"), - TransactionColumn("epa_designated_produc_desc", "epa_designated_produc_desc", "STRING"), + TransactionColumn( + "epa_designated_produc_desc", "epa_designated_produc_desc", "STRING" + ), TransactionColumn("epa_designated_product", "epa_designated_product", "STRING"), TransactionColumn("evaluated_preference", "evaluated_preference", "STRING"), - TransactionColumn("evaluated_preference_desc", "evaluated_preference_desc", "STRING"), - TransactionColumn("extent_compete_description", "extent_compete_description", "STRING"), + TransactionColumn( + "evaluated_preference_desc", "evaluated_preference_desc", "STRING" + ), + TransactionColumn( + "extent_compete_description", "extent_compete_description", "STRING" + ), TransactionColumn("extent_competed", "extent_competed", "STRING"), - TransactionColumn("fair_opportunity_limi_desc", "fair_opportunity_limi_desc", "STRING"), - TransactionColumn("fair_opportunity_limited_s", "fair_opportunity_limited_s", "STRING"), + TransactionColumn( + "fair_opportunity_limi_desc", "fair_opportunity_limi_desc", "STRING" + ), + TransactionColumn( + "fair_opportunity_limited_s", "fair_opportunity_limited_s", "STRING" + ), TransactionColumn("fed_biz_opps", "fed_biz_opps", "STRING"), TransactionColumn("fed_biz_opps_description", "fed_biz_opps_description", "STRING"), - TransactionColumn("federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)"), + TransactionColumn( + "federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)" + ), TransactionColumn("federal_agency", "federal_agency", "BOOLEAN"), - TransactionColumn("federally_funded_research", "federally_funded_research", "BOOLEAN"), + TransactionColumn( + "federally_funded_research", "federally_funded_research", "BOOLEAN" + ), TransactionColumn("for_profit_organization", "for_profit_organization", "BOOLEAN"), TransactionColumn("foreign_funding", "foreign_funding", "STRING"), TransactionColumn("foreign_funding_desc", "foreign_funding_desc", "STRING"), TransactionColumn("foreign_government", "foreign_government", "BOOLEAN"), - TransactionColumn("foreign_owned_and_located", "foreign_owned_and_located", "BOOLEAN"), + TransactionColumn( + "foreign_owned_and_located", "foreign_owned_and_located", "BOOLEAN" + ), TransactionColumn("foundation", "foundation", "BOOLEAN"), TransactionColumn("funding_agency_code", "funding_agency_code", "STRING"), TransactionColumn("funding_agency_name", "funding_agency_name", "STRING"), TransactionColumn("funding_office_code", "funding_office_code", "STRING"), TransactionColumn("funding_office_name", "funding_office_name", "STRING"), - TransactionColumn("funding_sub_tier_agency_co", "funding_sub_tier_agency_co", "STRING"), - TransactionColumn("funding_sub_tier_agency_na", "funding_sub_tier_agency_na", "STRING"), - TransactionColumn("government_furnished_desc", "government_furnished_desc", "STRING"), - TransactionColumn("government_furnished_prope", "government_furnished_prope", "STRING"), + TransactionColumn( + "funding_sub_tier_agency_co", "funding_sub_tier_agency_co", "STRING" + ), + TransactionColumn( + "funding_sub_tier_agency_na", "funding_sub_tier_agency_na", "STRING" + ), + TransactionColumn( + "government_furnished_desc", "government_furnished_desc", "STRING" + ), + TransactionColumn( + "government_furnished_prope", "government_furnished_prope", "STRING" + ), TransactionColumn("grants", "grants", "BOOLEAN"), - TransactionColumn("hispanic_american_owned_bu", "hispanic_american_owned_bu", "BOOLEAN"), - TransactionColumn("hispanic_servicing_institu", "hispanic_servicing_institu", "BOOLEAN"), - TransactionColumn("historically_black_college", "historically_black_college", "BOOLEAN"), - TransactionColumn("historically_underutilized", "historically_underutilized", "BOOLEAN"), + TransactionColumn( + "hispanic_american_owned_bu", "hispanic_american_owned_bu", "BOOLEAN" + ), + TransactionColumn( + "hispanic_servicing_institu", "hispanic_servicing_institu", "BOOLEAN" + ), + TransactionColumn( + "historically_black_college", "historically_black_college", "BOOLEAN" + ), + TransactionColumn( + "historically_underutilized", "historically_underutilized", "BOOLEAN" + ), TransactionColumn("hospital_flag", "hospital_flag", "BOOLEAN"), - TransactionColumn("housing_authorities_public", "housing_authorities_public", "BOOLEAN"), + TransactionColumn( + "housing_authorities_public", "housing_authorities_public", "BOOLEAN" + ), TransactionColumn("idv_type", "idv_type", "STRING"), TransactionColumn("idv_type_description", "idv_type_description", "STRING"), - TransactionColumn("indian_tribe_federally_rec", "indian_tribe_federally_rec", "BOOLEAN"), - TransactionColumn("information_technolog_desc", "information_technolog_desc", "STRING"), - TransactionColumn("information_technology_com", "information_technology_com", "STRING"), - TransactionColumn("inherently_government_desc", "inherently_government_desc", "STRING"), - TransactionColumn("inherently_government_func", "inherently_government_func", "STRING"), - TransactionColumn("initial_report_date", "initial_report_date", "STRING", "string_datetime_remove_timestamp"), - TransactionColumn("inter_municipal_local_gove", "inter_municipal_local_gove", "BOOLEAN"), - TransactionColumn("interagency_contract_desc", "interagency_contract_desc", "STRING"), - TransactionColumn("interagency_contracting_au", "interagency_contracting_au", "STRING"), - TransactionColumn("international_organization", "international_organization", "BOOLEAN"), + TransactionColumn( + "indian_tribe_federally_rec", "indian_tribe_federally_rec", "BOOLEAN" + ), + TransactionColumn( + "information_technolog_desc", "information_technolog_desc", "STRING" + ), + TransactionColumn( + "information_technology_com", "information_technology_com", "STRING" + ), + TransactionColumn( + "inherently_government_desc", "inherently_government_desc", "STRING" + ), + TransactionColumn( + "inherently_government_func", "inherently_government_func", "STRING" + ), + TransactionColumn( + "initial_report_date", + "initial_report_date", + "STRING", + "string_datetime_remove_timestamp", + ), + TransactionColumn( + "inter_municipal_local_gove", "inter_municipal_local_gove", "BOOLEAN" + ), + TransactionColumn( + "interagency_contract_desc", "interagency_contract_desc", "STRING" + ), + TransactionColumn( + "interagency_contracting_au", "interagency_contracting_au", "STRING" + ), + TransactionColumn( + "international_organization", "international_organization", "BOOLEAN" + ), TransactionColumn("interstate_entity", "interstate_entity", "BOOLEAN"), - TransactionColumn("joint_venture_economically", "joint_venture_economically", "BOOLEAN"), - TransactionColumn("joint_venture_women_owned", "joint_venture_women_owned", "BOOLEAN"), + TransactionColumn( + "joint_venture_economically", "joint_venture_economically", "BOOLEAN" + ), + TransactionColumn( + "joint_venture_women_owned", "joint_venture_women_owned", "BOOLEAN" + ), TransactionColumn("labor_standards", "labor_standards", "STRING"), TransactionColumn("labor_standards_descrip", "labor_standards_descrip", "STRING"), TransactionColumn("labor_surplus_area_firm", "labor_surplus_area_firm", "BOOLEAN"), TransactionColumn("last_modified", "last_modified", "STRING"), - TransactionColumn("legal_entity_address_line1", "legal_entity_address_line1", "STRING"), - TransactionColumn("legal_entity_address_line2", "legal_entity_address_line2", "STRING"), - TransactionColumn("legal_entity_address_line3", "legal_entity_address_line3", "STRING"), + TransactionColumn( + "legal_entity_address_line1", "legal_entity_address_line1", "STRING" + ), + TransactionColumn( + "legal_entity_address_line2", "legal_entity_address_line2", "STRING" + ), + TransactionColumn( + "legal_entity_address_line3", "legal_entity_address_line3", "STRING" + ), TransactionColumn("legal_entity_city_name", "legal_entity_city_name", "STRING"), - TransactionColumn("legal_entity_congressional", "legal_entity_congressional", "STRING"), + TransactionColumn( + "legal_entity_congressional", "legal_entity_congressional", "STRING" + ), TransactionColumn( "legal_entity_country_code", "legal_entity_country_code", @@ -163,68 +334,135 @@ TransactionColumn("legal_entity_county_code", "legal_entity_county_code", "STRING"), TransactionColumn("legal_entity_county_name", "legal_entity_county_name", "STRING"), TransactionColumn("legal_entity_state_code", "legal_entity_state_code", "STRING"), - TransactionColumn("legal_entity_state_descrip", "legal_entity_state_descrip", "STRING"), + TransactionColumn( + "legal_entity_state_descrip", "legal_entity_state_descrip", "STRING" + ), TransactionColumn("legal_entity_zip4", "legal_entity_zip4", "STRING"), TransactionColumn("legal_entity_zip5", "legal_entity_zip5", "STRING"), TransactionColumn("legal_entity_zip_last4", "legal_entity_zip_last4", "STRING"), - TransactionColumn("limited_liability_corporat", "limited_liability_corporat", "BOOLEAN"), + TransactionColumn( + "limited_liability_corporat", "limited_liability_corporat", "BOOLEAN" + ), TransactionColumn("local_area_set_aside", "local_area_set_aside", "STRING"), - TransactionColumn("local_area_set_aside_desc", "local_area_set_aside_desc", "STRING"), + TransactionColumn( + "local_area_set_aside_desc", "local_area_set_aside_desc", "STRING" + ), TransactionColumn("local_government_owned", "local_government_owned", "BOOLEAN"), TransactionColumn("major_program", "major_program", "STRING"), TransactionColumn("manufacturer_of_goods", "manufacturer_of_goods", "BOOLEAN"), - TransactionColumn("materials_supplies_article", "materials_supplies_article", "STRING"), - TransactionColumn("materials_supplies_descrip", "materials_supplies_descrip", "STRING"), + TransactionColumn( + "materials_supplies_article", "materials_supplies_article", "STRING" + ), + TransactionColumn( + "materials_supplies_descrip", "materials_supplies_descrip", "STRING" + ), TransactionColumn("minority_institution", "minority_institution", "BOOLEAN"), TransactionColumn("minority_owned_business", "minority_owned_business", "BOOLEAN"), TransactionColumn("multi_year_contract", "multi_year_contract", "STRING"), TransactionColumn("multi_year_contract_desc", "multi_year_contract_desc", "STRING"), - TransactionColumn("multiple_or_single_aw_desc", "multiple_or_single_aw_desc", "STRING"), - TransactionColumn("multiple_or_single_award_i", "multiple_or_single_award_i", "STRING"), - TransactionColumn("municipality_local_governm", "municipality_local_governm", "BOOLEAN"), + TransactionColumn( + "multiple_or_single_aw_desc", "multiple_or_single_aw_desc", "STRING" + ), + TransactionColumn( + "multiple_or_single_award_i", "multiple_or_single_award_i", "STRING" + ), + TransactionColumn( + "municipality_local_governm", "municipality_local_governm", "BOOLEAN" + ), TransactionColumn("naics", "naics", "STRING"), TransactionColumn("naics_description", "naics_description", "STRING"), TransactionColumn("national_interest_action", "national_interest_action", "STRING"), TransactionColumn("national_interest_desc", "national_interest_desc", "STRING"), - TransactionColumn("native_american_owned_busi", "native_american_owned_busi", "BOOLEAN"), - TransactionColumn("native_hawaiian_owned_busi", "native_hawaiian_owned_busi", "BOOLEAN"), - TransactionColumn("native_hawaiian_servicing", "native_hawaiian_servicing", "BOOLEAN"), + TransactionColumn( + "native_american_owned_busi", "native_american_owned_busi", "BOOLEAN" + ), + TransactionColumn( + "native_hawaiian_owned_busi", "native_hawaiian_owned_busi", "BOOLEAN" + ), + TransactionColumn( + "native_hawaiian_servicing", "native_hawaiian_servicing", "BOOLEAN" + ), TransactionColumn("nonprofit_organization", "nonprofit_organization", "BOOLEAN"), TransactionColumn("number_of_actions", "number_of_actions", "STRING"), TransactionColumn("number_of_employees", "number_of_employees", "STRING"), - TransactionColumn("number_of_offers_received", "number_of_offers_received", "STRING"), - TransactionColumn("officer_1_amount", "high_comp_officer1_amount", "NUMERIC(23,2)", "cast"), + TransactionColumn( + "number_of_offers_received", "number_of_offers_received", "STRING" + ), + TransactionColumn( + "officer_1_amount", "high_comp_officer1_amount", "NUMERIC(23,2)", "cast" + ), TransactionColumn("officer_1_name", "high_comp_officer1_full_na", "STRING"), - TransactionColumn("officer_2_amount", "high_comp_officer2_amount", "NUMERIC(23,2)", "cast"), + TransactionColumn( + "officer_2_amount", "high_comp_officer2_amount", "NUMERIC(23,2)", "cast" + ), TransactionColumn("officer_2_name", "high_comp_officer2_full_na", "STRING"), - TransactionColumn("officer_3_amount", "high_comp_officer3_amount", "NUMERIC(23,2)", "cast"), + TransactionColumn( + "officer_3_amount", "high_comp_officer3_amount", "NUMERIC(23,2)", "cast" + ), TransactionColumn("officer_3_name", "high_comp_officer3_full_na", "STRING"), - TransactionColumn("officer_4_amount", "high_comp_officer4_amount", "NUMERIC(23,2)", "cast"), + TransactionColumn( + "officer_4_amount", "high_comp_officer4_amount", "NUMERIC(23,2)", "cast" + ), TransactionColumn("officer_4_name", "high_comp_officer4_full_na", "STRING"), - TransactionColumn("officer_5_amount", "high_comp_officer5_amount", "NUMERIC(23,2)", "cast"), + TransactionColumn( + "officer_5_amount", "high_comp_officer5_amount", "NUMERIC(23,2)", "cast" + ), TransactionColumn("officer_5_name", "high_comp_officer5_full_na", "STRING"), TransactionColumn( - "ordering_period_end_date", "ordering_period_end_date", "STRING", "string_datetime_remove_timestamp" + "ordering_period_end_date", + "ordering_period_end_date", + "STRING", + "string_datetime_remove_timestamp", ), TransactionColumn("organizational_type", "organizational_type", "STRING"), - TransactionColumn("other_minority_owned_busin", "other_minority_owned_busin", "BOOLEAN"), - TransactionColumn("other_not_for_profit_organ", "other_not_for_profit_organ", "BOOLEAN"), - TransactionColumn("other_statutory_authority", "other_statutory_authority", "STRING"), - TransactionColumn("other_than_full_and_o_desc", "other_than_full_and_o_desc", "STRING"), - TransactionColumn("other_than_full_and_open_c", "other_than_full_and_open_c", "STRING"), + TransactionColumn( + "other_minority_owned_busin", "other_minority_owned_busin", "BOOLEAN" + ), + TransactionColumn( + "other_not_for_profit_organ", "other_not_for_profit_organ", "BOOLEAN" + ), + TransactionColumn( + "other_statutory_authority", "other_statutory_authority", "STRING" + ), + TransactionColumn( + "other_than_full_and_o_desc", "other_than_full_and_o_desc", "STRING" + ), + TransactionColumn( + "other_than_full_and_open_c", "other_than_full_and_open_c", "STRING" + ), TransactionColumn("parent_award_id", "parent_award_id", "STRING"), - TransactionColumn("partnership_or_limited_lia", "partnership_or_limited_lia", "BOOLEAN"), - TransactionColumn("performance_based_se_desc", "performance_based_se_desc", "STRING"), - TransactionColumn("performance_based_service", "performance_based_service", "STRING"), - TransactionColumn("period_of_perf_potential_e", "period_of_perf_potential_e", "STRING"), - TransactionColumn("period_of_performance_curr", "period_of_performance_curr", "STRING"), - TransactionColumn("period_of_performance_star", "period_of_performance_star", "STRING"), + TransactionColumn( + "partnership_or_limited_lia", "partnership_or_limited_lia", "BOOLEAN" + ), + TransactionColumn( + "performance_based_se_desc", "performance_based_se_desc", "STRING" + ), + TransactionColumn( + "performance_based_service", "performance_based_service", "STRING" + ), + TransactionColumn( + "period_of_perf_potential_e", "period_of_perf_potential_e", "STRING" + ), + TransactionColumn( + "period_of_performance_curr", "period_of_performance_curr", "STRING" + ), + TransactionColumn( + "period_of_performance_star", "period_of_performance_star", "STRING" + ), TransactionColumn("piid", "piid", "STRING"), TransactionColumn("place_of_manufacture", "place_of_manufacture", "STRING"), - TransactionColumn("place_of_manufacture_desc", "place_of_manufacture_desc", "STRING"), - TransactionColumn("place_of_perf_country_desc", "place_of_perf_country_desc", "STRING"), - TransactionColumn("place_of_perfor_state_desc", "place_of_perfor_state_desc", "STRING"), - TransactionColumn("place_of_perform_city_name", "place_of_perform_city_name", "STRING"), + TransactionColumn( + "place_of_manufacture_desc", "place_of_manufacture_desc", "STRING" + ), + TransactionColumn( + "place_of_perf_country_desc", "place_of_perf_country_desc", "STRING" + ), + TransactionColumn( + "place_of_perfor_state_desc", "place_of_perfor_state_desc", "STRING" + ), + TransactionColumn( + "place_of_perform_city_name", "place_of_perform_city_name", "STRING" + ), TransactionColumn( "place_of_perform_country_c", "place_of_perform_country_c", @@ -244,85 +482,173 @@ ELSE {input} \ END", ), - TransactionColumn("place_of_perform_county_co", "place_of_perform_county_co", "STRING"), - TransactionColumn("place_of_perform_county_na", "place_of_perform_county_na", "STRING"), - TransactionColumn("place_of_perform_state_nam", "place_of_perform_state_nam", "STRING"), - TransactionColumn("place_of_perform_zip_last4", "place_of_perform_zip_last4", "STRING"), - TransactionColumn("place_of_performance_congr", "place_of_performance_congr", "STRING"), - TransactionColumn("place_of_performance_locat", "place_of_performance_locat", "STRING"), - TransactionColumn("place_of_performance_state", "place_of_performance_state", "STRING"), - TransactionColumn("place_of_performance_zip4a", "place_of_performance_zip4a", "STRING"), - TransactionColumn("place_of_performance_zip5", "place_of_performance_zip5", "STRING"), + TransactionColumn( + "place_of_perform_county_co", "place_of_perform_county_co", "STRING" + ), + TransactionColumn( + "place_of_perform_county_na", "place_of_perform_county_na", "STRING" + ), + TransactionColumn( + "place_of_perform_state_nam", "place_of_perform_state_nam", "STRING" + ), + TransactionColumn( + "place_of_perform_zip_last4", "place_of_perform_zip_last4", "STRING" + ), + TransactionColumn( + "place_of_performance_congr", "place_of_performance_congr", "STRING" + ), + TransactionColumn( + "place_of_performance_locat", "place_of_performance_locat", "STRING" + ), + TransactionColumn( + "place_of_performance_state", "place_of_performance_state", "STRING" + ), + TransactionColumn( + "place_of_performance_zip4a", "place_of_performance_zip4a", "STRING" + ), + TransactionColumn( + "place_of_performance_zip5", "place_of_performance_zip5", "STRING" + ), TransactionColumn("planning_commission", "planning_commission", "BOOLEAN"), TransactionColumn("port_authority", "port_authority", "BOOLEAN"), - TransactionColumn("potential_total_value_awar", "potential_total_value_awar", "STRING"), - TransactionColumn("price_evaluation_adjustmen", "price_evaluation_adjustmen", "STRING"), - TransactionColumn("private_university_or_coll", "private_university_or_coll", "BOOLEAN"), - TransactionColumn("product_or_service_co_desc", "product_or_service_co_desc", "STRING"), + TransactionColumn( + "potential_total_value_awar", "potential_total_value_awar", "STRING" + ), + TransactionColumn( + "price_evaluation_adjustmen", "price_evaluation_adjustmen", "STRING" + ), + TransactionColumn( + "private_university_or_coll", "private_university_or_coll", "BOOLEAN" + ), + TransactionColumn( + "product_or_service_co_desc", "product_or_service_co_desc", "STRING" + ), TransactionColumn("product_or_service_code", "product_or_service_code", "STRING"), TransactionColumn("program_acronym", "program_acronym", "STRING"), - TransactionColumn("program_system_or_equ_desc", "program_system_or_equ_desc", "STRING"), - TransactionColumn("program_system_or_equipmen", "program_system_or_equipmen", "STRING"), + TransactionColumn( + "program_system_or_equ_desc", "program_system_or_equ_desc", "STRING" + ), + TransactionColumn( + "program_system_or_equipmen", "program_system_or_equipmen", "STRING" + ), TransactionColumn("pulled_from", "pulled_from", "STRING"), - TransactionColumn("purchase_card_as_paym_desc", "purchase_card_as_paym_desc", "STRING"), - TransactionColumn("purchase_card_as_payment_m", "purchase_card_as_payment_m", "STRING"), - TransactionColumn("receives_contracts_and_gra", "receives_contracts_and_gra", "BOOLEAN"), - TransactionColumn("recovered_materials_s_desc", "recovered_materials_s_desc", "STRING"), - TransactionColumn("recovered_materials_sustai", "recovered_materials_sustai", "STRING"), - TransactionColumn("referenced_idv_agency_desc", "referenced_idv_agency_desc", "STRING"), - TransactionColumn("referenced_idv_agency_iden", "referenced_idv_agency_iden", "STRING"), - TransactionColumn("referenced_idv_agency_name", "referenced_idv_agency_name", "STRING"), - TransactionColumn("referenced_idv_modificatio", "referenced_idv_modificatio", "STRING"), + TransactionColumn( + "purchase_card_as_paym_desc", "purchase_card_as_paym_desc", "STRING" + ), + TransactionColumn( + "purchase_card_as_payment_m", "purchase_card_as_payment_m", "STRING" + ), + TransactionColumn( + "receives_contracts_and_gra", "receives_contracts_and_gra", "BOOLEAN" + ), + TransactionColumn( + "recovered_materials_s_desc", "recovered_materials_s_desc", "STRING" + ), + TransactionColumn( + "recovered_materials_sustai", "recovered_materials_sustai", "STRING" + ), + TransactionColumn( + "referenced_idv_agency_desc", "referenced_idv_agency_desc", "STRING" + ), + TransactionColumn( + "referenced_idv_agency_iden", "referenced_idv_agency_iden", "STRING" + ), + TransactionColumn( + "referenced_idv_agency_name", "referenced_idv_agency_name", "STRING" + ), + TransactionColumn( + "referenced_idv_modificatio", "referenced_idv_modificatio", "STRING" + ), TransactionColumn("referenced_idv_type", "referenced_idv_type", "STRING"), TransactionColumn("referenced_idv_type_desc", "referenced_idv_type_desc", "STRING"), - TransactionColumn("referenced_mult_or_si_desc", "referenced_mult_or_si_desc", "STRING"), - TransactionColumn("referenced_mult_or_single", "referenced_mult_or_single", "STRING"), + TransactionColumn( + "referenced_mult_or_si_desc", "referenced_mult_or_si_desc", "STRING" + ), + TransactionColumn( + "referenced_mult_or_single", "referenced_mult_or_single", "STRING" + ), # The referenced_multi_or_single field does not appear in the django model and may have been created inadvertently # in the Delta model previously. Since it is always NULL, it is a candidate for elimination. TransactionColumn("referenced_multi_or_single", "NULL", "STRING", "literal"), TransactionColumn("research", "research", "STRING"), TransactionColumn("research_description", "research_description", "STRING"), TransactionColumn("sam_exception", "sam_exception", "STRING"), - TransactionColumn("sam_exception_description", "sam_exception_description", "STRING"), - TransactionColumn("sba_certified_8_a_joint_ve", "sba_certified_8_a_joint_ve", "BOOLEAN"), - TransactionColumn("school_district_local_gove", "school_district_local_gove", "BOOLEAN"), + TransactionColumn( + "sam_exception_description", "sam_exception_description", "STRING" + ), + TransactionColumn( + "sba_certified_8_a_joint_ve", "sba_certified_8_a_joint_ve", "BOOLEAN" + ), + TransactionColumn( + "school_district_local_gove", "school_district_local_gove", "BOOLEAN" + ), TransactionColumn("school_of_forestry", "school_of_forestry", "BOOLEAN"), TransactionColumn("sea_transportation", "sea_transportation", "STRING"), TransactionColumn("sea_transportation_desc", "sea_transportation_desc", "STRING"), - TransactionColumn("self_certified_small_disad", "self_certified_small_disad", "BOOLEAN"), - TransactionColumn("service_disabled_veteran_o", "service_disabled_veteran_o", "BOOLEAN"), - TransactionColumn("small_agricultural_coopera", "small_agricultural_coopera", "BOOLEAN"), - TransactionColumn("small_business_competitive", "small_business_competitive", "BOOLEAN"), - TransactionColumn("small_disadvantaged_busine", "small_disadvantaged_busine", "BOOLEAN"), + TransactionColumn( + "self_certified_small_disad", "self_certified_small_disad", "BOOLEAN" + ), + TransactionColumn( + "service_disabled_veteran_o", "service_disabled_veteran_o", "BOOLEAN" + ), + TransactionColumn( + "small_agricultural_coopera", "small_agricultural_coopera", "BOOLEAN" + ), + TransactionColumn( + "small_business_competitive", "small_business_competitive", "BOOLEAN" + ), + TransactionColumn( + "small_disadvantaged_busine", "small_disadvantaged_busine", "BOOLEAN" + ), TransactionColumn("sole_proprietorship", "sole_proprietorship", "BOOLEAN"), TransactionColumn("solicitation_date", "solicitation_date", "DATE", "cast"), TransactionColumn("solicitation_identifier", "solicitation_identifier", "STRING"), - TransactionColumn("solicitation_procedur_desc", "solicitation_procedur_desc", "STRING"), + TransactionColumn( + "solicitation_procedur_desc", "solicitation_procedur_desc", "STRING" + ), TransactionColumn("solicitation_procedures", "solicitation_procedures", "STRING"), - TransactionColumn("state_controlled_instituti", "state_controlled_instituti", "BOOLEAN"), - TransactionColumn("subchapter_s_corporation", "subchapter_s_corporation", "BOOLEAN"), - TransactionColumn("subcontinent_asian_asian_i", "subcontinent_asian_asian_i", "BOOLEAN"), + TransactionColumn( + "state_controlled_instituti", "state_controlled_instituti", "BOOLEAN" + ), + TransactionColumn( + "subchapter_s_corporation", "subchapter_s_corporation", "BOOLEAN" + ), + TransactionColumn( + "subcontinent_asian_asian_i", "subcontinent_asian_asian_i", "BOOLEAN" + ), TransactionColumn("subcontracting_plan", "subcontracting_plan", "STRING"), TransactionColumn("subcontracting_plan_desc", "subcontracting_plan_desc", "STRING"), TransactionColumn("the_ability_one_program", "the_ability_one_program", "BOOLEAN"), TransactionColumn("total_obligated_amount", "total_obligated_amount", "STRING"), - TransactionColumn("township_local_government", "township_local_government", "BOOLEAN"), + TransactionColumn( + "township_local_government", "township_local_government", "BOOLEAN" + ), TransactionColumn("transaction_id", None, "LONG NOT NULL"), TransactionColumn("transaction_number", "transaction_number", "STRING"), TransactionColumn("transit_authority", "transit_authority", "BOOLEAN"), TransactionColumn("tribal_college", "tribal_college", "BOOLEAN"), TransactionColumn("tribally_owned_business", "tribally_owned_business", "BOOLEAN"), - TransactionColumn("type_of_contract_pric_desc", "type_of_contract_pric_desc", "STRING"), + TransactionColumn( + "type_of_contract_pric_desc", "type_of_contract_pric_desc", "STRING" + ), TransactionColumn("type_of_contract_pricing", "type_of_contract_pricing", "STRING"), TransactionColumn("type_of_idc", "type_of_idc", "STRING"), TransactionColumn("type_of_idc_description", "type_of_idc_description", "STRING"), TransactionColumn("type_set_aside", "type_set_aside", "STRING"), - TransactionColumn("type_set_aside_description", "type_set_aside_description", "STRING"), - TransactionColumn("ultimate_parent_legal_enti", "ultimate_parent_legal_enti", "STRING"), + TransactionColumn( + "type_set_aside_description", "type_set_aside_description", "STRING" + ), + TransactionColumn( + "ultimate_parent_legal_enti", "ultimate_parent_legal_enti", "STRING" + ), TransactionColumn("ultimate_parent_uei", "ultimate_parent_uei", "STRING"), - TransactionColumn("ultimate_parent_unique_ide", "ultimate_parent_unique_ide", "STRING"), + TransactionColumn( + "ultimate_parent_unique_ide", "ultimate_parent_unique_ide", "STRING" + ), TransactionColumn("undefinitized_action", "undefinitized_action", "STRING"), - TransactionColumn("undefinitized_action_desc", "undefinitized_action_desc", "STRING"), + TransactionColumn( + "undefinitized_action_desc", "undefinitized_action_desc", "STRING" + ), TransactionColumn("unique_award_key", "unique_award_key", "STRING"), TransactionColumn("updated_at", "updated_at", "TIMESTAMP"), TransactionColumn("us_federal_government", "us_federal_government", "BOOLEAN"), @@ -331,19 +657,27 @@ TransactionColumn("us_state_government", "us_state_government", "BOOLEAN"), TransactionColumn("us_tribal_government", "us_tribal_government", "BOOLEAN"), TransactionColumn("vendor_alternate_name", "vendor_alternate_name", "STRING"), - TransactionColumn("vendor_alternate_site_code", "vendor_alternate_site_code", "STRING"), - TransactionColumn("vendor_doing_as_business_n", "vendor_doing_as_business_n", "STRING"), + TransactionColumn( + "vendor_alternate_site_code", "vendor_alternate_site_code", "STRING" + ), + TransactionColumn( + "vendor_doing_as_business_n", "vendor_doing_as_business_n", "STRING" + ), TransactionColumn("vendor_enabled", "vendor_enabled", "STRING"), TransactionColumn("vendor_fax_number", "vendor_fax_number", "STRING"), TransactionColumn("vendor_legal_org_name", "vendor_legal_org_name", "STRING"), - TransactionColumn("vendor_location_disabled_f", "vendor_location_disabled_f", "STRING"), + TransactionColumn( + "vendor_location_disabled_f", "vendor_location_disabled_f", "STRING" + ), TransactionColumn("vendor_phone_number", "vendor_phone_number", "STRING"), TransactionColumn("vendor_site_code", "vendor_site_code", "STRING"), TransactionColumn("veteran_owned_business", "veteran_owned_business", "BOOLEAN"), TransactionColumn("veterinary_college", "veterinary_college", "BOOLEAN"), TransactionColumn("veterinary_hospital", "veterinary_hospital", "BOOLEAN"), TransactionColumn("woman_owned_business", "woman_owned_business", "BOOLEAN"), - TransactionColumn("women_owned_small_business", "women_owned_small_business", "BOOLEAN"), + TransactionColumn( + "women_owned_small_business", "women_owned_small_business", "BOOLEAN" + ), ] TRANSACTION_FPDS_COLUMNS = [col.dest_name for col in TRANSACTION_FPDS_COLUMN_INFO] @@ -372,12 +706,14 @@ ] TRANSACTION_FPDS_VIEW_COLUMNS = [ - col.dest_name for col in TRANSACTION_FPDS_COLUMN_INFO if col.dest_name not in delta_columns_not_in_view + col.dest_name + for col in TRANSACTION_FPDS_COLUMN_INFO + if col.dest_name not in delta_columns_not_in_view ] transaction_fpds_sql_string = rf""" CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( - {", ".join([f'{col.dest_name} {col.delta_type}' for col in TRANSACTION_FPDS_COLUMN_INFO])} + {", ".join([f"{col.dest_name} {col.delta_type}" for col in TRANSACTION_FPDS_COLUMN_INFO])} ) USING DELTA LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' @@ -385,27 +721,41 @@ # Mapping from raw.detached_award_procurement to int.transaction_normalized columns, where a simple mapping exists DAP_TO_NORMALIZED_COLUMN_INFO = [ - TransactionColumn("action_date", "action_date", "DATE", "parse_string_datetime_to_date"), + TransactionColumn( + "action_date", "action_date", "DATE", "parse_string_datetime_to_date" + ), TransactionColumn("action_type", "action_type", "STRING"), TransactionColumn("action_type_description", "action_type_description", "STRING"), TransactionColumn("certified_date", "NULL", "DATE", "literal"), TransactionColumn("description", "award_description", "STRING"), TransactionColumn("face_value_loan_guarantee", "NULL", "NUMERIC(23, 2)", "literal"), - TransactionColumn("federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)"), + TransactionColumn( + "federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)" + ), TransactionColumn("funding_amount", "NULL", "NUMERIC(23, 2)", "literal"), TransactionColumn("indirect_federal_sharing", "NULL", "NUMERIC(23, 2)", "literal"), TransactionColumn("is_fpds", "TRUE", "BOOLEAN", "literal"), TransactionColumn("last_modified_date", "last_modified", "TIMESTAMP", "cast"), TransactionColumn("modification_number", "award_modification_amendme", "STRING"), - TransactionColumn("non_federal_funding_amount", "NULL", "NUMERIC(23, 2)", "literal"), - TransactionColumn("original_loan_subsidy_cost", "NULL", "NUMERIC(23, 2)", "literal"), + TransactionColumn( + "non_federal_funding_amount", "NULL", "NUMERIC(23, 2)", "literal" + ), + TransactionColumn( + "original_loan_subsidy_cost", "NULL", "NUMERIC(23, 2)", "literal" + ), # All period_of_performance_* fields seen as: YYYY-MM-DD 00:00:00, so cast works # BUT it's still just a string and could morph, so defensively smart-date-parsing the string TransactionColumn( - "period_of_performance_current_end_date", "period_of_performance_curr", "DATE", "parse_string_datetime_to_date" + "period_of_performance_current_end_date", + "period_of_performance_curr", + "DATE", + "parse_string_datetime_to_date", ), TransactionColumn( - "period_of_performance_start_date", "period_of_performance_star", "DATE", "parse_string_datetime_to_date" + "period_of_performance_start_date", + "period_of_performance_star", + "DATE", + "parse_string_datetime_to_date", ), TransactionColumn("transaction_unique_id", "detached_award_proc_unique", "STRING"), TransactionColumn("unique_award_key", "unique_award_key", "STRING"), diff --git a/usaspending_api/transactions/delta_models/transaction_normalized.py b/usaspending_api/transactions/delta_models/transaction_normalized.py index eb75def0aa..3279c6ffa6 100644 --- a/usaspending_api/transactions/delta_models/transaction_normalized.py +++ b/usaspending_api/transactions/delta_models/transaction_normalized.py @@ -32,7 +32,7 @@ transaction_normalized_sql_string = rf""" CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( - {", ".join([f'{key} {val}' for key, val in TRANSACTION_NORMALIZED_COLUMNS.items()])} + {", ".join([f"{key} {val}" for key, val in TRANSACTION_NORMALIZED_COLUMNS.items()])} ) USING DELTA LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' diff --git a/usaspending_api/transactions/delta_models/transaction_search.py b/usaspending_api/transactions/delta_models/transaction_search.py index 52a878b62b..9050257695 100644 --- a/usaspending_api/transactions/delta_models/transaction_search.py +++ b/usaspending_api/transactions/delta_models/transaction_search.py @@ -2,17 +2,37 @@ TRANSACTION_SEARCH_COLUMNS = { # Keys - "transaction_id": {"delta": "LONG NOT NULL", "postgres": "BIGINT NOT NULL", "gold": False}, - "award_id": {"delta": "LONG NOT NULL", "postgres": "BIGINT NOT NULL", "gold": False}, + "transaction_id": { + "delta": "LONG NOT NULL", + "postgres": "BIGINT NOT NULL", + "gold": False, + }, + "award_id": { + "delta": "LONG NOT NULL", + "postgres": "BIGINT NOT NULL", + "gold": False, + }, # while transaction_unique_id is gold, it can't be NULL - "transaction_unique_id": {"delta": "STRING NOT NULL", "postgres": "TEXT NOT NULL", "gold": False}, - "usaspending_unique_transaction_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "transaction_unique_id": { + "delta": "STRING NOT NULL", + "postgres": "TEXT NOT NULL", + "gold": False, + }, + "usaspending_unique_transaction_id": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "modification_number": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "generated_unique_award_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, # Dates "action_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "fiscal_action_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "last_modified_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, + "last_modified_date": { + "delta": "TIMESTAMP", + "postgres": "TIMESTAMP", + "gold": False, + }, "fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "award_certified_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "award_fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, @@ -21,37 +41,117 @@ "award_update_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, "award_date_signed": {"delta": "DATE", "postgres": "DATE", "gold": False}, "etl_update_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, - "period_of_performance_start_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "period_of_performance_current_end_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "initial_report_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, + "period_of_performance_start_date": { + "delta": "DATE", + "postgres": "DATE", + "gold": False, + }, + "period_of_performance_current_end_date": { + "delta": "DATE", + "postgres": "DATE", + "gold": False, + }, + "initial_report_date": { + "delta": "TIMESTAMP", + "postgres": "TIMESTAMP", + "gold": False, + }, # Agencies "awarding_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_toptier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_toptier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "awarding_toptier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_toptier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "funding_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "awarding_sub_tier_agency_c": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "funding_sub_tier_agency_co": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_subtier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_subtier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "awarding_toptier_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "funding_toptier_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, + "funding_toptier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, + "awarding_sub_tier_agency_c": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, + "funding_sub_tier_agency_co": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_subtier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_subtier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, + "awarding_toptier_agency_id": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "funding_toptier_agency_id": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, "awarding_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "funding_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "awarding_toptier_agency_abbreviation": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_abbreviation": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_abbreviation": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_subtier_agency_abbreviation": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "awarding_toptier_agency_abbreviation": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_abbreviation": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_abbreviation": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_subtier_agency_abbreviation": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "awarding_office_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "awarding_office_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "funding_office_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "funding_office_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, # Typing # while is_fpds is gold, it also can't be NULL - "is_fpds": {"delta": "BOOLEAN NOT NULL", "postgres": "BOOLEAN NOT NULL", "gold": False}, + "is_fpds": { + "delta": "BOOLEAN NOT NULL", + "postgres": "BOOLEAN NOT NULL", + "gold": False, + }, "type_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type_description_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type": {"delta": "STRING", "postgres": "TEXT", "gold": False}, @@ -60,17 +160,57 @@ "action_type_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "award_category": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "transaction_description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "business_categories": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, + "business_categories": { + "delta": "ARRAY", + "postgres": "TEXT[]", + "gold": False, + }, # Amounts - "award_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, - "generated_pragmatic_obligation": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, - "federal_action_obligation": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, - "original_loan_subsidy_cost": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, - "face_value_loan_guarantee": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, - "indirect_federal_sharing": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, - "funding_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, - "total_funding_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, - "non_federal_funding_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "award_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, + "generated_pragmatic_obligation": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, + "federal_action_obligation": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, + "original_loan_subsidy_cost": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, + "face_value_loan_guarantee": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, + "indirect_federal_sharing": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, + "funding_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, + "total_funding_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, + "non_federal_funding_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, # Recipient "recipient_hash": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "recipient_levels": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, @@ -82,25 +222,81 @@ "parent_uei": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "parent_recipient_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "parent_recipient_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "parent_recipient_unique_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "parent_recipient_unique_id": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, # Recipient Location - "recipient_location_country_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_country_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_county_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_county_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_county_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_congressional_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_congressional_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_congressional_code_current": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "recipient_location_country_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_country_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_fips": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_county_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_county_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_county_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_congressional_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_congressional_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_congressional_code_current": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "recipient_location_zip5": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "legal_entity_zip4": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_zip_last4": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_city_code": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "recipient_location_city_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "recipient_location_city_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "legal_entity_address_line1": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_address_line2": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_address_line3": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -108,7 +304,11 @@ "legal_entity_foreign_descr": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_foreign_posta": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_foreign_provi": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "recipient_location_county_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "recipient_location_county_fips": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, # Place of Performance "place_of_performance_code": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "place_of_performance_scope": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -122,8 +322,16 @@ "pop_county_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_county_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "pop_congressional_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "pop_congressional_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "pop_congressional_code_current": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "pop_congressional_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "pop_congressional_code_current": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "pop_zip5": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "place_of_performance_zip4a": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "place_of_perform_zip_last4": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -131,22 +339,50 @@ "place_of_performance_forei": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "pop_county_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, # Accounts - "treasury_account_identifiers": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, + "treasury_account_identifiers": { + "delta": "ARRAY", + "postgres": "TEXT[]", + "gold": False, + }, "tas_paths": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, "tas_components": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, "federal_accounts": {"delta": "STRING", "postgres": "JSONB", "gold": False}, - "disaster_emergency_fund_codes": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, + "disaster_emergency_fund_codes": { + "delta": "ARRAY", + "postgres": "TEXT[]", + "gold": False, + }, # Officer Amounts "officer_1_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_1_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "officer_1_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, "officer_2_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_2_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "officer_2_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, "officer_3_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_3_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "officer_3_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, "officer_4_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_4_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "officer_4_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, "officer_5_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_5_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "officer_5_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, # Exclusively FABS "published_fabs_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "afa_generated_unique": {"delta": "STRING", "postgres": "TEXT", "gold": False}, @@ -167,23 +403,67 @@ "sai_number": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "uri": {"delta": "STRING", "postgres": "TEXT", "gold": False}, # Exclusively FPDS - "detached_award_procurement_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "detached_award_proc_unique": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "detached_award_procurement_id": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "detached_award_proc_unique": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "a_76_fair_act_action": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "a_76_fair_act_action_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "agency_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "airport_authority": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "alaskan_native_owned_corpo": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "alaskan_native_servicing_i": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "american_indian_owned_busi": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "asian_pacific_american_own": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "alaskan_native_owned_corpo": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "alaskan_native_servicing_i": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "american_indian_owned_busi": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "asian_pacific_american_own": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "base_and_all_options_value": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "base_exercised_options_val": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "black_american_owned_busin": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "c1862_land_grant_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "c1890_land_grant_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "c1994_land_grant_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "c8a_program_participant": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "black_american_owned_busin": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "c1862_land_grant_college": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "c1890_land_grant_college": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "c1994_land_grant_college": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "c8a_program_participant": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "cage_code": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "city_local_government": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "clinger_cohen_act_planning": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -192,8 +472,16 @@ "commercial_item_acquisitio": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "commercial_item_test_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "commercial_item_test_progr": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "community_developed_corpor": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "community_development_corp": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "community_developed_corpor": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "community_development_corp": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "consolidated_contract": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "consolidated_contract_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "construction_wage_rat_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -209,8 +497,16 @@ "contracting_officers_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "contracting_officers_deter": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "contracts": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "corporate_entity_not_tax_e": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "corporate_entity_tax_exemp": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "corporate_entity_not_tax_e": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "corporate_entity_tax_exemp": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "cost_accounting_stand_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "cost_accounting_standards": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "cost_or_pricing_data": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -218,17 +514,37 @@ "council_of_governments": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "country_of_product_or_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "country_of_product_or_serv": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "county_local_government": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "county_local_government": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "current_total_value_award": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "dod_claimant_prog_cod_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "dod_claimant_program_code": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "domestic_or_foreign_e_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "domestic_or_foreign_entity": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "domestic_shelter": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "dot_certified_disadvantage": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "economically_disadvantaged": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "educational_institution": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "emerging_small_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "dot_certified_disadvantage": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "economically_disadvantaged": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "educational_institution": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "emerging_small_business": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "epa_designated_produc_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "epa_designated_product": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "evaluated_preference": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -240,40 +556,100 @@ "fed_biz_opps": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "fed_biz_opps_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "federal_agency": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "federally_funded_research": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "for_profit_organization": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "federally_funded_research": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "for_profit_organization": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "foreign_funding": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "foreign_funding_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "foreign_government": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "foreign_owned_and_located": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "foreign_owned_and_located": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "foundation": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "government_furnished_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "government_furnished_prope": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "grants": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "hispanic_american_owned_bu": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "hispanic_servicing_institu": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "historically_black_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "historically_underutilized": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "hispanic_american_owned_bu": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "hispanic_servicing_institu": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "historically_black_college": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "historically_underutilized": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "hospital_flag": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "housing_authorities_public": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "housing_authorities_public": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "idv_type": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "idv_type_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "indian_tribe_federally_rec": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "indian_tribe_federally_rec": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "information_technolog_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "information_technology_com": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "inherently_government_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "inherently_government_func": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "inter_municipal_local_gove": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "inter_municipal_local_gove": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "interagency_contract_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "interagency_contracting_au": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "international_organization": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "international_organization": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "interstate_entity": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "joint_venture_economically": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "joint_venture_women_owned": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "joint_venture_economically": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "joint_venture_women_owned": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "labor_standards": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "labor_standards_descrip": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "labor_surplus_area_firm": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "limited_liability_corporat": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "labor_surplus_area_firm": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "limited_liability_corporat": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "local_area_set_aside": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "local_area_set_aside_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "local_government_owned": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, @@ -282,31 +658,63 @@ "materials_supplies_article": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "materials_supplies_descrip": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "minority_institution": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "minority_owned_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "minority_owned_business": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "multi_year_contract": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "multi_year_contract_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "multiple_or_single_aw_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "multiple_or_single_award_i": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "municipality_local_governm": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "municipality_local_governm": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "naics_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "naics_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "national_interest_action": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "national_interest_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "native_american_owned_busi": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "native_hawaiian_owned_busi": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "native_hawaiian_servicing": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "native_american_owned_busi": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "native_hawaiian_owned_busi": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "native_hawaiian_servicing": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "nonprofit_organization": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "number_of_actions": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "number_of_offers_received": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "ordering_period_end_date": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "organizational_type": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "other_minority_owned_busin": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "other_not_for_profit_organ": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "other_minority_owned_busin": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "other_not_for_profit_organ": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "other_statutory_authority": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "other_than_full_and_o_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "other_than_full_and_open_c": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "parent_award_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "partnership_or_limited_lia": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "partnership_or_limited_lia": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "performance_based_se_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "performance_based_service": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "period_of_perf_potential_e": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -317,16 +725,28 @@ "port_authority": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "potential_total_value_awar": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "price_evaluation_adjustmen": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "private_university_or_coll": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "private_university_or_coll": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "product_or_service_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "product_or_service_description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "product_or_service_description": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "program_acronym": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "program_system_or_equ_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "program_system_or_equipmen": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "pulled_from": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "purchase_card_as_paym_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "purchase_card_as_payment_m": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "receives_contracts_and_gra": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "receives_contracts_and_gra": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "recovered_materials_s_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "recovered_materials_sustai": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "referenced_idv_agency_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -340,33 +760,85 @@ "research_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "sam_exception": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "sam_exception_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "sba_certified_8_a_joint_ve": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "school_district_local_gove": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "sba_certified_8_a_joint_ve": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "school_district_local_gove": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "school_of_forestry": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "sea_transportation": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "sea_transportation_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "self_certified_small_disad": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "service_disabled_veteran_o": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "small_agricultural_coopera": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "small_business_competitive": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "small_disadvantaged_busine": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "self_certified_small_disad": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "service_disabled_veteran_o": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "small_agricultural_coopera": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "small_business_competitive": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "small_disadvantaged_busine": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "sole_proprietorship": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "solicitation_date": {"delta": "DATE", "postgres": "DATE", "gold": True}, "solicitation_identifier": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "solicitation_procedur_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "solicitation_procedures": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "state_controlled_instituti": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "subchapter_s_corporation": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "subcontinent_asian_asian_i": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "state_controlled_instituti": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "subchapter_s_corporation": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "subcontinent_asian_asian_i": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "subcontracting_plan": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "subcontracting_plan_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "the_ability_one_program": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "the_ability_one_program": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "total_obligated_amount": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "township_local_government": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "township_local_government": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "transaction_number": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "transit_authority": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "tribal_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "tribally_owned_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "tribally_owned_business": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "type_of_contract_pricing": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type_of_contract_pric_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "type_of_idc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -387,7 +859,11 @@ "veterinary_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "veterinary_hospital": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "woman_owned_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "women_owned_small_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "women_owned_small_business": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "program_activities": {"delta": "STRING", "postgres": "JSONB", "gold": False}, } DELTA_ONLY_COLUMNS = { @@ -397,14 +873,18 @@ **{k: v["delta"] for k, v in TRANSACTION_SEARCH_COLUMNS.items()}, **DELTA_ONLY_COLUMNS, } -TRANSACTION_SEARCH_POSTGRES_COLUMNS = {k: v["postgres"] for k, v in TRANSACTION_SEARCH_COLUMNS.items() if not v["gold"]} -TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS = {k: v["postgres"] for k, v in TRANSACTION_SEARCH_COLUMNS.items()} +TRANSACTION_SEARCH_POSTGRES_COLUMNS = { + k: v["postgres"] for k, v in TRANSACTION_SEARCH_COLUMNS.items() if not v["gold"] +} +TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS = { + k: v["postgres"] for k, v in TRANSACTION_SEARCH_COLUMNS.items() +} ALL_AWARD_TYPES = list(award_type_mapping.keys()) transaction_search_create_sql_string = rf""" CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( - {", ".join([f'{key} {val}' for key, val in TRANSACTION_SEARCH_DELTA_COLUMNS.items()])} + {", ".join([f"{key} {val}" for key, val in TRANSACTION_SEARCH_DELTA_COLUMNS.items()])} ) USING DELTA LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' From 8c56255b580f02e1362b9c8290e3728ecd96fe11 Mon Sep 17 00:00:00 2001 From: Seth Stoudenmier Date: Tue, 3 Feb 2026 12:12:35 -0500 Subject: [PATCH 37/59] [DEV-14146] Update use of config for local in Spark --- usaspending_api/common/etl/spark.py | 291 ++++++++++++++++++---------- 1 file changed, 191 insertions(+), 100 deletions(-) diff --git a/usaspending_api/common/etl/spark.py b/usaspending_api/common/etl/spark.py index 30770ef124..5018aa439d 100644 --- a/usaspending_api/common/etl/spark.py +++ b/usaspending_api/common/etl/spark.py @@ -10,19 +10,34 @@ import os import shutil import time -from collections import namedtuple from itertools import chain -from typing import List import duckdb from duckdb.experimental.spark.sql import SparkSession as DuckDBSparkSession from duckdb.experimental.spark.sql.dataframe import DataFrame as DuckDBDataFrame from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.functions import col, concat, concat_ws, expr, lit, regexp_replace, to_date, transform, when +from pyspark.sql.functions import ( + col, + concat, + concat_ws, + expr, + lit, + regexp_replace, + to_date, + transform, + when, +) from pyspark.sql.types import ArrayType, DecimalType, StringType, StructType -from usaspending_api.accounts.models import AppropriationAccountBalances, FederalAccount, TreasuryAppropriationAccount -from usaspending_api.common.helpers.s3_helpers import rename_s3_object, retrieve_s3_bucket_object_list +from usaspending_api.accounts.models import ( + AppropriationAccountBalances, + FederalAccount, + TreasuryAppropriationAccount, +) +from usaspending_api.common.helpers.s3_helpers import ( + rename_s3_object, + retrieve_s3_bucket_object_list, +) from usaspending_api.common.helpers.spark_helpers import ( get_broker_jdbc_url, get_jdbc_connection_properties, @@ -30,7 +45,9 @@ ) from usaspending_api.config import CONFIG from usaspending_api.download.filestreaming.download_generation import EXCEL_ROW_LIMIT -from usaspending_api.financial_activities.models import FinancialAccountsByProgramActivityObjectClass +from usaspending_api.financial_activities.models import ( + FinancialAccountsByProgramActivityObjectClass, +) from usaspending_api.recipient.models import StateData from usaspending_api.references.models import ( CGAC, @@ -52,9 +69,15 @@ ToptierAgency, ZipsGrouped, ) -from usaspending_api.reporting.models import ReportingAgencyMissingTas, ReportingAgencyOverview +from usaspending_api.reporting.models import ( + ReportingAgencyMissingTas, + ReportingAgencyOverview, +) from usaspending_api.settings import CSV_LOCAL_PATH, IS_LOCAL, USASPENDING_AWS_REGION -from usaspending_api.submissions.models import DABSSubmissionWindowSchedule, SubmissionAttributes +from usaspending_api.submissions.models import ( + DABSSubmissionWindowSchedule, + SubmissionAttributes, +) MAX_PARTITIONS = CONFIG.SPARK_MAX_PARTITIONS _USAS_RDS_REF_TABLES = [ @@ -87,12 +110,17 @@ ZipsGrouped, ] -_BROKER_REF_TABLES = ["cd_state_grouped", "cd_zips_grouped", "cd_county_grouped", "cd_city_grouped"] +_BROKER_REF_TABLES = [ + "cd_state_grouped", + "cd_zips_grouped", + "cd_county_grouped", + "cd_city_grouped", +] logger = logging.getLogger(__name__) -def extract_db_data_frame( +def extract_db_data_frame( # noqa: PLR0912, PLR0913, PLR0915 spark: SparkSession, conn_props: dict, jdbc_url: str, @@ -120,19 +148,25 @@ def extract_db_data_frame( min_max_df = spark.read.jdbc(url=jdbc_url, table=min_max_sql, properties=conn_props) if is_date_partitioning_col: # Ensure it is a date (e.g. if date in string format, convert to date) - min_max_df = min_max_df.withColumn(min_max_df.columns[0], to_date(min_max_df[0])).withColumn( - min_max_df.columns[1], to_date(min_max_df[1]) - ) + min_max_df = min_max_df.withColumn( + min_max_df.columns[0], to_date(min_max_df[0]) + ).withColumn(min_max_df.columns[1], to_date(min_max_df[1])) min_max = min_max_df.first() min_val = min_max[0] max_val = min_max[1] count = min_max[2] if is_numeric_partitioning_col: - logger.info(f"Deriving partitions from numeric ranges across column: {partitioning_col}") + logger.info( + f"Deriving partitions from numeric ranges across column: {partitioning_col}" + ) # Take count as partition if using a spotty range, and count of rows is less than range of IDs - partitions = int(min((int(max_val) - int(min_val)), int(count)) / (partition_rows + 1)) - logger.info(f"Derived {partitions} partitions from numeric ranges across column: {partitioning_col}") + partitions = int( + min((int(max_val) - int(min_val)), int(count)) / (partition_rows + 1) + ) + logger.info( + f"Derived {partitions} partitions from numeric ranges across column: {partitioning_col}" + ) if partitions > MAX_PARTITIONS: fail_msg = ( f"Aborting job run because {partitions} partitions " @@ -141,7 +175,9 @@ def extract_db_data_frame( logger.fatal(fail_msg) raise RuntimeError(fail_msg) - logger.info(f"{partitions} partitions to extract at approximately {partition_rows} rows each.") + logger.info( + f"{partitions} partitions to extract at approximately {partition_rows} rows each." + ) data_df = spark.read.options(customSchema=custom_schema).jdbc( url=jdbc_url, @@ -160,7 +196,9 @@ def extract_db_data_frame( # if that distinct count is less than MAX_PARTITIONS date_delta = max_val - min_val partitions = date_delta.days + 1 - if (count / partitions) < 0.6 or True: # Forcing this path, see comment in else below + if ( + count / partitions + ) < 0.6 or True: # Forcing this path, see comment in else below logger.info( f"Partitioning by date in col {partitioning_col} would yield {partitions} but only {count} " f"distinct dates in the dataset. This partition range is too sparse. Going to query the " @@ -179,7 +217,9 @@ def extract_db_data_frame( table=f"(select distinct {partitioning_col} from {table}) distinct_dates", properties=conn_props, ) - partition_sql_predicates = [f"{partitioning_col} = '{str(row[0])}'" for row in date_df.collect()] + partition_sql_predicates = [ + f"{partitioning_col} = '{str(row[0])}'" for row in date_df.collect() + ] logger.info( f"Built {len(partition_sql_predicates)} SQL partition predicates " f"to yield data partitions, based on distinct values of {partitioning_col} " @@ -208,7 +248,9 @@ def extract_db_data_frame( # NOTE: Have to use integer (really a Long) representation of the Date, since that is what the Scala # ... implementation is expecting: https://github.com/apache/spark/blob/c561ee686551690bee689f37ae5bbd75119994d6/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala#L192-L207 # TODO: THIS DOES NOT SEEM TO WORK WITH DATES for lowerBound and upperBound. Forcing use of predicates - raise NotImplementedError("Cannot read JDBC partitions with date lower/upper bound") + raise NotImplementedError( + "Cannot read JDBC partitions with date lower/upper bound" + ) data_df = spark.read.jdbc( url=jdbc_url, @@ -235,12 +277,20 @@ def extract_db_data_frame( raise RuntimeError(fail_msg) # SQL usable in Postgres to get a distinct 32-bit int from an md5 hash of text - pg_int_from_hash = f"('x'||substr(md5({partitioning_col}::text),1,8))::bit(32)::int" + pg_int_from_hash = ( + f"('x'||substr(md5({partitioning_col}::text),1,8))::bit(32)::int" + ) # int could be signed. This workaround SQL gets unsigned modulus from the hash int - non_neg_modulo = f"mod({partitions} + mod({pg_int_from_hash}, {partitions}), {partitions})" - partition_sql_predicates = [f"{non_neg_modulo} = {p}" for p in range(0, partitions)] + non_neg_modulo = ( + f"mod({partitions} + mod({pg_int_from_hash}, {partitions}), {partitions})" + ) + partition_sql_predicates = [ + f"{non_neg_modulo} = {p}" for p in range(0, partitions) + ] - logger.info(f"{partitions} partitions to extract by predicates at approximately {partition_rows} rows each.") + logger.info( + f"{partitions} partitions to extract by predicates at approximately {partition_rows} rows each." + ) data_df = spark.read.jdbc( url=jdbc_url, @@ -309,12 +359,19 @@ def load_delta_table( # NOTE: Best to (only?) use .saveAsTable(name=) rather than .insertInto(tableName=) # ... The insertInto does not seem to align/merge columns from DataFrame to table columns (defaults to column order) save_mode = "overwrite" if overwrite else "append" - source_df.write.format(source="delta").mode(saveMode=save_mode).saveAsTable(name=delta_table_name) + source_df.write.format(source="delta").mode(saveMode=save_mode).saveAsTable( + name=delta_table_name + ) logger.info(f"LOAD (FINISH): Loaded data into Delta table {delta_table_name}") def load_es_index( - spark: SparkSession, source_df: DataFrame, base_config: dict, index_name: str, routing: str, doc_id: str + spark: SparkSession, + source_df: DataFrame, + base_config: dict, + index_name: str, + routing: str, + doc_id: str, ) -> None: # pragma: no cover -- will be used and tested eventually index_config = base_config.copy() index_config["es.resource.write"] = index_name @@ -328,10 +385,14 @@ def load_es_index( jvm_data_df = source_df._jdf # Call the elasticsearch-hadoop method to write the DF to ES via the _jvm conduit on the SparkContext - spark.sparkContext._jvm.org.elasticsearch.spark.sql.EsSparkSQL.saveToEs(jvm_data_df, jvm_es_config_map) + spark.sparkContext._jvm.org.elasticsearch.spark.sql.EsSparkSQL.saveToEs( + jvm_data_df, jvm_es_config_map + ) -def merge_delta_table(spark: SparkSession, source_df: DataFrame, delta_table_name: str, merge_column: str): +def merge_delta_table( + spark: SparkSession, source_df: DataFrame, delta_table_name: str, merge_column: str +) -> None: source_df.create_or_replace_temporary_view("temp_table") spark.sql( @@ -345,7 +406,11 @@ def merge_delta_table(spark: SparkSession, source_df: DataFrame, delta_table_nam def diff( - left: DataFrame, right: DataFrame, unique_key_col="id", compare_cols=None, include_unchanged_rows=False + left: DataFrame, + right: DataFrame, + unique_key_col: str = "id", + compare_cols: list[str] | None = None, + include_unchanged_rows: bool = False, ) -> DataFrame: """Compares two Spark DataFrames that share a schema and returns row-level differences in a DataFrame @@ -413,7 +478,9 @@ def diff( if unique_key_col in compare_cols: compare_cols.remove(unique_key_col) - distinct_stmts = " ".join([f"WHEN l.{c} IS DISTINCT FROM r.{c} THEN 'C'" for c in compare_cols]) + distinct_stmts = " ".join( + [f"WHEN l.{c} IS DISTINCT FROM r.{c} THEN 'C'" for c in compare_cols] + ) compare_expr = f""" CASE WHEN l.exists IS NULL THEN 'I' @@ -426,14 +493,26 @@ def diff( differences = ( left.withColumn("exists", lit(1)) .alias("l") - .join(right.withColumn("exists", lit(1)).alias("r"), left[unique_key_col] == right[unique_key_col], "fullouter") + .join( + right.withColumn("exists", lit(1)).alias("r"), + left[unique_key_col] == right[unique_key_col], + "fullouter", + ) .withColumn("diff", expr(compare_expr)) ) # Put "diff" col first, then follow by the l and r value for each column, for all columns compared cols_to_show = ( ["diff"] + [f"l.{unique_key_col}", f"r.{unique_key_col}"] - + list(chain(*zip([f"l.{c}" for c in compare_cols], [f"r.{c}" for c in compare_cols], strict=False))) + + list( + chain( + *zip( + [f"l.{c}" for c in compare_cols], + [f"r.{c}" for c in compare_cols], + strict=False, + ) + ) + ) ) differences = differences.select(*cols_to_show) if not include_unchanged_rows: @@ -446,14 +525,16 @@ def convert_decimal_cols_to_string(df: DataFrame) -> DataFrame: for f in df.schema.fields: if not isinstance(f.dataType, DecimalType): continue - df_no_decimal = df_no_decimal.withColumn(f.name, df_no_decimal[f.name].cast(StringType())) + df_no_decimal = df_no_decimal.withColumn( + f.name, df_no_decimal[f.name].cast(StringType()) + ) return df_no_decimal def convert_array_cols_to_string( df: DataFrame, - is_postgres_array_format=False, - is_for_csv_export=False, + is_postgres_array_format: bool = False, + is_for_csv_export: bool = False, ) -> DataFrame: """For each column that is an Array of ANYTHING, transform it to a string-ified representation of that Array. @@ -482,7 +563,8 @@ def convert_array_cols_to_string( 2. Escape any quotes inside the array element with backslash. - A case that involves all of this will yield CSV field value like this when viewed in a text editor, assuming Spark CSV options are: quote='"', escape='"' (the default is for it to match quote) - ...,"{""{\""simple\"": \""elem1\"", \""other\"": \""elem1\""}"", ""{\""simple\"": \""elem2\"", \""other\"": \""elem2\""}""}",... + ...,"{""{\""simple\"": \""elem1\"", \""other\"": \""elem1\""}"", + ""{\""simple\"": \""elem2\"", \""other\"": \""elem2\""}""}",... """ arr_open_bracket = "[" arr_close_bracket = "]" @@ -517,10 +599,14 @@ def convert_array_cols_to_string( # Special handling in case of data that already has either a quote " or backslash \ # inside an array element # First replace any single backslash character \ with TWO \\ (an escaped backslash) - # Then replace any quote " character with \" (escaped quote, inside a quoted array elem) + # Then replace quote " character with \" (escaped quote, inside a quoted array elem) # NOTE: these regexp_replace get sent down to a Java replaceAll, which will require # FOUR backslashes to represent ONE - regexp_replace(regexp_replace(c, "\\\\", "\\\\\\\\"), '"', '\\\\"'), + regexp_replace( + regexp_replace(c, "\\\\", "\\\\\\\\"), + '"', + '\\\\"', + ), lit('"'), ), ) @@ -534,14 +620,14 @@ def convert_array_cols_to_string( return df_no_arrays -def build_ref_table_name_list(): +def build_ref_table_name_list() -> list[str]: return [rds_ref_table._meta.db_table for rds_ref_table in _USAS_RDS_REF_TABLES] -def _generate_global_view_sql_strings(tables: List[str], jdbc_url: str) -> List[str]: +def _generate_global_view_sql_strings(tables: list[str], jdbc_url: str) -> list[str]: """Generates the CREATE OR REPLACE SQL strings for each of the given tables and JDBC URL""" - sql_strings: List[str] = [] + sql_strings: list[str] = [] jdbc_conn_props = get_jdbc_connection_properties() for table_name in tables: @@ -561,7 +647,9 @@ def _generate_global_view_sql_strings(tables: List[str], jdbc_url: str) -> List[ return sql_strings -def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broker_views: bool = False): +def create_ref_temp_views( # noqa: PLR0912 + spark: SparkSession | DuckDBSparkSession, create_broker_views: bool = False +) -> None: """Create global temporary Spark reference views that sit atop remote PostgreSQL RDS tables Setting create_broker_views to True will create views for all tables list in _BROKER_REF_TABLES Note: They will all be listed under global_temp.{table_name} @@ -574,20 +662,21 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke # Create USAS temp views rds_ref_tables = build_ref_table_name_list() - logger.info(f"Creating the following tables under the global_temp database: {rds_ref_tables}") + logger.info( + f"Creating the following tables under the global_temp database: {rds_ref_tables}" + ) match isinstance(spark, DuckDBSparkSession): case True: logger.info("Creating ref temp views using DuckDB") - if IS_LOCAL: spark.sql( f""" CREATE OR REPLACE SECRET ( TYPE s3, PROVIDER config, - KEY_ID '{CONFIG.AWS_ACCESS_KEY}', - SECRET '{CONFIG.AWS_SECRET_KEY}', + KEY_ID '{CONFIG.AWS_ACCESS_KEY.get_secret_value()}', + SECRET '{CONFIG.AWS_SECRET_KEY.get_secret_value()}', ENDPOINT '{CONFIG.AWS_S3_ENDPOINT}', URL_STYLE 'path', USE_SSL 'false' @@ -596,7 +685,9 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke ) else: # DuckDB will prepend the HTTP or HTTPS so we need to strip it from the AWS endpoint URL - endpoint_url = CONFIG.AWS_S3_ENDPOINT.replace("http://", "").replace("https://", "") + endpoint_url = CONFIG.AWS_S3_ENDPOINT.replace("http://", "").replace( + "https://", "" + ) spark.sql( f""" CREATE OR REPLACE SECRET ( @@ -610,15 +701,16 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke _download_delta_tables = [ {"schema": "rpt", "table_name": "account_balances_download"}, - {"schema": "rpt", "table_name": "object_class_program_activity_download"}, + { + "schema": "rpt", + "table_name": "object_class_program_activity_download", + }, ] # The DuckDB Delta extension is needed to interact with DeltaLake tables spark.sql("LOAD delta; CREATE SCHEMA IF NOT EXISTS rpt;") for table in _download_delta_tables: - s3_path = ( - f"s3://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/{table['schema']}/{table['table_name']}" - ) + s3_path = f"s3://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/{table['schema']}/{table['table_name']}" try: spark.sql( f""" @@ -626,21 +718,31 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke SELECT * FROM delta_scan('{s3_path}'); """ ) - logger.info(f"Successfully created table {table['schema']}.{table['table_name']}") - except duckdb.IOException: + logger.info( + f"Successfully created table {table['schema']}.{table['table_name']}" + ) + except duckdb.IOException as exc: logger.exception(f"Failed to create table {table['table_name']}") - raise RuntimeError(f"Failed to create table {table['table_name']}") + raise RuntimeError( + f"Failed to create table {table['table_name']}" + ) from exc # The DuckDB Postgres extension is needed to connect to the USAS Postgres DB spark.sql("LOAD postgres; CREATE SCHEMA IF NOT EXISTS global_temp;") - spark.sql(f"ATTACH '{CONFIG.DATABASE_URL}' AS usas (TYPE postgres, READ_ONLY);") + spark.sql( + f"ATTACH '{CONFIG.DATABASE_URL}' AS usas (TYPE postgres, READ_ONLY);" + ) for table in rds_ref_tables: try: - spark.sql(f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM usas.public.{table};") - except duckdb.CatalogException: + spark.sql( + f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM usas.public.{table};" + ) + except duckdb.CatalogException as exc: logger.exception(f"Failed to create view {table} for {table}") - raise RuntimeError(f"Failed to create view {table} for {table}") + raise RuntimeError( + f"Failed to create view {table} for {table}" + ) from exc if create_broker_views: spark.sql( @@ -653,10 +755,14 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke ) for table in _BROKER_REF_TABLES: try: - spark.sql(f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM broker.public.{table};") - except duckdb.CatalogException: + spark.sql( + f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM broker.public.{table};" + ) + except duckdb.CatalogException as exc: logger.exception(f"Failed to create view {table} for {table}") - raise RuntimeError(f"Failed to create view {table} for {table}") + raise RuntimeError( + f"Failed to create view {table} for {table}" + ) from exc case False: logger.info("Creating ref temp views using Spark") @@ -682,14 +788,14 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke logger.info("Created the reference views in the global_temp database") -def write_csv_file( +def write_csv_file( # noqa: PLR0913 spark: SparkSession, df: DataFrame, parts_dir: str, - max_records_per_file=EXCEL_ROW_LIMIT, - overwrite=True, - logger=None, - delimiter=",", + max_records_per_file: int = EXCEL_ROW_LIMIT, + overwrite: bool = True, + logger: logging.Logger | None = None, + delimiter: str = ",", ) -> int: """Write DataFrame data to CSV file parts. Args: @@ -712,7 +818,9 @@ def write_csv_file( if fs.exists(parts_dir_path): fs.delete(parts_dir_path, True) start = time.time() - logger.info(f"Writing source data DataFrame to csv part files for file {parts_dir}...") + logger.info( + f"Writing source data DataFrame to csv part files for file {parts_dir}..." + ) df_record_count = df.count() num_partitions = math.ceil(df_record_count / max_records_per_file) or 1 df.repartition(num_partitions).write.options( @@ -730,7 +838,9 @@ def write_csv_file( sep=delimiter, ) logger.info(f"{parts_dir} contains {df_record_count:,} rows of data") - logger.info(f"Wrote source data DataFrame to csv part files in {(time.time() - start):3f}s") + logger.info( + f"Wrote source data DataFrame to csv part files in {(time.time() - start):3f}s" + ) return df_record_count @@ -766,7 +876,9 @@ def write_csv_file_duckdb( full_file_paths = [] - logger.info(f"Writing source data DataFrame to csv files for file {download_file_name}") + logger.info( + f"Writing source data DataFrame to csv files for file {download_file_name}" + ) rel.to_csv( file_name=f"{temp_csv_directory_path}{download_file_name}", sep=delimiter, @@ -785,44 +897,22 @@ def write_csv_file_duckdb( for dir in _partition_dirs: _old_csv_path = f"{dir}/{os.listdir(dir)[0]}" _new_csv_path = ( - f"{temp_csv_directory_path}{download_file_name}/{download_file_name}_{dir.split('=')[1].zfill(2)}.csv" + f"{temp_csv_directory_path}{download_file_name}" + f"/{download_file_name}_{dir.split('=')[1].zfill(2)}.csv" ) shutil.move(_old_csv_path, _new_csv_path) full_file_paths.append(_new_csv_path) os.rmdir(dir) - logger.info(f"{temp_csv_directory_path}{download_file_name} contains {df_record_count:,} rows of data") - logger.info(f"Wrote source data DataFrame to {len(full_file_paths)} CSV files in {(time.time() - start):3f}s") + logger.info( + f"{temp_csv_directory_path}{download_file_name} contains {df_record_count:,} rows of data" + ) + logger.info( + f"Wrote source data DataFrame to {len(full_file_paths)} CSV files in {(time.time() - start):3f}s" + ) return df_record_count, full_file_paths -def _merge_file_parts(fs, out_stream, conf, hadoop, partial_merged_file_path, part_file_list): - """Read-in files in alphabetical order and append them one by one to the merged file""" - - for part_file in part_file_list: - in_stream = None - try: - in_stream = fs.open(part_file) - # Write bytes of each file read and keep out_stream open after write for next file - hadoop.io.IOUtils.copyBytes(in_stream, out_stream, conf, False) - finally: - if in_stream: - in_stream.close() - if fs.exists(partial_merged_file_path): - fs.delete(partial_merged_file_path, True) - - -def _merge_grouper(items, group_size): - """Helper to chunk up files into mergeable groups""" - FileMergeGroup = namedtuple("FileMergeGroup", ["part", "file_list"]) - if len(items) <= group_size: - yield FileMergeGroup(None, items) - return - group_generator = (items[i : i + group_size] for i in range(0, len(items), group_size)) - for i, group in enumerate(group_generator, start=1): - yield FileMergeGroup(i, group) - - def rename_part_files( bucket_name: str, destination_file_name: str, @@ -849,7 +939,8 @@ def rename_part_files( [ file.key for file in retrieve_s3_bucket_object_list( - bucket_name, key_prefix=f"{temp_download_dir_name}/{destination_file_name}/part-" + bucket_name, + key_prefix=f"{temp_download_dir_name}/{destination_file_name}/part-", ) if file.key.endswith(file_format) ] From 6b7af17611d9712ea4c4040911664ad2f1b54a2c Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Thu, 5 Feb 2026 10:11:54 -0600 Subject: [PATCH 38/59] [DEV-14451] Add new assistance type codes --- usaspending_api/awards/v2/lookups/lookups.py | 72 +- .../tests/data/spending_by_award_test_data.py | 99 +- .../integration/test_spending_by_award.py | 1545 +++++++++++++---- 3 files changed, 1370 insertions(+), 346 deletions(-) diff --git a/usaspending_api/awards/v2/lookups/lookups.py b/usaspending_api/awards/v2/lookups/lookups.py index 17f052785c..cd64448e49 100644 --- a/usaspending_api/awards/v2/lookups/lookups.py +++ b/usaspending_api/awards/v2/lookups/lookups.py @@ -72,7 +72,11 @@ **direct_payment_award_mapping, **other_award_mapping, } -non_loan_assistance_award_mapping = {**grant_award_mapping, **direct_payment_award_mapping, **other_award_mapping} +non_loan_assistance_award_mapping = { + **grant_award_mapping, + **direct_payment_award_mapping, + **other_award_mapping, +} # TODO: include IDV mappings in the award_type_mapping and update award_filter.py award_type_mapping = { @@ -105,9 +109,24 @@ # 'F': 'Cooperative Agreement', # 'G': 'Grant for Research', # 'S': 'Funded Space Act Agreement', - # 'T': 'Training Grant' + # 'T': 'Training Grant', + "F001": "Grant", + "F002": "Cooperative Agreement", + "F003": "Direct Loan", + "F004": "Loan Guarantee", + "F005": "Indemnity / Insurance (non-loan)", + "F006": "Direct Payment for Specified Use", + "F007": "Direct Payment with Unrestricted Use", + "F008": "Asset Forfeiture / Equitable Sharing", + "F009": "Sale, Exchange, or Donation of Property and Goods", + "F010": "Other Financial Assistance", +} +contract_type_mapping = { + "A": "BPA Call", + "B": "Purchase Order", + "C": "Delivery Order", + "D": "Definitive Contract", } -contract_type_mapping = {"A": "BPA Call", "B": "Purchase Order", "C": "Delivery Order", "D": "Definitive Contract"} idv_type_mapping = { "IDV_A": "GWAC Government Wide Acquisition Contract", "IDV_B": "IDC Multi-Agency Contract, Other Indefinite Delivery Contract", @@ -118,18 +137,47 @@ "IDV_D": "BOA Basic Ordering Agreement", "IDV_E": "BPA Blanket Purchase Agreement", } -grant_type_mapping = {"02": "Block Grant", "03": "Formula Grant", "04": "Project Grant", "05": "Cooperative Agreement"} -direct_payment_type_mapping = {"06": "Direct Payment for Specified Use", "10": "Direct Payment with Unrestricted Use"} -loan_type_mapping = {"07": "Direct Loan", "08": "Guaranteed/Insured Loan"} +grant_type_mapping = { + "02": "Block Grant", + "03": "Formula Grant", + "04": "Project Grant", + "05": "Cooperative Agreement", + "F001": "Grant", + "F002": "Cooperative Agreement", +} +direct_payment_type_mapping = { + "06": "Direct Payment for Specified Use", + "10": "Direct Payment with Unrestricted Use", + "F006": "Direct Payment for Specified Use", + "F007": "Direct Payment with Unrestricted Use", +} +loan_type_mapping = { + "07": "Direct Loan", + "08": "Guaranteed/Insured Loan", + "F003": "Direct Loan", + "F004": "Loan Guarantee", +} # -1 is a derived type that we added as a "catch-all" for any invalid `type` values -other_type_mapping = {"09": "Insurance", "11": "Other Financial Assistance", "-1": "Not Specified"} +other_type_mapping = { + "09": "Insurance", + "11": "Other Financial Assistance", + "-1": "Not Specified", + "F005": "Indemnity / Insurance (non-loan)", + "F008": "Asset Forfeiture / Equitable Sharing", + "F009": "Sale, Exchange, or Donation of Property and Goods", + "F010": "Other Financial Assistance", +} assistance_type_mapping = { **grant_type_mapping, **direct_payment_type_mapping, **loan_type_mapping, **other_type_mapping, } -non_loan_assistance_type_mapping = {**grant_type_mapping, **direct_payment_type_mapping, **other_type_mapping} +non_loan_assistance_type_mapping = { + **grant_type_mapping, + **direct_payment_type_mapping, + **other_type_mapping, +} procurement_type_mapping = {**contract_type_mapping, **idv_type_mapping} all_award_types_mappings = { "contracts": list(contract_type_mapping), @@ -141,9 +189,13 @@ } all_awards_types_to_category = { - type_code: category for category, type_codes in all_award_types_mappings.items() for type_code in type_codes + type_code: category + for category, type_codes in all_award_types_mappings.items() + for type_code in type_codes } all_subaward_types = ["grant", "procurement"] -SUBAWARD_MAPPING_LOOKUP = {key: value.replace(".keyword", "") for key, value in subaward_mapping.items()} +SUBAWARD_MAPPING_LOOKUP = { + key: value.replace(".keyword", "") for key, value in subaward_mapping.items() +} diff --git a/usaspending_api/search/tests/data/spending_by_award_test_data.py b/usaspending_api/search/tests/data/spending_by_award_test_data.py index f651a2fced..5f49bf0453 100644 --- a/usaspending_api/search/tests/data/spending_by_award_test_data.py +++ b/usaspending_api/search/tests/data/spending_by_award_test_data.py @@ -280,7 +280,9 @@ def spending_by_award_test_data(): pop_zip4="9040", cfda_number="64.114", cfda_program_title="VETERANS HOUSING GUARANTEED AND INSURED LOANS", - cfdas=['{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}'], + cfdas=[ + '{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}' + ], ) baker.make( @@ -320,7 +322,9 @@ def spending_by_award_test_data(): pop_zip4="9040", cfda_number="64.114", cfda_program_title="VETERANS HOUSING GUARANTEED AND INSURED LOANS", - cfdas=['{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}'], + cfdas=[ + '{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}' + ], ) award_6 = baker.make( @@ -360,7 +364,9 @@ def spending_by_award_test_data(): pop_zip4="9040", cfda_number="64.114", cfda_program_title="VETERANS HOUSING GUARANTEED AND INSURED LOANS", - cfdas=['{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}'], + cfdas=[ + '{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}' + ], ) baker.make( @@ -669,7 +675,9 @@ def spending_by_award_test_data(): pop_zip4="9040", cfda_number="64.114", cfda_program_title="VETERANS HOUSING GUARANTEED AND INSURED LOANS", - cfdas=['{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}'], + cfdas=[ + '{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}' + ], ) baker.make( @@ -723,14 +731,34 @@ def spending_by_award_test_data(): product_or_service_description="PSC description 1", ) + baker.make( + "search.AwardSearch", + award_id=2026, + type="F003", + category="loan", + date_signed="2019-01-01", + action_date="2019-01-01", + fain="fain2026", + display_award_id="award2026", + generated_unique_award_id="ASST_NEW_TYPES_2026", + ) + # Toptier Agency ta1 = baker.make( - "references.ToptierAgency", abbreviation="TA1", name="TOPTIER AGENCY 1", toptier_code="ABC", _fill_optional=True + "references.ToptierAgency", + abbreviation="TA1", + name="TOPTIER AGENCY 1", + toptier_code="ABC", + _fill_optional=True, ) # Federal Account baker.make( - "accounts.FederalAccount", id=1, parent_toptier_agency=ta1, agency_identifier="1", main_account_code="0001" + "accounts.FederalAccount", + id=1, + parent_toptier_agency=ta1, + agency_identifier="1", + main_account_code="0001", ) # TAS @@ -750,7 +778,10 @@ def spending_by_award_test_data(): earliest_public_law_enactment_date="2020-03-06", ) defc_q = baker.make( - "references.DisasterEmergencyFundCode", code="Q", group_name=None, earliest_public_law_enactment_date=None + "references.DisasterEmergencyFundCode", + code="Q", + group_name=None, + earliest_public_law_enactment_date=None, ) # Submissions @@ -832,16 +863,38 @@ def spending_by_award_test_data(): ) # Subtier Agency - subtier_agency_1 = {"pk": 1, "abbreviation": "SA1", "name": "SUBTIER AGENCY 1", "subtier_code": "DEF"} - subtier_agency_2 = {"pk": 2, "abbreviation": "SA2", "name": "SUBTIER AGENCY 2", "subtier_code": "1000"} + subtier_agency_1 = { + "pk": 1, + "abbreviation": "SA1", + "name": "SUBTIER AGENCY 1", + "subtier_code": "DEF", + } + subtier_agency_2 = { + "pk": 2, + "abbreviation": "SA2", + "name": "SUBTIER AGENCY 2", + "subtier_code": "1000", + } baker.make("references.SubtierAgency", **subtier_agency_1, _fill_optional=True) baker.make("references.SubtierAgency", **subtier_agency_2, _fill_optional=True) # Agency - baker.make("references.Agency", pk=1, toptier_agency=ta1, subtier_agency_id=1, _fill_optional=True) + baker.make( + "references.Agency", + pk=1, + toptier_agency=ta1, + subtier_agency_id=1, + _fill_optional=True, + ) - baker.make("search.TransactionSearch", transaction_id=1, award=award_1, action_date="2020-04-01", is_fpds=True) + baker.make( + "search.TransactionSearch", + transaction_id=1, + award=award_1, + action_date="2020-04-01", + is_fpds=True, + ) baker.make( "search.TransactionSearch", transaction_id=2, @@ -879,8 +932,20 @@ def spending_by_award_test_data(): recipient_location_county_code="012", naics_code="112244", ) - baker.make("search.TransactionSearch", transaction_id=4, award=award_3, action_date="2017-01-01", is_fpds=True) - baker.make("search.TransactionSearch", transaction_id=5, award=award_3, action_date="2018-01-01", is_fpds=True) + baker.make( + "search.TransactionSearch", + transaction_id=4, + award=award_3, + action_date="2017-01-01", + is_fpds=True, + ) + baker.make( + "search.TransactionSearch", + transaction_id=5, + award=award_3, + action_date="2018-01-01", + is_fpds=True, + ) baker.make( "search.TransactionSearch", transaction_id=6, @@ -898,7 +963,13 @@ def spending_by_award_test_data(): cfda_number="10.331", recipient_unique_id="duns_1001", ) - baker.make("search.TransactionSearch", transaction_id=8, award=award_5, action_date="2019-10-1", is_fpds=True) + baker.make( + "search.TransactionSearch", + transaction_id=8, + award=award_5, + action_date="2019-10-1", + is_fpds=True, + ) baker.make( "search.SubawardSearch", diff --git a/usaspending_api/search/tests/integration/test_spending_by_award.py b/usaspending_api/search/tests/integration/test_spending_by_award.py index 1eafd1580f..b7d8981354 100644 --- a/usaspending_api/search/tests/integration/test_spending_by_award.py +++ b/usaspending_api/search/tests/integration/test_spending_by_award.py @@ -7,20 +7,39 @@ from usaspending_api.awards.v2.lookups.lookups import all_award_types_mappings from usaspending_api.common.helpers.generic_helper import get_generic_filters_message -from usaspending_api.search.tests.data.search_filters_test_data import legacy_filters, non_legacy_filters +from usaspending_api.search.tests.data.search_filters_test_data import ( + legacy_filters, + non_legacy_filters, +) from usaspending_api.search.tests.data.utilities import setup_elasticsearch_test @pytest.fixture def award_data_fixture(db): - baker.make("search.TransactionSearch", transaction_id=210210210, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=321032103, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=432104321, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=543210543, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=654321065, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=765432107, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=876543210, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=987654321, action_date="2013-09-17") + baker.make( + "search.TransactionSearch", transaction_id=210210210, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=321032103, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=432104321, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=543210543, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=654321065, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=765432107, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=876543210, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=987654321, action_date="2013-09-17" + ) ref_program_activity1 = baker.make( "references.RefProgramActivity", @@ -35,9 +54,19 @@ def award_data_fixture(db): program_activity_name="PROGRAM_ACTIVITY_2", ) - baker.make("references.DisasterEmergencyFundCode", code="L", group_name="covid_19", public_law="LAW", title="title") baker.make( - "references.DisasterEmergencyFundCode", code="Z", group_name="infrastructure", public_law="LAW", title="title" + "references.DisasterEmergencyFundCode", + code="L", + group_name="covid_19", + public_law="LAW", + title="title", + ) + baker.make( + "references.DisasterEmergencyFundCode", + code="Z", + group_name="infrastructure", + public_law="LAW", + title="title", ) award1 = baker.make( @@ -286,7 +315,10 @@ def test_spending_by_award_subaward_success( assert resp.status_code == status.HTTP_200_OK # Testing contents of what is returned - spending_level_filter_list = [{"spending_level": "subawards"}, {"spending_level": "subawards"}] + spending_level_filter_list = [ + {"spending_level": "subawards"}, + {"spending_level": "subawards"}, + ] for spending_level_filter in spending_level_filter_list: resp = client.post( @@ -353,14 +385,21 @@ def test_spending_by_award_subaward_success( @pytest.mark.django_db -def test_spending_by_award_legacy_filters(client, monkeypatch, elasticsearch_award_index): +def test_spending_by_award_legacy_filters( + client, monkeypatch, elasticsearch_award_index +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = client.post( "/api/v2/search/spending_by_award", content_type="application/json", data=json.dumps( - {"spending_level": "awards", "fields": ["Award ID"], "sort": "Award ID", "filters": legacy_filters()} + { + "spending_level": "awards", + "fields": ["Award ID"], + "sort": "Award ID", + "filters": legacy_filters(), + } ), ) assert resp.status_code == status.HTTP_200_OK @@ -368,9 +407,20 @@ def test_spending_by_award_legacy_filters(client, monkeypatch, elasticsearch_awa @pytest.mark.django_db def test_no_intersection(client, monkeypatch, elasticsearch_award_index): - - baker.make("search.AwardSearch", award_id=1, type="A", latest_transaction_id=1, action_date="2020-10-10") - baker.make("search.TransactionSearch", transaction_id=1, action_date="2010-10-01", award_id=1, is_fpds=True) + baker.make( + "search.AwardSearch", + award_id=1, + type="A", + latest_transaction_id=1, + action_date="2020-10-10", + ) + baker.make( + "search.TransactionSearch", + transaction_id=1, + action_date="2010-10-01", + award_id=1, + is_fpds=True, + ) setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) @@ -381,19 +431,34 @@ def test_no_intersection(client, monkeypatch, elasticsearch_award_index): "filters": {"award_type_codes": ["A", "B", "C", "D"]}, } - resp = client.post("/api/v2/search/spending_by_award", content_type="application/json", data=json.dumps(request)) + resp = client.post( + "/api/v2/search/spending_by_award", + content_type="application/json", + data=json.dumps(request), + ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 1 request["filters"]["award_type_codes"].append("no intersection") - resp = client.post("/api/v2/search/spending_by_award", content_type="application/json", data=json.dumps(request)) + resp = client.post( + "/api/v2/search/spending_by_award", + content_type="application/json", + data=json.dumps(request), + ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 0, "Results returned, there should be 0" @pytest.fixture def awards_over_different_date_ranges(): - award_category_list = ["contracts", "direct_payments", "grants", "idvs", "loans", "other_financial_assistance"] + award_category_list = [ + "contracts", + "direct_payments", + "grants", + "idvs", + "loans", + "other_financial_assistance", + ] # The date ranges for the different awards are setup to cover possible intersection points by the # different date ranges being searched. The comments on each line specify where the date ranges are @@ -402,23 +467,68 @@ def awards_over_different_date_ranges(): # - {"start_date": "2017-02-01", "end_date": "2017-11-30"} date_range_list = [ # Intersect only one of the date ranges searched for - {"date_signed": datetime(2014, 1, 1), "action_date": datetime(2014, 5, 1)}, # Before both - {"date_signed": datetime(2014, 3, 1), "action_date": datetime(2015, 4, 15)}, # Beginning of first - {"date_signed": datetime(2015, 2, 1), "action_date": datetime(2015, 7, 1)}, # Middle of first + { + "date_signed": datetime(2014, 1, 1), + "action_date": datetime(2014, 5, 1), + }, # Before both + { + "date_signed": datetime(2014, 3, 1), + "action_date": datetime(2015, 4, 15), + }, # Beginning of first + { + "date_signed": datetime(2015, 2, 1), + "action_date": datetime(2015, 7, 1), + }, # Middle of first {"date_signed": datetime(2015, 2, 1), "action_date": datetime(2015, 4, 17)}, - {"date_signed": datetime(2014, 12, 1), "action_date": datetime(2016, 1, 1)}, # All of first - {"date_signed": datetime(2015, 11, 1), "action_date": datetime(2016, 3, 1)}, # End of first - {"date_signed": datetime(2016, 2, 23), "action_date": datetime(2016, 7, 19)}, # Between both - {"date_signed": datetime(2016, 11, 26), "action_date": datetime(2017, 3, 1)}, # Beginning of second - {"date_signed": datetime(2017, 5, 1), "action_date": datetime(2017, 7, 1)}, # Middle of second - {"date_signed": datetime(2017, 1, 1), "action_date": datetime(2017, 12, 1)}, # All of second - {"date_signed": datetime(2017, 9, 1), "action_date": datetime(2017, 12, 17)}, # End of second - {"date_signed": datetime(2018, 2, 1), "action_date": datetime(2018, 7, 1)}, # After both + { + "date_signed": datetime(2014, 12, 1), + "action_date": datetime(2016, 1, 1), + }, # All of first + { + "date_signed": datetime(2015, 11, 1), + "action_date": datetime(2016, 3, 1), + }, # End of first + { + "date_signed": datetime(2016, 2, 23), + "action_date": datetime(2016, 7, 19), + }, # Between both + { + "date_signed": datetime(2016, 11, 26), + "action_date": datetime(2017, 3, 1), + }, # Beginning of second + { + "date_signed": datetime(2017, 5, 1), + "action_date": datetime(2017, 7, 1), + }, # Middle of second + { + "date_signed": datetime(2017, 1, 1), + "action_date": datetime(2017, 12, 1), + }, # All of second + { + "date_signed": datetime(2017, 9, 1), + "action_date": datetime(2017, 12, 17), + }, # End of second + { + "date_signed": datetime(2018, 2, 1), + "action_date": datetime(2018, 7, 1), + }, # After both # Intersect both date ranges searched for - {"date_signed": datetime(2014, 12, 1), "action_date": datetime(2017, 12, 5)}, # Completely both - {"date_signed": datetime(2015, 7, 1), "action_date": datetime(2017, 5, 1)}, # Partially both - {"date_signed": datetime(2014, 10, 3), "action_date": datetime(2017, 4, 8)}, # All first; partial second - {"date_signed": datetime(2015, 8, 1), "action_date": datetime(2018, 1, 2)}, # Partial first; all second + { + "date_signed": datetime(2014, 12, 1), + "action_date": datetime(2017, 12, 5), + }, # Completely both + { + "date_signed": datetime(2015, 7, 1), + "action_date": datetime(2017, 5, 1), + }, # Partially both + { + "date_signed": datetime(2014, 10, 3), + "action_date": datetime(2017, 4, 8), + }, # All first; partial second + { + "date_signed": datetime(2015, 8, 1), + "action_date": datetime(2018, 1, 2), + }, # Partial first; all second ] award_id = 0 @@ -478,7 +588,9 @@ def test_date_range_search_with_one_range( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_with_contracts) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_with_contracts), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 9 @@ -497,7 +609,9 @@ def test_date_range_search_with_one_range( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_with_grants) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_with_grants), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 8 @@ -516,11 +630,15 @@ def test_date_range_search_with_one_range( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_one_award) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_one_award), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 1 - assert resp.data["results"] == [{"Award ID": "abcdefg1", "internal_id": 1, "generated_internal_id": "AWARD_1"}] + assert resp.data["results"] == [ + {"Award ID": "abcdefg1", "internal_id": 1, "generated_internal_id": "AWARD_1"} + ] # Test with no award showing request_for_no_awards = { @@ -536,7 +654,9 @@ def test_date_range_search_with_one_range( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_no_awards) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_no_awards), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 0 @@ -568,7 +688,9 @@ def test_date_range_search_with_two_ranges( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_with_contracts) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_with_contracts), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 13 @@ -590,7 +712,9 @@ def test_date_range_search_with_two_ranges( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_with_grants) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_with_grants), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 13 @@ -612,7 +736,9 @@ def test_date_range_search_with_two_ranges( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_two_awards) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_two_awards), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 2 @@ -638,13 +764,17 @@ def test_date_range_search_with_two_ranges( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_no_awards) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_no_awards), ) assert resp.status_code == status.HTTP_200_OK @pytest.mark.django_db -def test_date_range_with_date_signed(client, monkeypatch, elasticsearch_award_index, awards_over_different_date_ranges): +def test_date_range_with_date_signed( + client, monkeypatch, elasticsearch_award_index, awards_over_different_date_ranges +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) contract_type_list = all_award_types_mappings["contracts"] @@ -657,14 +787,20 @@ def test_date_range_with_date_signed(client, monkeypatch, elasticsearch_award_in "page": 1, "filters": { "time_period": [ - {"start_date": "2015-01-01", "end_date": "2015-12-31", "date_type": "date_signed"}, + { + "start_date": "2015-01-01", + "end_date": "2015-12-31", + "date_type": "date_signed", + }, ], "award_type_codes": contract_type_list, }, } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_2015) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_2015), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 5 @@ -677,21 +813,29 @@ def test_date_range_with_date_signed(client, monkeypatch, elasticsearch_award_in "page": 1, "filters": { "time_period": [ - {"start_date": "2016-01-01", "end_date": "2016-12-31", "date_type": "date_signed"}, + { + "start_date": "2016-01-01", + "end_date": "2016-12-31", + "date_type": "date_signed", + }, ], "award_type_codes": contract_type_list, }, } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_2016) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_2016), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 2 @pytest.mark.django_db -def test_messages_not_nested(client, monkeypatch, elasticsearch_award_index, awards_over_different_date_ranges): +def test_messages_not_nested( + client, monkeypatch, elasticsearch_award_index, awards_over_different_date_ranges +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) contract_type_list = all_award_types_mappings["contracts"] @@ -704,7 +848,11 @@ def test_messages_not_nested(client, monkeypatch, elasticsearch_award_index, awa "page": 1, "filters": { "time_period": [ - {"start_date": "2015-01-01", "end_date": "2015-12-31", "date_type": "date_signed"}, + { + "start_date": "2015-01-01", + "end_date": "2015-12-31", + "date_type": "date_signed", + }, ], "award_type_codes": contract_type_list, "not_a_real_filter": "abc", @@ -712,7 +860,9 @@ def test_messages_not_nested(client, monkeypatch, elasticsearch_award_index, awa } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_2015) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_2015), ) resp_json = resp.json() @@ -749,7 +899,9 @@ def test_success_with_all_filters(client, monkeypatch, elasticsearch_award_index @pytest.mark.django_db -def test_inclusive_naics_code(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_inclusive_naics_code( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): """ Verify use of built query_string boolean logic for NAICS code inclusions/exclusions executes as expected on ES """ @@ -763,7 +915,9 @@ def test_inclusive_naics_code(client, monkeypatch, spending_by_award_test_data, "filters": { "award_type_codes": ["A", "B", "C", "D"], "naics_codes": {"require": ["1122"]}, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -779,7 +933,9 @@ def test_inclusive_naics_code(client, monkeypatch, spending_by_award_test_data, @pytest.mark.django_db -def test_exclusive_naics_code(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_exclusive_naics_code( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): """ Verify use of built query_string boolean logic for NAICS code inclusions/exclusions executes as expected on ES """ @@ -793,7 +949,9 @@ def test_exclusive_naics_code(client, monkeypatch, spending_by_award_test_data, "filters": { "award_type_codes": ["A", "B", "C", "D"], "naics_codes": {"require": ["999990"]}, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -809,7 +967,9 @@ def test_exclusive_naics_code(client, monkeypatch, spending_by_award_test_data, @pytest.mark.django_db -def test_mixed_naics_codes(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_mixed_naics_codes( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): """ Verify use of built query_string boolean logic for NAICS code inclusions/exclusions executes as expected on ES """ @@ -848,8 +1008,13 @@ def test_mixed_naics_codes(client, monkeypatch, spending_by_award_test_data, ela { "filters": { "award_type_codes": ["A", "B", "C", "D"], - "naics_codes": {"require": ["112233", "222233"], "exclude": ["112233"]}, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "naics_codes": { + "require": ["112233", "222233"], + "exclude": ["112233"], + }, + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -860,15 +1025,27 @@ def test_mixed_naics_codes(client, monkeypatch, spending_by_award_test_data, ela } ), ) - expected_result = [{"internal_id": 5, "Award ID": None, "generated_internal_id": "ASST_NON_TESTING_5"}] + expected_result = [ + { + "internal_id": 5, + "Award ID": None, + "generated_internal_id": "ASST_NON_TESTING_5", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Keyword filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Keyword filter does not match expected result" + ) @pytest.mark.django_db def test_correct_response_for_each_filter( - client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index, elasticsearch_subaward_index + client, + monkeypatch, + spending_by_award_test_data, + elasticsearch_award_index, + elasticsearch_subaward_index, ): """ Verify the content of the response when using different filters. This function creates the ES Index @@ -881,6 +1058,7 @@ def test_correct_response_for_each_filter( _test_correct_response_for_keywords, _test_correct_response_for_time_period, _test_correct_response_for_award_type_codes, + _test_correct_response_for_award_type_codes_loans, _test_correct_response_for_agencies, _test_correct_response_for_tas_components, _test_correct_response_for_pop_location, @@ -924,12 +1102,22 @@ def _test_correct_response_for_keywords(client): ), ) expected_result = [ - {"internal_id": 2, "Award ID": "abc222", "generated_internal_id": "CONT_AWD_TESTING_2"}, - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, + { + "internal_id": 2, + "Award ID": "abc222", + "generated_internal_id": "CONT_AWD_TESTING_2", + }, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "Keyword filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Keyword filter does not match expected result" + ) def _test_correct_response_for_time_period(client): @@ -940,7 +1128,9 @@ def _test_correct_response_for_time_period(client): { "filters": { "award_type_codes": ["A"], - "time_period": [{"start_date": "2014-01-01", "end_date": "2008-12-31"}], + "time_period": [ + {"start_date": "2014-01-01", "end_date": "2008-12-31"} + ], }, "fields": ["Award ID"], "page": 1, @@ -951,10 +1141,18 @@ def _test_correct_response_for_time_period(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Time Period filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Time Period filter does not match expected result" + ) def _test_correct_response_for_award_type_codes(client): @@ -965,7 +1163,9 @@ def _test_correct_response_for_award_type_codes(client): { "filters": { "award_type_codes": ["A", "B", "C", "D"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -977,17 +1177,82 @@ def _test_correct_response_for_award_type_codes(client): ), ) expected_result = [ - {"internal_id": 999, "Award ID": "award999", "generated_internal_id": "ASST_NON_TESTING_999"}, - {"internal_id": 998, "Award ID": "award998", "generated_internal_id": "ASST_NON_TESTING_998"}, - {"internal_id": 997, "Award ID": "award997", "generated_internal_id": "ASST_NON_TESTING_997"}, - {"internal_id": 5, "Award ID": "abcdef123", "generated_internal_id": "CONT_AWD_TESTING_5"}, - {"internal_id": 3, "Award ID": "abc333", "generated_internal_id": "CONT_AWD_TESTING_3"}, - {"internal_id": 2, "Award ID": "abc222", "generated_internal_id": "CONT_AWD_TESTING_2"}, - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, + { + "internal_id": 999, + "Award ID": "award999", + "generated_internal_id": "ASST_NON_TESTING_999", + }, + { + "internal_id": 998, + "Award ID": "award998", + "generated_internal_id": "ASST_NON_TESTING_998", + }, + { + "internal_id": 997, + "Award ID": "award997", + "generated_internal_id": "ASST_NON_TESTING_997", + }, + { + "internal_id": 5, + "Award ID": "abcdef123", + "generated_internal_id": "CONT_AWD_TESTING_5", + }, + { + "internal_id": 3, + "Award ID": "abc333", + "generated_internal_id": "CONT_AWD_TESTING_3", + }, + { + "internal_id": 2, + "Award ID": "abc222", + "generated_internal_id": "CONT_AWD_TESTING_2", + }, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 7 - assert resp.json().get("results") == expected_result, "Award Type Codes filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Award Type Codes filter does not match expected result" + ) + + +def _test_correct_response_for_award_type_codes_loans(client): + resp = client.post( + "/api/v2/search/spending_by_award", + content_type="application/json", + data=json.dumps( + { + "filters": { + "award_type_codes": ["F003"], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], + }, + "fields": ["Award ID"], + "page": 1, + "limit": 60, + "sort": "Award ID", + "order": "desc", + "spending_level": "awards", + } + ), + ) + expected_result = [ + { + "internal_id": 2026, + "Award ID": "award2026", + "generated_internal_id": "ASST_NEW_TYPES_2026", + } + ] + assert resp.status_code == status.HTTP_200_OK + assert len(resp.json().get("results")) == 1 + assert resp.json().get("results") == expected_result, ( + "Award Type Codes filter does not match expected result" + ) def _test_correct_response_for_agencies(client): @@ -999,10 +1264,20 @@ def _test_correct_response_for_agencies(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "agencies": [ - {"type": "awarding", "tier": "toptier", "name": "TOPTIER AGENCY 1"}, - {"type": "awarding", "tier": "subtier", "name": "SUBTIER AGENCY 1"}, + { + "type": "awarding", + "tier": "toptier", + "name": "TOPTIER AGENCY 1", + }, + { + "type": "awarding", + "tier": "subtier", + "name": "SUBTIER AGENCY 1", + }, + ], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} ], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], }, "fields": ["Award ID"], "page": 1, @@ -1013,10 +1288,18 @@ def _test_correct_response_for_agencies(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Agency filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Agency filter does not match expected result" + ) def _test_correct_response_for_tas_components(client): @@ -1028,7 +1311,9 @@ def _test_correct_response_for_tas_components(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "tas_codes": [{"aid": "097", "main": "4930"}], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1040,12 +1325,22 @@ def _test_correct_response_for_tas_components(client): ), ) expected_result = [ - {"internal_id": 5, "Award ID": "abcdef123", "generated_internal_id": "CONT_AWD_TESTING_5"}, - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, + { + "internal_id": 5, + "Award ID": "abcdef123", + "generated_internal_id": "CONT_AWD_TESTING_5", + }, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "TAS Codes filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "TAS Codes filter does not match expected result" + ) def _test_correct_response_for_pop_location(client): @@ -1056,8 +1351,12 @@ def _test_correct_response_for_pop_location(client): { "filters": { "award_type_codes": ["A", "B", "C", "D"], - "place_of_performance_locations": [{"country": "USA", "state": "VA", "county": "014"}], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "place_of_performance_locations": [ + {"country": "USA", "state": "VA", "county": "014"} + ], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1068,10 +1367,18 @@ def _test_correct_response_for_pop_location(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Place of Performance filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Place of Performance filter does not match expected result" + ) def _test_correct_response_for_recipient_location(client): @@ -1086,7 +1393,9 @@ def _test_correct_response_for_recipient_location(client): {"country": "USA", "state": "VA", "county": "012"}, {"country": "USA", "state": "VA", "city": "Arlington"}, ], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1098,12 +1407,22 @@ def _test_correct_response_for_recipient_location(client): ), ) expected_result = [ - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, - {"internal_id": 2, "Award ID": "abc222", "generated_internal_id": "CONT_AWD_TESTING_2"}, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, + { + "internal_id": 2, + "Award ID": "abc222", + "generated_internal_id": "CONT_AWD_TESTING_2", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "Recipient Location filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Recipient Location filter does not match expected result" + ) def _test_correct_response_for_recipient_search_text(client): @@ -1115,7 +1434,9 @@ def _test_correct_response_for_recipient_search_text(client): "filters": { "award_type_codes": ["02", "03", "04", "05"], "recipient_search_text": ["recipient_name_for_award_1001"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1126,10 +1447,18 @@ def _test_correct_response_for_recipient_search_text(client): } ), ) - expected_result = [{"internal_id": 4, "Award ID": "abc444", "generated_internal_id": "ASST_NON_TESTING_4"}] + expected_result = [ + { + "internal_id": 4, + "Award ID": "abc444", + "generated_internal_id": "ASST_NON_TESTING_4", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Recipient Search Text filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Recipient Search Text filter does not match expected result" + ) # Test the results when searching for a recipient name that ends with a period # A search for `ACME INC` should include ACME INC, ACME INC. and ACME INC.XYZ @@ -1141,7 +1470,9 @@ def _test_correct_response_for_recipient_search_text(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "recipient_search_text": ["ACME INC"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID", "Recipient Name"], "page": 1, @@ -1175,7 +1506,9 @@ def _test_correct_response_for_recipient_search_text(client): assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == len(expected_result) - assert resp.json().get("results") == expected_result, "Recipient Search Text filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Recipient Search Text filter does not match expected result" + ) # A search for `ACME INC.` should include ACME INC. and ACME INC.XYZ but not ACME INC resp = client.post( @@ -1186,7 +1519,9 @@ def _test_correct_response_for_recipient_search_text(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "recipient_search_text": ["ACME INC."], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID", "Recipient Name"], "page": 1, @@ -1214,7 +1549,9 @@ def _test_correct_response_for_recipient_search_text(client): assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == len(expected_result) - assert resp.json().get("results") == expected_result, "Recipient Search Text filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Recipient Search Text filter does not match expected result" + ) def _test_correct_response_for_recipient_type_names(client): @@ -1225,8 +1562,13 @@ def _test_correct_response_for_recipient_type_names(client): { "filters": { "award_type_codes": ["A", "B", "C", "D"], - "recipient_type_names": ["business_category_1_3", "business_category_2_8"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "recipient_type_names": [ + "business_category_1_3", + "business_category_2_8", + ], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1238,12 +1580,22 @@ def _test_correct_response_for_recipient_type_names(client): ), ) expected_result = [ - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, - {"internal_id": 3, "Award ID": "abc333", "generated_internal_id": "CONT_AWD_TESTING_3"}, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, + { + "internal_id": 3, + "Award ID": "abc333", + "generated_internal_id": "CONT_AWD_TESTING_3", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "Recipient Type Names filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Recipient Type Names filter does not match expected result" + ) def _test_correct_response_for_award_amounts(client): @@ -1254,8 +1606,13 @@ def _test_correct_response_for_award_amounts(client): { "filters": { "award_type_codes": ["A", "B", "C", "D"], - "award_amounts": [{"upper_bound": 1000000}, {"lower_bound": 9013, "upper_bound": 9017}], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "award_amounts": [ + {"upper_bound": 1000000}, + {"lower_bound": 9013, "upper_bound": 9017}, + ], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1267,13 +1624,27 @@ def _test_correct_response_for_award_amounts(client): ), ) expected_result = [ - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, - {"internal_id": 2, "Award ID": "abc222", "generated_internal_id": "CONT_AWD_TESTING_2"}, - {"internal_id": 5, "Award ID": "abcdef123", "generated_internal_id": "CONT_AWD_TESTING_5"}, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, + { + "internal_id": 2, + "Award ID": "abc222", + "generated_internal_id": "CONT_AWD_TESTING_2", + }, + { + "internal_id": 5, + "Award ID": "abcdef123", + "generated_internal_id": "CONT_AWD_TESTING_5", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 3 - assert resp.json().get("results") == expected_result, "Award Amounts filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Award Amounts filter does not match expected result" + ) def _test_correct_response_for_cfda_program(client): @@ -1285,7 +1656,9 @@ def _test_correct_response_for_cfda_program(client): "filters": { "award_type_codes": ["02", "03", "04", "05"], "program_numbers": ["10.331"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1296,10 +1669,18 @@ def _test_correct_response_for_cfda_program(client): } ), ) - expected_result = [{"internal_id": 4, "Award ID": "abc444", "generated_internal_id": "ASST_NON_TESTING_4"}] + expected_result = [ + { + "internal_id": 4, + "Award ID": "abc444", + "generated_internal_id": "ASST_NON_TESTING_4", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "CFDA Program filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "CFDA Program filter does not match expected result" + ) def _test_correct_response_for_cfda_program_subawards(client): @@ -1311,7 +1692,9 @@ def _test_correct_response_for_cfda_program_subawards(client): "filters": { "award_type_codes": ["02", "03", "04", "05"], "program_numbers": ["10.331"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1332,7 +1715,9 @@ def _test_correct_response_for_cfda_program_subawards(client): ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "CFDA Program filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "CFDA Program filter does not match expected result" + ) def _test_correct_response_for_naics_codes(client): @@ -1344,7 +1729,9 @@ def _test_correct_response_for_naics_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "naics_codes": {"require": ["1122"], "exclude": ["112244"]}, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1355,10 +1742,18 @@ def _test_correct_response_for_naics_codes(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "NAICS Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "NAICS Code filter does not match expected result" + ) def _test_correct_response_for_naics_codes_subawards(client): @@ -1370,7 +1765,9 @@ def _test_correct_response_for_naics_codes_subawards(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "naics_codes": {"require": ["112233", "112244"]}, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1397,7 +1794,9 @@ def _test_correct_response_for_naics_codes_subawards(client): ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "NAICS Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "NAICS Code filter does not match expected result" + ) def _test_correct_response_for_psc_code_list(client): @@ -1409,7 +1808,9 @@ def _test_correct_response_for_psc_code_list(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "psc_codes": ["PSC1"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1420,10 +1821,18 @@ def _test_correct_response_for_psc_code_list(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "PSC Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "PSC Code filter does not match expected result" + ) def _test_correct_response_for_psc_code_object(client): @@ -1438,7 +1847,9 @@ def _test_correct_response_for_psc_code_object(client): "require": [["Service", "P", "PSC", "PSC1"]], "exclude": [["Service", "P", "PSC", "PSC0"]], }, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1449,10 +1860,18 @@ def _test_correct_response_for_psc_code_object(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "PSC Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "PSC Code filter does not match expected result" + ) def _test_correct_response_for_psc_code_list_subawards(client): @@ -1465,7 +1884,9 @@ def _test_correct_response_for_psc_code_list_subawards(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "psc_codes": ["PSC2"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1486,7 +1907,9 @@ def _test_correct_response_for_psc_code_list_subawards(client): ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "PSC Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "PSC Code filter does not match expected result" + ) def _test_correct_response_for_psc_code_object_subawards(client): @@ -1502,7 +1925,9 @@ def _test_correct_response_for_psc_code_object_subawards(client): "require": [["Service", "P", "PSC", "PSC2"]], "exclude": [["Service", "P", "PSC", "PSC0"]], }, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1523,7 +1948,9 @@ def _test_correct_response_for_psc_code_object_subawards(client): ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "PSC Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "PSC Code filter does not match expected result" + ) def _test_more_sophisticated_eclipsed_psc_code_1(client): @@ -1538,7 +1965,9 @@ def _test_more_sophisticated_eclipsed_psc_code_1(client): "require": [["Service"], ["Service", "P", "PSC"]], "exclude": [["Service", "P"], ["Service", "P", "PSC", "PSC1"]], }, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1565,7 +1994,9 @@ def _test_more_sophisticated_eclipsed_psc_code_2(client): "require": [["Service", "P"], ["Service", "P", "PSC", "PSC1"]], "exclude": [["Service"], ["Service", "P", "PSC"]], }, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1589,7 +2020,9 @@ def _test_correct_response_for_contract_pricing_type_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "contract_pricing_type_codes": ["contract_pricing_test"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1600,12 +2033,18 @@ def _test_correct_response_for_contract_pricing_type_codes(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert ( - resp.json().get("results") == expected_result - ), "Contract Pricing Type Codes filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Contract Pricing Type Codes filter does not match expected result" + ) def _test_correct_response_for_set_aside_type_codes(client): @@ -1617,7 +2056,9 @@ def _test_correct_response_for_set_aside_type_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "set_aside_type_codes": ["type_set_aside_test"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1628,10 +2069,18 @@ def _test_correct_response_for_set_aside_type_codes(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Set Aside Type Codes filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Set Aside Type Codes filter does not match expected result" + ) def _test_correct_response_for_set_extent_competed_type_codes(client): @@ -1643,7 +2092,9 @@ def _test_correct_response_for_set_extent_competed_type_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "extent_competed_type_codes": ["extent_competed_test"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1654,12 +2105,18 @@ def _test_correct_response_for_set_extent_competed_type_codes(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert ( - resp.json().get("results") == expected_result - ), "Extent Competed Type Codes filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Extent Competed Type Codes filter does not match expected result" + ) def _test_correct_response_for_recipient_id(client): @@ -1671,7 +2128,9 @@ def _test_correct_response_for_recipient_id(client): "filters": { "award_type_codes": ["02", "03", "04", "05"], "recipient_id": "51c7c0ad-a793-de3f-72ba-be5c2895a9ca", - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1682,10 +2141,16 @@ def _test_correct_response_for_recipient_id(client): } ), ) - expected_result = {"internal_id": 4, "Award ID": "abc444", "generated_internal_id": "ASST_NON_TESTING_4"} + expected_result = { + "internal_id": 4, + "Award ID": "abc444", + "generated_internal_id": "ASST_NON_TESTING_4", + } assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 7 - assert resp.json().get("results")[-1] == expected_result, "Recipient ID filter does not match expected result" + assert resp.json().get("results")[-1] == expected_result, ( + "Recipient ID filter does not match expected result" + ) def _test_correct_response_for_def_codes(client): @@ -1697,7 +2162,9 @@ def _test_correct_response_for_def_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "def_codes": ["L", "Q"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1709,12 +2176,22 @@ def _test_correct_response_for_def_codes(client): ), ) expected_result = [ - {"internal_id": 5, "Award ID": "abcdef123", "generated_internal_id": "CONT_AWD_TESTING_5"}, - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, + { + "internal_id": 5, + "Award ID": "abcdef123", + "generated_internal_id": "CONT_AWD_TESTING_5", + }, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "DEFC filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "DEFC filter does not match expected result" + ) resp = client.post( "/api/v2/search/spending_by_award", @@ -1724,7 +2201,9 @@ def _test_correct_response_for_def_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "def_codes": ["J"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1738,7 +2217,9 @@ def _test_correct_response_for_def_codes(client): expected_result = [] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 0 - assert resp.json().get("results") == expected_result, "DEFC filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "DEFC filter does not match expected result" + ) def _test_correct_response_for_def_codes_subaward(client): @@ -1750,7 +2231,9 @@ def _test_correct_response_for_def_codes_subaward(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "def_codes": ["L"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1783,7 +2266,9 @@ def _test_correct_response_for_def_codes_subaward(client): ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 3 - assert resp.json().get("results") == expected_result, "DEFC subaward filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "DEFC subaward filter does not match expected result" + ) resp = client.post( "/api/v2/search/spending_by_award", @@ -1793,7 +2278,9 @@ def _test_correct_response_for_def_codes_subaward(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "def_codes": ["J"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1807,7 +2294,9 @@ def _test_correct_response_for_def_codes_subaward(client): expected_result = [] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 0 - assert resp.json().get("results") == expected_result, "DEFC subaward filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "DEFC subaward filter does not match expected result" + ) @pytest.mark.django_db @@ -1815,7 +2304,11 @@ def test_failure_with_invalid_filters(client, monkeypatch, elasticsearch_award_i setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) # Fails with no request data - resp = client.post("/api/v2/search/spending_by_award", content_type="application/json", data=json.dumps({})) + resp = client.post( + "/api/v2/search/spending_by_award", + content_type="application/json", + data=json.dumps({}), + ) assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY assert resp.json().get("detail") == "Missing value: 'fields' is a required field" @@ -1823,10 +2316,21 @@ def test_failure_with_invalid_filters(client, monkeypatch, elasticsearch_award_i resp = client.post( "/api/v2/search/spending_by_award", content_type="application/json", - data=json.dumps({"fields": [], "filters": {}, "page": 1, "limit": 60, "spending_level": "awards"}), + data=json.dumps( + { + "fields": [], + "filters": {}, + "page": 1, + "limit": 60, + "spending_level": "awards", + } + ), ) assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY - assert resp.json().get("detail") == "Missing value: 'filters|award_type_codes' is a required field" + assert ( + resp.json().get("detail") + == "Missing value: 'filters|award_type_codes' is a required field" + ) # fails with empty field resp = client.post( @@ -1836,7 +2340,9 @@ def test_failure_with_invalid_filters(client, monkeypatch, elasticsearch_award_i { "fields": [], "filters": { - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], "award_type_codes": ["A", "B", "C", "D"], }, "page": 1, @@ -1846,11 +2352,15 @@ def test_failure_with_invalid_filters(client, monkeypatch, elasticsearch_award_i ), ) assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY - assert resp.json().get("detail") == "Field 'fields' value '[]' is below min '1' items" + assert ( + resp.json().get("detail") == "Field 'fields' value '[]' is below min '1' items" + ) @pytest.mark.django_db -def test_search_after(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_search_after( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = client.post( @@ -1871,20 +2381,48 @@ def test_search_after(client, monkeypatch, spending_by_award_test_data, elastics ), ) expected_result = [ - {"internal_id": 2, "Award ID": "abc222", "generated_internal_id": "CONT_AWD_TESTING_2"}, - {"internal_id": 3, "Award ID": "abc333", "generated_internal_id": "CONT_AWD_TESTING_3"}, - {"internal_id": 5, "Award ID": "abcdef123", "generated_internal_id": "CONT_AWD_TESTING_5"}, - {"internal_id": 997, "Award ID": "award997", "generated_internal_id": "ASST_NON_TESTING_997"}, - {"internal_id": 998, "Award ID": "award998", "generated_internal_id": "ASST_NON_TESTING_998"}, - {"internal_id": 999, "Award ID": "award999", "generated_internal_id": "ASST_NON_TESTING_999"}, + { + "internal_id": 2, + "Award ID": "abc222", + "generated_internal_id": "CONT_AWD_TESTING_2", + }, + { + "internal_id": 3, + "Award ID": "abc333", + "generated_internal_id": "CONT_AWD_TESTING_3", + }, + { + "internal_id": 5, + "Award ID": "abcdef123", + "generated_internal_id": "CONT_AWD_TESTING_5", + }, + { + "internal_id": 997, + "Award ID": "award997", + "generated_internal_id": "ASST_NON_TESTING_997", + }, + { + "internal_id": 998, + "Award ID": "award998", + "generated_internal_id": "ASST_NON_TESTING_998", + }, + { + "internal_id": 999, + "Award ID": "award999", + "generated_internal_id": "ASST_NON_TESTING_999", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == len(expected_result) - assert resp.json().get("results") == expected_result, "Award Type Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Award Type Code filter does not match expected result" + ) @pytest.mark.django_db -def test_no_0_covid_amounts(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_no_0_covid_amounts( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = client.post( @@ -1895,7 +2433,9 @@ def test_no_0_covid_amounts(client, monkeypatch, spending_by_award_test_data, el "filters": { "award_type_codes": ["A", "B", "C", "D"], "def_codes": ["L"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1906,14 +2446,24 @@ def test_no_0_covid_amounts(client, monkeypatch, spending_by_award_test_data, el } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "DEFC filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "DEFC filter does not match expected result" + ) @pytest.mark.django_db -def test_uei_keyword_filter(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_uei_keyword_filter( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = client.post( @@ -1924,7 +2474,9 @@ def test_uei_keyword_filter(client, monkeypatch, spending_by_award_test_data, el "filters": { "award_type_codes": ["A", "B", "C", "D"], "keywords": ["testuei"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1935,14 +2487,24 @@ def test_uei_keyword_filter(client, monkeypatch, spending_by_award_test_data, el } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "UEI filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "UEI filter does not match expected result" + ) @pytest.mark.django_db -def test_parent_uei_keyword_filter(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_parent_uei_keyword_filter( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = client.post( @@ -1953,7 +2515,9 @@ def test_parent_uei_keyword_filter(client, monkeypatch, spending_by_award_test_d "filters": { "award_type_codes": ["A", "B", "C", "D"], "keywords": ["test_parent_uei"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1964,15 +2528,27 @@ def test_parent_uei_keyword_filter(client, monkeypatch, spending_by_award_test_d } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "UEI filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "UEI filter does not match expected result" + ) @pytest.mark.django_db def test_uei_recipient_filter_subaward( - client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index, elasticsearch_subaward_index + client, + monkeypatch, + spending_by_award_test_data, + elasticsearch_award_index, + elasticsearch_subaward_index, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -1983,7 +2559,9 @@ def test_uei_recipient_filter_subaward( data=json.dumps( { "filters": { - "time_period": [{"start_date": "2007-10-01", "end_date": "2022-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2022-09-30"} + ], "award_type_codes": [ "A", "B", @@ -2019,12 +2597,18 @@ def test_uei_recipient_filter_subaward( ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "UEI Recipient subaward filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "UEI Recipient subaward filter does not match expected result" + ) @pytest.mark.django_db def test_date_range_with_new_awards_only( - client, monkeypatch, elasticsearch_award_index, awards_over_different_date_ranges, elasticsearch_subaward_index + client, + monkeypatch, + elasticsearch_award_index, + awards_over_different_date_ranges, + elasticsearch_subaward_index, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2039,14 +2623,20 @@ def test_date_range_with_new_awards_only( "page": 1, "filters": { "time_period": [ - {"start_date": "2015-01-01", "end_date": "2015-12-31", "date_type": "new_awards_only"}, + { + "start_date": "2015-01-01", + "end_date": "2015-12-31", + "date_type": "new_awards_only", + }, ], "award_type_codes": contract_type_list, }, } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_2015) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_2015), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 5 @@ -2059,14 +2649,20 @@ def test_date_range_with_new_awards_only( "page": 1, "filters": { "time_period": [ - {"start_date": "2015-01-01", "end_date": "2015-12-31", "date_type": "new_awards_only"}, + { + "start_date": "2015-01-01", + "end_date": "2015-12-31", + "date_type": "new_awards_only", + }, ], "award_type_codes": contract_type_list, }, } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_2015) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_2015), ) assert resp.status_code == status.HTTP_400_BAD_REQUEST assert ( @@ -2077,7 +2673,11 @@ def test_date_range_with_new_awards_only( @pytest.mark.django_db def test_spending_by_award_program_activity_subawards( - client, monkeypatch, elasticsearch_award_index, spending_by_award_test_data, elasticsearch_subaward_index + client, + monkeypatch, + elasticsearch_award_index, + spending_by_award_test_data, + elasticsearch_subaward_index, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2102,11 +2702,15 @@ def test_spending_by_award_program_activity_subawards( } ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) test_payload = { "spending_level": "subawards", @@ -2127,11 +2731,15 @@ def test_spending_by_award_program_activity_subawards( } ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) test_payload = { "spending_level": "subawards", @@ -2145,15 +2753,21 @@ def test_spending_by_award_program_activity_subawards( } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) @pytest.mark.django_db -def test_spending_by_award_program_activity(client, monkeypatch, elasticsearch_award_index, award_data_fixture): +def test_spending_by_award_program_activity( + client, monkeypatch, elasticsearch_award_index, award_data_fixture +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) # Program Activites filter test @@ -2175,11 +2789,15 @@ def test_spending_by_award_program_activity(client, monkeypatch, elasticsearch_a } ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) test_payload = { "spending_level": "awards", @@ -2193,11 +2811,15 @@ def test_spending_by_award_program_activity(client, monkeypatch, elasticsearch_a } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) test_payload = { "spending_level": "awards", @@ -2217,11 +2839,15 @@ def test_spending_by_award_program_activity(client, monkeypatch, elasticsearch_a } ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) test_payload = { "spending_level": "awards", @@ -2241,11 +2867,15 @@ def test_spending_by_award_program_activity(client, monkeypatch, elasticsearch_a } ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) @pytest.mark.django_db @@ -2271,10 +2901,16 @@ def test_spending_by_award_subawards_award_id_filter( "prime_award_generated_internal_id": "ASST_NON_DECF0000058_8900", } ] - resp = client.post("/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(payload)) + resp = client.post( + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(payload), + ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # Test finding a Subaward by it's `award_piid_fain` payload = { @@ -2293,15 +2929,25 @@ def test_spending_by_award_subawards_award_id_filter( "prime_award_generated_internal_id": "ASST_NON_DECF0000058_8900", } ] - resp = client.post("/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(payload)) + resp = client.post( + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(payload), + ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) @pytest.mark.django_db def test_spending_by_award_unique_id_award( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2323,11 +2969,15 @@ def test_spending_by_award_unique_id_award( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # Test with an undefined award_unique_id test_payload = { @@ -2340,16 +2990,24 @@ def test_spending_by_award_unique_id_award( } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) @pytest.mark.django_db def test_spending_by_award_unique_id_subaward( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2378,11 +3036,15 @@ def test_spending_by_award_unique_id_subaward( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # Test with a single subaward test_payload = { @@ -2402,11 +3064,15 @@ def test_spending_by_award_unique_id_subaward( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # Test with no subawards test_payload = { @@ -2419,15 +3085,23 @@ def test_spending_by_award_unique_id_subaward( } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_description_specificity( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2436,7 +3110,10 @@ def test_spending_by_award_description_specificity( test_payload = { "spending_level": "awards", "fields": ["Award ID"], - "filters": {"award_type_codes": ["A", "B", "C", "D"], "description": "the test"}, + "filters": { + "award_type_codes": ["A", "B", "C", "D"], + "description": "the test", + }, } expected_response = [ { @@ -2446,17 +3123,24 @@ def test_spending_by_award_description_specificity( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # get subaward with description "the test test test" and not "the description for test" test_payload = { "spending_level": "subawards", "fields": ["Sub-Award ID"], - "filters": {"award_type_codes": ["A", "B", "C", "D"], "description": "the test"}, + "filters": { + "award_type_codes": ["A", "B", "C", "D"], + "description": "the test", + }, } expected_response = [ { @@ -2467,29 +3151,44 @@ def test_spending_by_award_description_specificity( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # ensure only queries for text in the correct order test_payload = { "spending_level": "subawards", "fields": ["Sub-Award ID"], - "filters": {"award_type_codes": ["A", "B", "C", "D"], "description": "test the"}, + "filters": { + "award_type_codes": ["A", "B", "C", "D"], + "description": "test the", + }, } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_keyword_specificity( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2508,11 +3207,15 @@ def test_spending_by_award_keyword_specificity( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # get subaward with product_or_service_description "the test test test" and not # "the description for test" @@ -2530,11 +3233,15 @@ def test_spending_by_award_keyword_specificity( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # ensure only queries for text in the correct order test_payload = { @@ -2544,15 +3251,23 @@ def test_spending_by_award_keyword_specificity( } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_new_subcontract_fields( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2614,15 +3329,23 @@ def test_spending_by_award_new_subcontract_fields( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_new_subgrant_fields( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2677,27 +3400,45 @@ def test_spending_by_award_new_subgrant_fields( "zip5": "55455", }, "Prime Award Recipient UEI": "uei 1", - "Assistance Listing": {"cfda_number": "1.234", "cfda_program_title": "test cfda"}, + "Assistance Listing": { + "cfda_number": "1.234", + "cfda_program_title": "test cfda", + }, "sub_award_recipient_id": "EXAM-PLE-ID-P", }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_new_contract_fields( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) # get award with naics_description "the test test test" and not "the description for test" test_payload = { "spending_level": "awards", - "fields": ["Award ID", "Recipient UEI", "Recipient Location", "Primary Place of Performance", "NAICS", "PSC"], + "fields": [ + "Award ID", + "Recipient UEI", + "Recipient Location", + "Primary Place of Performance", + "NAICS", + "PSC", + ], "filters": {"award_type_codes": ["A", "B", "C", "D"], "keyword": "the test"}, } expected_response = [ @@ -2740,15 +3481,23 @@ def test_spending_by_award_new_contract_fields( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_new_assistance_fields( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) @@ -2801,7 +3550,10 @@ def test_spending_by_award_new_assistance_fields( "zip5": "55455", }, "Assistance Listings": [ - {"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"} + { + "cfda_number": "64.114", + "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS", + } ], "primary_assistance_listing": { "cfda_number": "64.114", @@ -2810,17 +3562,24 @@ def test_spending_by_award_new_assistance_fields( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_sort_recipient_location( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -2961,7 +3720,9 @@ def test_spending_by_award_sort_recipient_location( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -2987,7 +3748,9 @@ def test_spending_by_award_sort_recipient_location( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3003,9 +3766,12 @@ def test_spending_by_award_sort_recipient_location( def test_spending_by_primary_place_of_performance( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3098,7 +3864,9 @@ def test_spending_by_primary_place_of_performance( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3113,9 +3881,12 @@ def test_spending_by_primary_place_of_performance( def test_spending_by_award_sort_naics( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3130,7 +3901,9 @@ def test_spending_by_award_sort_naics( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) naics_1 = {"code": "123456", "description": "1"} @@ -3158,7 +3931,9 @@ def test_spending_by_award_sort_naics( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3170,9 +3945,12 @@ def test_spending_by_award_sort_naics( def test_spending_by_award_sort_psc( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3187,7 +3965,9 @@ def test_spending_by_award_sort_psc( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) psc1 = {"code": "PSC1", "description": "PSC description 1"} @@ -3215,7 +3995,9 @@ def test_spending_by_award_sort_psc( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3227,9 +4009,12 @@ def test_spending_by_award_sort_psc( def test_spending_by_award_assistance_listings( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3244,7 +4029,9 @@ def test_spending_by_award_assistance_listings( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assisance_listing1 = [{"cfda_number": "12", "cfda_program_title": "program1"}] @@ -3272,7 +4059,9 @@ def test_spending_by_award_assistance_listings( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3284,7 +4073,11 @@ def test_spending_by_award_assistance_listings( def test_spending_by_award_sort_sub_recipient_locations( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3300,7 +4093,9 @@ def test_spending_by_award_sort_sub_recipient_locations( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3309,7 +4104,10 @@ def test_spending_by_award_sort_sub_recipient_locations( assert results[0]["Sub-Recipient Location"]["city_name"] == "ARLINGTON" assert results[0]["Sub-Recipient Location"]["address_line1"] == "1 Memorial Drive" assert results[1]["Sub-Recipient Location"]["city_name"] == "ARLINGTON" - assert results[1]["Sub-Recipient Location"]["address_line1"] == "600 CALIFORNIA STREET FL 18" + assert ( + results[1]["Sub-Recipient Location"]["address_line1"] + == "600 CALIFORNIA STREET FL 18" + ) assert results[2]["Sub-Recipient Location"]["city_name"] == "SAN FRANCISCO" assert results[3]["Sub-Recipient Location"]["state_code"] == "CA" assert results[3]["Sub-Recipient Location"]["city_name"] is None @@ -3332,7 +4130,9 @@ def test_spending_by_award_sort_sub_recipient_locations( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3340,7 +4140,10 @@ def test_spending_by_award_sort_sub_recipient_locations( assert len(results) == 7 assert results[0]["Sub-Recipient Location"]["city_name"] == "SAN FRANCISCO" assert results[1]["Sub-Recipient Location"]["city_name"] == "ARLINGTON" - assert results[1]["Sub-Recipient Location"]["address_line1"] == "600 CALIFORNIA STREET FL 18" + assert ( + results[1]["Sub-Recipient Location"]["address_line1"] + == "600 CALIFORNIA STREET FL 18" + ) assert results[2]["Sub-Recipient Location"]["city_name"] == "ARLINGTON" assert results[2]["Sub-Recipient Location"]["address_line1"] == "1 Memorial Drive" assert results[3]["Sub-Recipient Location"]["state_code"] == "NE" @@ -3350,7 +4153,11 @@ def test_spending_by_award_sort_sub_recipient_locations( def test_spending_by_award_sort_sub_pop_location( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3367,23 +4174,37 @@ def test_spending_by_award_sort_sub_pop_location( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK results = resp.json().get("results") assert len(results) == 7 - assert results[0]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" - assert results[1]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" - assert results[2]["Sub-Award Primary Place of Performance"]["city_name"] == "LOS ANGELES" + assert ( + results[0]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" + ) + assert ( + results[1]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" + ) + assert ( + results[2]["Sub-Award Primary Place of Performance"]["city_name"] + == "LOS ANGELES" + ) assert results[3]["Sub-Award Primary Place of Performance"]["city_name"] is None assert results[3]["Sub-Award Primary Place of Performance"]["state_code"] == "IL" assert results[4]["Sub-Award Primary Place of Performance"]["city_name"] is None assert results[4]["Sub-Award Primary Place of Performance"]["state_code"] == "VA" assert results[5]["Sub-Award Primary Place of Performance"]["state_code"] is None - assert results[5]["Sub-Award Primary Place of Performance"]["country_name"] == "LAOS" + assert ( + results[5]["Sub-Award Primary Place of Performance"]["country_name"] == "LAOS" + ) assert results[6]["Sub-Award Primary Place of Performance"]["state_code"] is None - assert results[6]["Sub-Award Primary Place of Performance"]["country_name"] == "UNITED STATES" + assert ( + results[6]["Sub-Award Primary Place of Performance"]["country_name"] + == "UNITED STATES" + ) test_payload = { "spending_level": "subawards", @@ -3397,23 +4218,41 @@ def test_spending_by_award_sort_sub_pop_location( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK results = resp.json().get("results") assert len(results) == 7 - assert results[0]["Sub-Award Primary Place of Performance"]["city_name"] == "LOS ANGELES" - assert results[1]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" - assert results[2]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" + assert ( + results[0]["Sub-Award Primary Place of Performance"]["city_name"] + == "LOS ANGELES" + ) + assert ( + results[1]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" + ) + assert ( + results[2]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" + ) assert results[3]["Sub-Award Primary Place of Performance"]["state_code"] == "VA" assert results[4]["Sub-Award Primary Place of Performance"]["state_code"] == "IL" - assert results[5]["Sub-Award Primary Place of Performance"]["country_name"] == "UNITED STATES" - assert results[6]["Sub-Award Primary Place of Performance"]["country_name"] == "LAOS" + assert ( + results[5]["Sub-Award Primary Place of Performance"]["country_name"] + == "UNITED STATES" + ) + assert ( + results[6]["Sub-Award Primary Place of Performance"]["country_name"] == "LAOS" + ) def test_spending_by_award_sort_sub_assistance_listing( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3430,7 +4269,9 @@ def test_spending_by_award_sort_sub_assistance_listing( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3457,7 +4298,9 @@ def test_spending_by_award_sort_sub_assistance_listing( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3472,7 +4315,11 @@ def test_spending_by_award_sort_sub_assistance_listing( def test_spending_by_award_sort_sub_naics( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3489,7 +4336,9 @@ def test_spending_by_award_sort_sub_naics( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3514,7 +4363,9 @@ def test_spending_by_award_sort_sub_naics( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3529,7 +4380,11 @@ def test_spending_by_award_sort_sub_naics( def test_spending_by_award_sort_sub_psc( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3546,7 +4401,9 @@ def test_spending_by_award_sort_sub_psc( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3571,7 +4428,9 @@ def test_spending_by_award_sort_sub_psc( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3586,9 +4445,12 @@ def test_spending_by_award_sort_sub_psc( def test_spending_by_subaward_new_sort_fields( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3604,7 +4466,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3625,7 +4489,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3646,7 +4512,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3667,7 +4535,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3688,7 +4558,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3709,7 +4581,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3730,7 +4604,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3741,7 +4617,9 @@ def test_spending_by_subaward_new_sort_fields( @pytest.mark.django_db -def test_covid_and_iija_values(client, monkeypatch, elasticsearch_award_index, award_data_fixture): +def test_covid_and_iija_values( + client, monkeypatch, elasticsearch_award_index, award_data_fixture +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) request_body = { "spending_level": "awards", @@ -3763,7 +4641,9 @@ def test_covid_and_iija_values(client, monkeypatch, elasticsearch_award_index, a } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_body) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_body), ) expected_result = [ { @@ -3799,7 +4679,9 @@ def test_covid_and_iija_values(client, monkeypatch, elasticsearch_award_index, a } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_body) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_body), ) expected_result = [ { @@ -3833,7 +4715,9 @@ def test_spending_by_subaward_place_of_perf_zip_filter( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3850,13 +4734,18 @@ def test_spending_by_subaward_recipient_location_zip_filter( test_payload = { "spending_level": "subawards", "fields": ["Sub-Award ID"], - "filters": {"award_type_codes": ["07", "08"], "recipient_locations": [{"country": "USA", "zip": "12345"}]}, + "filters": { + "award_type_codes": ["07", "08"], + "recipient_locations": [{"country": "USA", "zip": "12345"}], + }, "sort": "Sub-Award ID", "order": "desc", } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3866,7 +4755,11 @@ def test_spending_by_subaward_recipient_location_zip_filter( def test_spending_by_award_sort_contract_award_type( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3881,13 +4774,19 @@ def test_spending_by_award_sort_contract_award_type( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK def test_spending_by_award_sort_recipient_uei( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3902,6 +4801,8 @@ def test_spending_by_award_sort_recipient_uei( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK From aaac2b6402c70f78244b722be620359ae8645a06 Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Thu, 5 Feb 2026 10:19:09 -0600 Subject: [PATCH 39/59] [DEV-14451] Ruff ignore files with test data --- pyproject.toml | 62 +++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 46195c3d32..2e5ff823b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ readme = 'README.md' license = "CC0-1.0" license-files = ["LICENSE"] requires-python = '>=3.10' -classifiers=[ +classifiers = [ "Development Status :: 5 - Production/Stable", "Programming Language :: Python", "Programming Language :: Python :: 3", @@ -137,14 +137,13 @@ DJANGO_SETTINGS_MODULE = "usaspending_api.settings" addopts = "--cov=usaspending_api" markers = [ "signal_handling: Mark all tests that import the signal library and invoke signals. This MUST be done on the main thread, and can cause errors if pytest-xdist subordinates parellel test sessions to background threads.", - # These are "auto" marked based on fixture usage. See conftest.py pytest_collection_modifyitems "spark: Mark all tests using the spark fixture. Can be selected with -m spark or deselected with -m (not spark)", "database: Mark all integration tests using a database. Can be selected with -m database or deselected with -m (not database)", "elasticsearch: Mark all integration tests using Elasticsearch. Can be selected with -m database or deselected with -m (not elasticsearch)", ] pythonpath = [ - "." + "." ] [tool.coverage.run] @@ -174,25 +173,25 @@ exclude = [ ] select = [ - "PLR0913", # max arguments in function - "PLR0904", # max number of public methods - "PLR0911", # max number of return statements - "PLR0916", # max number of boolean expressions - "PLR0915", # max number of lines in a function - "PLR0912", # max number of logical branches in a function - "PLR1702", # max number of nested blocks - "C901", # cognitive complexity (functions) - "I001", # unsorted imports - "B", # flake8 bugbear - "E", # pycodestyle errors - "F", # pyflakes - "W", # pycodestyle warnings - "ANN001", # missing type annotation for function argument - "ANN201", # missing return type annotation for public function or method - "ANN202", # missing return type annotation for private function or method + "PLR0913", # max arguments in function + "PLR0904", # max number of public methods + "PLR0911", # max number of return statements + "PLR0916", # max number of boolean expressions + "PLR0915", # max number of lines in a function + "PLR0912", # max number of logical branches in a function + "PLR1702", # max number of nested blocks + "C901", # cognitive complexity (functions) + "I001", # unsorted imports + "B", # flake8 bugbear + "E", # pycodestyle errors + "F", # pyflakes + "W", # pycodestyle warnings + "ANN001", # missing type annotation for function argument + "ANN201", # missing return type annotation for public function or method + "ANN202", # missing return type annotation for private function or method ] ignore = [ - "E203", # whitespace before punctuation + "E203", # whitespace before punctuation ] pylint.max-args = 6 @@ -207,16 +206,23 @@ pycodestyle.max-line-length = 120 [tool.ruff.lint.per-file-ignores] "**/tests/**/test*.py" = [ - "ANN001", # missing-type-function-argument; avoid conflict with fixtures - "ANN201", # missing-return-type-undocumented-public-function; avoid conflict with test case return values - "ANN202", # missing-return-type-undocumented-private-function; avoid conflict with test case return values + "ANN001", # missing-type-function-argument; avoid conflict with fixtures + "ANN201", # missing-return-type-undocumented-public-function; avoid conflict with test case return values + "ANN202", # missing-return-type-undocumented-private-function; avoid conflict with test case return values "PLR0913", # too-many-arguments; avoid conflict with too many fixtures - "PLR0915", # too-many-statements; avoid conflict with long fixtures + "PLR0915", # too-many-statements; avoid conflict with long fixtures ] "**/conftest*.py" = [ - "ANN001", # missing-type-function-argument; avoid conflict with fixtures - "ANN201", # missing-return-type-undocumented-public-function; avoid conflict with test case return values - "ANN202", # missing-return-type-undocumented-private-function; avoid conflict with test case return values + "ANN001", # missing-type-function-argument; avoid conflict with fixtures + "ANN201", # missing-return-type-undocumented-public-function; avoid conflict with test case return values + "ANN202", # missing-return-type-undocumented-private-function; avoid conflict with test case return values + "PLR0913", # too-many-arguments; avoid conflict with too many fixtures + "PLR0915", # too-many-statements; avoid conflict with long fixtures +] +"**/tests/**/data/**.py" = [ + "ANN001", # missing-type-function-argument; avoid conflict with fixtures + "ANN201", # missing-return-type-undocumented-public-function; avoid conflict with test case return values + "ANN202", # missing-return-type-undocumented-private-function; avoid conflict with test case return values "PLR0913", # too-many-arguments; avoid conflict with too many fixtures - "PLR0915", # too-many-statements; avoid conflict with long fixtures + "PLR0915", # too-many-statements; avoid conflict with long fixtures ] From 9203d0f3630e483b8c75c480732f0d490046a66a Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Thu, 5 Feb 2026 10:40:34 -0600 Subject: [PATCH 40/59] [DEV-14451] Ruff fix --- .../search/tests/integration/test_spending_by_award.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usaspending_api/search/tests/integration/test_spending_by_award.py b/usaspending_api/search/tests/integration/test_spending_by_award.py index b7d8981354..0663d12589 100644 --- a/usaspending_api/search/tests/integration/test_spending_by_award.py +++ b/usaspending_api/search/tests/integration/test_spending_by_award.py @@ -2667,7 +2667,7 @@ def test_date_range_with_new_awards_only( assert resp.status_code == status.HTTP_400_BAD_REQUEST assert ( resp.json().get("detail") - == "Field 'filters|time_period' is outside valid values ['action_date', 'last_modified_date', 'date_signed', 'sub_action_date']" + == "Field 'filters|time_period' is outside valid values ['action_date', 'last_modified_date', 'date_signed', 'sub_action_date']" # noqa: E501 ) From 08169264c4867c869c068c2baa275fc67053c34b Mon Sep 17 00:00:00 2001 From: aguest-kc Date: Thu, 5 Feb 2026 11:17:57 -0600 Subject: [PATCH 41/59] [DEV-14451] Test fixes --- .../tests/integration/test_cfda_count.py | 2 +- .../tests/integration/test_cfda_loans.py | 2 +- .../tests/integration/test_cfda_spending.py | 100 ++++++++++++++---- .../integration/test_disaster_agency_loans.py | 2 +- .../tests/integration/test_recipient_loans.py | 2 +- 5 files changed, 85 insertions(+), 23 deletions(-) diff --git a/usaspending_api/disaster/tests/integration/test_cfda_count.py b/usaspending_api/disaster/tests/integration/test_cfda_count.py index f574229dc3..217c23e761 100644 --- a/usaspending_api/disaster/tests/integration/test_cfda_count.py +++ b/usaspending_api/disaster/tests/integration/test_cfda_count.py @@ -91,5 +91,5 @@ def test_invalid_award_type_codes( assert resp.status_code == status.HTTP_400_BAD_REQUEST assert ( resp.data["detail"] - == "Field 'filter|award_type_codes' is outside valid values ['-1', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11']" + == "Field 'filter|award_type_codes' is outside valid values ['-1', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', 'F001', 'F002', 'F003', 'F004', 'F005', 'F006', 'F007', 'F008', 'F009', 'F010']" # noqa: E501 ) diff --git a/usaspending_api/disaster/tests/integration/test_cfda_loans.py b/usaspending_api/disaster/tests/integration/test_cfda_loans.py index 9807903d4e..f998c39f2d 100644 --- a/usaspending_api/disaster/tests/integration/test_cfda_loans.py +++ b/usaspending_api/disaster/tests/integration/test_cfda_loans.py @@ -219,7 +219,7 @@ def test_invalid_award_type_codes( resp = helpers.post_for_spending_endpoint(client, url, award_type_codes=["ZZ", "08"], def_codes=["L", "M"]) assert resp.status_code == status.HTTP_400_BAD_REQUEST - assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08']" + assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08', 'F003', 'F004']" @pytest.mark.django_db diff --git a/usaspending_api/disaster/tests/integration/test_cfda_spending.py b/usaspending_api/disaster/tests/integration/test_cfda_spending.py index 5f3e764516..b4dfc71b85 100644 --- a/usaspending_api/disaster/tests/integration/test_cfda_spending.py +++ b/usaspending_api/disaster/tests/integration/test_cfda_spending.py @@ -20,7 +20,11 @@ def test_correct_response_defc_no_results( @pytest.mark.django_db def test_correct_response_single_defc( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) @@ -75,7 +79,11 @@ def test_correct_response_single_defc( @pytest.mark.django_db def test_correct_response_multiple_defc( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) @@ -131,16 +139,24 @@ def test_correct_response_multiple_defc( @pytest.mark.django_db def test_correct_response_with_query( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) - resp = helpers.post_for_spending_endpoint(client, url, def_codes=["L", "M"], query="GIBBERISH") + resp = helpers.post_for_spending_endpoint( + client, url, def_codes=["L", "M"], query="GIBBERISH" + ) expected_results = [] assert resp.status_code == status.HTTP_200_OK assert resp.json()["results"] == expected_results - resp = helpers.post_for_spending_endpoint(client, url, def_codes=["L", "M"], query="3") + resp = helpers.post_for_spending_endpoint( + client, url, def_codes=["L", "M"], query="3" + ) expected_results = [ { "code": "30.300", @@ -163,16 +179,24 @@ def test_correct_response_with_query( @pytest.mark.django_db def test_correct_response_with_award_type_codes( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) - resp = helpers.post_for_spending_endpoint(client, url, def_codes=["L", "M"], award_type_codes=["11"]) + resp = helpers.post_for_spending_endpoint( + client, url, def_codes=["L", "M"], award_type_codes=["11"] + ) expected_results = [] assert resp.status_code == status.HTTP_200_OK assert resp.json()["results"] == expected_results - resp = helpers.post_for_spending_endpoint(client, url, def_codes=["L", "M"], award_type_codes=["07", "09", "11"]) + resp = helpers.post_for_spending_endpoint( + client, url, def_codes=["L", "M"], award_type_codes=["07", "09", "11"] + ) expected_results = [ { "code": "20.200", @@ -208,53 +232,91 @@ def test_correct_response_with_award_type_codes( @pytest.mark.django_db -def test_invalid_defc(client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions): +def test_invalid_defc( + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = helpers.post_for_spending_endpoint(client, url, def_codes=["ZZ"]) assert resp.status_code == status.HTTP_400_BAD_REQUEST - assert resp.data["detail"] == "Field 'filter|def_codes' is outside valid values ['L', 'M', 'N']" + assert ( + resp.data["detail"] + == "Field 'filter|def_codes' is outside valid values ['L', 'M', 'N']" + ) @pytest.mark.django_db -def test_invalid_defc_type(client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions): +def test_invalid_defc_type( + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = helpers.post_for_spending_endpoint(client, url, def_codes="100") assert resp.status_code == status.HTTP_400_BAD_REQUEST - assert resp.data["detail"] == "Invalid value in 'filter|def_codes'. '100' is not a valid type (array)" + assert ( + resp.data["detail"] + == "Invalid value in 'filter|def_codes'. '100' is not a valid type (array)" + ) @pytest.mark.django_db -def test_missing_defc(client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions): +def test_missing_defc( + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = helpers.post_for_spending_endpoint(client, url) assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY - assert resp.data["detail"] == "Missing value: 'filter|def_codes' is a required field" + assert ( + resp.data["detail"] == "Missing value: 'filter|def_codes' is a required field" + ) @pytest.mark.django_db def test_invalid_award_type_codes( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) - resp = helpers.post_for_spending_endpoint(client, url, award_type_codes=["ZZ", "08"], def_codes=["L", "M"]) + resp = helpers.post_for_spending_endpoint( + client, url, award_type_codes=["ZZ", "08"], def_codes=["L", "M"] + ) assert resp.status_code == status.HTTP_400_BAD_REQUEST assert ( resp.data["detail"] - == "Field 'filter|award_type_codes' is outside valid values ['-1', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11']" + == "Field 'filter|award_type_codes' is outside valid values ['-1', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', 'F001', 'F002', 'F003', 'F004', 'F005', 'F006', 'F007', 'F008', 'F009', 'F010']" # noqa: E501 ) @pytest.mark.django_db def test_pagination_page_and_limit( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) - resp = helpers.post_for_spending_endpoint(client, url, def_codes=["L", "M"], page=2, limit=1, sort="description") + resp = helpers.post_for_spending_endpoint( + client, url, def_codes=["L", "M"], page=2, limit=1, sort="description" + ) expected_results = { "totals": {"award_count": 4, "obligation": 2222.0, "outlay": 1100.0}, "results": [ diff --git a/usaspending_api/disaster/tests/integration/test_disaster_agency_loans.py b/usaspending_api/disaster/tests/integration/test_disaster_agency_loans.py index 1404c0a9da..292ec2b1b5 100644 --- a/usaspending_api/disaster/tests/integration/test_disaster_agency_loans.py +++ b/usaspending_api/disaster/tests/integration/test_disaster_agency_loans.py @@ -111,4 +111,4 @@ def test_invalid_award_type_codes(client, monkeypatch, helpers, elasticsearch_aw resp = helpers.post_for_spending_endpoint(client, url, award_type_codes=["ZZ", "08"], def_codes=["L", "M"]) assert resp.status_code == status.HTTP_400_BAD_REQUEST - assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08']" + assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08', 'F003', 'F004']" diff --git a/usaspending_api/disaster/tests/integration/test_recipient_loans.py b/usaspending_api/disaster/tests/integration/test_recipient_loans.py index 1b047f5186..526a7a6dae 100644 --- a/usaspending_api/disaster/tests/integration/test_recipient_loans.py +++ b/usaspending_api/disaster/tests/integration/test_recipient_loans.py @@ -287,7 +287,7 @@ def test_invalid_award_type_codes(client, monkeypatch, helpers, elasticsearch_aw resp = helpers.post_for_spending_endpoint(client, url, award_type_codes=["ZZ", "08"], def_codes=["L", "M"]) assert resp.status_code == status.HTTP_400_BAD_REQUEST - assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08']" + assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08', 'F003', 'F004']" @pytest.mark.django_db From b799dcd9f1ba8c8c7a64e527ee1b27d839e23efe Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Thu, 5 Feb 2026 12:56:34 -0600 Subject: [PATCH 42/59] Updating table spec to use dataclasses --- .../tests/integration/test_spark_jobs.py | 2 +- .../commands/archive_table_in_delta.py | 27 +- .../management/commands/create_delta_table.py | 51 +- .../commands/load_query_to_delta.py | 601 +++++++----------- .../commands/load_table_from_delta.py | 28 +- .../commands/load_table_to_delta.py | 470 +++++++------- usaspending_api/etl/table_specs.py | 53 ++ .../etl/tests/data/delta_model_for_test.py | 36 +- .../integration/test_create_delta_table.py | 2 +- .../integration/test_load_to_from_delta.py | 19 +- usaspending_api/etl/tests/unit/test_spark.py | 17 - 11 files changed, 600 insertions(+), 706 deletions(-) create mode 100644 usaspending_api/etl/table_specs.py delete mode 100644 usaspending_api/etl/tests/unit/test_spark.py diff --git a/usaspending_api/common/tests/integration/test_spark_jobs.py b/usaspending_api/common/tests/integration/test_spark_jobs.py index 389d41371e..063de13a8a 100644 --- a/usaspending_api/common/tests/integration/test_spark_jobs.py +++ b/usaspending_api/common/tests/integration/test_spark_jobs.py @@ -5,7 +5,7 @@ def test_local_spark_jobs_strategy(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): expected_table_name = "award_search" delta_table_spec = TABLE_SPEC[expected_table_name] - expected_db_name = delta_table_spec["destination_database"] + expected_db_name = delta_table_spec.destination_database spark_jobs = SparkJobs(LocalStrategy()) spark_jobs.start( diff --git a/usaspending_api/etl/management/commands/archive_table_in_delta.py b/usaspending_api/etl/management/commands/archive_table_in_delta.py index b26c5d242a..40c00206af 100644 --- a/usaspending_api/etl/management/commands/archive_table_in_delta.py +++ b/usaspending_api/etl/management/commands/archive_table_in_delta.py @@ -13,18 +13,21 @@ get_usas_jdbc_url, ) from usaspending_api.download.delta_models.download_job import download_job_create_sql_string +from usaspending_api.etl.table_specs import ArchiveTableSpec logger = logging.getLogger(__name__) TABLE_SPEC = { - "download_job": { - "destination_database": "arc", - "destination_table": "download_job", - "archive_date_field": "update_date", - "source_table": "download_job", - "source_database": "public", - "delta_table_create_sql": download_job_create_sql_string, - } + "download_job": ArchiveTableSpec( + **{ + "destination_database": "arc", + "destination_table": "download_job", + "archive_date_field": "update_date", + "source_table": "download_job", + "source_database": "public", + "delta_table_create_sql": download_job_create_sql_string, + } + ) } @@ -86,12 +89,12 @@ def handle(self, *args, **options): archive_period = options["archive_period"] table_spec = TABLE_SPEC[destination_table] - destination_database = options["alt_db"] or table_spec["destination_database"] + destination_database = options["alt_db"] or table_spec.destination_database destination_table_name = options["alt_name"] or destination_table - source_table = table_spec["source_table"] - source_database = table_spec["source_database"] + source_table = table_spec.source_table + source_database = table_spec.source_database qualified_source_table = f"{source_database}.{source_table}" - archive_date_field = table_spec["archive_date_field"] + archive_date_field = table_spec.archive_date_field archive_date = datetime.now() - timedelta(days=archive_period) archive_date_string = archive_date.strftime("%Y-%m-%d") diff --git a/usaspending_api/etl/management/commands/create_delta_table.py b/usaspending_api/etl/management/commands/create_delta_table.py index cbdfe84f40..991f2f9cec 100644 --- a/usaspending_api/etl/management/commands/create_delta_table.py +++ b/usaspending_api/etl/management/commands/create_delta_table.py @@ -10,23 +10,36 @@ ) from usaspending_api.common.spark.configs import DEFAULT_EXTRA_CONF from usaspending_api.config import CONFIG -from usaspending_api.etl.management.commands.archive_table_in_delta import TABLE_SPEC as ARCHIVE_TABLE_SPEC -from usaspending_api.etl.management.commands.load_query_to_delta import TABLE_SPEC as LOAD_QUERY_TABLE_SPEC -from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC as LOAD_TABLE_TABLE_SPEC -from usaspending_api.transactions.delta_models.transaction_id_lookup import TRANSACTION_ID_LOOKUP_SCHEMA +from usaspending_api.etl.management.commands.archive_table_in_delta import ( + TABLE_SPEC as ARCHIVE_TABLE_SPEC, +) +from usaspending_api.etl.management.commands.load_query_to_delta import ( + TABLE_SPEC as LOAD_QUERY_TABLE_SPEC, +) +from usaspending_api.etl.management.commands.load_table_to_delta import ( + TABLE_SPEC as LOAD_TABLE_TABLE_SPEC, +) +from usaspending_api.etl.table_specs import TableSpec +from usaspending_api.transactions.delta_models.transaction_id_lookup import ( + TRANSACTION_ID_LOOKUP_SCHEMA, +) TABLE_SPEC = { **ARCHIVE_TABLE_SPEC, **LOAD_TABLE_TABLE_SPEC, **LOAD_QUERY_TABLE_SPEC, - "award_id_lookup": { - "destination_database": "int", - "delta_table_create_sql": AWARD_ID_LOOKUP_SCHEMA, - }, - "transaction_id_lookup": { - "destination_database": "int", - "delta_table_create_sql": TRANSACTION_ID_LOOKUP_SCHEMA, - }, + "award_id_lookup": TableSpec( + **{ + "destination_database": "int", + "delta_table_create_sql": AWARD_ID_LOOKUP_SCHEMA, + } + ), + "transaction_id_lookup": TableSpec( + **{ + "destination_database": "int", + "delta_table_create_sql": TRANSACTION_ID_LOOKUP_SCHEMA, + } + ), } logger = logging.getLogger(__name__) @@ -78,27 +91,27 @@ def handle(self, *args, **options): spark_s3_bucket = options["spark_s3_bucket"] table_spec = TABLE_SPEC[destination_table] - destination_database = options["alt_db"] or table_spec["destination_database"] + destination_database = options["alt_db"] or table_spec.destination_database destination_table_name = options["alt_name"] or destination_table # Set the database that will be interacted with for all Delta Lake table Spark-based activity logger.info(f"Using Spark Database: {destination_database}") spark.sql(f"create database if not exists {destination_database};") spark.sql(f"use {destination_database};") - if isinstance(table_spec["delta_table_create_sql"], str): + if isinstance(table_spec.delta_table_create_sql, str): # Define Schema Using CREATE TABLE AS command spark.sql( - TABLE_SPEC[destination_table]["delta_table_create_sql"].format( + table_spec.delta_table_create_sql.format( DESTINATION_TABLE=destination_table_name, DESTINATION_DATABASE=destination_database, SPARK_S3_BUCKET=spark_s3_bucket, DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, ) ) - elif isinstance(table_spec["delta_table_create_sql"], StructType): - schema = table_spec["delta_table_create_sql"] - additional_options = table_spec.get("delta_table_create_options") or {} - partition_cols = table_spec.get("delta_table_create_partitions") or [] + elif isinstance(table_spec.delta_table_create_sql, StructType): + schema = table_spec.delta_table_create_sql + additional_options = table_spec.delta_table_create_options or {} + partition_cols = table_spec.delta_table_create_partitions or [] df = spark.createDataFrame([], schema) default_options = { diff --git a/usaspending_api/etl/management/commands/load_query_to_delta.py b/usaspending_api/etl/management/commands/load_query_to_delta.py index aa58341cd9..d9ef19a811 100644 --- a/usaspending_api/etl/management/commands/load_query_to_delta.py +++ b/usaspending_api/etl/management/commands/load_query_to_delta.py @@ -36,6 +36,7 @@ object_class_program_activity_schema, ) from usaspending_api.download.delta_models.transaction_download import transaction_download_schema +from usaspending_api.etl.table_specs import QueryTableSpec from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_POSTGRES_COLUMNS, RECIPIENT_PROFILE_POSTGRES_COLUMNS, @@ -89,360 +90,243 @@ logger = logging.getLogger(__name__) TABLE_SPEC = { - "award_search": { - "model": AwardSearch, - "is_from_broker": False, - "source_query": load_award_search, - "source_query_incremental": load_award_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "award_search", - "swap_schema": "rpt", - "partition_column": "award_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": award_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": AWARD_SEARCH_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," - " tas_components ARRAY", - "column_names": list(AWARD_SEARCH_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "award_search_gold": { - "model": AwardSearch, - "is_from_broker": False, - "source_query": load_award_search, - "source_query_incremental": load_award_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "award_search", - "swap_schema": "rpt", - "partition_column": "award_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": award_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": AWARD_SEARCH_POSTGRES_GOLD_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," - " tas_components ARRAY", - "column_names": list(AWARD_SEARCH_POSTGRES_GOLD_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "recipient_lookup": { - "model": RecipientLookup, - "is_from_broker": False, - "source_query": recipient_lookup_load_sql_string_list, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "recipient_lookup", - "swap_schema": "rpt", - "partition_column": "recipient_hash", - "partition_column_type": "string", - "is_partition_column_unique": True, - "delta_table_create_sql": rpt_recipient_lookup_create_sql_string, - "delta_table_create_options": None, - "source_schema": RECIPIENT_LOOKUP_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING", - "column_names": list(RPT_RECIPIENT_LOOKUP_DELTA_COLUMNS), - "postgres_seq_name": "recipient_lookup_id_seq", - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "recipient_profile": { - "model": RecipientProfile, - "is_from_broker": False, - "source_query": recipient_profile_load_sql_strings, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "recipient_profile", - "swap_schema": "rpt", - "partition_column": "recipient_hash", # This isn't used for anything - "partition_column_type": "string", - "is_partition_column_unique": False, - "delta_table_create_sql": recipient_profile_create_sql_string, - "delta_table_create_options": None, - "source_schema": RECIPIENT_PROFILE_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING", - "column_names": list(RPT_RECIPIENT_PROFILE_DELTA_COLUMNS), - "postgres_seq_name": "recipient_profile_id_seq", - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "summary_state_view": { - "model": SummaryStateView, - "is_from_broker": False, - "source_query": summary_state_view_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "summary_state_view", - "swap_schema": "rpt", - "partition_column": "duh", - "partition_column_type": "string", - "is_partition_column_unique": True, - "delta_table_create_sql": summary_state_view_create_sql_string, - "delta_table_create_options": None, - "source_schema": SUMMARY_STATE_VIEW_POSTGRES_COLUMNS, - "custom_schema": "duh STRING", - "column_names": list(SUMMARY_STATE_VIEW_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "sam_recipient": { - "model": None, - "is_from_broker": True, - "source_query": sam_recipient_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "int", - "swap_table": "duns", - "swap_schema": "int", - "partition_column": "broker_duns_id", - "partition_column_type": "string", - "is_partition_column_unique": True, - "delta_table_create_sql": sam_recipient_create_sql_string, - "delta_table_create_options": None, - "source_schema": SAM_RECIPIENT_POSTGRES_COLUMNS, - "custom_schema": None, - "column_names": list(SAM_RECIPIENT_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "transaction_search": { - "model": TransactionSearch, - "is_from_broker": False, - "source_query": load_transaction_search, - "source_query_incremental": load_transaction_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "transaction_search", - "swap_schema": "rpt", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": TRANSACTION_SEARCH_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", - "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "transaction_search_gold": { - "model": TransactionSearch, - "is_from_broker": False, - "source_query": load_transaction_search, - "source_query_incremental": load_transaction_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "transaction_search", - "swap_schema": "rpt", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", - "column_names": list(TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": { - "partition_keys": ["is_fpds"], - "partitioning_form": "LIST", - "partitions": [ - {"table_suffix": "_fpds", "partitioning_clause": "FOR VALUES IN (TRUE)"}, - {"table_suffix": "_fabs", "partitioning_clause": "FOR VALUES IN (FALSE)"}, - ], - }, - "delta_table_create_partitions": None, - }, - "transaction_current_cd_lookup": { - "model": None, - "is_from_broker": False, - "source_query": transaction_current_cd_lookup_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "int", - "swap_table": "transaction_current_cd_lookup", - "swap_schema": "int", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_current_cd_lookup_create_sql_string, - "delta_table_create_options": None, - "source_schema": TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS, - "custom_schema": "", - "column_names": list(TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "subaward_search": { - "model": SubawardSearch, - "is_from_broker": False, - "source_query": subaward_search_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "subaward_search", - "swap_schema": "rpt", - "partition_column": "broker_subaward_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": subaward_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": SUBAWARD_SEARCH_POSTGRES_COLUMNS, - "custom_schema": "treasury_account_identifiers ARRAY", - "column_names": list(SUBAWARD_SEARCH_COLUMNS), - "postgres_seq_name": None, - "tsvectors": SUBAWARD_SEARCH_POSTGRES_VECTORS, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "covid_faba_spending": { - "model": CovidFABASpending, - "is_from_broker": False, - "source_query": covid_faba_spending_load_sql_strings, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "covid_faba_spending", - "swap_schema": "rpt", - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": covid_faba_spending_create_sql_string, - "delta_table_create_options": None, - "source_schema": COVID_FABA_SPENDING_POSTGRES_COLUMNS, - "custom_schema": None, - "column_names": list(COVID_FABA_SPENDING_DELTA_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "account_balances_download": { - "model": None, - "is_from_broker": False, - "source_query": load_account_balances, - "source_query_incremental": load_account_balances_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "appropriation_account_balances_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": account_balances_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], - }, - "award_financial_download": { - "model": None, - "is_from_broker": False, - "source_query": load_award_financial, - "source_query_incremental": load_award_financial_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "financial_accounts_by_awards_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": award_financial_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], - }, - "object_class_program_activity_download": { - "model": None, - "is_from_broker": False, - "source_query": load_object_class_program_activity, - "source_query_incremental": load_object_class_program_activity_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "financial_accounts_by_program_activity_object_class_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": object_class_program_activity_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], - }, - "transaction_download": { - "model": None, - "is_from_broker": False, - "source_query": None, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": transaction_download_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["awarding_agency_code", "is_fpds", "action_date_fiscal_year"], - }, + "award_search": QueryTableSpec( + **{ + "model": AwardSearch, + "source_query": load_award_search, + "source_query_incremental": load_award_search_incremental, + "destination_database": "rpt", + "swap_table": "award_search", + "swap_schema": "rpt", + "partition_column": "award_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": award_search_create_sql_string, + "source_schema": AWARD_SEARCH_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," + " tas_components ARRAY", + "column_names": list(AWARD_SEARCH_COLUMNS), + } + ), + "award_search_gold": QueryTableSpec( + **{ + "model": AwardSearch, + "source_query": load_award_search, + "source_query_incremental": load_award_search_incremental, + "destination_database": "rpt", + "swap_table": "award_search", + "swap_schema": "rpt", + "partition_column": "award_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": award_search_create_sql_string, + "source_schema": AWARD_SEARCH_POSTGRES_GOLD_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," + " tas_components ARRAY", + "column_names": list(AWARD_SEARCH_POSTGRES_GOLD_COLUMNS), + } + ), + "recipient_lookup": QueryTableSpec( + **{ + "model": RecipientLookup, + "source_query": recipient_lookup_load_sql_string_list, + "destination_database": "rpt", + "swap_table": "recipient_lookup", + "swap_schema": "rpt", + "partition_column": "recipient_hash", + "partition_column_type": "string", + "is_partition_column_unique": True, + "delta_table_create_sql": rpt_recipient_lookup_create_sql_string, + "source_schema": RECIPIENT_LOOKUP_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING", + "column_names": list(RPT_RECIPIENT_LOOKUP_DELTA_COLUMNS), + "postgres_seq_name": "recipient_lookup_id_seq", + } + ), + "recipient_profile": QueryTableSpec( + **{ + "model": RecipientProfile, + "source_query": recipient_profile_load_sql_strings, + "destination_database": "rpt", + "swap_table": "recipient_profile", + "swap_schema": "rpt", + "partition_column": "recipient_hash", # This isn't used for anything + "partition_column_type": "string", + "delta_table_create_sql": recipient_profile_create_sql_string, + "source_schema": RECIPIENT_PROFILE_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING", + "column_names": list(RPT_RECIPIENT_PROFILE_DELTA_COLUMNS), + "postgres_seq_name": "recipient_profile_id_seq", + } + ), + "summary_state_view": QueryTableSpec( + **{ + "model": SummaryStateView, + "source_query": summary_state_view_load_sql_string, + "destination_database": "rpt", + "swap_table": "summary_state_view", + "swap_schema": "rpt", + "partition_column": "duh", + "partition_column_type": "string", + "is_partition_column_unique": True, + "delta_table_create_sql": summary_state_view_create_sql_string, + "source_schema": SUMMARY_STATE_VIEW_POSTGRES_COLUMNS, + "custom_schema": "duh STRING", + "column_names": list(SUMMARY_STATE_VIEW_COLUMNS), + } + ), + "sam_recipient": QueryTableSpec( + **{ + "is_from_broker": True, + "source_query": sam_recipient_load_sql_string, + "destination_database": "int", + "swap_table": "duns", + "swap_schema": "int", + "partition_column": "broker_duns_id", + "partition_column_type": "string", + "is_partition_column_unique": True, + "delta_table_create_sql": sam_recipient_create_sql_string, + "source_schema": SAM_RECIPIENT_POSTGRES_COLUMNS, + "column_names": list(SAM_RECIPIENT_COLUMNS), + } + ), + "transaction_search": QueryTableSpec( + **{ + "model": TransactionSearch, + "source_query": load_transaction_search, + "source_query_incremental": load_transaction_search_incremental, + "destination_database": "rpt", + "swap_table": "transaction_search", + "swap_schema": "rpt", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_search_create_sql_string, + "source_schema": TRANSACTION_SEARCH_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", + "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), + } + ), + "transaction_search_gold": QueryTableSpec( + **{ + "model": TransactionSearch, + "source_query": load_transaction_search, + "source_query_incremental": load_transaction_search_incremental, + "destination_database": "rpt", + "swap_table": "transaction_search", + "swap_schema": "rpt", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_search_create_sql_string, + "source_schema": TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", + "column_names": list(TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS), + "postgres_partition_spec": { + "partition_keys": ["is_fpds"], + "partitioning_form": "LIST", + "partitions": [ + {"table_suffix": "_fpds", "partitioning_clause": "FOR VALUES IN (TRUE)"}, + {"table_suffix": "_fabs", "partitioning_clause": "FOR VALUES IN (FALSE)"}, + ], + }, + } + ), + "transaction_current_cd_lookup": QueryTableSpec( + **{ + "source_query": transaction_current_cd_lookup_load_sql_string, + "destination_database": "int", + "swap_table": "transaction_current_cd_lookup", + "swap_schema": "int", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_current_cd_lookup_create_sql_string, + "source_schema": TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS, + "column_names": list(TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS), + } + ), + "subaward_search": QueryTableSpec( + **{ + "model": SubawardSearch, + "source_query": subaward_search_load_sql_string, + "destination_database": "rpt", + "swap_table": "subaward_search", + "swap_schema": "rpt", + "partition_column": "broker_subaward_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": subaward_search_create_sql_string, + "source_schema": SUBAWARD_SEARCH_POSTGRES_COLUMNS, + "custom_schema": "treasury_account_identifiers ARRAY", + "column_names": list(SUBAWARD_SEARCH_COLUMNS), + "tsvectors": SUBAWARD_SEARCH_POSTGRES_VECTORS, + } + ), + "covid_faba_spending": QueryTableSpec( + **{ + "model": CovidFABASpending, + "source_query": covid_faba_spending_load_sql_strings, + "destination_database": "rpt", + "swap_table": "covid_faba_spending", + "swap_schema": "rpt", + "partition_column": "id", + "partition_column_type": "numeric", + "delta_table_create_sql": covid_faba_spending_create_sql_string, + "source_schema": COVID_FABA_SPENDING_POSTGRES_COLUMNS, + "column_names": list(COVID_FABA_SPENDING_DELTA_COLUMNS), + } + ), + "account_balances_download": QueryTableSpec( + **{ + "source_query": load_account_balances, + "source_query_incremental": load_account_balances_incremental, + "destination_database": "rpt", + "partition_column": "appropriation_account_balances_id", + "partition_column_type": "numeric", + "delta_table_create_sql": account_balances_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": list(), + "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], + } + ), + "award_financial_download": QueryTableSpec( + **{ + "source_query": load_award_financial, + "source_query_incremental": load_award_financial_incremental, + "destination_database": "rpt", + "partition_column": "financial_accounts_by_awards_id", + "partition_column_type": "numeric", + "delta_table_create_sql": award_financial_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": list(), + "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], + } + ), + "object_class_program_activity_download": QueryTableSpec( + **{ + "source_query": load_object_class_program_activity, + "source_query_incremental": load_object_class_program_activity_incremental, + "destination_database": "rpt", + "partition_column": "financial_accounts_by_program_activity_object_class_id", + "partition_column_type": "numeric", + "delta_table_create_sql": object_class_program_activity_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": list(), + "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], + } + ), + "transaction_download": QueryTableSpec( + **{ + "destination_database": "rpt", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "delta_table_create_sql": transaction_download_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": list(), + "delta_table_create_partitions": ["awarding_agency_code", "is_fpds", "action_date_fiscal_year"], + } + ), } @@ -506,10 +390,10 @@ def handle(self, *args, **options): # Resolve Parameters destination_table = options["destination_table"] table_spec = TABLE_SPEC[destination_table] - self.destination_database = options["alt_db"] or table_spec["destination_database"] + self.destination_database = options["alt_db"] or table_spec.destination_database self.destination_table_name = options["alt_name"] or destination_table.split(".")[-1] source_query_key = "source_query_incremental" if options["incremental"] else "source_query" - load_query = table_spec.get(source_query_key) + load_query = getattr(table_spec, source_query_key) if load_query is None: raise ArgumentTypeError(f"Invalid source query. `{source_query_key}` must be specified in the TABLE_SPEC.") @@ -517,11 +401,6 @@ def handle(self, *args, **options): logger.info(f"Using Spark Database: {self.destination_database}") self.spark.sql(f"use {self.destination_database};") - # Create User Defined Functions if needed - if table_spec.get("user_defined_functions"): - for udf_args in table_spec["user_defined_functions"]: - self.spark.udf.register(**udf_args) - create_ref_temp_views(self.spark, create_broker_views=True) if isinstance(load_query, list): diff --git a/usaspending_api/etl/management/commands/load_table_from_delta.py b/usaspending_api/etl/management/commands/load_table_from_delta.py index 79892c5dce..c0af9f0597 100644 --- a/usaspending_api/etl/management/commands/load_table_from_delta.py +++ b/usaspending_api/etl/management/commands/load_table_from_delta.py @@ -169,18 +169,18 @@ def handle(self, *args, **options): table_spec = TABLE_SPEC[delta_table] # Delta side - destination_database = options["alt_delta_db"] or table_spec["destination_database"] + destination_database = options["alt_delta_db"] or table_spec.destination_database delta_table_name = options["alt_delta_name"] or delta_table delta_table = f"{destination_database}.{delta_table_name}" if destination_database else delta_table_name # Postgres side - source postgres_table = None - postgres_model = table_spec["model"] - postgres_schema = table_spec["source_database"] or table_spec["swap_schema"] - postgres_table_name = table_spec["source_table"] or table_spec["swap_table"] - postgres_cols = table_spec["source_schema"] - column_names = table_spec.get("column_names") - tsvectors = table_spec.get("tsvectors") or {} + postgres_model = table_spec.model + postgres_schema = table_spec.source_database or table_spec.swap_schema + postgres_table_name = table_spec.source_table or table_spec.swap_table + postgres_cols = table_spec.source_schema + column_names = table_spec.column_names + tsvectors = table_spec.tsvectors or {} if postgres_table_name: postgres_table = f"{postgres_schema}.{postgres_table_name}" if postgres_schema else postgres_table_name @@ -222,7 +222,7 @@ def handle(self, *args, **options): temp_dest_table_exists = False make_new_table = not temp_dest_table_exists - is_postgres_table_partitioned = table_spec.get("postgres_partition_spec") is not None + is_postgres_table_partitioned = table_spec.postgres_partition_spec is not None if postgres_table or postgres_cols: # Recreate the table if it doesn't exist. Spark's df.write automatically does this but doesn't account for @@ -234,8 +234,8 @@ def handle(self, *args, **options): partitions_sql = [] if is_postgres_table_partitioned: partition_clause = ( - f"PARTITION BY {table_spec['postgres_partition_spec']['partitioning_form']}" - f"({', '.join(table_spec['postgres_partition_spec']['partition_keys'])})" + f"PARTITION BY {table_spec.postgres_partition_spec['partitioning_form']}" + f"({', '.join(table_spec.postgres_partition_spec['partition_keys'])})" ) storage_parameters = "" partitions_sql = [ @@ -246,7 +246,7 @@ def handle(self, *args, **options): f"PARTITION OF {temp_table} {pt['partitioning_clause']} " f"{storage_parameters}" ) - for pt in table_spec["postgres_partition_spec"]["partitions"] + for pt in table_spec.postgres_partition_spec["partitions"] ] if postgres_table: create_temp_sql = f""" @@ -317,8 +317,8 @@ def handle(self, *args, **options): logger.info(f"{temp_table} truncated.") # Reset the sequence before load for a table if it exists - if options["reset_sequence"] and table_spec.get("postgres_seq_name"): - postgres_seq_last_value = self._set_sequence_value(table_spec["postgres_seq_name"]) + if options["reset_sequence"] and table_spec.postgres_seq_name: + postgres_seq_last_value = self._set_sequence_value(table_spec.postgres_seq_name) else: postgres_seq_last_value = None @@ -359,7 +359,7 @@ def handle(self, *args, **options): logger.error( f"Command failed unexpectedly; resetting the sequence to previous value: {postgres_seq_last_value}" ) - self._set_sequence_value(table_spec["postgres_seq_name"], postgres_seq_last_value) + self._set_sequence_value(table_spec.postgres_seq_name, postgres_seq_last_value) raise Exception(exc) logger.info( diff --git a/usaspending_api/etl/management/commands/load_table_to_delta.py b/usaspending_api/etl/management/commands/load_table_to_delta.py index dd031c7115..b54ab9eb7c 100644 --- a/usaspending_api/etl/management/commands/load_table_to_delta.py +++ b/usaspending_api/etl/management/commands/load_table_to_delta.py @@ -10,8 +10,15 @@ BROKER_SUBAWARDS_COLUMNS, broker_subawards_sql_string, ) -from usaspending_api.broker.delta_models.broker_zips import ZIPS_COLUMNS, zips_sql_string -from usaspending_api.common.etl.spark import extract_db_data_frame, get_partition_bounds_sql, load_delta_table +from usaspending_api.broker.delta_models.broker_zips import ( + ZIPS_COLUMNS, + zips_sql_string, +) +from usaspending_api.common.etl.spark import ( + extract_db_data_frame, + get_partition_bounds_sql, + load_delta_table, +) from usaspending_api.common.helpers.spark_helpers import ( configure_spark_session, get_active_spark_session, @@ -20,6 +27,7 @@ get_broker_jdbc_url, ) from usaspending_api.config import CONFIG +from usaspending_api.etl.table_specs import TableSpec from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_COLUMNS, recipient_lookup_create_sql_string, @@ -45,7 +53,10 @@ ) from usaspending_api.transactions.models import SourceAssistanceTransaction from usaspending_api.transactions.models import SourceProcurementTransaction -from usaspending_api.search.delta_models.award_search import award_search_create_sql_string, AWARD_SEARCH_COLUMNS +from usaspending_api.search.delta_models.award_search import ( + award_search_create_sql_string, + AWARD_SEARCH_COLUMNS, +) from usaspending_api.recipient.models import DUNS, RecipientLookup, RecipientProfile from usaspending_api.awards.models import ( @@ -58,250 +69,195 @@ logger = logging.getLogger(__name__) + TABLE_SPEC = { - "awards": { - "model": Award, - "is_from_broker": False, - "source_table": "vw_awards", - "source_database": "rpt", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": awards_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(AWARDS_COLUMNS), - "tsvectors": None, - }, - "detached_award_procurement": { - "model": SourceProcurementTransaction, - "is_from_broker": False, - "source_table": "source_procurement_transaction", - "source_database": "raw", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "detached_award_procurement_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": detached_award_procurement_create_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS), - "tsvectors": None, - }, - "financial_accounts_by_awards": { - "model": FinancialAccountsByAwards, - "is_from_broker": False, - "source_table": "financial_accounts_by_awards", - "source_database": "public", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "financial_accounts_by_awards_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": financial_accounts_by_awards_sql_string, - "source_schema": None, - "custom_schema": "award_id LONG", - "column_names": list(FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS), - "tsvectors": None, - }, - "transaction_fabs": { - "model": TransactionFABS, - "is_from_broker": False, - "source_table": "vw_transaction_fabs", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_fabs_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": TRANSACTION_FABS_VIEW_COLUMNS, - "tsvectors": None, - }, - "published_fabs": { - "model": SourceAssistanceTransaction, - "is_from_broker": False, - "source_table": "source_assistance_transaction", - "source_database": "raw", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "published_fabs_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": published_fabs_create_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(PUBLISHED_FABS_COLUMNS), - "tsvectors": None, - }, - "transaction_fpds": { - "model": TransactionFPDS, - "is_from_broker": False, - "source_table": "vw_transaction_fpds", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_fpds_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": TRANSACTION_FPDS_VIEW_COLUMNS, - "tsvectors": None, - }, - "transaction_normalized": { - "model": TransactionNormalized, - "is_from_broker": False, - "source_table": "vw_transaction_normalized", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_normalized_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(TRANSACTION_NORMALIZED_COLUMNS), - "tsvectors": None, - }, + "awards": TableSpec( + **{ + "model": Award, + "source_table": "vw_awards", + "source_database": "rpt", + "destination_database": "raw", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": awards_sql_string, + "column_names": list(AWARDS_COLUMNS), + } + ), + "detached_award_procurement": TableSpec( + **{ + "model": SourceProcurementTransaction, + "source_table": "source_procurement_transaction", + "source_database": "raw", + "destination_database": "raw", + "partition_column": "detached_award_procurement_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": detached_award_procurement_create_sql_string, + "column_names": list(DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS), + } + ), + "financial_accounts_by_awards": TableSpec( + **{ + "model": FinancialAccountsByAwards, + "source_table": "financial_accounts_by_awards", + "source_database": "public", + "destination_database": "raw", + "partition_column": "financial_accounts_by_awards_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": financial_accounts_by_awards_sql_string, + "custom_schema": "award_id LONG", + "column_names": list(FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS), + } + ), + "transaction_fabs": TableSpec( + **{ + "model": TransactionFABS, + "source_table": "vw_transaction_fabs", + "source_database": "int", + "destination_database": "raw", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_fabs_sql_string, + "column_names": TRANSACTION_FABS_VIEW_COLUMNS, + } + ), + "published_fabs": TableSpec( + **{ + "model": SourceAssistanceTransaction, + "source_table": "source_assistance_transaction", + "source_database": "raw", + "destination_database": "raw", + "partition_column": "published_fabs_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": published_fabs_create_sql_string, + "column_names": list(PUBLISHED_FABS_DELTA_COLUMNS), + } + ), + "transaction_fpds": TableSpec( + **{ + "model": TransactionFPDS, + "source_table": "vw_transaction_fpds", + "source_database": "int", + "destination_database": "raw", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_fpds_sql_string, + "custom_schema": "", + "column_names": TRANSACTION_FPDS_VIEW_COLUMNS, + } + ), + "transaction_normalized": TableSpec( + **{ + "model": TransactionNormalized, + "source_table": "vw_transaction_normalized", + "source_database": "int", + "destination_database": "raw", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_normalized_sql_string, + "column_names": list(TRANSACTION_NORMALIZED_COLUMNS), + } + ), # Tables loaded in from the Broker - "subaward": { - "model": None, - "is_from_broker": True, - "source_table": "subaward", - "source_database": None, - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": broker_subawards_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(BROKER_SUBAWARDS_COLUMNS), - "tsvectors": None, - }, - "zips": { - "model": None, - "is_from_broker": True, - "source_table": "zips", - "source_database": None, - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "zips_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": zips_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(ZIPS_COLUMNS), - "tsvectors": None, - }, + "subaward": TableSpec( + **{ + "is_from_broker": True, + "source_table": "subaward", + "destination_database": "raw", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": broker_subawards_sql_string, + "column_names": list(BROKER_SUBAWARDS_COLUMNS), + } + ), + "zips": TableSpec( + **{ + "is_from_broker": True, + "source_table": "zips", + "destination_database": "raw", + "partition_column": "zips_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": zips_sql_string, + "column_names": list(ZIPS_COLUMNS), + } + ), # Additional definitions for use in testing; # These are copies of Views / Materialized Views / Tables from Postgres to Spark to aid in # data comparison between current Postgres data and the data transformed via Spark. - "award_search_testing": { - "model": AwardSearch, - "is_from_broker": False, - "source_table": "award_search", - "source_database": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "award_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": award_search_create_sql_string, - "source_schema": None, - "custom_schema": "total_covid_outlay NUMERIC(23,2), total_covid_obligation NUMERIC(23,2), recipient_hash " - "STRING, federal_accounts STRING, cfdas ARRAY, tas_components ARRAY", - "column_names": list(AWARD_SEARCH_COLUMNS), - "tsvectors": None, - }, - "recipient_lookup_testing": { - "model": RecipientLookup, - "is_from_broker": False, - "source_table": "recipient_lookup", - "source_database": "rpt", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": recipient_lookup_create_sql_string, - "source_schema": None, - "custom_schema": "recipient_hash STRING", - "column_names": list(RECIPIENT_LOOKUP_COLUMNS), - "tsvectors": None, - }, - "recipient_profile_testing": { - "model": RecipientProfile, - "is_from_broker": False, - "source_table": "recipient_profile", - "source_database": "rpt", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "delta_table_create_sql": recipient_profile_create_sql_string, - "is_partition_column_unique": True, - "source_schema": None, - "custom_schema": "recipient_hash STRING", - "column_names": list(RECIPIENT_PROFILE_DELTA_COLUMNS), - "tsvectors": None, - }, - "sam_recipient_testing": { - "model": DUNS, - "is_from_broker": False, - "source_table": "duns", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": None, - "partition_column_type": None, - "is_partition_column_unique": False, - "delta_table_create_sql": sam_recipient_create_sql_string, - "source_schema": None, - "custom_schema": "broker_duns_id STRING, business_types_codes ARRAY", - "column_names": list(SAM_RECIPIENT_COLUMNS), - "tsvectors": None, - }, - "transaction_search_testing": { - "model": TransactionSearch, - "is_from_broker": False, - "source_table": "transaction_search", - "source_database": None, - "destination_database": "test", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_search_create_sql_string, - "source_schema": None, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", - "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), - "tsvectors": None, - }, + "award_search_testing": TableSpec( + **{ + "model": AwardSearch, + "source_table": "award_search", + "destination_database": "rpt", + "partition_column": "award_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": award_search_create_sql_string, + "custom_schema": "total_covid_outlay NUMERIC(23,2), total_covid_obligation NUMERIC(23,2), recipient_hash " + "STRING, federal_accounts STRING, cfdas ARRAY, tas_components ARRAY", + "column_names": list(AWARD_SEARCH_COLUMNS), + } + ), + "recipient_lookup_testing": TableSpec( + **{ + "model": RecipientLookup, + "source_table": "recipient_lookup", + "source_database": "rpt", + "destination_database": "raw", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": recipient_lookup_create_sql_string, + "custom_schema": "recipient_hash STRING", + "column_names": list(RECIPIENT_LOOKUP_COLUMNS), + } + ), + "recipient_profile_testing": TableSpec( + **{ + "model": RecipientProfile, + "source_table": "recipient_profile", + "source_database": "rpt", + "destination_database": "raw", + "partition_column": "id", + "partition_column_type": "numeric", + "delta_table_create_sql": recipient_profile_create_sql_string, + "is_partition_column_unique": True, + "custom_schema": "recipient_hash STRING", + "column_names": list(RECIPIENT_PROFILE_DELTA_COLUMNS), + } + ), + "sam_recipient_testing": TableSpec( + **{ + "model": DUNS, + "source_table": "duns", + "source_database": "int", + "destination_database": "raw", + "delta_table_create_sql": sam_recipient_create_sql_string, + "custom_schema": "broker_duns_id STRING, business_types_codes ARRAY", + "column_names": list(SAM_RECIPIENT_COLUMNS), + } + ), + "transaction_search_testing": TableSpec( + **{ + "model": TransactionSearch, + "source_table": "transaction_search", + "destination_database": "test", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_search_create_sql_string, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", + "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), + } + ), } SPARK_PARTITION_ROWS = CONFIG.SPARK_PARTITION_ROWS @@ -358,14 +314,14 @@ def handle(self, *args, **options): destination_table = options["destination_table"] table_spec = TABLE_SPEC[destination_table] - is_from_broker = table_spec["is_from_broker"] - destination_database = options["alt_db"] or table_spec["destination_database"] + is_from_broker = table_spec.is_from_broker + destination_database = options["alt_db"] or table_spec.destination_database destination_table_name = options["alt_name"] or destination_table - source_table = table_spec["source_table"] - partition_column = table_spec["partition_column"] - partition_column_type = table_spec["partition_column_type"] - is_partition_column_unique = table_spec["is_partition_column_unique"] - custom_schema = table_spec["custom_schema"] + source_table = table_spec.source_table + partition_column = table_spec.partition_column + partition_column_type = table_spec.partition_column_type + is_partition_column_unique = table_spec.is_partition_column_unique + custom_schema = table_spec.custom_schema # Set the database that will be interacted with for all Delta Lake table Spark-based activity logger.info(f"Using Spark Database: {destination_database}") @@ -374,9 +330,13 @@ def handle(self, *args, **options): # Resolve JDBC URL for Source Database jdbc_url = get_usas_jdbc_url() if not is_from_broker else get_broker_jdbc_url() if not jdbc_url: - raise RuntimeError(f"Couldn't find JDBC url, please properly configure your CONFIG.") + raise RuntimeError( + f"Couldn't find JDBC url, please properly configure your CONFIG." + ) if not jdbc_url.startswith("jdbc:postgresql://"): - raise ValueError("JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://...") + raise ValueError( + "JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://..." + ) # If a partition_column is present, read from jdbc using partitioning if partition_column: @@ -387,7 +347,9 @@ def handle(self, *args, **options): is_numeric_partitioning_col = False is_date_partitioning_col = True else: - raise ValueError("partition_column_type should be either 'numeric' or 'date'") + raise ValueError( + "partition_column_type should be either 'numeric' or 'date'" + ) # Read from table or view df = extract_db_data_frame( @@ -417,8 +379,8 @@ def handle(self, *args, **options): # Make sure that the column order defined in the Delta table schema matches # that of the Spark dataframe used to pull from the Postgres table. While not # always needed, this should help to prevent any future mismatch between the two. - if table_spec.get("column_names"): - df = df.select(table_spec.get("column_names")) + if table_spec.column_names: + df = df.select(table_spec.column_names) # Write to S3 load_delta_table(spark, df, destination_table_name, True) diff --git a/usaspending_api/etl/table_specs.py b/usaspending_api/etl/table_specs.py new file mode 100644 index 0000000000..a291776115 --- /dev/null +++ b/usaspending_api/etl/table_specs.py @@ -0,0 +1,53 @@ +from dataclasses import dataclass +from typing import Literal, Any, Callable + +from django.db import models +from pyspark.sql import SparkSession +from pyspark.sql.types import StructType + + +@dataclass(kw_only=True) +class TableSpec: + destination_database: Literal["arc", "int", "raw", "rpt", "test"] + delta_table_create_sql: str | StructType + column_names: list[str] | None = None + model: models.Model | None = None + is_from_broker: bool = False + source_table: str | None = None + source_database: Literal["public", "int", "raw", "rpt"] | None = None + swap_table: str | None = None + swap_schema: str | None = None + partition_column: str | None = None + partition_column_type: Literal["date", "numeric"] | None = None + is_partition_column_unique: bool = False + source_schema: dict[str, str] | None = None + custom_schema: str | None = None + delta_table_create_options: dict[str, str | bool] | None = None + delta_table_create_partitions: list[str] | None = None + tsvectors: dict[str, list[str]] | None = None + + +@dataclass(kw_only=True) +class QueryTableSpec(TableSpec): + source_query: ( + str + | Callable[[SparkSession, str, str], None] + | list[str] + | list[Callable[[SparkSession, str, str], None]] + | None + ) = None + source_query_incremental: ( + str + | Callable[[SparkSession, str, str], None] + | list[str] + | list[Callable[[SparkSession, str, str], None]] + | None + ) = None + postgres_seq_name: str | None = None + postgres_partition_spec: dict[str, Any] | None = None + + +@dataclass(kw_only=True) +class ArchiveTableSpec(TableSpec): + destination_table: str + archive_date_field: str diff --git a/usaspending_api/etl/tests/data/delta_model_for_test.py b/usaspending_api/etl/tests/data/delta_model_for_test.py index e4d63eb538..15841ed1fd 100644 --- a/usaspending_api/etl/tests/data/delta_model_for_test.py +++ b/usaspending_api/etl/tests/data/delta_model_for_test.py @@ -2,9 +2,13 @@ from django.db import models +from usaspending_api.etl.table_specs import TableSpec + class TestModel(models.Model): - id = models.IntegerField(primary_key=True, help_text="surrogate primary key defined in Broker") + id = models.IntegerField( + primary_key=True, help_text="surrogate primary key defined in Broker" + ) test_timestamp = models.DateTimeField(null=True, blank=True) class Meta: @@ -26,21 +30,17 @@ class Meta: """ TEST_TABLE_SPEC = { - "test_table": { - "model": TestModel, - "is_from_broker": False, - "source_table": "test_table", - "source_database": "temp", - "destination_database": "temp", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": TEST_TABLE_DELTA, - "source_schema": None, - "custom_schema": "", - "column_names": ["id", "test_timestamp"], - "tsvectors": None, - } + "test_table": TableSpec( + **{ + "model": TestModel, + "source_table": "test_table", + "source_database": "temp", + "destination_database": "temp", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": TEST_TABLE_DELTA, + "column_names": ["id", "test_timestamp"], + } + ) } diff --git a/usaspending_api/etl/tests/integration/test_create_delta_table.py b/usaspending_api/etl/tests/integration/test_create_delta_table.py index b36597868f..e7e4106f18 100644 --- a/usaspending_api/etl/tests/integration/test_create_delta_table.py +++ b/usaspending_api/etl/tests/integration/test_create_delta_table.py @@ -21,7 +21,7 @@ def _verify_delta_table_creation( delta_table_spec = TABLE_SPEC[delta_table_name] cmd_args = [f"--destination-table={delta_table_name}", f"--spark-s3-bucket={s3_bucket}"] - expected_db_name = delta_table_spec["destination_database"] + expected_db_name = delta_table_spec.destination_database if alt_db: cmd_args += [f"--alt-db={alt_db}"] expected_db_name = alt_db diff --git a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py index dcd0b38ab8..c66b60fb18 100644 --- a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py +++ b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py @@ -225,11 +225,11 @@ def verify_delta_table_loaded_to_delta( # noqa: PLR0912 else: expected_table_name = delta_table_name.split(".")[-1] - partition_col = TABLE_SPEC[delta_table_name].get("partition_column") + partition_col = TABLE_SPEC[delta_table_name].partition_column if dummy_data is None: # get the postgres data to compare - model = TABLE_SPEC[delta_table_name]["model"] - is_from_broker = TABLE_SPEC[delta_table_name]["is_from_broker"] + model = TABLE_SPEC[delta_table_name].model + is_from_broker = TABLE_SPEC[delta_table_name].is_from_broker if delta_table_name == "summary_state_view": dummy_query = f"SELECT * from {expected_table_name}" if partition_col is not None: @@ -243,7 +243,7 @@ def verify_delta_table_loaded_to_delta( # noqa: PLR0912 elif is_from_broker: # model can be None if loading from the Broker broker_connection = connections[settings.BROKER_DB_ALIAS] - source_broker_name = TABLE_SPEC[delta_table_name]["source_table"] + source_broker_name = TABLE_SPEC[delta_table_name].source_table with broker_connection.cursor() as cursor: dummy_query = f"SELECT * from {source_broker_name}" if partition_col is not None: @@ -266,7 +266,7 @@ def verify_delta_table_loaded_to_delta( # noqa: PLR0912 assert equal_datasets( dummy_data, received_data, - TABLE_SPEC[delta_table_name]["custom_schema"], + TABLE_SPEC[delta_table_name].custom_schema, ignore_fields, ) @@ -304,9 +304,10 @@ def verify_delta_table_loaded_from_delta( call_command(load_command, *cmd_args) # get the postgres data to compare + source_table = ( - TABLE_SPEC[delta_table_name]["source_table"] - or TABLE_SPEC[delta_table_name]["swap_table"] + TABLE_SPEC[delta_table_name].source_table + or TABLE_SPEC[delta_table_name].swap_table ) temp_schema = "temp" if source_table: @@ -314,7 +315,7 @@ def verify_delta_table_loaded_from_delta( else: tmp_table_name = f"{temp_schema}.{expected_table_name}_temp" postgres_query = f"SELECT * FROM {tmp_table_name}" - partition_col = TABLE_SPEC[delta_table_name]["partition_column"] + partition_col = TABLE_SPEC[delta_table_name].partition_column if partition_col is not None: postgres_query = f"{postgres_query} ORDER BY {partition_col}" with psycopg2.connect(dsn=get_database_dsn_string()) as connection: @@ -331,7 +332,7 @@ def verify_delta_table_loaded_from_delta( assert equal_datasets( postgres_data, delta_data, - TABLE_SPEC[delta_table_name]["custom_schema"], + TABLE_SPEC[delta_table_name].custom_schema, ignore_fields=ignore_fields, ) diff --git a/usaspending_api/etl/tests/unit/test_spark.py b/usaspending_api/etl/tests/unit/test_spark.py deleted file mode 100644 index 3d1c6cf0a3..0000000000 --- a/usaspending_api/etl/tests/unit/test_spark.py +++ /dev/null @@ -1,17 +0,0 @@ -from usaspending_api.etl.management.commands.load_query_to_delta import TABLE_SPEC as LOAD_QUERY_TABLE_SPEC -from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC as LOAD_TABLE_TABLE_SPEC - - -def test_table_spec_consistency(): - table_spec_config_groups = { - "LOAD_QUERY_TABLE_SPEC": LOAD_QUERY_TABLE_SPEC, - "LOAD_TABLE_TABLE_SPEC": LOAD_TABLE_TABLE_SPEC, - } - for table_spec_group_name, table_spec_config_group in table_spec_config_groups.items(): - unioned_table_spec_keys = set() - for table_name, config in table_spec_config_group.items(): - unioned_table_spec_keys = unioned_table_spec_keys.union(set(list(config.keys()))) - for table_name, config in table_spec_config_group.items(): - diff = unioned_table_spec_keys - set(list(config.keys())) - if diff: - raise Exception(f"{table_name} is missing the following {table_spec_group_name} values: {diff}") From be94f1307a6f770e0bcde35a2a1a617b3f238756 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Thu, 5 Feb 2026 15:25:24 -0600 Subject: [PATCH 43/59] Addressing ruff issues --- .../commands/archive_table_in_delta.py | 40 ++-- .../management/commands/create_delta_table.py | 6 +- .../commands/load_query_to_delta.py | 90 ++++++--- .../commands/load_table_from_delta.py | 191 +++++++++++++----- .../commands/load_table_to_delta.py | 66 +++--- usaspending_api/etl/table_specs.py | 2 +- 6 files changed, 266 insertions(+), 129 deletions(-) diff --git a/usaspending_api/etl/management/commands/archive_table_in_delta.py b/usaspending_api/etl/management/commands/archive_table_in_delta.py index 40c00206af..f80b8a6c44 100644 --- a/usaspending_api/etl/management/commands/archive_table_in_delta.py +++ b/usaspending_api/etl/management/commands/archive_table_in_delta.py @@ -1,10 +1,9 @@ import logging -import psycopg2 - from datetime import datetime, timedelta -from django.core.management.base import BaseCommand -from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string +import psycopg2 +from django.core.management.base import BaseCommand, CommandParser + from usaspending_api.common.etl.spark import load_delta_table from usaspending_api.common.helpers.spark_helpers import ( configure_spark_session, @@ -12,7 +11,10 @@ get_jdbc_connection_properties, get_usas_jdbc_url, ) -from usaspending_api.download.delta_models.download_job import download_job_create_sql_string +from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string +from usaspending_api.download.delta_models.download_job import ( + download_job_create_sql_string, +) from usaspending_api.etl.table_specs import ArchiveTableSpec logger = logging.getLogger(__name__) @@ -38,7 +40,8 @@ class Command(BaseCommand): those records from Postgres. """ - def add_arguments(self, parser): + @staticmethod + def add_arguments(parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -57,7 +60,8 @@ def add_arguments(self, parser): "--alt-db", type=str, required=False, - help="An alternate Delta Database (aka schema) in which to archive this table, overriding the TABLE_SPEC's destination_database", + help="An alternate Delta Database (aka schema) in which to archive this table, overriding the TABLE_SPEC's" + " destination_database", ) parser.add_argument( "--alt-name", @@ -66,7 +70,7 @@ def add_arguments(self, parser): help="An alternate Delta Table name which to archive this table, overriding the destination_table", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -107,14 +111,16 @@ def handle(self, *args, **options): # Resolve JDBC URL for Source Database jdbc_url = get_usas_jdbc_url() if not jdbc_url: - raise RuntimeError(f"Couldn't find JDBC url, please properly configure your CONFIG.") + raise RuntimeError( + "Couldn't find JDBC url, please properly configure your CONFIG." + ) if not jdbc_url.startswith("jdbc:postgresql://"): - raise ValueError("JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://...") + raise ValueError( + "JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://..." + ) # Retrieve data from Postgres - query_with_predicate = ( - f"(SELECT * FROM {qualified_source_table} WHERE {archive_date_field} < '{archive_date_string}') AS tmp" - ) + query_with_predicate = f"(SELECT * FROM {qualified_source_table} WHERE {archive_date_field} < '{archive_date_string}') AS tmp" df = spark.read.jdbc( url=jdbc_url, @@ -125,7 +131,9 @@ def handle(self, *args, **options): # Write data to Delta Lake in Append Mode load_delta_table(spark, df, destination_table_name, overwrite=False) archived_count = df.count() - logger.info(f"Archived {archived_count} records from the {qualified_source_table}") + logger.info( + f"Archived {archived_count} records from the {qualified_source_table}" + ) # Delete data from with psycopg2.connect(dsn=get_database_dsn_string()) as connection: @@ -135,7 +143,9 @@ def handle(self, *args, **options): ) deleted_count = cursor.rowcount - logger.info(f"Deleted {deleted_count} records from the {qualified_source_table} table") + logger.info( + f"Deleted {deleted_count} records from the {qualified_source_table} table" + ) # Shut down spark if spark_created_by_command: diff --git a/usaspending_api/etl/management/commands/create_delta_table.py b/usaspending_api/etl/management/commands/create_delta_table.py index 991f2f9cec..19978107dc 100644 --- a/usaspending_api/etl/management/commands/create_delta_table.py +++ b/usaspending_api/etl/management/commands/create_delta_table.py @@ -1,6 +1,6 @@ import logging -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandParser from pyspark.sql.types import StructType from usaspending_api.awards.delta_models.award_id_lookup import AWARD_ID_LOOKUP_SCHEMA @@ -50,7 +50,7 @@ class Command(BaseCommand): This command creates an empty Delta Table based on the provided --destination-table argument. """ - def add_arguments(self, parser): + def add_arguments(self, parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -79,7 +79,7 @@ def add_arguments(self, parser): "name", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: spark = get_active_spark_session() spark_created_by_command = False if not spark: diff --git a/usaspending_api/etl/management/commands/load_query_to_delta.py b/usaspending_api/etl/management/commands/load_query_to_delta.py index d9ef19a811..ff609a79e2 100644 --- a/usaspending_api/etl/management/commands/load_query_to_delta.py +++ b/usaspending_api/etl/management/commands/load_query_to_delta.py @@ -2,7 +2,7 @@ from argparse import ArgumentTypeError from typing import Callable -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandParser from pyspark.sql import SparkSession from usaspending_api.common.etl.spark import create_ref_temp_views @@ -35,7 +35,9 @@ load_object_class_program_activity_incremental, object_class_program_activity_schema, ) -from usaspending_api.download.delta_models.transaction_download import transaction_download_schema +from usaspending_api.download.delta_models.transaction_download import ( + transaction_download_schema, +) from usaspending_api.etl.table_specs import QueryTableSpec from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_POSTGRES_COLUMNS, @@ -58,7 +60,10 @@ AWARD_SEARCH_POSTGRES_GOLD_COLUMNS, award_search_create_sql_string, ) -from usaspending_api.search.delta_models.dataframes.award_search import load_award_search, load_award_search_incremental +from usaspending_api.search.delta_models.dataframes.award_search import ( + load_award_search, + load_award_search_incremental, +) from usaspending_api.search.delta_models.dataframes.transaction_search import ( load_transaction_search, load_transaction_search_incremental, @@ -70,7 +75,12 @@ subaward_search_create_sql_string, subaward_search_load_sql_string, ) -from usaspending_api.search.models import AwardSearch, SubawardSearch, SummaryStateView, TransactionSearch +from usaspending_api.search.models import ( + AwardSearch, + SubawardSearch, + SummaryStateView, + TransactionSearch, +) from usaspending_api.settings import HOST from usaspending_api.transactions.delta_models import ( SUMMARY_STATE_VIEW_COLUMNS, @@ -226,8 +236,14 @@ "partition_keys": ["is_fpds"], "partitioning_form": "LIST", "partitions": [ - {"table_suffix": "_fpds", "partitioning_clause": "FOR VALUES IN (TRUE)"}, - {"table_suffix": "_fabs", "partitioning_clause": "FOR VALUES IN (FALSE)"}, + { + "table_suffix": "_fpds", + "partitioning_clause": "FOR VALUES IN (TRUE)", + }, + { + "table_suffix": "_fabs", + "partitioning_clause": "FOR VALUES IN (FALSE)", + }, ], }, } @@ -286,8 +302,11 @@ "partition_column_type": "numeric", "delta_table_create_sql": account_balances_schema, "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "column_names": list(), - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], + "column_names": [], + "delta_table_create_partitions": [ + "reporting_fiscal_year", + "funding_toptier_agency_id", + ], } ), "award_financial_download": QueryTableSpec( @@ -299,8 +318,11 @@ "partition_column_type": "numeric", "delta_table_create_sql": award_financial_schema, "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "column_names": list(), - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], + "column_names": [], + "delta_table_create_partitions": [ + "reporting_fiscal_year", + "funding_toptier_agency_id", + ], } ), "object_class_program_activity_download": QueryTableSpec( @@ -312,8 +334,11 @@ "partition_column_type": "numeric", "delta_table_create_sql": object_class_program_activity_schema, "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "column_names": list(), - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], + "column_names": [], + "delta_table_create_partitions": [ + "reporting_fiscal_year", + "funding_toptier_agency_id", + ], } ), "transaction_download": QueryTableSpec( @@ -323,8 +348,12 @@ "partition_column_type": "numeric", "delta_table_create_sql": transaction_download_schema, "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "column_names": list(), - "delta_table_create_partitions": ["awarding_agency_code", "is_fpds", "action_date_fiscal_year"], + "column_names": [], + "delta_table_create_partitions": [ + "awarding_agency_code", + "is_fpds", + "action_date_fiscal_year", + ], } ), } @@ -342,7 +371,8 @@ class Command(BaseCommand): destination_table_name: str spark: SparkSession - def add_arguments(self, parser): + @staticmethod + def add_arguments(parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -370,7 +400,7 @@ def add_arguments(self, parser): help="Whether or not the table will be updated incrementally", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -385,17 +415,25 @@ def handle(self, *args, **options): spark_created_by_command = False if not self.spark: spark_created_by_command = True - self.spark = configure_spark_session(**extra_conf, spark_context=self.spark) # type: SparkSession + self.spark = configure_spark_session( + **extra_conf, spark_context=self.spark + ) # type: SparkSession # Resolve Parameters destination_table = options["destination_table"] table_spec = TABLE_SPEC[destination_table] self.destination_database = options["alt_db"] or table_spec.destination_database - self.destination_table_name = options["alt_name"] or destination_table.split(".")[-1] - source_query_key = "source_query_incremental" if options["incremental"] else "source_query" + self.destination_table_name = ( + options["alt_name"] or destination_table.split(".")[-1] + ) + source_query_key = ( + "source_query_incremental" if options["incremental"] else "source_query" + ) load_query = getattr(table_spec, source_query_key) if load_query is None: - raise ArgumentTypeError(f"Invalid source query. `{source_query_key}` must be specified in the TABLE_SPEC.") + raise ArgumentTypeError( + f"Invalid source query. `{source_query_key}` must be specified in the TABLE_SPEC." + ) # Set the database that will be interacted with for all Delta Lake table Spark-based activity logger.info(f"Using Spark Database: {self.destination_database}") @@ -405,7 +443,9 @@ def handle(self, *args, **options): if isinstance(load_query, list): for index, query in enumerate(load_query): - logger.info(f"Running query number: {index + 1}\nPreview of query: {query[:100]}") + logger.info( + f"Running query number: {index + 1}\nPreview of query: {query[:100]}" + ) self.run_spark_sql(query) else: self.run_spark_sql(load_query) @@ -413,7 +453,9 @@ def handle(self, *args, **options): if spark_created_by_command: self.spark.stop() - def run_spark_sql(self, query: str | Callable[[SparkSession, str, str], None]): + def run_spark_sql( + self, query: str | Callable[[SparkSession, str, str], None] + ) -> None: if isinstance(query, str): jdbc_conn_props = get_jdbc_connection_properties() self.spark.sql( @@ -430,4 +472,6 @@ def run_spark_sql(self, query: str | Callable[[SparkSession, str, str], None]): elif isinstance(query, Callable): query(self.spark, self.destination_database, self.destination_table_name) else: - raise ArgumentTypeError(f"Invalid query. `{query}` must be a string or a Callable.") + raise ArgumentTypeError( + f"Invalid query. `{query}` must be a string or a Callable." + ) diff --git a/usaspending_api/etl/management/commands/load_table_from_delta.py b/usaspending_api/etl/management/commands/load_table_from_delta.py index c0af9f0597..503d33754c 100644 --- a/usaspending_api/etl/management/commands/load_table_from_delta.py +++ b/usaspending_api/etl/management/commands/load_table_from_delta.py @@ -1,40 +1,45 @@ import itertools import logging +from datetime import datetime +from math import ceil +from typing import Dict, List, Optional, override import boto3 import numpy as np import psycopg2 - from django import db +from django.core.management import CommandParser from django.core.management.base import BaseCommand from django.db.models import Model -from math import ceil -from pyspark.sql import SparkSession, DataFrame -from typing import Dict, Optional, List -from datetime import datetime +from pyspark.sql import Column, DataFrame, SparkSession from usaspending_api.common.csv_stream_s3_to_pg import copy_csvs_from_s3_to_pg from usaspending_api.common.etl.spark import convert_array_cols_to_string -from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.common.helpers.spark_helpers import ( configure_spark_session, get_active_spark_session, get_jdbc_connection_properties, get_usas_jdbc_url, ) +from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.config import CONFIG -from usaspending_api.settings import DEFAULT_TEXT_SEARCH_CONFIG - from usaspending_api.etl.management.commands.create_delta_table import TABLE_SPEC +from usaspending_api.settings import DEFAULT_TEXT_SEARCH_CONFIG logger = logging.getLogger(__name__) # Note: the `delta` type is not actually in Spark SQL. It's how we're temporarily storing the data before converting it # to the proper postgres type, since pySpark doesn't automatically support this conversion. SPECIAL_TYPES_MAPPING = { - db.models.UUIDField: {"postgres": "UUID USING {column_name}::UUID", "delta": "TEXT"}, + db.models.UUIDField: { + "postgres": "UUID USING {column_name}::UUID", + "delta": "TEXT", + }, "UUID": {"postgres": "UUID USING {column_name}::UUID", "delta": "TEXT"}, - db.models.JSONField: {"postgres": "JSONB using {column_name}::JSON", "delta": "TEXT"}, + db.models.JSONField: { + "postgres": "JSONB using {column_name}::JSON", + "delta": "TEXT", + }, "JSONB": {"postgres": "JSONB using {column_name}::JSON", "delta": "TEXT"}, } @@ -46,7 +51,6 @@ class Command(BaseCommand): - help = """ This command reads data from a Delta table and copies it into a corresponding Postgres database table (under a temp name). As of now, it only supports a full reload of a table. If the table with the chosen temp name already @@ -55,7 +59,7 @@ class Command(BaseCommand): if a new table has been made. """ - def add_arguments(self, parser): + def add_arguments(self, parser: CommandParser) -> None: parser.add_argument( "--delta-table", type=str, @@ -73,7 +77,7 @@ def add_arguments(self, parser): "--alt-delta-name", type=str, required=False, - help="An alternate delta table name to load, overriding the TABLE_SPEC destination_table" "name", + help="An alternate delta table name to load, overriding the TABLE_SPEC destination_tablename", ) parser.add_argument( "--jdbc-inserts", @@ -111,7 +115,8 @@ def add_arguments(self, parser): "If the job fails for some unexpected reason then the sequence will be reset to the previous value.", ) - def _split_dfs(self, df, special_columns): + @staticmethod + def _split_dfs(df: DataFrame, special_columns: str | Column) -> [DataFrame]: """Split a DataFrame into DataFrame subsets based on presence of NULL values in certain special columns Unfortunately, pySpark with the JDBC doesn't handle UUIDs/JSON well. @@ -129,13 +134,18 @@ def _split_dfs(self, df, special_columns): # Figure all the possible combos of filters filter_batches = [] for subset in itertools.product([True, False], repeat=len(special_columns)): - filter_batches.append({col: subset[i] for i, col in enumerate(special_columns)}) + filter_batches.append( + {col: subset[i] for i, col in enumerate(special_columns)} + ) # Generate all the split dfs based on the filter batches split_dfs = [] for filter_batch in filter_batches: # Apply the filters (True = null column, drop it. False = not null column, keep it) - modified_filters = [df[col].isNull() if val else df[col].isNotNull() for col, val in filter_batch.items()] + modified_filters = [ + df[col].isNull() if val else df[col].isNotNull() + for col, val in filter_batch.items() + ] split_df = df.filter(np.bitwise_and.reduce(modified_filters)) # Drop the columns where it's null **after filtering them out** @@ -145,7 +155,7 @@ def _split_dfs(self, df, special_columns): split_dfs.append(split_df) return split_dfs - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -160,7 +170,9 @@ def handle(self, *args, **options): spark_created_by_command = False if not spark: spark_created_by_command = True - spark = configure_spark_session(**extra_conf, spark_context=spark) # type: SparkSession + spark = configure_spark_session( + **extra_conf, spark_context=spark + ) # type: SparkSession # Resolve Parameters delta_table = options["delta_table"] @@ -169,9 +181,15 @@ def handle(self, *args, **options): table_spec = TABLE_SPEC[delta_table] # Delta side - destination_database = options["alt_delta_db"] or table_spec.destination_database + destination_database = ( + options["alt_delta_db"] or table_spec.destination_database + ) delta_table_name = options["alt_delta_name"] or delta_table - delta_table = f"{destination_database}.{delta_table_name}" if destination_database else delta_table_name + delta_table = ( + f"{destination_database}.{delta_table_name}" + if destination_database + else delta_table_name + ) # Postgres side - source postgres_table = None @@ -182,19 +200,27 @@ def handle(self, *args, **options): column_names = table_spec.column_names tsvectors = table_spec.tsvectors or {} if postgres_table_name: - postgres_table = f"{postgres_schema}.{postgres_table_name}" if postgres_schema else postgres_table_name + postgres_table = ( + f"{postgres_schema}.{postgres_table_name}" + if postgres_schema + else postgres_table_name + ) # Postgres side - temp temp_schema = "temp" temp_table_suffix = "temp" - temp_table_suffix_appendage = f"_{temp_table_suffix}" if {temp_table_suffix} else "" + temp_table_suffix_appendage = ( + f"_{temp_table_suffix}" if {temp_table_suffix} else "" + ) if postgres_table: temp_table_name = f"{postgres_table_name}{temp_table_suffix_appendage}" else: temp_table_name = f"{delta_table_name}{temp_table_suffix_appendage}" temp_table = f"{temp_schema}.{temp_table_name}" - summary_msg = f"Copying delta table {delta_table} to a Postgres temp table {temp_table}." + summary_msg = ( + f"Copying delta table {delta_table} to a Postgres temp table {temp_table}." + ) if postgres_table: summary_msg = f"{summary_msg} The temp table will be based on the postgres table {postgres_table}" logger.info(summary_msg) @@ -213,7 +239,9 @@ def handle(self, *args, **options): # If it does, and we're recreating it, drop it first if temp_dest_table_exists and recreate: - logger.info(f"{temp_table} exists and recreate argument provided. Dropping first.") + logger.info( + f"{temp_table} exists and recreate argument provided. Dropping first." + ) # If the schema has changed and we need to do a complete reload, just drop the table and rebuild it clear_table_sql = f"DROP TABLE {temp_table}" with db.connection.cursor() as cursor: @@ -242,7 +270,8 @@ def handle(self, *args, **options): ( f"CREATE TABLE " # Below: e.g. my_tbl_temp -> my_tbl_part_temp - f"{temp_table[:-len(temp_table_suffix_appendage)]}{pt['table_suffix']}{temp_table_suffix_appendage} " + f"{temp_table[: -len(temp_table_suffix_appendage)]}" + f"{pt['table_suffix']}{temp_table_suffix_appendage} " f"PARTITION OF {temp_table} {pt['partitioning_clause']} " f"{storage_parameters}" ) @@ -257,7 +286,7 @@ def handle(self, *args, **options): elif postgres_cols: create_temp_sql = f""" CREATE TABLE {temp_table} ( - {", ".join([f'{key} {val}' for key, val in postgres_cols.items()])} + {", ".join([f"{key} {val}" for key, val in postgres_cols.items()])} ) {partition_clause} {storage_parameters} """ else: @@ -272,7 +301,9 @@ def handle(self, *args, **options): if is_postgres_table_partitioned and partitions_sql: for create_partition in partitions_sql: - logger.info(f"Creating partition of {temp_table} with SQL:\n{create_partition}") + logger.info( + f"Creating partition of {temp_table} with SQL:\n{create_partition}" + ) cursor.execute(create_partition) logger.info("Partition created.") @@ -284,7 +315,9 @@ def handle(self, *args, **options): f"To prevent any confusion or duplicates, dropping the trigger" f" tsvector_update_{tsvector_name} if it exists before potentially recreating it." ) - cursor.execute(f"DROP TRIGGER IF EXISTS tsvector_update_{tsvector_name} ON {temp_table}") + cursor.execute( + f"DROP TRIGGER IF EXISTS tsvector_update_{tsvector_name} ON {temp_table}" + ) logger.info( f"Adding tsvector trigger for column {tsvector_name}" @@ -298,7 +331,9 @@ def handle(self, *args, **options): {derived_from_cols_str}) """ cursor.execute(tsvector_trigger_sql) - logger.info(f"tsvector trigger for column {tsvector_name} added.") + logger.info( + f"tsvector trigger for column {tsvector_name} added." + ) # Read from Delta df = spark.table(delta_table) @@ -318,7 +353,9 @@ def handle(self, *args, **options): # Reset the sequence before load for a table if it exists if options["reset_sequence"] and table_spec.postgres_seq_name: - postgres_seq_last_value = self._set_sequence_value(table_spec.postgres_seq_name) + postgres_seq_last_value = self._set_sequence_value( + table_spec.postgres_seq_name + ) else: postgres_seq_last_value = None @@ -326,7 +363,7 @@ def handle(self, *args, **options): use_jdbc_inserts = options["jdbc_inserts"] strategy = "JDBC INSERTs" if use_jdbc_inserts else "SQL bulk COPY CSV" logger.info( - f"LOAD (START): Loading data from Delta table {delta_table} to {temp_table} using {strategy} " f"strategy" + f"LOAD (START): Loading data from Delta table {delta_table} to {temp_table} using {strategy} strategy" ) try: @@ -342,7 +379,9 @@ def handle(self, *args, **options): ) else: if not column_names: - raise RuntimeError("column_names None or empty, but are required to map CSV cols to table cols") + raise RuntimeError( + "column_names None or empty, but are required to map CSV cols to table cols" + ) spark_s3_bucket_name = options["spark_s3_bucket"] self._write_with_sql_bulk_copy_csv( spark, @@ -359,11 +398,13 @@ def handle(self, *args, **options): logger.error( f"Command failed unexpectedly; resetting the sequence to previous value: {postgres_seq_last_value}" ) - self._set_sequence_value(table_spec.postgres_seq_name, postgres_seq_last_value) - raise Exception(exc) + self._set_sequence_value( + table_spec.postgres_seq_name, postgres_seq_last_value + ) + raise exc logger.info( - f"LOAD (FINISH): Loaded data from Delta table {delta_table} to {temp_table} using {strategy} " f"strategy" + f"LOAD (FINISH): Loaded data from Delta table {delta_table} to {temp_table} using {strategy} strategy" ) # We're done with spark at this point @@ -391,9 +432,12 @@ def _set_sequence_value(self, seq_name: str, val: Optional[int] = None) -> int: with db.connection.cursor() as cursor: cursor.execute(f"SELECT last_value FROM {seq_name}") last_value = cursor.fetchone()[0] - cursor.execute(f"ALTER SEQUENCE IF EXISTS {seq_name} RESTART WITH {new_seq_val}") + cursor.execute( + f"ALTER SEQUENCE IF EXISTS {seq_name} RESTART WITH {new_seq_val}" + ) return last_value + @override def _write_with_sql_bulk_copy_csv( self, spark: SparkSession, @@ -403,8 +447,8 @@ def _write_with_sql_bulk_copy_csv( temp_table: str, ordered_col_names: List[str], spark_s3_bucket_name: str, - keep_csv_files=False, - ): + keep_csv_files: bool = False, + ) -> None: """ Write-from-delta-to-postgres strategy that relies on SQL bulk COPY of CSV files to Postgres. It uses the SQL COPY command on CSV files, which are created from the Delta table's underlying parquet files. @@ -464,11 +508,15 @@ def _write_with_sql_bulk_copy_csv( aws_secret_access_key=CONFIG.AWS_SECRET_KEY.get_secret_value(), ) s3_resource = boto3_session.resource( - service_name="s3", region_name=CONFIG.AWS_REGION, endpoint_url=f"http://{CONFIG.AWS_S3_ENDPOINT}" + service_name="s3", + region_name=CONFIG.AWS_REGION, + endpoint_url=f"http://{CONFIG.AWS_S3_ENDPOINT}", ) else: s3_resource = boto3.resource( - service_name="s3", region_name=CONFIG.AWS_REGION, endpoint_url=f"https://{CONFIG.AWS_S3_ENDPOINT}" + service_name="s3", + region_name=CONFIG.AWS_REGION, + endpoint_url=f"https://{CONFIG.AWS_S3_ENDPOINT}", ) s3_bucket_name = spark_s3_bucket_name s3_bucket = s3_resource.Bucket(s3_bucket_name) @@ -476,15 +524,25 @@ def _write_with_sql_bulk_copy_csv( initial_size = sum(1 for _ in objs_collection) if initial_size > 0: - logger.info(f"LOAD: Starting to delete {initial_size} previous objects in {s3_bucket_with_csv_path}") + logger.info( + f"LOAD: Starting to delete {initial_size} previous objects in {s3_bucket_with_csv_path}" + ) objs_collection.delete() post_delete_size = sum(1 for _ in objs_collection) - logger.info(f"LOAD: Finished deleting. {post_delete_size} objects remain in {s3_bucket_with_csv_path}") + logger.info( + f"LOAD: Finished deleting. {post_delete_size} objects remain in {s3_bucket_with_csv_path}" + ) else: - logger.info(f"LOAD: Target S3 path {s3_bucket_with_csv_path} is empty or yet to be created") + logger.info( + f"LOAD: Target S3 path {s3_bucket_with_csv_path} is empty or yet to be created" + ) - logger.info(f"LOAD: Starting dump of Delta table to temp gzipped CSV files in {s3_bucket_with_csv_path}") - df_no_arrays = convert_array_cols_to_string(df, is_postgres_array_format=True, is_for_csv_export=True) + logger.info( + f"LOAD: Starting dump of Delta table to temp gzipped CSV files in {s3_bucket_with_csv_path}" + ) + df_no_arrays = convert_array_cols_to_string( + df, is_postgres_array_format=True, is_for_csv_export=True + ) df_no_arrays.write.options( maxRecordsPerFile=_SPARK_CSV_WRITE_TO_PG_MAX_RECORDS_PER_FILE, compression="gzip", @@ -493,18 +551,28 @@ def _write_with_sql_bulk_copy_csv( ignoreLeadingWhiteSpace=False, # must set for CSV write, as it defaults to true ignoreTrailingWhiteSpace=False, # must set for CSV write, as it defaults to true timestampFormat=CONFIG.SPARK_CSV_TIMEZONE_FORMAT, - ).mode(saveMode="overwrite" if not keep_csv_files else "errorifexists").csv(s3_bucket_with_csv_path) + ).mode(saveMode="overwrite" if not keep_csv_files else "errorifexists").csv( + s3_bucket_with_csv_path + ) logger.debug( f"Connecting to S3 at endpoint_url={CONFIG.AWS_S3_ENDPOINT}, region_name={CONFIG.AWS_REGION} to " f"get listing of contents of Bucket={spark_s3_bucket_name} with Prefix={csv_path}" ) - gzipped_csv_files = [f.key for f in s3_bucket.objects.filter(Prefix=csv_path) if f.key.endswith(".csv.gz")] + gzipped_csv_files = [ + f.key + for f in s3_bucket.objects.filter(Prefix=csv_path) + if f.key.endswith(".csv.gz") + ] file_count = len(gzipped_csv_files) - logger.info(f"LOAD: Finished dumping {file_count} CSV files in {s3_bucket_with_csv_path}") + logger.info( + f"LOAD: Finished dumping {file_count} CSV files in {s3_bucket_with_csv_path}" + ) - logger.info(f"LOAD: Starting SQL bulk COPY of {file_count} CSV files to Postgres {temp_table} table") + logger.info( + f"LOAD: Starting SQL bulk COPY of {file_count} CSV files to Postgres {temp_table} table" + ) db_dsn = get_database_dsn_string() with psycopg2.connect(dsn=db_dsn) as connection: @@ -518,7 +586,10 @@ def _write_with_sql_bulk_copy_csv( # fraction less than 1.0. The final value will be the greater of that or # SPARK_CSV_WRITE_TO_PG_MIN_PARTITIONS partitions = max( - ceil(max_parallel_workers * CONFIG.SPARK_CSV_WRITE_TO_PG_PARALLEL_WORKER_MULTIPLIER), + ceil( + max_parallel_workers + * CONFIG.SPARK_CSV_WRITE_TO_PG_PARALLEL_WORKER_MULTIPLIER + ), CONFIG.SPARK_CSV_WRITE_TO_PG_MIN_PARTITIONS, ) @@ -547,8 +618,11 @@ def _write_with_sql_bulk_copy_csv( ), ).collect() - logger.info(f"LOAD: Finished SQL bulk COPY of {file_count} CSV files to Postgres {temp_table} table") + logger.info( + f"LOAD: Finished SQL bulk COPY of {file_count} CSV files to Postgres {temp_table} table" + ) + @override def _write_with_jdbc_inserts( self, spark: SparkSession, @@ -558,7 +632,7 @@ def _write_with_jdbc_inserts( postgres_model: Optional[Model] = None, postgres_cols: Optional[Dict[str, str]] = None, overwrite: bool = False, - ): + ) -> None: """ Write-from-delta-to-postgres strategy that leverages the native Spark ``DataFrame.write.jdbc`` approach. This will issue a series of individual INSERT statements over a JDBC connection-per-executor. @@ -594,7 +668,10 @@ def _write_with_jdbc_inserts( # special handling. Get those columns and handle each. if split_df_by_special_cols: if postgres_model: - col_type_mapping = [(column.name, type(column)) for column in postgres_model._meta.get_fields()] + col_type_mapping = [ + (column.name, type(column)) + for column in postgres_model._meta.get_fields() + ] else: col_type_mapping = list(postgres_cols.items()) for column_name, column_type in col_type_mapping: @@ -609,14 +686,18 @@ def _write_with_jdbc_inserts( ) for i, split_df in enumerate(split_dfs): # Note: we're only appending here as we don't want to re-truncate or overwrite with multiple dataframes - logger.info(f"LOAD: Loading part {i + 1} of {split_df_count} (note: unequal part sizes)") + logger.info( + f"LOAD: Loading part {i + 1} of {split_df_count} (note: unequal part sizes)" + ) split_df.write.jdbc( url=get_usas_jdbc_url(), table=temp_table, mode=save_mode, properties=get_jdbc_connection_properties(), ) - logger.info(f"LOAD: Part {i + 1} of {split_df_count} loaded (note: unequal part sizes)") + logger.info( + f"LOAD: Part {i + 1} of {split_df_count} loaded (note: unequal part sizes)" + ) else: # Do it in one shot df.write.jdbc( diff --git a/usaspending_api/etl/management/commands/load_table_to_delta.py b/usaspending_api/etl/management/commands/load_table_to_delta.py index b54ab9eb7c..13d9d425a0 100644 --- a/usaspending_api/etl/management/commands/load_table_to_delta.py +++ b/usaspending_api/etl/management/commands/load_table_to_delta.py @@ -1,14 +1,21 @@ import logging -from django.core.management import BaseCommand +from django.core.management import BaseCommand, CommandParser from usaspending_api.awards.delta_models import ( AWARDS_COLUMNS, - awards_sql_string, - FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS, - financial_accounts_by_awards_sql_string, BROKER_SUBAWARDS_COLUMNS, + FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS, + awards_sql_string, broker_subawards_sql_string, + financial_accounts_by_awards_sql_string, +) +from usaspending_api.awards.models import ( + Award, + FinancialAccountsByAwards, + TransactionFABS, + TransactionFPDS, + TransactionNormalized, ) from usaspending_api.broker.delta_models.broker_zips import ( ZIPS_COLUMNS, @@ -22,49 +29,43 @@ from usaspending_api.common.helpers.spark_helpers import ( configure_spark_session, get_active_spark_session, + get_broker_jdbc_url, get_jdbc_connection_properties, get_usas_jdbc_url, - get_broker_jdbc_url, ) from usaspending_api.config import CONFIG from usaspending_api.etl.table_specs import TableSpec from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_COLUMNS, - recipient_lookup_create_sql_string, - recipient_profile_create_sql_string, RECIPIENT_PROFILE_DELTA_COLUMNS, SAM_RECIPIENT_COLUMNS, + recipient_lookup_create_sql_string, + recipient_profile_create_sql_string, sam_recipient_create_sql_string, ) -from usaspending_api.search.models import TransactionSearch, AwardSearch +from usaspending_api.recipient.models import DUNS, RecipientLookup, RecipientProfile +from usaspending_api.search.delta_models.award_search import ( + AWARD_SEARCH_COLUMNS, + award_search_create_sql_string, +) +from usaspending_api.search.models import AwardSearch, TransactionSearch from usaspending_api.transactions.delta_models import ( DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS, - detached_award_procurement_create_sql_string, + PUBLISHED_FABS_COLUMNS, TRANSACTION_FABS_VIEW_COLUMNS, - transaction_fabs_sql_string, TRANSACTION_FPDS_VIEW_COLUMNS, - transaction_fpds_sql_string, TRANSACTION_NORMALIZED_COLUMNS, - transaction_normalized_sql_string, TRANSACTION_SEARCH_POSTGRES_COLUMNS, - transaction_search_create_sql_string, - PUBLISHED_FABS_COLUMNS, + detached_award_procurement_create_sql_string, published_fabs_create_sql_string, + transaction_fabs_sql_string, + transaction_fpds_sql_string, + transaction_normalized_sql_string, + transaction_search_create_sql_string, ) -from usaspending_api.transactions.models import SourceAssistanceTransaction -from usaspending_api.transactions.models import SourceProcurementTransaction -from usaspending_api.search.delta_models.award_search import ( - award_search_create_sql_string, - AWARD_SEARCH_COLUMNS, -) - -from usaspending_api.recipient.models import DUNS, RecipientLookup, RecipientProfile -from usaspending_api.awards.models import ( - Award, - FinancialAccountsByAwards, - TransactionFABS, - TransactionFPDS, - TransactionNormalized, +from usaspending_api.transactions.models import ( + SourceAssistanceTransaction, + SourceProcurementTransaction, ) logger = logging.getLogger(__name__) @@ -134,7 +135,7 @@ "partition_column_type": "numeric", "is_partition_column_unique": True, "delta_table_create_sql": published_fabs_create_sql_string, - "column_names": list(PUBLISHED_FABS_DELTA_COLUMNS), + "column_names": list(PUBLISHED_FABS_COLUMNS), } ), "transaction_fpds": TableSpec( @@ -271,7 +272,8 @@ class Command(BaseCommand): before new data is written. """ - def add_arguments(self, parser): + @staticmethod + def add_arguments(parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -293,7 +295,7 @@ def add_arguments(self, parser): "name", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -331,7 +333,7 @@ def handle(self, *args, **options): jdbc_url = get_usas_jdbc_url() if not is_from_broker else get_broker_jdbc_url() if not jdbc_url: raise RuntimeError( - f"Couldn't find JDBC url, please properly configure your CONFIG." + "Couldn't find JDBC url, please properly configure your CONFIG." ) if not jdbc_url.startswith("jdbc:postgresql://"): raise ValueError( diff --git a/usaspending_api/etl/table_specs.py b/usaspending_api/etl/table_specs.py index a291776115..e313957858 100644 --- a/usaspending_api/etl/table_specs.py +++ b/usaspending_api/etl/table_specs.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Literal, Any, Callable +from typing import Any, Callable, Literal from django.db import models from pyspark.sql import SparkSession From 53d8ff547cb235828d07d8953add1cae64bd60a8 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Thu, 5 Feb 2026 16:28:14 -0600 Subject: [PATCH 44/59] Addressing ruff issues --- .../commands/archive_table_in_delta.py | 5 +- .../commands/load_table_from_delta.py | 352 +++++++++++------- 2 files changed, 222 insertions(+), 135 deletions(-) diff --git a/usaspending_api/etl/management/commands/archive_table_in_delta.py b/usaspending_api/etl/management/commands/archive_table_in_delta.py index f80b8a6c44..439d6cb864 100644 --- a/usaspending_api/etl/management/commands/archive_table_in_delta.py +++ b/usaspending_api/etl/management/commands/archive_table_in_delta.py @@ -120,7 +120,10 @@ def handle(self, *args, **options) -> None: ) # Retrieve data from Postgres - query_with_predicate = f"(SELECT * FROM {qualified_source_table} WHERE {archive_date_field} < '{archive_date_string}') AS tmp" + query_with_predicate = ( + f"(SELECT * FROM {qualified_source_table} " + f"WHERE {archive_date_field} < '{archive_date_string}') AS tmp" + ) df = spark.read.jdbc( url=jdbc_url, diff --git a/usaspending_api/etl/management/commands/load_table_from_delta.py b/usaspending_api/etl/management/commands/load_table_from_delta.py index 503d33754c..1440c9c5d8 100644 --- a/usaspending_api/etl/management/commands/load_table_from_delta.py +++ b/usaspending_api/etl/management/commands/load_table_from_delta.py @@ -24,6 +24,7 @@ from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.config import CONFIG from usaspending_api.etl.management.commands.create_delta_table import TABLE_SPEC +from usaspending_api.etl.table_specs import QueryTableSpec from usaspending_api.settings import DEFAULT_TEXT_SEARCH_CONFIG logger = logging.getLogger(__name__) @@ -155,7 +156,7 @@ def _split_dfs(df: DataFrame, special_columns: str | Column) -> [DataFrame]: split_dfs.append(split_df) return split_dfs - def handle(self, *args, **options) -> None: + def _get_spark_session(self) -> SparkSession: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -165,7 +166,6 @@ def handle(self, *args, **options) -> None: "spark.sql.legacy.parquet.int96RebaseModeInWrite": "LEGACY", # for timestamps at/before 1900 "spark.sql.jsonGenerator.ignoreNullFields": "false", # keep nulls in our json } - spark = get_active_spark_session() spark_created_by_command = False if not spark: @@ -173,11 +173,14 @@ def handle(self, *args, **options) -> None: spark = configure_spark_session( **extra_conf, spark_context=spark ) # type: SparkSession + return spark, spark_created_by_command + + def handle(self, *args, **options) -> None: + spark, spark_created_by_command = self._get_spark_session() # Resolve Parameters delta_table = options["delta_table"] recreate = options["recreate"] - table_spec = TABLE_SPEC[delta_table] # Delta side @@ -193,7 +196,6 @@ def handle(self, *args, **options) -> None: # Postgres side - source postgres_table = None - postgres_model = table_spec.model postgres_schema = table_spec.source_database or table_spec.swap_schema postgres_table_name = table_spec.source_table or table_spec.swap_table postgres_cols = table_spec.source_schema @@ -212,10 +214,11 @@ def handle(self, *args, **options) -> None: temp_table_suffix_appendage = ( f"_{temp_table_suffix}" if {temp_table_suffix} else "" ) - if postgres_table: - temp_table_name = f"{postgres_table_name}{temp_table_suffix_appendage}" - else: - temp_table_name = f"{delta_table_name}{temp_table_suffix_appendage}" + temp_table_name = ( + f"{postgres_table_name}{temp_table_suffix_appendage}" + if postgres_table + else f"{delta_table_name}{temp_table_suffix_appendage}" + ) temp_table = f"{temp_schema}.{temp_table_name}" summary_msg = ( @@ -225,115 +228,30 @@ def handle(self, *args, **options) -> None: summary_msg = f"{summary_msg} The temp table will be based on the postgres table {postgres_table}" logger.info(summary_msg) - # Checking if the temp destination table already exists - temp_dest_table_exists_sql = f""" - SELECT EXISTS ( - SELECT 1 - FROM information_schema.tables - WHERE table_schema = '{temp_schema}' - AND table_name = '{temp_table_name}') - """ - with db.connection.cursor() as cursor: - cursor.execute(temp_dest_table_exists_sql) - temp_dest_table_exists = cursor.fetchone()[0] + temp_dest_table_exists = self._temp_table_exists(temp_schema, temp_table_name) # If it does, and we're recreating it, drop it first if temp_dest_table_exists and recreate: - logger.info( - f"{temp_table} exists and recreate argument provided. Dropping first." - ) - # If the schema has changed and we need to do a complete reload, just drop the table and rebuild it - clear_table_sql = f"DROP TABLE {temp_table}" - with db.connection.cursor() as cursor: - cursor.execute(clear_table_sql) - logger.info(f"{temp_table} dropped.") + self._drop_temp_table(temp_table) temp_dest_table_exists = False make_new_table = not temp_dest_table_exists - is_postgres_table_partitioned = table_spec.postgres_partition_spec is not None + is_postgres_table_partitioned = ( + hasattr(table_spec, "postgres_partition_spec") + and table_spec.postgres_partition_spec is not None + ) if postgres_table or postgres_cols: - # Recreate the table if it doesn't exist. Spark's df.write automatically does this but doesn't account for - # the extra metadata (indexes, constraints, defaults) which CREATE TABLE X LIKE Y accounts for. - # If there is no postgres_table to base it on, it just relies on spark to make it and work with delta table - if make_new_table: - partition_clause = "" - storage_parameters = "WITH (autovacuum_enabled=FALSE)" - partitions_sql = [] - if is_postgres_table_partitioned: - partition_clause = ( - f"PARTITION BY {table_spec.postgres_partition_spec['partitioning_form']}" - f"({', '.join(table_spec.postgres_partition_spec['partition_keys'])})" - ) - storage_parameters = "" - partitions_sql = [ - ( - f"CREATE TABLE " - # Below: e.g. my_tbl_temp -> my_tbl_part_temp - f"{temp_table[: -len(temp_table_suffix_appendage)]}" - f"{pt['table_suffix']}{temp_table_suffix_appendage} " - f"PARTITION OF {temp_table} {pt['partitioning_clause']} " - f"{storage_parameters}" - ) - for pt in table_spec.postgres_partition_spec["partitions"] - ] - if postgres_table: - create_temp_sql = f""" - CREATE TABLE {temp_table} ( - LIKE {postgres_table} INCLUDING DEFAULTS INCLUDING GENERATED INCLUDING IDENTITY - ) {partition_clause} {storage_parameters} - """ - elif postgres_cols: - create_temp_sql = f""" - CREATE TABLE {temp_table} ( - {", ".join([f"{key} {val}" for key, val in postgres_cols.items()])} - ) {partition_clause} {storage_parameters} - """ - else: - raise RuntimeError( - "make_new_table=True but neither a postgres_table or postgres_cols are " - "populated for the target delta table in the TABLE_SPEC" - ) - with db.connection.cursor() as cursor: - logger.info(f"Creating {temp_table}") - cursor.execute(create_temp_sql) - logger.info(f"{temp_table} created.") - - if is_postgres_table_partitioned and partitions_sql: - for create_partition in partitions_sql: - logger.info( - f"Creating partition of {temp_table} with SQL:\n{create_partition}" - ) - cursor.execute(create_partition) - logger.info("Partition created.") - - # If there are vectors, add the triggers that will populate them based on other calls - # NOTE: Undetermined whether tsvector triggers can be applied on partitioned tables, - # at the top-level virtual/partitioned table (versus having to apply on each partition) - for tsvector_name, derived_from_cols in tsvectors.items(): - logger.info( - f"To prevent any confusion or duplicates, dropping the trigger" - f" tsvector_update_{tsvector_name} if it exists before potentially recreating it." - ) - cursor.execute( - f"DROP TRIGGER IF EXISTS tsvector_update_{tsvector_name} ON {temp_table}" - ) - - logger.info( - f"Adding tsvector trigger for column {tsvector_name}" - f" based on the following columns: {derived_from_cols}" - ) - derived_from_cols_str = ", ".join(derived_from_cols) - tsvector_trigger_sql = f""" - CREATE TRIGGER tsvector_update_{tsvector_name} BEFORE INSERT OR UPDATE - ON {temp_table} FOR EACH ROW EXECUTE PROCEDURE - tsvector_update_trigger({tsvector_name}, '{DEFAULT_TEXT_SEARCH_CONFIG}', - {derived_from_cols_str}) - """ - cursor.execute(tsvector_trigger_sql) - logger.info( - f"tsvector trigger for column {tsvector_name} added." - ) + self._recreate_table( + make_new_table=make_new_table, + is_postgres_table_partitioned=is_postgres_table_partitioned, + table_spec=table_spec, + temp_table=temp_table, + temp_table_suffix_appendage=temp_table_suffix_appendage, + postgres_table=postgres_table, + postgres_cols=postgres_cols, + tsvectors=tsvectors, + ) # Read from Delta df = spark.table(delta_table) @@ -352,20 +270,174 @@ def handle(self, *args, **options) -> None: logger.info(f"{temp_table} truncated.") # Reset the sequence before load for a table if it exists - if options["reset_sequence"] and table_spec.postgres_seq_name: - postgres_seq_last_value = self._set_sequence_value( - table_spec.postgres_seq_name - ) - else: - postgres_seq_last_value = None + postgres_seq_last_value = ( + self._set_sequence_value(table_spec.postgres_seq_name) + if options["reset_sequence"] + and hasattr(table_spec, "postgres_seq_name") + and table_spec.postgres_seq_name + else None + ) + + self._write_df( + delta_table=delta_table, + spark=spark, + df=df, + temp_table=temp_table, + postgres_cols=postgres_cols, + options=options, + destination_database=destination_database, + delta_table_name=delta_table_name, + column_names=column_names, + postgres_seq_last_value=postgres_seq_last_value, + table_spec=table_spec, + ) + + self._finish( + delta_table=delta_table, + temp_table=temp_table, + options=options, + spark_created_by_command=spark_created_by_command, + spark=spark, + postgres_table=postgres_table, + ) + + @staticmethod + def _temp_table_exists(temp_schema: str, temp_table_name: str) -> bool: + # Checking if the temp destination table already exists + temp_dest_table_exists_sql = f""" + SELECT EXISTS ( + SELECT 1 + FROM information_schema.tables + WHERE table_schema = '{temp_schema}' + AND table_name = '{temp_table_name}') + """ + with db.connection.cursor() as cursor: + cursor.execute(temp_dest_table_exists_sql) + return bool(cursor.fetchone()[0]) + + @staticmethod + def _drop_temp_table(temp_table: str) -> None: + logger.info( + f"{temp_table} exists and recreate argument provided. Dropping first." + ) + # If the schema has changed and we need to do a complete reload, just drop the table and rebuild it + clear_table_sql = f"DROP TABLE {temp_table}" + with db.connection.cursor() as cursor: + cursor.execute(clear_table_sql) + logger.info(f"{temp_table} dropped.") + + @override + @staticmethod + def _recreate_table( + make_new_table: bool, + is_postgres_table_partitioned: bool, + table_spec: QueryTableSpec, + temp_table: str, + temp_table_suffix_appendage: str, + postgres_table: str, + postgres_cols: list, + tsvectors: dict, + ) -> None: + # Recreate the table if it doesn't exist. Spark's df.write automatically does this but doesn't account for + # the extra metadata (indexes, constraints, defaults) which CREATE TABLE X LIKE Y accounts for. + # If there is no postgres_table to base it on, it just relies on spark to make it and work with delta table + if make_new_table: + partition_clause = "" + storage_parameters = "WITH (autovacuum_enabled=FALSE)" + partitions_sql = [] + if is_postgres_table_partitioned: + partition_clause = ( + f"PARTITION BY {table_spec.postgres_partition_spec['partitioning_form']}" + f"({', '.join(table_spec.postgres_partition_spec['partition_keys'])})" + ) + storage_parameters = "" + partitions_sql = [ + ( + f"CREATE TABLE " + # Below: e.g. my_tbl_temp -> my_tbl_part_temp + f"{temp_table[: -len(temp_table_suffix_appendage)]}" + f"{pt['table_suffix']}{temp_table_suffix_appendage} " + f"PARTITION OF {temp_table} {pt['partitioning_clause']} " + f"{storage_parameters}" + ) + for pt in table_spec.postgres_partition_spec["partitions"] + ] + if postgres_table: + create_temp_sql = f""" + CREATE TABLE {temp_table} ( + LIKE {postgres_table} INCLUDING DEFAULTS INCLUDING GENERATED INCLUDING IDENTITY + ) {partition_clause} {storage_parameters} + """ + elif postgres_cols: + create_temp_sql = f""" + CREATE TABLE {temp_table} ( + {", ".join([f"{key} {val}" for key, val in postgres_cols.items()])} + ) {partition_clause} {storage_parameters} + """ + else: + raise RuntimeError( + "make_new_table=True but neither a postgres_table or postgres_cols are " + "populated for the target delta table in the TABLE_SPEC" + ) + with db.connection.cursor() as cursor: + logger.info(f"Creating {temp_table}") + cursor.execute(create_temp_sql) + logger.info(f"{temp_table} created.") + + if is_postgres_table_partitioned and partitions_sql: + for create_partition in partitions_sql: + logger.info( + f"Creating partition of {temp_table} with SQL:\n{create_partition}" + ) + cursor.execute(create_partition) + logger.info("Partition created.") + + # If there are vectors, add the triggers that will populate them based on other calls + # NOTE: Undetermined whether tsvector triggers can be applied on partitioned tables, + # at the top-level virtual/partitioned table (versus having to apply on each partition) + for tsvector_name, derived_from_cols in tsvectors.items(): + logger.info( + f"To prevent any confusion or duplicates, dropping the trigger" + f" tsvector_update_{tsvector_name} if it exists before potentially recreating it." + ) + cursor.execute( + f"DROP TRIGGER IF EXISTS tsvector_update_{tsvector_name} ON {temp_table}" + ) - # Write to Postgres + logger.info( + f"Adding tsvector trigger for column {tsvector_name}" + f" based on the following columns: {derived_from_cols}" + ) + derived_from_cols_str = ", ".join(derived_from_cols) + tsvector_trigger_sql = f""" + CREATE TRIGGER tsvector_update_{tsvector_name} BEFORE INSERT OR UPDATE + ON {temp_table} FOR EACH ROW EXECUTE PROCEDURE + tsvector_update_trigger({tsvector_name}, '{DEFAULT_TEXT_SEARCH_CONFIG}', + {derived_from_cols_str}) + """ + cursor.execute(tsvector_trigger_sql) + logger.info(f"tsvector trigger for column {tsvector_name} added.") + + @override + def _write_df( + self, + delta_table: str, + spark: SparkSession, + df: DataFrame, + temp_table: str, + postgres_cols: dict, + options: dict, + destination_database: str, + delta_table_name: str, + column_names: list, + postgres_seq_last_value: int | bool, + table_spec: QueryTableSpec, + ) -> None: use_jdbc_inserts = options["jdbc_inserts"] strategy = "JDBC INSERTs" if use_jdbc_inserts else "SQL bulk COPY CSV" logger.info( f"LOAD (START): Loading data from Delta table {delta_table} to {temp_table} using {strategy} strategy" ) - try: if use_jdbc_inserts: self._write_with_jdbc_inserts( @@ -373,7 +445,7 @@ def handle(self, *args, **options) -> None: df, temp_table, split_df_by_special_cols=True, - postgres_model=postgres_model, + postgres_model=table_spec.model, postgres_cols=postgres_cols, overwrite=False, ) @@ -403,22 +475,6 @@ def handle(self, *args, **options) -> None: ) raise exc - logger.info( - f"LOAD (FINISH): Loaded data from Delta table {delta_table} to {temp_table} using {strategy} strategy" - ) - - # We're done with spark at this point - if spark_created_by_command: - spark.stop() - - if postgres_table: - logger.info( - f"Note: this has merely loaded the data from Delta. For various reasons, we've separated the" - f" metadata portion of the table download to a separate script. If not already done so," - f" please run the following additional command to complete the process: " - f" 'copy_table_metadata --source-table {postgres_table} --dest-table {temp_table}'." - ) - def _set_sequence_value(self, seq_name: str, val: Optional[int] = None) -> int: """ Used to reset the value of a Postgres sequence. This function should be used for tables that utilize a @@ -706,3 +762,31 @@ def _write_with_jdbc_inserts( mode=save_mode, properties=get_jdbc_connection_properties(), ) + + @override + def _finish( + self, + delta_table: str, + temp_table: str, + options: dict, + spark_created_by_command: bool, + spark: SparkSession, + postgres_table: str | bool, + ) -> None: + use_jdbc_inserts = options["jdbc_inserts"] + strategy = "JDBC INSERTs" if use_jdbc_inserts else "SQL bulk COPY CSV" + logger.info( + f"LOAD (FINISH): Loaded data from Delta table {delta_table} to {temp_table} using {strategy} strategy" + ) + + # We're done with spark at this point + if spark_created_by_command: + spark.stop() + + if postgres_table: + logger.info( + f"Note: this has merely loaded the data from Delta. For various reasons, we've separated the" + f" metadata portion of the table download to a separate script. If not already done so," + f" please run the following additional command to complete the process: " + f" 'copy_table_metadata --source-table {postgres_table} --dest-table {temp_table}'." + ) From 3800222d1a8d6f5ea02f06b654738aeaeb208c9c Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 6 Feb 2026 10:01:23 -0600 Subject: [PATCH 45/59] Fixing tests --- ...st_load_transactions_in_delta_fabs_fpds.py | 236 ++++-- ...test_load_transactions_in_delta_lookups.py | 708 +++++++++++++----- 2 files changed, 713 insertions(+), 231 deletions(-) diff --git a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py index bf74f2e3fc..2a897a4b07 100644 --- a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py +++ b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py @@ -5,22 +5,34 @@ from copy import deepcopy from datetime import datetime, timedelta, timezone + from django.core.management import call_command from model_bakery import baker from pytest import mark -from usaspending_api.broker.helpers.last_load_date import get_last_load_date, update_last_load_date -from usaspending_api.etl.tests.integration.test_load_to_from_delta import load_delta_table_from_postgres, equal_datasets +from usaspending_api.broker.helpers.last_load_date import ( + get_last_load_date, + update_last_load_date, +) +from usaspending_api.config import CONFIG +from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC +from usaspending_api.etl.tests.integration.test_load_to_from_delta import ( + equal_datasets, + load_delta_table_from_postgres, +) from usaspending_api.etl.tests.integration.test_load_transactions_in_delta_lookups import ( _BEGINNING_OF_TIME, _INITIAL_SOURCE_TABLE_LOAD_DATETIME, _InitialRunWithPostgresLoader, _TableLoadInfo, +) +from usaspending_api.etl.tests.integration.test_load_transactions_in_delta_lookups import ( TestInitialRun as InitialRun, # Remove 'test' prefix to avoid pytest running these tests twice - TestInitialRunNoPostgresLoader as InitialRunNoPostgresLoader, # Remove 'test' prefix to avoid pytest running these tests twice ) -from usaspending_api.config import CONFIG -from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC +from usaspending_api.etl.tests.integration.test_load_transactions_in_delta_lookups import ( + # Remove 'test' prefix to avoid pytest running these tests twice + TestInitialRunNoPostgresLoader as InitialRunNoPostgresLoader, +) class _TransactionFabsFpdsCore: @@ -60,7 +72,7 @@ def unexpected_paths_source_tables_only_test_core(self): self.spark.sql(f"create database if not exists {raw_db};") self.spark.sql(f"use {raw_db};") self.spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=self.s3_data_bucket, @@ -68,7 +80,7 @@ def unexpected_paths_source_tables_only_test_core(self): ) ) self.spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=self.s3_data_bucket, @@ -92,7 +104,9 @@ def unexpected_paths_source_tables_only_test_core(self): } # Even though nothing will have been loaded to that table, the table whose etl_level has been called will # have its last load date set to the date of the source tables' load. - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) InitialRun.verify(self.spark, [], [], **kwargs) # 2. With raw.transaction_normalized and raw.awards still not created, call load_transactions_in_delta @@ -102,18 +116,26 @@ def unexpected_paths_source_tables_only_test_core(self): # need to reset the last load date on transaction_fabs update_last_load_date(self.etl_level, _BEGINNING_OF_TIME) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) call_command("load_transactions_in_delta", "--etl-level", self.etl_level) # The expected transaction_id_lookup table should be the same as in _InitialRunWithPostgresLoader, # but all of the transaction ids should be 1 larger than expected there. - expected_transaction_id_lookup = deepcopy(_InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup) + expected_transaction_id_lookup = deepcopy( + _InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup + ) for item in expected_transaction_id_lookup: item["transaction_id"] += 1 # Also, the last load date of the transaction_id_lookup table and of the table whose etl_level is being # called should be updated to the load time of the source tables - kwargs["expected_last_load_transaction_id_lookup"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs["expected_last_load_transaction_id_lookup"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) InitialRun.verify( self.spark, expected_transaction_id_lookup, @@ -130,20 +152,31 @@ def unexpected_paths_source_tables_only_test_core(self): delta_data = [row.asDict() for row in self.spark.sql(query).collect()] if len(self.expected_initial_transaction_fabs) > 0: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fabs) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fabs + ) else: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fpds) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fpds + ) for item in expected_transaction_fabs_fpds: item["transaction_id"] += 1 assert equal_datasets(expected_transaction_fabs_fpds, delta_data, "") def unexpected_paths_test_core( - self, load_other_raw_tables, expected_initial_transaction_id_lookup, expected_initial_award_id_lookup + self, + load_other_raw_tables, + expected_initial_transaction_id_lookup, + expected_initial_award_id_lookup, ): # 1. Call load_transactions_in_delta with etl-level of initial_run first, making sure to load # raw.transaction_normalized along with the source tables, but don't copy the raw tables to int. # Then immediately call load_transactions_in_delta with etl-level of transaction_f[ab|pd]s. - InitialRun.initial_run(self.s3_data_bucket, load_other_raw_tables=load_other_raw_tables, initial_copy=False) + InitialRun.initial_run( + self.s3_data_bucket, + load_other_raw_tables=load_other_raw_tables, + initial_copy=False, + ) call_command("load_transactions_in_delta", "--etl-level", self.etl_level) # Even without the call to load_transactions_in_delta with etl-level of transaction_id_lookup, the appropriate @@ -157,7 +190,9 @@ def unexpected_paths_test_core( "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, } - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) InitialRun.verify( self.spark, expected_initial_transaction_id_lookup, @@ -172,9 +207,13 @@ def unexpected_paths_test_core( query = f"SELECT {', '.join(self.compare_fields)} FROM int.{self.etl_level} ORDER BY {self.pk_field}" delta_data = [row.asDict() for row in self.spark.sql(query).collect()] if len(self.expected_initial_transaction_fabs) > 0: - assert equal_datasets(self.expected_initial_transaction_fabs, delta_data, "") + assert equal_datasets( + self.expected_initial_transaction_fabs, delta_data, "" + ) else: - assert equal_datasets(self.expected_initial_transaction_fpds, delta_data, "") + assert equal_datasets( + self.expected_initial_transaction_fpds, delta_data, "" + ) # 2. Test inserting, updating, and deleting without calling load_transactions_in_delta with etl-level # of transaction_id_lookup before calling load_transactions_in_delta with etl-level of transaction_f[ab|pd]s. @@ -233,9 +272,13 @@ def unexpected_paths_test_core( # However, this call should *NOT* pick up the inserts or deletes, since those transactions will not # have changed in the transaction_id_lookup table. if len(self.expected_initial_transaction_fabs) > 0: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fabs) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fabs + ) else: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fpds) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fpds + ) expected_transaction_fabs_fpds[-2]["updated_at"] = insert_update_datetime expected_transaction_fabs_fpds[-1]["updated_at"] = insert_update_datetime assert equal_datasets(expected_transaction_fabs_fpds, delta_data, "") @@ -264,8 +307,12 @@ def happy_paths_test_core( ): # 1, Test calling load_transactions_in_delta with etl-level of transaction_f[ab|pd]s after calling with # etl-levels of initial_run and transaction_id_lookup. - InitialRun.initial_run(self.s3_data_bucket, load_other_raw_tables=load_other_raw_tables) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + InitialRun.initial_run( + self.s3_data_bucket, load_other_raw_tables=load_other_raw_tables + ) + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) call_command("load_transactions_in_delta", "--etl-level", self.etl_level) # Verify the tables. The transaction and award id lookup tables should be the same as during the initial run. @@ -277,7 +324,9 @@ def happy_paths_test_core( "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, } - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) InitialRun.verify( self.spark, expected_initial_transaction_id_lookup, @@ -289,14 +338,23 @@ def happy_paths_test_core( ) # Verify key fields in transaction_fabs table - transaction_fabs_fpds_query = ( - f"SELECT {', '.join(self.compare_fields)} FROM int.{self.etl_level} ORDER BY {self.pk_field}" - ) - delta_data = [row.asDict() for row in self.spark.sql(transaction_fabs_fpds_query).collect()] + transaction_fabs_fpds_query = f""" + SELECT {', '.join(self.compare_fields)} + FROM int.{self.etl_level} + ORDER BY {self.pk_field} + """ + delta_data = [ + row.asDict() + for row in self.spark.sql(transaction_fabs_fpds_query).collect() + ] if len(self.expected_initial_transaction_fabs) > 0: - assert equal_datasets(self.expected_initial_transaction_fabs, delta_data, "") + assert equal_datasets( + self.expected_initial_transaction_fabs, delta_data, "" + ) else: - assert equal_datasets(self.expected_initial_transaction_fpds, delta_data, "") + assert equal_datasets( + self.expected_initial_transaction_fpds, delta_data, "" + ) # 2. Test inserting, updating, and deleting records followed by calling load_transactions_in_delta with # etl-levels of transaction_id_lookup and then transaction_f[ab|pd]s. @@ -364,14 +422,18 @@ def happy_paths_test_core( ) # Need to load changes into the transaction_id_lookup table. - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) call_command("load_transactions_in_delta", "--etl-level", self.etl_level) # Verify transaction_id_lookup table query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" delta_data = [row.asDict() for row in self.spark.sql(query).collect()] - expected_transaction_id_lookup = deepcopy(expected_initial_transaction_id_lookup) + expected_transaction_id_lookup = deepcopy( + expected_initial_transaction_id_lookup + ) for pop_index in expected_transaction_id_lookup_pops: expected_transaction_id_lookup.pop(pop_index) expected_transaction_id_lookup_append.update( @@ -383,26 +445,41 @@ def happy_paths_test_core( assert equal_datasets(expected_transaction_id_lookup, delta_data, "") # Verify country code scalar transformation - query = f"SELECT DISTINCT legal_entity_country_code, place_of_perform_country_c FROM int.{self.etl_level} WHERE {self.pk_field} = 4 OR {self.pk_field} = 5" + query = f""" + SELECT DISTINCT legal_entity_country_code, place_of_perform_country_c + FROM int.{self.etl_level} + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ delta_data = [row.asDict() for row in self.spark.sql(query).collect()] assert len(delta_data) == 1 assert delta_data[0]["legal_entity_country_code"] == "USA" assert delta_data[0]["place_of_perform_country_c"] == "USA" # Verify country name scalar transformation - query = f"SELECT DISTINCT legal_entity_country_name, place_of_perform_country_n FROM int.{self.etl_level} WHERE {self.pk_field} = 4 OR {self.pk_field} = 5" + query = f""" + SELECT DISTINCT legal_entity_country_name, place_of_perform_country_n + FROM int.{self.etl_level} + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ delta_data = [row.asDict() for row in self.spark.sql(query).collect()] assert len(delta_data) == 1 assert delta_data[0]["legal_entity_country_name"] == "UNITED STATES" assert delta_data[0]["place_of_perform_country_n"] == "UNITED STATES" # Verify key fields in transaction_f[ab|pd]s table - delta_data = [row.asDict() for row in self.spark.sql(transaction_fabs_fpds_query).collect()] + delta_data = [ + row.asDict() + for row in self.spark.sql(transaction_fabs_fpds_query).collect() + ] if len(self.expected_initial_transaction_fabs) > 0: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fabs) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fabs + ) else: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fpds) + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fpds + ) expected_transaction_fabs_fpds.pop(1) expected_transaction_fabs_fpds.pop(1) expected_transaction_fabs_fpds[-2]["updated_at"] = insert_update_datetime @@ -420,7 +497,10 @@ def happy_paths_test_core( # Verify that the last_load_dates of the transaction_id_lookup table and the table whose etl_level has been # called did NOT change, since only one of the broker source tables' last load date was changed. - assert get_last_load_date("transaction_id_lookup") == _INITIAL_SOURCE_TABLE_LOAD_DATETIME + assert ( + get_last_load_date("transaction_id_lookup") + == _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) assert get_last_load_date(self.etl_level) == _INITIAL_SOURCE_TABLE_LOAD_DATETIME def happy_paths_no_pg_loader_test_core( @@ -442,7 +522,9 @@ def happy_paths_no_pg_loader_test_core( self.etl_level, initial_transaction_fabs_fpds, ), - _TableLoadInfo(self.spark, "awards", InitialRunNoPostgresLoader.initial_awards), + _TableLoadInfo( + self.spark, "awards", InitialRunNoPostgresLoader.initial_awards + ), ), InitialRunNoPostgresLoader.expected_initial_transaction_id_lookup, InitialRunNoPostgresLoader.expected_initial_award_id_lookup, @@ -459,7 +541,9 @@ class TestTransactionFabs: usas_source_table_name = "published_fabs" broker_source_table_name = "source_assistance_transaction" baker_table = "transactions.SourceAssistanceTransaction" - compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fabs[0].keys() + compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fabs[ + 0 + ].keys() new_afa_generated_unique = "award_assist_0004_trans_0001" new_unique_award_key = "award_assist_0004" baker_kwargs = { @@ -479,7 +563,9 @@ class TestTransactionFabs: "unique_award_key": new_unique_award_key.upper(), } - def _generate_transaction_fabs_fpds_core(self, spark, s3_data_bucket, expected_initial_transaction_fabs): + def _generate_transaction_fabs_fpds_core( + self, spark, s3_data_bucket, expected_initial_transaction_fabs + ): return _TransactionFabsFpdsCore( spark, s3_data_bucket, @@ -496,28 +582,46 @@ def _generate_transaction_fabs_fpds_core(self, spark, s3_data_bucket, expected_i @mark.django_db(transaction=True) def test_unexpected_paths_source_tables_only( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, _InitialRunWithPostgresLoader.expected_initial_transaction_fabs + spark, + s3_unittest_data_bucket, + _InitialRunWithPostgresLoader.expected_initial_transaction_fabs, ) transaction_fabs_fpds_core.unexpected_paths_source_tables_only_test_core() @mark.django_db(transaction=True) def test_unexpected_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fabs + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fabs, ) transaction_fabs_fpds_core.unexpected_paths_no_pg_loader_test_core() @mark.django_db(transaction=True) def test_happy_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fabs + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fabs, ) transaction_fabs_fpds_core.happy_paths_no_pg_loader_test_core( InitialRunNoPostgresLoader.initial_transaction_fabs, @@ -534,7 +638,9 @@ class TestTransactionFpds: usas_source_table_name = "detached_award_procurement" broker_source_table_name = "source_procurement_transaction" baker_table = "transactions.SourceProcurementTransaction" - compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fpds[0].keys() + compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fpds[ + 0 + ].keys() new_detached_award_proc_unique = "award_procure_0004_trans_0001" new_unique_award_key = "award_procure_0004" baker_kwargs = { @@ -552,7 +658,9 @@ class TestTransactionFpds: "unique_award_key": new_unique_award_key.upper(), } - def _generate_transaction_fabs_fpds_core(self, spark, s3_data_bucket, expected_initial_transaction_fpds): + def _generate_transaction_fabs_fpds_core( + self, spark, s3_data_bucket, expected_initial_transaction_fpds + ): return _TransactionFabsFpdsCore( spark, s3_data_bucket, @@ -569,28 +677,46 @@ def _generate_transaction_fabs_fpds_core(self, spark, s3_data_bucket, expected_i @mark.django_db(transaction=True) def test_unexpected_paths_source_tables_only( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, _InitialRunWithPostgresLoader.expected_initial_transaction_fpds + spark, + s3_unittest_data_bucket, + _InitialRunWithPostgresLoader.expected_initial_transaction_fpds, ) transaction_fabs_fpds_core.unexpected_paths_source_tables_only_test_core() @mark.django_db(transaction=True) def test_unexpected_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fpds + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fpds, ) transaction_fabs_fpds_core.unexpected_paths_no_pg_loader_test_core() @mark.django_db(transaction=True) def test_happy_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fpds + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fpds, ) transaction_fabs_fpds_core.happy_paths_no_pg_loader_test_core( InitialRunNoPostgresLoader.initial_transaction_fpds, diff --git a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py index 978cfa345d..b623129860 100644 --- a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py +++ b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py @@ -3,29 +3,41 @@ NOTE: Uses Pytest Fixtures from immediate parent conftest.py: usaspending_api/etl/tests/conftest.py """ -import dateutil import re -import pyspark - from copy import deepcopy from dataclasses import dataclass from datetime import datetime, timedelta, timezone -from django.db import connection +from typing import Any, Dict, Optional, Sequence +from unittest.mock import patch + +import dateutil +import pyspark from django.core.management import call_command +from django.db import connection from model_bakery import baker from pyspark.sql import SparkSession from pytest import mark, raises -from typing import Any, Dict, Optional, Sequence -from unittest.mock import patch -from usaspending_api.broker.helpers.last_load_date import get_last_load_date, update_last_load_date +from usaspending_api.broker.helpers.last_load_date import ( + get_last_load_date, + update_last_load_date, +) from usaspending_api.common.helpers.spark_helpers import load_dict_to_delta_table -from usaspending_api.etl.tests.integration.test_load_to_from_delta import load_delta_table_from_postgres, equal_datasets -from usaspending_api.transactions.delta_models.transaction_fabs import TRANSACTION_FABS_COLUMNS -from usaspending_api.transactions.delta_models.transaction_fpds import TRANSACTION_FPDS_COLUMNS -from usaspending_api.transactions.delta_models.transaction_normalized import TRANSACTION_NORMALIZED_COLUMNS from usaspending_api.config import CONFIG from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC +from usaspending_api.etl.tests.integration.test_load_to_from_delta import ( + equal_datasets, + load_delta_table_from_postgres, +) +from usaspending_api.transactions.delta_models.transaction_fabs import ( + TRANSACTION_FABS_COLUMNS, +) +from usaspending_api.transactions.delta_models.transaction_fpds import ( + TRANSACTION_FPDS_COLUMNS, +) +from usaspending_api.transactions.delta_models.transaction_normalized import ( + TRANSACTION_NORMALIZED_COLUMNS, +) _BEGINNING_OF_TIME = datetime(1970, 1, 1, tzinfo=timezone.utc) _INITIAL_DATETIME = datetime(2022, 10, 31, tzinfo=timezone.utc) @@ -144,7 +156,9 @@ class _TableLoadInfo: overwrite: Optional[bool] = False -def _load_tables_to_delta(s3_data_bucket, load_source_tables=True, load_other_raw_tables=None): +def _load_tables_to_delta( + s3_data_bucket, load_source_tables=True, load_other_raw_tables=None +): if load_source_tables: load_delta_table_from_postgres("published_fabs", s3_data_bucket) load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) @@ -152,22 +166,42 @@ def _load_tables_to_delta(s3_data_bucket, load_source_tables=True, load_other_ra if load_other_raw_tables: for item in load_other_raw_tables: if isinstance(item, _TableLoadInfo): - load_dict_to_delta_table(item.spark, s3_data_bucket, "raw", item.table_name, item.data, item.overwrite) + load_dict_to_delta_table( + item.spark, + s3_data_bucket, + "raw", + item.table_name, + item.data, + item.overwrite, + ) else: load_delta_table_from_postgres(item, s3_data_bucket) class TestInitialRun: @staticmethod - def initial_run(s3_data_bucket, load_source_tables=True, load_other_raw_tables=None, initial_copy=True): + def initial_run( + s3_data_bucket, + load_source_tables=True, + load_other_raw_tables=None, + initial_copy=True, + ): _load_tables_to_delta(s3_data_bucket, load_source_tables, load_other_raw_tables) - call_params = ["load_transactions_in_delta", "--etl-level", "initial_run", "--spark-s3-bucket", s3_data_bucket] + call_params = [ + "load_transactions_in_delta", + "--etl-level", + "initial_run", + "--spark-s3-bucket", + s3_data_bucket, + ] if not initial_copy: call_params.append("--no-initial-copy") call_command(*call_params) @staticmethod - def verify_transaction_ids(spark, expected_transaction_id_lookup, expected_last_load=None): + def verify_transaction_ids( + spark, expected_transaction_id_lookup, expected_last_load=None + ): # Verify transaction_id_lookup table query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" delta_data = [row.asDict() for row in spark.sql(query).collect()] @@ -180,7 +214,10 @@ def verify_transaction_ids(spark, expected_transaction_id_lookup, expected_last_ max_transaction_id = cursor.fetchone()[0] if expected_transaction_id_lookup: assert max_transaction_id == max( - [transaction["transaction_id"] for transaction in expected_transaction_id_lookup] + [ + transaction["transaction_id"] + for transaction in expected_transaction_id_lookup + ] ) else: assert max_transaction_id == 1 @@ -188,12 +225,16 @@ def verify_transaction_ids(spark, expected_transaction_id_lookup, expected_last_ # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false # so that the next call to nextval() will return the same value. with connection.cursor() as cursor: - cursor.execute(f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)") + cursor.execute( + f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)" + ) @staticmethod def verify_award_ids(spark, expected_award_id_lookup, expected_last_load=None): # Verify award_id_lookup table - query = "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + query = ( + "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + ) delta_data = [row.asDict() for row in spark.sql(query).collect()] assert equal_datasets(expected_award_id_lookup, delta_data, "") @@ -203,7 +244,9 @@ def verify_award_ids(spark, expected_award_id_lookup, expected_last_load=None): # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id max_award_id = cursor.fetchone()[0] if expected_award_id_lookup: - assert max_award_id == max([award["award_id"] for award in expected_award_id_lookup]) + assert max_award_id == max( + [award["award_id"] for award in expected_award_id_lookup] + ) else: assert max_award_id == 1 @@ -221,9 +264,13 @@ def verify_lookup_info( expected_load_load_award_id_lookup=None, ): TestInitialRun.verify_transaction_ids( - spark, expected_transaction_id_lookup, expected_last_load_transaction_id_lookup + spark, + expected_transaction_id_lookup, + expected_last_load_transaction_id_lookup, + ) + TestInitialRun.verify_award_ids( + spark, expected_award_id_lookup, expected_load_load_award_id_lookup ) - TestInitialRun.verify_award_ids(spark, expected_award_id_lookup, expected_load_load_award_id_lookup) @staticmethod def verify_raw_vs_int_tables(spark, table_name, col_names): @@ -269,11 +316,13 @@ def verify( ) # int.award_ids_delete_modified should exist, but be empty - actual_count = spark.sql("SELECT COUNT(*) AS count from int.award_ids_delete_modified").collect()[0]["count"] + actual_count = spark.sql( + "SELECT COUNT(*) AS count from int.award_ids_delete_modified" + ).collect()[0]["count"] assert actual_count == 0 # Make sure int.transaction_[normalized,fabs,fpds] tables have been created and have the expected sizes. - for table_name, expected_count, expected_last_load, col_names in zip( + for table_name, expected_count, _expected_last_load, col_names in zip( (f"transaction_{t}" for t in ("normalized", "fabs", "fpds")), (expected_normalized_count, expected_fabs_count, expected_fpds_count), ( @@ -281,9 +330,16 @@ def verify( expected_last_load_transaction_fabs, expected_last_load_transaction_fpds, ), - (list(TRANSACTION_NORMALIZED_COLUMNS), TRANSACTION_FABS_COLUMNS, TRANSACTION_FPDS_COLUMNS), + ( + list(TRANSACTION_NORMALIZED_COLUMNS), + TRANSACTION_FABS_COLUMNS, + TRANSACTION_FPDS_COLUMNS, + ), + strict=False, ): - actual_count = spark.sql(f"SELECT COUNT(*) AS count from int.{table_name}").collect()[0]["count"] + actual_count = spark.sql( + f"SELECT COUNT(*) AS count from int.{table_name}" + ).collect()[0]["count"] assert actual_count == expected_count if expected_count > 0: @@ -300,16 +356,20 @@ def verify( else: raise e else: - TestInitialRun.verify_raw_vs_int_tables(spark, table_name, col_names) + TestInitialRun.verify_raw_vs_int_tables( + spark, table_name, col_names + ) @mark.django_db(transaction=True) - def test_edge_cases_using_only_source_tables(self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db): + def test_edge_cases_using_only_source_tables( + self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db + ): # Setup some source tables without data, this test does not require these tables to be populated raw_db = "raw" spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -317,7 +377,7 @@ def test_edge_cases_using_only_source_tables(self, spark, s3_unittest_data_bucke ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -349,16 +409,23 @@ class _InitialRunWithPostgresLoader: { "transaction_id": id, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[id - 1]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[id - 1][ + "afa_generated_unique" + ].upper(), } for id in range(1, len(_INITIAL_ASSISTS) + 1) ] + [ { "transaction_id": id, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[id - 6]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[id - 6][ + "detached_award_proc_unique" + ].upper(), } - for id in range(len(_INITIAL_ASSISTS) + 1, len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1) + for id in range( + len(_INITIAL_ASSISTS) + 1, + len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1, + ) ] expected_initial_award_id_lookup = [ @@ -373,7 +440,12 @@ class _InitialRunWithPostgresLoader: { "award_id": ( int(procure["unique_award_key"].split("_")[-1]) - + max([int(assist["unique_award_key"].split("_")[-1]) for assist in _INITIAL_ASSISTS]) + + max( + [ + int(assist["unique_award_key"].split("_")[-1]) + for assist in _INITIAL_ASSISTS + ] + ) ), "is_fpds": True, "transaction_unique_id": procure["detached_award_proc_unique"].upper(), @@ -385,7 +457,9 @@ class _InitialRunWithPostgresLoader: expected_initial_transaction_fabs = [ { **assist, - "action_date": dateutil.parser.parse(assist["action_date"]).date().isoformat(), + "action_date": dateutil.parser.parse(assist["action_date"]) + .date() + .isoformat(), "afa_generated_unique": assist["afa_generated_unique"].upper(), "transaction_id": assist["published_fabs_id"], "unique_award_key": assist["unique_award_key"].upper(), @@ -396,9 +470,12 @@ class _InitialRunWithPostgresLoader: expected_initial_transaction_fpds = [ { **procure, - "action_date": dateutil.parser.parse(procure["action_date"]).date().isoformat(), + "action_date": dateutil.parser.parse(procure["action_date"]) + .date() + .isoformat(), "detached_award_proc_unique": procure["detached_award_proc_unique"].upper(), - "transaction_id": procure["detached_award_procurement_id"] + len(_INITIAL_ASSISTS), + "transaction_id": procure["detached_award_procurement_id"] + + len(_INITIAL_ASSISTS), "unique_award_key": procure["unique_award_key"].upper(), } for procure in _INITIAL_PROCURES @@ -410,52 +487,72 @@ class TestInitialRunNoPostgresLoader: { "transaction_id": 1, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), }, { "transaction_id": 2, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), }, { "transaction_id": 3, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), }, { "transaction_id": 4, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), }, { "transaction_id": 5, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[2]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[2][ + "afa_generated_unique" + ].upper(), }, { "transaction_id": 6, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[2]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[2][ + "detached_award_proc_unique" + ].upper(), }, { "transaction_id": 7, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), }, { "transaction_id": 8, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[4][ + "afa_generated_unique" + ].upper(), }, { "transaction_id": 9, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), }, { "transaction_id": 10, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), }, ] @@ -463,62 +560,102 @@ class TestInitialRunNoPostgresLoader: { "award_id": 1, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[0]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[0][ + "unique_award_key" + ].upper(), }, { "award_id": 2, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[1]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[1][ + "unique_award_key" + ].upper(), }, { "award_id": 2, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[2]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[2]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[2][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[2][ + "unique_award_key" + ].upper(), }, { "award_id": 3, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[0]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[0][ + "unique_award_key" + ].upper(), }, { "award_id": 4, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[1]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[1][ + "unique_award_key" + ].upper(), }, { "award_id": 4, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[2]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[2]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[2][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[2][ + "unique_award_key" + ].upper(), }, { "award_id": 5, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[3]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[3][ + "unique_award_key" + ].upper(), }, { "award_id": 5, "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[4]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[4][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[4][ + "unique_award_key" + ].upper(), }, { "award_id": 6, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[3]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[3][ + "unique_award_key" + ].upper(), }, { "award_id": 6, "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[4]["unique_award_key"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[4][ + "unique_award_key" + ].upper(), }, ] @@ -528,49 +665,73 @@ class TestInitialRunNoPostgresLoader: { "id": 1, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_ASSISTS[0]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[0][ + "unique_award_key" + ].upper(), "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), "subaward_count": 0, }, { "id": 2, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_ASSISTS[1]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[1][ + "unique_award_key" + ].upper(), "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), "subaward_count": 0, }, { "id": 3, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_PROCURES[0]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[0][ + "unique_award_key" + ].upper(), "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), "subaward_count": 0, }, { "id": 4, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_PROCURES[1]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[1][ + "unique_award_key" + ].upper(), "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), "subaward_count": 0, }, { "id": 5, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_ASSISTS[3]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[3][ + "unique_award_key" + ].upper(), "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), "subaward_count": 0, }, { "id": 6, "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_PROCURES[3]["unique_award_key"].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[3][ + "unique_award_key" + ].upper(), "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), "subaward_count": 0, }, ] @@ -580,9 +741,13 @@ class TestInitialRunNoPostgresLoader: "id": 1, "award_id": 1, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[0]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[0]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": False, "unique_award_key": _INITIAL_ASSISTS[0]["unique_award_key"].upper(), @@ -591,9 +756,13 @@ class TestInitialRunNoPostgresLoader: "id": 2, "award_id": 3, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[0]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[0]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": True, "unique_award_key": _INITIAL_PROCURES[0]["unique_award_key"].upper(), @@ -602,9 +771,13 @@ class TestInitialRunNoPostgresLoader: "id": 3, "award_id": 2, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[1]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[1]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": False, "unique_award_key": _INITIAL_ASSISTS[1]["unique_award_key"].upper(), @@ -613,9 +786,13 @@ class TestInitialRunNoPostgresLoader: "id": 4, "award_id": 4, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[1]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[1]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": True, "unique_award_key": _INITIAL_PROCURES[1]["unique_award_key"].upper(), @@ -624,9 +801,13 @@ class TestInitialRunNoPostgresLoader: "id": 5, "award_id": 2, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[2]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[2]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[2]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[2][ + "afa_generated_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": False, "unique_award_key": _INITIAL_ASSISTS[2]["unique_award_key"].upper(), @@ -635,9 +816,13 @@ class TestInitialRunNoPostgresLoader: "id": 6, "award_id": 4, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[2]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[2]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[2]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[2][ + "detached_award_proc_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": True, "unique_award_key": _INITIAL_PROCURES[2]["unique_award_key"].upper(), @@ -646,9 +831,13 @@ class TestInitialRunNoPostgresLoader: "id": 7, "award_id": 5, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[3]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[3]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": False, "unique_award_key": _INITIAL_ASSISTS[3]["unique_award_key"].upper(), @@ -657,9 +846,13 @@ class TestInitialRunNoPostgresLoader: "id": 8, "award_id": 5, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[4]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[4]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), + "transaction_unique_id": _INITIAL_ASSISTS[4][ + "afa_generated_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": False, "unique_award_key": _INITIAL_ASSISTS[4]["unique_award_key"].upper(), @@ -668,9 +861,13 @@ class TestInitialRunNoPostgresLoader: "id": 9, "award_id": 6, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[3]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": True, "unique_award_key": _INITIAL_PROCURES[3]["unique_award_key"].upper(), @@ -679,9 +876,13 @@ class TestInitialRunNoPostgresLoader: "id": 10, "award_id": 6, "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]).date(), + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[3]["action_date"] + ).date(), "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), + "transaction_unique_id": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), "update_date": initial_award_trans_norm_update_create_date, "is_fpds": True, "unique_award_key": _INITIAL_PROCURES[4]["unique_award_key"].upper(), @@ -691,7 +892,9 @@ class TestInitialRunNoPostgresLoader: initial_transaction_fabs = [ { **assist, - "action_date": dateutil.parser.parse(assist["action_date"]).date().isoformat(), + "action_date": dateutil.parser.parse(assist["action_date"]) + .date() + .isoformat(), "afa_generated_unique": assist["afa_generated_unique"].upper(), "transaction_id": (assist["published_fabs_id"] - 1) * 2 + 1, "unique_award_key": assist["unique_award_key"].upper(), @@ -700,7 +903,9 @@ class TestInitialRunNoPostgresLoader: ] + [ { **_INITIAL_ASSISTS[4], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[4]["action_date"]).date().isoformat(), + "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[4]["action_date"]) + .date() + .isoformat(), "afa_generated_unique": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), "transaction_id": 8, "unique_award_key": _INITIAL_ASSISTS[4]["unique_award_key"].upper(), @@ -710,7 +915,9 @@ class TestInitialRunNoPostgresLoader: initial_transaction_fpds = [ { **procure, - "action_date": dateutil.parser.parse(procure["action_date"]).date().isoformat(), + "action_date": dateutil.parser.parse(procure["action_date"]) + .date() + .isoformat(), "detached_award_proc_unique": procure["detached_award_proc_unique"].upper(), "transaction_id": procure["detached_award_procurement_id"] * 2, "unique_award_key": procure["unique_award_key"].upper(), @@ -719,15 +926,23 @@ class TestInitialRunNoPostgresLoader: ] + [ { **_INITIAL_PROCURES[3], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]).date().isoformat(), - "detached_award_proc_unique": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), + "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]) + .date() + .isoformat(), + "detached_award_proc_unique": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), "transaction_id": 9, "unique_award_key": _INITIAL_PROCURES[3]["unique_award_key"].upper(), }, { **_INITIAL_PROCURES[4], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[4]["action_date"]).date().isoformat(), - "detached_award_proc_unique": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), + "action_date": dateutil.parser.parse(_INITIAL_PROCURES[4]["action_date"]) + .date() + .isoformat(), + "detached_award_proc_unique": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), "transaction_id": 10, "unique_award_key": _INITIAL_PROCURES[4]["unique_award_key"].upper(), }, @@ -736,7 +951,9 @@ class TestInitialRunNoPostgresLoader: # This test will only load the source tables from postgres, and NOT use the Postgres transaction loader # to populate any other Delta tables, so can only test for NULLs originating in Delta. @mark.django_db(transaction=True) - @patch("usaspending_api.etl.management.commands.load_transactions_in_delta.Command._insert_orphaned_transactions") + @patch( + "usaspending_api.etl.management.commands.load_transactions_in_delta.Command._insert_orphaned_transactions" + ) def test_nulls_in_trans_norm_unique_award_key_from_delta( self, orphaned_txns_patch, @@ -749,7 +966,7 @@ def test_nulls_in_trans_norm_unique_award_key_from_delta( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -757,7 +974,7 @@ def test_nulls_in_trans_norm_unique_award_key_from_delta( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -765,7 +982,7 @@ def test_nulls_in_trans_norm_unique_award_key_from_delta( ) ) spark.sql( - TABLE_SPEC["transaction_normalized"]["delta_table_create_sql"].format( + TABLE_SPEC["transaction_normalized"].delta_table_create_sql.format( DESTINATION_TABLE="transaction_normalized", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -808,9 +1025,16 @@ def test_nulls_in_trans_norm_unique_award_key_from_delta( """ ) - with raises(ValueError, match="Found 1 NULL in 'unique_award_key' in table raw.transaction_normalized!"): + with raises( + ValueError, + match="Found 1 NULL in 'unique_award_key' in table raw.transaction_normalized!", + ): call_command( - "load_transactions_in_delta", "--etl-level", "initial_run", "--spark-s3-bucket", s3_unittest_data_bucket + "load_transactions_in_delta", + "--etl-level", + "initial_run", + "--spark-s3-bucket", + s3_unittest_data_bucket, ) spark.sql( @@ -849,19 +1073,32 @@ def test_nulls_in_trans_norm_unique_award_key_from_delta( """ ) - with raises(ValueError, match="Found 2 NULLs in 'unique_award_key' in table raw.transaction_normalized!"): + with raises( + ValueError, + match="Found 2 NULLs in 'unique_award_key' in table raw.transaction_normalized!", + ): call_command( - "load_transactions_in_delta", "--etl-level", "initial_run", "--spark-s3-bucket", s3_unittest_data_bucket + "load_transactions_in_delta", + "--etl-level", + "initial_run", + "--spark-s3-bucket", + s3_unittest_data_bucket, ) @mark.django_db(transaction=True) def test_happy_path_scenarios( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards # from expected data when making initial run load_other_raw_tables = [ - _TableLoadInfo(spark, "transaction_normalized", self.initial_transaction_normalized), + _TableLoadInfo( + spark, "transaction_normalized", self.initial_transaction_normalized + ), _TableLoadInfo(spark, "awards", self.initial_awards), ] # Setup some source tables with data, without loading these Delta Tables from Postgres @@ -870,7 +1107,7 @@ def test_happy_path_scenarios( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -878,7 +1115,7 @@ def test_happy_path_scenarios( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -915,7 +1152,10 @@ def test_happy_path_scenarios( "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, } TestInitialRun.verify( - spark, self.expected_initial_transaction_id_lookup, self.expected_initial_award_id_lookup, **kwargs + spark, + self.expected_initial_transaction_id_lookup, + self.expected_initial_award_id_lookup, + **kwargs, ) # 2. Call initial_run with initial-copy, and have all raw tables populated @@ -927,10 +1167,18 @@ def test_happy_path_scenarios( _TableLoadInfo(spark, "transaction_fpds", self.initial_transaction_fpds), ] # Don't call Postgres loader or re-load the source tables, though. - TestInitialRun.initial_run(s3_unittest_data_bucket, False, load_other_raw_tables) - kwargs["expected_last_load_transaction_normalized"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - kwargs["expected_last_load_transaction_fabs"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - kwargs["expected_last_load_transaction_fpds"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + TestInitialRun.initial_run( + s3_unittest_data_bucket, False, load_other_raw_tables + ) + kwargs["expected_last_load_transaction_normalized"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + kwargs["expected_last_load_transaction_fabs"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + kwargs["expected_last_load_transaction_fpds"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) TestInitialRun.verify( spark, self.expected_initial_transaction_id_lookup, @@ -945,7 +1193,11 @@ def test_happy_path_scenarios( class TestTransactionIdLookup: @mark.django_db(transaction=True) def test_unexpected_paths( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): # Setup some source tables with data, without loading these Delta Tables from Postgres # for efficiency reasons. @@ -953,7 +1205,7 @@ def test_unexpected_paths( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -961,7 +1213,7 @@ def test_unexpected_paths( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -991,7 +1243,7 @@ def test_unexpected_paths( # First, create blank raw.transaction_normalized and raw.awards tables spark.sql( - TABLE_SPEC["transaction_normalized"]["delta_table_create_sql"].format( + TABLE_SPEC["transaction_normalized"].delta_table_create_sql.format( DESTINATION_TABLE="transaction_normalized", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -999,7 +1251,7 @@ def test_unexpected_paths( ) ) spark.sql( - TABLE_SPEC["awards"]["delta_table_create_sql"].format( + TABLE_SPEC["awards"].delta_table_create_sql.format( DESTINATION_TABLE="awards", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -1009,7 +1261,9 @@ def test_unexpected_paths( # Then, call load_transactions_in_delta with etl-level of initial_run and verify. # Don't reload the source tables, and don't do initial copy of transaction tables, though. - TestInitialRun.initial_run(s3_unittest_data_bucket, load_source_tables=False, initial_copy=False) + TestInitialRun.initial_run( + s3_unittest_data_bucket, load_source_tables=False, initial_copy=False + ) kwargs = { "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, @@ -1020,16 +1274,22 @@ def test_unexpected_paths( TestInitialRun.verify(spark, [], [], **kwargs) # Then, call load_transactions_in_delta with etl-level of transaction_id_lookup. - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) # The expected transaction_id_lookup table should be the same as in _InitialRunWithPostgresLoader, # but all of the transaction ids should be 1 larger than expected there. - expected_transaction_id_lookup = deepcopy(_InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup) + expected_transaction_id_lookup = deepcopy( + _InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup + ) for item in expected_transaction_id_lookup: item["transaction_id"] += 1 # Also, the last load date for the transaction_id_lookup table should be updated to the date of the # initial loads. - kwargs["expected_last_load_transaction_id_lookup"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs["expected_last_load_transaction_id_lookup"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) TestInitialRun.verify(spark, expected_transaction_id_lookup, [], **kwargs) @staticmethod @@ -1047,7 +1307,7 @@ def _happy_path_test_core( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_data_bucket, @@ -1055,7 +1315,7 @@ def _happy_path_test_core( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_data_bucket, @@ -1080,7 +1340,10 @@ def _happy_path_test_core( ) # Trigger initial run of load transactions in delta. This step is required as it creates various data sources. TestInitialRun.initial_run( - s3_data_bucket, load_source_tables=False, load_other_raw_tables=load_other_raw_tables, initial_copy=False + s3_data_bucket, + load_source_tables=False, + load_other_raw_tables=load_other_raw_tables, + initial_copy=False, ) # 1. Test deleting the transaction(s) with the last transaction ID(s) from the appropriate raw table, @@ -1094,13 +1357,17 @@ def _happy_path_test_core( WHERE detached_award_procurement_id = 4 OR detached_award_procurement_id = 5 """ ) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) # Verify transaction_id_lookup table query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" delta_data = [row.asDict() for row in spark.sql(query).collect()] - expected_transaction_id_lookup = deepcopy(expected_initial_transaction_id_lookup) + expected_transaction_id_lookup = deepcopy( + expected_initial_transaction_id_lookup + ) expected_transaction_id_lookup.pop() expected_transaction_id_lookup.pop() assert equal_datasets(expected_transaction_id_lookup, delta_data, "") @@ -1115,7 +1382,9 @@ def _happy_path_test_core( # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false # so that the next call to nextval() will return the same value as previously. with connection.cursor() as cursor: - cursor.execute(f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)") + cursor.execute( + f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)" + ) # 3. Test for a single inserted transaction, and another call to load_transaction_in_delta with etl-level of # transaction_id_lookup. @@ -1126,18 +1395,28 @@ def _happy_path_test_core( insert_datetime = last_assist_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) - update_last_load_date("source_assistance_transaction", last_assist_load_datetime) + update_last_load_date( + "source_assistance_transaction", last_assist_load_datetime + ) load_delta_table_from_postgres("published_fabs", s3_data_bucket) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) # Verify transaction_id_lookup table query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" delta_data = [row.asDict() for row in spark.sql(query).collect()] - expected_transaction_id_lookup = deepcopy(expected_initial_transaction_id_lookup) + expected_transaction_id_lookup = deepcopy( + expected_initial_transaction_id_lookup + ) expected_transaction_id_lookup.pop() expected_transaction_id_lookup.pop() @@ -1160,19 +1439,28 @@ def _happy_path_test_core( "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, } - TestInitialRun.verify(spark, expected_transaction_id_lookup, expected_initial_award_id_lookup, **kwargs) + TestInitialRun.verify( + spark, + expected_transaction_id_lookup, + expected_initial_award_id_lookup, + **kwargs, + ) # Also, make sure transaction_id_seq hasn't gone backwards with connection.cursor() as cursor: cursor.execute("SELECT nextval('transaction_id_seq')") # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id max_transaction_id = cursor.fetchone()[0] - assert max_transaction_id == (len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1) # Add one for the insert + assert max_transaction_id == ( + len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1 + ) # Add one for the insert # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false # so that the next call to nextval() will return the same value as previously. with connection.cursor() as cursor: - cursor.execute(f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)") + cursor.execute( + f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)" + ) # 3. Make inserts to and deletes from the raw tables, call load_transaction_in_delta with etl-level of # transaction_id_lookup, and test that the results are as expected. @@ -1180,10 +1468,16 @@ def _happy_path_test_core( insert_datetime = last_procure_load_datetime + timedelta(minutes=-15) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - update_last_load_date("source_procurement_transaction", last_procure_load_datetime) + update_last_load_date( + "source_procurement_transaction", last_procure_load_datetime + ) load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) spark.sql( @@ -1199,7 +1493,9 @@ def _happy_path_test_core( """ ) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) # Verify transaction_id_lookup table query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" @@ -1211,7 +1507,9 @@ def _happy_path_test_core( { "transaction_id": 12, "is_fpds": True, - "transaction_unique_id": _NEW_PROCURE["detached_award_proc_unique"].upper(), + "transaction_unique_id": _NEW_PROCURE[ + "detached_award_proc_unique" + ].upper(), } ) assert equal_datasets(expected_transaction_id_lookup, delta_data, "") @@ -1220,15 +1518,23 @@ def _happy_path_test_core( @mark.django_db(transaction=True) def test_happy_path_scenarios_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards # from expected data when making initial run load_other_raw_tables = [ _TableLoadInfo( - spark, "transaction_normalized", TestInitialRunNoPostgresLoader.initial_transaction_normalized + spark, + "transaction_normalized", + TestInitialRunNoPostgresLoader.initial_transaction_normalized, + ), + _TableLoadInfo( + spark, "awards", TestInitialRunNoPostgresLoader.initial_awards ), - _TableLoadInfo(spark, "awards", TestInitialRunNoPostgresLoader.initial_awards), ] self._happy_path_test_core( @@ -1244,7 +1550,11 @@ def test_happy_path_scenarios_no_pg_loader( class TestAwardIdLookup: @mark.django_db(transaction=True) def test_unexpected_paths( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): # First, setup some source tables with data, without loading these Delta Tables from Postgres # for efficiency reasons. @@ -1252,7 +1562,7 @@ def test_unexpected_paths( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -1260,7 +1570,7 @@ def test_unexpected_paths( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -1290,7 +1600,7 @@ def test_unexpected_paths( # First, create blank raw.transaction_normalized and raw.awards tables spark.sql( - TABLE_SPEC["transaction_normalized"]["delta_table_create_sql"].format( + TABLE_SPEC["transaction_normalized"].delta_table_create_sql.format( DESTINATION_TABLE="transaction_normalized", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -1298,7 +1608,7 @@ def test_unexpected_paths( ) ) spark.sql( - TABLE_SPEC["awards"]["delta_table_create_sql"].format( + TABLE_SPEC["awards"].delta_table_create_sql.format( DESTINATION_TABLE="awards", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_unittest_data_bucket, @@ -1308,7 +1618,9 @@ def test_unexpected_paths( # Then, call load_transactions_in_delta with etl-level of initial_run and verify. # Don't reload the source tables, and don't do initial copy of transaction tables, though. - TestInitialRun.initial_run(s3_unittest_data_bucket, load_source_tables=False, initial_copy=False) + TestInitialRun.initial_run( + s3_unittest_data_bucket, load_source_tables=False, initial_copy=False + ) kwargs = { "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, @@ -1323,11 +1635,15 @@ def test_unexpected_paths( # The expected award_id_lookup table should be the same as in TestInitialRunWithPostgresLoader, # but all of the award ids should be 1 larger than expected there. - expected_award_id_lookup = deepcopy(_InitialRunWithPostgresLoader.expected_initial_award_id_lookup) + expected_award_id_lookup = deepcopy( + _InitialRunWithPostgresLoader.expected_initial_award_id_lookup + ) for item in expected_award_id_lookup: item["award_id"] += 1 # Also, the last load date for the award_id_lookup table should be updated to the date of the initial loads. - kwargs["expected_last_load_award_id_lookup"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME + kwargs["expected_last_load_award_id_lookup"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) TestInitialRun.verify(spark, [], expected_award_id_lookup, **kwargs) @staticmethod @@ -1346,7 +1662,7 @@ def _happy_path_test_core( spark.sql(f"create database if not exists {raw_db};") spark.sql(f"use {raw_db};") spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( DESTINATION_TABLE="published_fabs", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_data_bucket, @@ -1354,7 +1670,7 @@ def _happy_path_test_core( ) ) spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( DESTINATION_TABLE="detached_award_procurement", DESTINATION_DATABASE=raw_db, SPARK_S3_BUCKET=s3_data_bucket, @@ -1379,7 +1695,10 @@ def _happy_path_test_core( ) # Trigger initial run of load transactions in delta. This step is required as it creates various data sources. TestInitialRun.initial_run( - s3_data_bucket, load_source_tables=False, load_other_raw_tables=load_other_raw_tables, initial_copy=False + s3_data_bucket, + load_source_tables=False, + load_other_raw_tables=load_other_raw_tables, + initial_copy=False, ) # 1. Test deleting the transactions with the last award ID from the appropriate raw table, @@ -1400,15 +1719,23 @@ def _happy_path_test_core( insert_datetime = last_assist_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) - update_last_load_date("source_assistance_transaction", last_assist_load_datetime) + update_last_load_date( + "source_assistance_transaction", last_assist_load_datetime + ) load_delta_table_from_postgres("published_fabs", s3_data_bucket) call_command("load_transactions_in_delta", "--etl-level", "award_id_lookup") # Verify award_id_lookup table - query = "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + query = ( + "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + ) delta_data = [row.asDict() for row in spark.sql(query).collect()] expected_award_id_lookup = deepcopy(expected_initial_award_id_lookup) @@ -1435,7 +1762,12 @@ def _happy_path_test_core( "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, } - TestInitialRun.verify(spark, expected_initial_transaction_id_lookup, expected_award_id_lookup, **kwargs) + TestInitialRun.verify( + spark, + expected_initial_transaction_id_lookup, + expected_award_id_lookup, + **kwargs, + ) # Make sure award_id_seq hasn't gone backwards with connection.cursor() as cursor: @@ -1443,7 +1775,11 @@ def _happy_path_test_core( # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id max_award_id = cursor.fetchone()[0] assert ( - max_award_id == max([award["id"] for award in TestInitialRunNoPostgresLoader.initial_awards]) + 1 + max_award_id + == max( + [award["id"] for award in TestInitialRunNoPostgresLoader.initial_awards] + ) + + 1 ) # Add one for the insert # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false @@ -1458,10 +1794,16 @@ def _happy_path_test_core( insert_datetime = last_procure_load_datetime + timedelta(minutes=-15) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - update_last_load_date("source_procurement_transaction", last_procure_load_datetime) + update_last_load_date( + "source_procurement_transaction", last_procure_load_datetime + ) load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) spark.sql( @@ -1480,7 +1822,9 @@ def _happy_path_test_core( call_command("load_transactions_in_delta", "--etl-level", "award_id_lookup") # Verify award_id_lookup table - query = "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + query = ( + "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + ) delta_data = [row.asDict() for row in spark.sql(query).collect()] for pop in expected_award_id_lookup_pops: @@ -1489,7 +1833,9 @@ def _happy_path_test_core( { "award_id": 8, "is_fpds": True, - "transaction_unique_id": _NEW_PROCURE["detached_award_proc_unique"].upper(), + "transaction_unique_id": _NEW_PROCURE[ + "detached_award_proc_unique" + ].upper(), "generated_unique_award_id": _NEW_PROCURE["unique_award_key"].upper(), } ) @@ -1500,19 +1846,29 @@ def _happy_path_test_core( # Verify award_ids_delete_modified table query = "SELECT * FROM int.award_ids_delete_modified ORDER BY award_id" delta_data = [row.asDict() for row in spark.sql(query).collect()] - assert equal_datasets([{"award_id": partially_deleted_award_id}], delta_data, "") + assert equal_datasets( + [{"award_id": partially_deleted_award_id}], delta_data, "" + ) @mark.django_db(transaction=True) def test_happy_path_scenarios_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, ): # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards # from expected data when making initial run load_other_raw_tables = [ _TableLoadInfo( - spark, "transaction_normalized", TestInitialRunNoPostgresLoader.initial_transaction_normalized + spark, + "transaction_normalized", + TestInitialRunNoPostgresLoader.initial_transaction_normalized, + ), + _TableLoadInfo( + spark, "awards", TestInitialRunNoPostgresLoader.initial_awards ), - _TableLoadInfo(spark, "awards", TestInitialRunNoPostgresLoader.initial_awards), ] self._happy_path_test_core( From cbdf3a07aa03f5486f6260478501cc7107e5b82a Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 6 Feb 2026 11:15:07 -0600 Subject: [PATCH 46/59] Updating default custom_schema value --- usaspending_api/etl/table_specs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usaspending_api/etl/table_specs.py b/usaspending_api/etl/table_specs.py index e313957858..d3fc405e7c 100644 --- a/usaspending_api/etl/table_specs.py +++ b/usaspending_api/etl/table_specs.py @@ -21,7 +21,7 @@ class TableSpec: partition_column_type: Literal["date", "numeric"] | None = None is_partition_column_unique: bool = False source_schema: dict[str, str] | None = None - custom_schema: str | None = None + custom_schema: str = "" delta_table_create_options: dict[str, str | bool] | None = None delta_table_create_partitions: list[str] | None = None tsvectors: dict[str, list[str]] | None = None From f89c21bf70cf5ef11cd44cebf173c45e7c9577be Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 6 Feb 2026 13:56:50 -0600 Subject: [PATCH 47/59] cleaning up tests --- .../commands/load_table_from_delta.py | 208 ++++++++---------- 1 file changed, 89 insertions(+), 119 deletions(-) diff --git a/usaspending_api/etl/management/commands/load_table_from_delta.py b/usaspending_api/etl/management/commands/load_table_from_delta.py index 1440c9c5d8..6672723b14 100644 --- a/usaspending_api/etl/management/commands/load_table_from_delta.py +++ b/usaspending_api/etl/management/commands/load_table_from_delta.py @@ -2,7 +2,7 @@ import logging from datetime import datetime from math import ceil -from typing import Dict, List, Optional, override +from typing import Dict, Optional import boto3 import numpy as np @@ -60,6 +60,19 @@ class Command(BaseCommand): if a new table has been made. """ + delta_table: str + delta_table_name: str + destination_database: str + column_names: list + + postgres_table: str + postgres_table_name: str + postgres_schema: str + postgres_cols: dict + + temp_table: str + temp_table_name: str + def add_arguments(self, parser: CommandParser) -> None: parser.add_argument( "--delta-table", @@ -179,33 +192,31 @@ def handle(self, *args, **options) -> None: spark, spark_created_by_command = self._get_spark_session() # Resolve Parameters - delta_table = options["delta_table"] + self.delta_table = options["delta_table"] recreate = options["recreate"] - table_spec = TABLE_SPEC[delta_table] + table_spec = TABLE_SPEC[self.delta_table] # Delta side - destination_database = ( + self.destination_database = ( options["alt_delta_db"] or table_spec.destination_database ) - delta_table_name = options["alt_delta_name"] or delta_table - delta_table = ( - f"{destination_database}.{delta_table_name}" - if destination_database - else delta_table_name + self.delta_table_name = options["alt_delta_name"] or self.delta_table + self.delta_table = ( + f"{self.destination_database}.{self.delta_table_name}" + if self.destination_database + else self.delta_table_name ) # Postgres side - source - postgres_table = None - postgres_schema = table_spec.source_database or table_spec.swap_schema - postgres_table_name = table_spec.source_table or table_spec.swap_table - postgres_cols = table_spec.source_schema - column_names = table_spec.column_names - tsvectors = table_spec.tsvectors or {} - if postgres_table_name: - postgres_table = ( - f"{postgres_schema}.{postgres_table_name}" - if postgres_schema - else postgres_table_name + self.postgres_schema = table_spec.source_database or table_spec.swap_schema + self.postgres_table_name = table_spec.source_table or table_spec.swap_table + self.postgres_cols = table_spec.source_schema + self.column_names = table_spec.column_names + if self.postgres_table_name: + self.postgres_table = ( + f"{self.postgres_schema}.{self.postgres_table_name}" + if self.postgres_schema + else self.postgres_table_name ) # Postgres side - temp @@ -214,60 +225,50 @@ def handle(self, *args, **options) -> None: temp_table_suffix_appendage = ( f"_{temp_table_suffix}" if {temp_table_suffix} else "" ) - temp_table_name = ( - f"{postgres_table_name}{temp_table_suffix_appendage}" - if postgres_table - else f"{delta_table_name}{temp_table_suffix_appendage}" + self.temp_table_name = ( + f"{self.postgres_table_name}{temp_table_suffix_appendage}" + if self.postgres_table + else f"{self.delta_table_name}{temp_table_suffix_appendage}" ) - temp_table = f"{temp_schema}.{temp_table_name}" + self.temp_table = f"{temp_schema}.{self.temp_table_name}" - summary_msg = ( - f"Copying delta table {delta_table} to a Postgres temp table {temp_table}." - ) - if postgres_table: - summary_msg = f"{summary_msg} The temp table will be based on the postgres table {postgres_table}" + summary_msg = f"Copying delta table {self.delta_table} to a Postgres temp table {self.temp_table}." + if self.postgres_table: + summary_msg = f"{summary_msg} The temp table will be based on the postgres table {self.postgres_table}" logger.info(summary_msg) - temp_dest_table_exists = self._temp_table_exists(temp_schema, temp_table_name) + temp_dest_table_exists = self._temp_table_exists( + temp_schema, self.temp_table_name + ) # If it does, and we're recreating it, drop it first if temp_dest_table_exists and recreate: - self._drop_temp_table(temp_table) + self._drop_temp_table() temp_dest_table_exists = False make_new_table = not temp_dest_table_exists - is_postgres_table_partitioned = ( - hasattr(table_spec, "postgres_partition_spec") - and table_spec.postgres_partition_spec is not None - ) - - if postgres_table or postgres_cols: + if self.postgres_table or self.postgres_cols: self._recreate_table( make_new_table=make_new_table, - is_postgres_table_partitioned=is_postgres_table_partitioned, table_spec=table_spec, - temp_table=temp_table, temp_table_suffix_appendage=temp_table_suffix_appendage, - postgres_table=postgres_table, - postgres_cols=postgres_cols, - tsvectors=tsvectors, ) # Read from Delta - df = spark.table(delta_table) + df = spark.table(self.delta_table) # Make sure that the column order defined in the Delta table schema matches # that of the Spark dataframe used to pull from the Postgres table. While not # always needed, this should help to prevent any future mismatch between the two. - if column_names: - df = df.select(column_names) + if self.column_names: + df = df.select(self.column_names) # If we're working off an existing table, truncate before loading in all the data if not make_new_table: - logger.info(f"Truncating existing table {temp_table}") + logger.info(f"Truncating existing table {self.temp_table}") with db.connection.cursor() as cursor: - cursor.execute(f"TRUNCATE {temp_table}") - logger.info(f"{temp_table} truncated.") + cursor.execute(f"TRUNCATE {self.temp_table}") + logger.info(f"{self.temp_table} truncated.") # Reset the sequence before load for a table if it exists postgres_seq_last_value = ( @@ -279,26 +280,17 @@ def handle(self, *args, **options) -> None: ) self._write_df( - delta_table=delta_table, spark=spark, df=df, - temp_table=temp_table, - postgres_cols=postgres_cols, options=options, - destination_database=destination_database, - delta_table_name=delta_table_name, - column_names=column_names, postgres_seq_last_value=postgres_seq_last_value, table_spec=table_spec, ) self._finish( - delta_table=delta_table, - temp_table=temp_table, options=options, spark_created_by_command=spark_created_by_command, spark=spark, - postgres_table=postgres_table, ) @staticmethod @@ -315,32 +307,31 @@ def _temp_table_exists(temp_schema: str, temp_table_name: str) -> bool: cursor.execute(temp_dest_table_exists_sql) return bool(cursor.fetchone()[0]) - @staticmethod - def _drop_temp_table(temp_table: str) -> None: + def _drop_temp_table(self) -> None: logger.info( - f"{temp_table} exists and recreate argument provided. Dropping first." + f"{self.temp_table} exists and recreate argument provided. Dropping first." ) # If the schema has changed and we need to do a complete reload, just drop the table and rebuild it - clear_table_sql = f"DROP TABLE {temp_table}" + clear_table_sql = f"DROP TABLE {self.temp_table}" with db.connection.cursor() as cursor: cursor.execute(clear_table_sql) - logger.info(f"{temp_table} dropped.") + logger.info(f"{self.temp_table} dropped.") - @override - @staticmethod def _recreate_table( + self, make_new_table: bool, - is_postgres_table_partitioned: bool, table_spec: QueryTableSpec, - temp_table: str, temp_table_suffix_appendage: str, - postgres_table: str, - postgres_cols: list, - tsvectors: dict, ) -> None: # Recreate the table if it doesn't exist. Spark's df.write automatically does this but doesn't account for # the extra metadata (indexes, constraints, defaults) which CREATE TABLE X LIKE Y accounts for. # If there is no postgres_table to base it on, it just relies on spark to make it and work with delta table + is_postgres_table_partitioned = ( + hasattr(table_spec, "postgres_partition_spec") + and table_spec.postgres_partition_spec is not None + ) + tsvectors = table_spec.tsvectors or {} + if make_new_table: partition_clause = "" storage_parameters = "WITH (autovacuum_enabled=FALSE)" @@ -355,23 +346,23 @@ def _recreate_table( ( f"CREATE TABLE " # Below: e.g. my_tbl_temp -> my_tbl_part_temp - f"{temp_table[: -len(temp_table_suffix_appendage)]}" + f"{self.temp_table[: -len(temp_table_suffix_appendage)]}" f"{pt['table_suffix']}{temp_table_suffix_appendage} " - f"PARTITION OF {temp_table} {pt['partitioning_clause']} " + f"PARTITION OF {self.temp_table} {pt['partitioning_clause']} " f"{storage_parameters}" ) for pt in table_spec.postgres_partition_spec["partitions"] ] - if postgres_table: + if self.postgres_table: create_temp_sql = f""" - CREATE TABLE {temp_table} ( - LIKE {postgres_table} INCLUDING DEFAULTS INCLUDING GENERATED INCLUDING IDENTITY + CREATE TABLE {self.temp_table} ( + LIKE {self.postgres_table} INCLUDING DEFAULTS INCLUDING GENERATED INCLUDING IDENTITY ) {partition_clause} {storage_parameters} """ - elif postgres_cols: + elif self.postgres_cols: create_temp_sql = f""" - CREATE TABLE {temp_table} ( - {", ".join([f"{key} {val}" for key, val in postgres_cols.items()])} + CREATE TABLE {self.temp_table} ( + {", ".join([f"{key} {val}" for key, val in self.postgres_cols.items()])} ) {partition_clause} {storage_parameters} """ else: @@ -380,14 +371,14 @@ def _recreate_table( "populated for the target delta table in the TABLE_SPEC" ) with db.connection.cursor() as cursor: - logger.info(f"Creating {temp_table}") + logger.info(f"Creating {self.temp_table}") cursor.execute(create_temp_sql) - logger.info(f"{temp_table} created.") + logger.info(f"{self.temp_table} created.") if is_postgres_table_partitioned and partitions_sql: for create_partition in partitions_sql: logger.info( - f"Creating partition of {temp_table} with SQL:\n{create_partition}" + f"Creating partition of {self.temp_table} with SQL:\n{create_partition}" ) cursor.execute(create_partition) logger.info("Partition created.") @@ -401,7 +392,7 @@ def _recreate_table( f" tsvector_update_{tsvector_name} if it exists before potentially recreating it." ) cursor.execute( - f"DROP TRIGGER IF EXISTS tsvector_update_{tsvector_name} ON {temp_table}" + f"DROP TRIGGER IF EXISTS tsvector_update_{tsvector_name} ON {self.temp_table}" ) logger.info( @@ -411,46 +402,39 @@ def _recreate_table( derived_from_cols_str = ", ".join(derived_from_cols) tsvector_trigger_sql = f""" CREATE TRIGGER tsvector_update_{tsvector_name} BEFORE INSERT OR UPDATE - ON {temp_table} FOR EACH ROW EXECUTE PROCEDURE + ON {self.temp_table} FOR EACH ROW EXECUTE PROCEDURE tsvector_update_trigger({tsvector_name}, '{DEFAULT_TEXT_SEARCH_CONFIG}', {derived_from_cols_str}) """ cursor.execute(tsvector_trigger_sql) logger.info(f"tsvector trigger for column {tsvector_name} added.") - @override def _write_df( self, - delta_table: str, spark: SparkSession, df: DataFrame, - temp_table: str, - postgres_cols: dict, options: dict, - destination_database: str, - delta_table_name: str, - column_names: list, postgres_seq_last_value: int | bool, table_spec: QueryTableSpec, ) -> None: use_jdbc_inserts = options["jdbc_inserts"] strategy = "JDBC INSERTs" if use_jdbc_inserts else "SQL bulk COPY CSV" logger.info( - f"LOAD (START): Loading data from Delta table {delta_table} to {temp_table} using {strategy} strategy" + f"LOAD (START): Loading data from Delta table {self.delta_table} " + f"to {self.temp_table} using {strategy} strategy" ) try: if use_jdbc_inserts: self._write_with_jdbc_inserts( - spark, df, - temp_table, + self.temp_table, split_df_by_special_cols=True, postgres_model=table_spec.model, - postgres_cols=postgres_cols, + postgres_cols=self.postgres_cols, overwrite=False, ) else: - if not column_names: + if not self.column_names: raise RuntimeError( "column_names None or empty, but are required to map CSV cols to table cols" ) @@ -458,10 +442,6 @@ def _write_df( self._write_with_sql_bulk_copy_csv( spark, df, - delta_db=destination_database, - delta_table_name=delta_table_name, - temp_table=temp_table, - ordered_col_names=column_names, spark_s3_bucket_name=spark_s3_bucket_name, keep_csv_files=True if options["keep_csv_files"] else False, ) @@ -493,15 +473,10 @@ def _set_sequence_value(self, seq_name: str, val: Optional[int] = None) -> int: ) return last_value - @override def _write_with_sql_bulk_copy_csv( self, spark: SparkSession, df: DataFrame, - delta_db: str, - delta_table_name: str, - temp_table: str, - ordered_col_names: List[str], spark_s3_bucket_name: str, keep_csv_files: bool = False, ) -> None: @@ -548,10 +523,10 @@ def _write_with_sql_bulk_copy_csv( sub-folder of a "temp" folder. Be mindful of cleaning these up if setting to True. If False, the same output path is used for each write and nukes-and-paves the files in that output path. """ - csv_path = f"{CONFIG.SPARK_CSV_S3_PATH}/{delta_db}/{delta_table_name}/" + csv_path = f"{CONFIG.SPARK_CSV_S3_PATH}/{self.destination_database}/{self.delta_table_name}/" if keep_csv_files: csv_path = ( - f"{CONFIG.SPARK_CSV_S3_PATH}/temp/{delta_db}/{delta_table_name}/" + f"{CONFIG.SPARK_CSV_S3_PATH}/temp/{self.destination_database}/{self.delta_table_name}/" f"{datetime.strftime(datetime.utcnow(), '%Y%m%d%H%M%S')}/" ) s3_bucket_with_csv_path = f"s3a://{spark_s3_bucket_name}/{csv_path}" @@ -627,7 +602,7 @@ def _write_with_sql_bulk_copy_csv( ) logger.info( - f"LOAD: Starting SQL bulk COPY of {file_count} CSV files to Postgres {temp_table} table" + f"LOAD: Starting SQL bulk COPY of {file_count} CSV files to Postgres {self.temp_table} table" ) db_dsn = get_database_dsn_string() @@ -667,21 +642,19 @@ def _write_with_sql_bulk_copy_csv( s3_bucket_name=s3_bucket_name, s3_obj_keys=s3_obj_keys, db_dsn=db_dsn, - target_pg_table=temp_table, - ordered_col_names=ordered_col_names, + target_pg_table=self.temp_table, + ordered_col_names=self.column_names, gzipped=True, work_mem_override=_PG_WORK_MEM_FOR_LARGE_CSV_COPY, ), ).collect() logger.info( - f"LOAD: Finished SQL bulk COPY of {file_count} CSV files to Postgres {temp_table} table" + f"LOAD: Finished SQL bulk COPY of {file_count} CSV files to Postgres {self.temp_table} table" ) - @override def _write_with_jdbc_inserts( self, - spark: SparkSession, df: DataFrame, temp_table: str, split_df_by_special_cols: bool = False, @@ -763,30 +736,27 @@ def _write_with_jdbc_inserts( properties=get_jdbc_connection_properties(), ) - @override def _finish( self, - delta_table: str, - temp_table: str, options: dict, spark_created_by_command: bool, spark: SparkSession, - postgres_table: str | bool, ) -> None: use_jdbc_inserts = options["jdbc_inserts"] strategy = "JDBC INSERTs" if use_jdbc_inserts else "SQL bulk COPY CSV" logger.info( - f"LOAD (FINISH): Loaded data from Delta table {delta_table} to {temp_table} using {strategy} strategy" + f"LOAD (FINISH): Loaded data from Delta table {self.delta_table} " + f"to {self.temp_table} using {strategy} strategy" ) # We're done with spark at this point if spark_created_by_command: spark.stop() - if postgres_table: + if self.postgres_table: logger.info( f"Note: this has merely loaded the data from Delta. For various reasons, we've separated the" f" metadata portion of the table download to a separate script. If not already done so," f" please run the following additional command to complete the process: " - f" 'copy_table_metadata --source-table {postgres_table} --dest-table {temp_table}'." + f" 'copy_table_metadata --source-table {self.postgres_table} --dest-table {self.temp_table}'." ) From c37c31a87fcc69f0a0e41ff8ac78f6c331848c6d Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 6 Feb 2026 14:58:27 -0600 Subject: [PATCH 48/59] resolving variables for rdd.map --- .../etl/management/commands/load_table_from_delta.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/usaspending_api/etl/management/commands/load_table_from_delta.py b/usaspending_api/etl/management/commands/load_table_from_delta.py index 6672723b14..05ca3e78cd 100644 --- a/usaspending_api/etl/management/commands/load_table_from_delta.py +++ b/usaspending_api/etl/management/commands/load_table_from_delta.py @@ -636,14 +636,16 @@ def _write_with_sql_bulk_copy_csv( # into the mapped function, its module, or an arg of it ... that is not pickle-able, this will throw an error. # One way to help is to resolve all arguments to primitive types (int, string) that can be passed # to the mapped function + temp_table = self.temp_table + ordered_col_names = self.column_names rdd.mapPartitionsWithIndex( lambda partition_idx, s3_obj_keys: copy_csvs_from_s3_to_pg( batch_num=partition_idx, s3_bucket_name=s3_bucket_name, s3_obj_keys=s3_obj_keys, db_dsn=db_dsn, - target_pg_table=self.temp_table, - ordered_col_names=self.column_names, + target_pg_table=temp_table, + ordered_col_names=ordered_col_names, gzipped=True, work_mem_override=_PG_WORK_MEM_FOR_LARGE_CSV_COPY, ), From 2f103365598fb0c452afb355795e6e90b6c40489 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 11 Feb 2026 17:04:54 -0600 Subject: [PATCH 49/59] [DEV-14453] WIP --- usaspending_api/common/data_classes.py | 4 +- usaspending_api/common/etl/spark.py | 164 ++-- .../commands/load_table_to_delta.py | 170 ++-- .../commands/load_transaction_normalized.py | 4 +- usaspending_api/etl/table_specs.py | 15 +- .../etl/transaction_delta_loaders/loaders.py | 277 +++---- .../detached_award_procurement.py | 9 +- .../delta_models/published_fabs.py | 31 +- .../delta_models/transaction_fpds.py | 725 +++++------------- .../delta_models/transaction_normalized.py | 4 + 10 files changed, 537 insertions(+), 866 deletions(-) diff --git a/usaspending_api/common/data_classes.py b/usaspending_api/common/data_classes.py index ede54ed88c..9a7416814d 100644 --- a/usaspending_api/common/data_classes.py +++ b/usaspending_api/common/data_classes.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Optional +from typing import Callable, Optional from typing_extensions import Literal @@ -47,4 +47,4 @@ class TransactionColumn: # calling code to format the string with a input. You should expect the scalar transformation # to be applied on this input. For example, a valid scalar_transformation string is # "CASE {input} WHEN 'UNITED STATES' THEN 'USA' ELSE {input} END" - scalar_transformation: str = None + scalar_transformation: Callable | None= None diff --git a/usaspending_api/common/etl/spark.py b/usaspending_api/common/etl/spark.py index 5018aa439d..8d84201e52 100644 --- a/usaspending_api/common/etl/spark.py +++ b/usaspending_api/common/etl/spark.py @@ -11,11 +11,13 @@ import shutil import time from itertools import chain +from typing import Literal import duckdb +from delta import DeltaTable from duckdb.experimental.spark.sql import SparkSession as DuckDBSparkSession from duckdb.experimental.spark.sql.dataframe import DataFrame as DuckDBDataFrame -from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import Column, DataFrame, SparkSession from pyspark.sql.functions import ( col, concat, @@ -148,25 +150,19 @@ def extract_db_data_frame( # noqa: PLR0912, PLR0913, PLR0915 min_max_df = spark.read.jdbc(url=jdbc_url, table=min_max_sql, properties=conn_props) if is_date_partitioning_col: # Ensure it is a date (e.g. if date in string format, convert to date) - min_max_df = min_max_df.withColumn( - min_max_df.columns[0], to_date(min_max_df[0]) - ).withColumn(min_max_df.columns[1], to_date(min_max_df[1])) + min_max_df = min_max_df.withColumn(min_max_df.columns[0], to_date(min_max_df[0])).withColumn( + min_max_df.columns[1], to_date(min_max_df[1]) + ) min_max = min_max_df.first() min_val = min_max[0] max_val = min_max[1] count = min_max[2] if is_numeric_partitioning_col: - logger.info( - f"Deriving partitions from numeric ranges across column: {partitioning_col}" - ) + logger.info(f"Deriving partitions from numeric ranges across column: {partitioning_col}") # Take count as partition if using a spotty range, and count of rows is less than range of IDs - partitions = int( - min((int(max_val) - int(min_val)), int(count)) / (partition_rows + 1) - ) - logger.info( - f"Derived {partitions} partitions from numeric ranges across column: {partitioning_col}" - ) + partitions = int(min((int(max_val) - int(min_val)), int(count)) / (partition_rows + 1)) + logger.info(f"Derived {partitions} partitions from numeric ranges across column: {partitioning_col}") if partitions > MAX_PARTITIONS: fail_msg = ( f"Aborting job run because {partitions} partitions " @@ -175,9 +171,7 @@ def extract_db_data_frame( # noqa: PLR0912, PLR0913, PLR0915 logger.fatal(fail_msg) raise RuntimeError(fail_msg) - logger.info( - f"{partitions} partitions to extract at approximately {partition_rows} rows each." - ) + logger.info(f"{partitions} partitions to extract at approximately {partition_rows} rows each.") data_df = spark.read.options(customSchema=custom_schema).jdbc( url=jdbc_url, @@ -196,9 +190,7 @@ def extract_db_data_frame( # noqa: PLR0912, PLR0913, PLR0915 # if that distinct count is less than MAX_PARTITIONS date_delta = max_val - min_val partitions = date_delta.days + 1 - if ( - count / partitions - ) < 0.6 or True: # Forcing this path, see comment in else below + if (count / partitions) < 0.6 or True: # Forcing this path, see comment in else below logger.info( f"Partitioning by date in col {partitioning_col} would yield {partitions} but only {count} " f"distinct dates in the dataset. This partition range is too sparse. Going to query the " @@ -217,9 +209,7 @@ def extract_db_data_frame( # noqa: PLR0912, PLR0913, PLR0915 table=f"(select distinct {partitioning_col} from {table}) distinct_dates", properties=conn_props, ) - partition_sql_predicates = [ - f"{partitioning_col} = '{str(row[0])}'" for row in date_df.collect() - ] + partition_sql_predicates = [f"{partitioning_col} = '{str(row[0])}'" for row in date_df.collect()] logger.info( f"Built {len(partition_sql_predicates)} SQL partition predicates " f"to yield data partitions, based on distinct values of {partitioning_col} " @@ -248,9 +238,7 @@ def extract_db_data_frame( # noqa: PLR0912, PLR0913, PLR0915 # NOTE: Have to use integer (really a Long) representation of the Date, since that is what the Scala # ... implementation is expecting: https://github.com/apache/spark/blob/c561ee686551690bee689f37ae5bbd75119994d6/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala#L192-L207 # TODO: THIS DOES NOT SEEM TO WORK WITH DATES for lowerBound and upperBound. Forcing use of predicates - raise NotImplementedError( - "Cannot read JDBC partitions with date lower/upper bound" - ) + raise NotImplementedError("Cannot read JDBC partitions with date lower/upper bound") data_df = spark.read.jdbc( url=jdbc_url, @@ -277,20 +265,12 @@ def extract_db_data_frame( # noqa: PLR0912, PLR0913, PLR0915 raise RuntimeError(fail_msg) # SQL usable in Postgres to get a distinct 32-bit int from an md5 hash of text - pg_int_from_hash = ( - f"('x'||substr(md5({partitioning_col}::text),1,8))::bit(32)::int" - ) + pg_int_from_hash = f"('x'||substr(md5({partitioning_col}::text),1,8))::bit(32)::int" # int could be signed. This workaround SQL gets unsigned modulus from the hash int - non_neg_modulo = ( - f"mod({partitions} + mod({pg_int_from_hash}, {partitions}), {partitions})" - ) - partition_sql_predicates = [ - f"{non_neg_modulo} = {p}" for p in range(0, partitions) - ] + non_neg_modulo = f"mod({partitions} + mod({pg_int_from_hash}, {partitions}), {partitions})" + partition_sql_predicates = [f"{non_neg_modulo} = {p}" for p in range(0, partitions)] - logger.info( - f"{partitions} partitions to extract by predicates at approximately {partition_rows} rows each." - ) + logger.info(f"{partitions} partitions to extract by predicates at approximately {partition_rows} rows each.") data_df = spark.read.jdbc( url=jdbc_url, @@ -343,26 +323,39 @@ def load_delta_table( spark: SparkSession, source_df: DataFrame, delta_table_name: str, - overwrite: bool = False, + save_mode: Literal["append", "merge", "overwrite"] = "append", + merge_condition: str | Column | None = None, + partition_columns: list[str] | None = None, ) -> None: """ Write DataFrame data to a table in Delta format. Args: spark: the SparkSession source_df: DataFrame with data to write - delta_table_name: table to write into. Currently this function requires the table to already exist. - overwrite: If True, will replace all existing data with that of the DataFrame, while append will add new data. - If left False (the default), the DataFrame data will be appended to existing data. + delta_table_name: table to write into. Currently, this function requires the table to already exist. + save_mode: one of "append", "merge", "overwrite" + merge_condition: merge_condition must be provided if save_mode is "merge" + partition_columns: list of column names to partition by Returns: None """ + start = time.perf_counter() logger.info(f"LOAD (START): Loading data into Delta table {delta_table_name}") # NOTE: Best to (only?) use .saveAsTable(name=) rather than .insertInto(tableName=) # ... The insertInto does not seem to align/merge columns from DataFrame to table columns (defaults to column order) - save_mode = "overwrite" if overwrite else "append" - source_df.write.format(source="delta").mode(saveMode=save_mode).saveAsTable( - name=delta_table_name - ) - logger.info(f"LOAD (FINISH): Loaded data into Delta table {delta_table_name}") + if save_mode == "merge": + if merge_condition is None: + raise ValueError("merge_condition cannot be None when save_mode is 'merge'") + target = DeltaTable.forName(spark, delta_table_name).alias("t") + ( + target.merge(source_df.alias("s"), merge_condition) + .whenNotMatchedInsertAll() + .whenNotMatchedBySourceDelete() + .execute() + ) + else: + source_df.write.format(source="delta").mode(saveMode=save_mode).saveAsTable(name=delta_table_name) + end = time.perf_counter() + logger.info(f"LOAD (FINISH): Loaded data into Delta table {delta_table_name} in {end - start:.2f} seconds.") def load_es_index( @@ -385,14 +378,10 @@ def load_es_index( jvm_data_df = source_df._jdf # Call the elasticsearch-hadoop method to write the DF to ES via the _jvm conduit on the SparkContext - spark.sparkContext._jvm.org.elasticsearch.spark.sql.EsSparkSQL.saveToEs( - jvm_data_df, jvm_es_config_map - ) + spark.sparkContext._jvm.org.elasticsearch.spark.sql.EsSparkSQL.saveToEs(jvm_data_df, jvm_es_config_map) -def merge_delta_table( - spark: SparkSession, source_df: DataFrame, delta_table_name: str, merge_column: str -) -> None: +def merge_delta_table(spark: SparkSession, source_df: DataFrame, delta_table_name: str, merge_column: str) -> None: source_df.create_or_replace_temporary_view("temp_table") spark.sql( @@ -478,9 +467,7 @@ def diff( if unique_key_col in compare_cols: compare_cols.remove(unique_key_col) - distinct_stmts = " ".join( - [f"WHEN l.{c} IS DISTINCT FROM r.{c} THEN 'C'" for c in compare_cols] - ) + distinct_stmts = " ".join([f"WHEN l.{c} IS DISTINCT FROM r.{c} THEN 'C'" for c in compare_cols]) compare_expr = f""" CASE WHEN l.exists IS NULL THEN 'I' @@ -525,9 +512,7 @@ def convert_decimal_cols_to_string(df: DataFrame) -> DataFrame: for f in df.schema.fields: if not isinstance(f.dataType, DecimalType): continue - df_no_decimal = df_no_decimal.withColumn( - f.name, df_no_decimal[f.name].cast(StringType()) - ) + df_no_decimal = df_no_decimal.withColumn(f.name, df_no_decimal[f.name].cast(StringType())) return df_no_decimal @@ -662,9 +647,7 @@ def create_ref_temp_views( # noqa: PLR0912 # Create USAS temp views rds_ref_tables = build_ref_table_name_list() - logger.info( - f"Creating the following tables under the global_temp database: {rds_ref_tables}" - ) + logger.info(f"Creating the following tables under the global_temp database: {rds_ref_tables}") match isinstance(spark, DuckDBSparkSession): case True: @@ -685,9 +668,7 @@ def create_ref_temp_views( # noqa: PLR0912 ) else: # DuckDB will prepend the HTTP or HTTPS so we need to strip it from the AWS endpoint URL - endpoint_url = CONFIG.AWS_S3_ENDPOINT.replace("http://", "").replace( - "https://", "" - ) + endpoint_url = CONFIG.AWS_S3_ENDPOINT.replace("http://", "").replace("https://", "") spark.sql( f""" CREATE OR REPLACE SECRET ( @@ -710,7 +691,9 @@ def create_ref_temp_views( # noqa: PLR0912 # The DuckDB Delta extension is needed to interact with DeltaLake tables spark.sql("LOAD delta; CREATE SCHEMA IF NOT EXISTS rpt;") for table in _download_delta_tables: - s3_path = f"s3://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/{table['schema']}/{table['table_name']}" + s3_path = ( + f"s3://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/{table['schema']}/{table['table_name']}" + ) try: spark.sql( f""" @@ -718,31 +701,21 @@ def create_ref_temp_views( # noqa: PLR0912 SELECT * FROM delta_scan('{s3_path}'); """ ) - logger.info( - f"Successfully created table {table['schema']}.{table['table_name']}" - ) + logger.info(f"Successfully created table {table['schema']}.{table['table_name']}") except duckdb.IOException as exc: logger.exception(f"Failed to create table {table['table_name']}") - raise RuntimeError( - f"Failed to create table {table['table_name']}" - ) from exc + raise RuntimeError(f"Failed to create table {table['table_name']}") from exc # The DuckDB Postgres extension is needed to connect to the USAS Postgres DB spark.sql("LOAD postgres; CREATE SCHEMA IF NOT EXISTS global_temp;") - spark.sql( - f"ATTACH '{CONFIG.DATABASE_URL}' AS usas (TYPE postgres, READ_ONLY);" - ) + spark.sql(f"ATTACH '{CONFIG.DATABASE_URL}' AS usas (TYPE postgres, READ_ONLY);") for table in rds_ref_tables: try: - spark.sql( - f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM usas.public.{table};" - ) + spark.sql(f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM usas.public.{table};") except duckdb.CatalogException as exc: logger.exception(f"Failed to create view {table} for {table}") - raise RuntimeError( - f"Failed to create view {table} for {table}" - ) from exc + raise RuntimeError(f"Failed to create view {table} for {table}") from exc if create_broker_views: spark.sql( @@ -755,14 +728,10 @@ def create_ref_temp_views( # noqa: PLR0912 ) for table in _BROKER_REF_TABLES: try: - spark.sql( - f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM broker.public.{table};" - ) + spark.sql(f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM broker.public.{table};") except duckdb.CatalogException as exc: logger.exception(f"Failed to create view {table} for {table}") - raise RuntimeError( - f"Failed to create view {table} for {table}" - ) from exc + raise RuntimeError(f"Failed to create view {table} for {table}") from exc case False: logger.info("Creating ref temp views using Spark") @@ -818,9 +787,7 @@ def write_csv_file( # noqa: PLR0913 if fs.exists(parts_dir_path): fs.delete(parts_dir_path, True) start = time.time() - logger.info( - f"Writing source data DataFrame to csv part files for file {parts_dir}..." - ) + logger.info(f"Writing source data DataFrame to csv part files for file {parts_dir}...") df_record_count = df.count() num_partitions = math.ceil(df_record_count / max_records_per_file) or 1 df.repartition(num_partitions).write.options( @@ -838,9 +805,7 @@ def write_csv_file( # noqa: PLR0913 sep=delimiter, ) logger.info(f"{parts_dir} contains {df_record_count:,} rows of data") - logger.info( - f"Wrote source data DataFrame to csv part files in {(time.time() - start):3f}s" - ) + logger.info(f"Wrote source data DataFrame to csv part files in {(time.time() - start):3f}s") return df_record_count @@ -876,9 +841,7 @@ def write_csv_file_duckdb( full_file_paths = [] - logger.info( - f"Writing source data DataFrame to csv files for file {download_file_name}" - ) + logger.info(f"Writing source data DataFrame to csv files for file {download_file_name}") rel.to_csv( file_name=f"{temp_csv_directory_path}{download_file_name}", sep=delimiter, @@ -897,19 +860,14 @@ def write_csv_file_duckdb( for dir in _partition_dirs: _old_csv_path = f"{dir}/{os.listdir(dir)[0]}" _new_csv_path = ( - f"{temp_csv_directory_path}{download_file_name}" - f"/{download_file_name}_{dir.split('=')[1].zfill(2)}.csv" + f"{temp_csv_directory_path}{download_file_name}" f"/{download_file_name}_{dir.split('=')[1].zfill(2)}.csv" ) shutil.move(_old_csv_path, _new_csv_path) full_file_paths.append(_new_csv_path) os.rmdir(dir) - logger.info( - f"{temp_csv_directory_path}{download_file_name} contains {df_record_count:,} rows of data" - ) - logger.info( - f"Wrote source data DataFrame to {len(full_file_paths)} CSV files in {(time.time() - start):3f}s" - ) + logger.info(f"{temp_csv_directory_path}{download_file_name} contains {df_record_count:,} rows of data") + logger.info(f"Wrote source data DataFrame to {len(full_file_paths)} CSV files in {(time.time() - start):3f}s") return df_record_count, full_file_paths diff --git a/usaspending_api/etl/management/commands/load_table_to_delta.py b/usaspending_api/etl/management/commands/load_table_to_delta.py index c541240646..615001881f 100644 --- a/usaspending_api/etl/management/commands/load_table_to_delta.py +++ b/usaspending_api/etl/management/commands/load_table_to_delta.py @@ -52,7 +52,6 @@ from usaspending_api.search.models import AwardSearch, TransactionSearch from usaspending_api.transactions.delta_models import ( DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS, - PUBLISHED_FABS_COLUMNS, PUBLISHED_FABS_DELTA_COLUMNS, TRANSACTION_FABS_VIEW_COLUMNS, TRANSACTION_FPDS_VIEW_COLUMNS, @@ -74,88 +73,89 @@ TABLE_SPEC = { "awards": TableSpec( - **{ - "model": Award, - "source_table": "vw_awards", - "source_database": "rpt", - "destination_database": "raw", - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": awards_sql_string, - "column_names": list(AWARDS_COLUMNS), - } + model=Award, + source_table="vw_awards", + source_database="rpt", + destination_database="raw", + partition_column="id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=awards_sql_string, + column_names=list(AWARDS_COLUMNS), + # delta_table_create_partitions=["fiscal_year"], ), "detached_award_procurement": TableSpec( - **{ - "model": SourceProcurementTransaction, - "source_table": "source_procurement_transaction", - "source_database": "raw", - "destination_database": "raw", - "partition_column": "detached_award_procurement_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": detached_award_procurement_create_sql_string, - "column_names": list(DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS), - "add_hash_field": True, - } + model=SourceProcurementTransaction, + source_table="source_procurement_transaction", + source_database="raw", + destination_database="raw", + partition_column="detached_award_procurement_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=detached_award_procurement_create_sql_string, + column_names=list(DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS), + extra_columns={ + "hash": lambda: sf.xxhash64("*"), + "action_year": lambda: sf.year(sf.to_date("action_date")), + "action_month": lambda: sf.month(sf.to_date("action_date")), + }, + save_mode="merge", + merge_condition="s.detached_award_procurement_id = t.detached_award_procurement_id and s.hash = t.hash", ), "financial_accounts_by_awards": TableSpec( - **{ - "model": FinancialAccountsByAwards, - "source_table": "financial_accounts_by_awards", - "source_database": "public", - "destination_database": "raw", - "partition_column": "financial_accounts_by_awards_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": financial_accounts_by_awards_sql_string, - "custom_schema": "award_id LONG", - "column_names": list(FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS), - } + model=FinancialAccountsByAwards, + source_table="financial_accounts_by_awards", + source_database="public", + destination_database="raw", + partition_column="financial_accounts_by_awards_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=financial_accounts_by_awards_sql_string, + custom_schema="award_id LONG", + column_names=list(FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS), ), "transaction_fabs": TableSpec( - **{ - "model": TransactionFABS, - "source_table": "vw_transaction_fabs", - "source_database": "int", - "destination_database": "raw", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_fabs_sql_string, - "column_names": TRANSACTION_FABS_VIEW_COLUMNS, - "add_hash_field": True, - } + model=TransactionFABS, + source_table="vw_transaction_fabs", + source_database="int", + destination_database="raw", + partition_column="transaction_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=transaction_fabs_sql_string, + column_names=TRANSACTION_FABS_VIEW_COLUMNS, + # add_hash_field=True, + # delta_table_create_partitions=["action_year", "action_month"], ), "published_fabs": TableSpec( - **{ - "model": SourceAssistanceTransaction, - "source_table": "source_assistance_transaction", - "source_database": "raw", - "destination_database": "raw", - "partition_column": "published_fabs_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": published_fabs_create_sql_string, - "column_names": list(PUBLISHED_FABS_COLUMNS), - "add_hash_field": True, - } + model=SourceAssistanceTransaction, + source_table="source_assistance_transaction", + source_database="raw", + destination_database="raw", + partition_column="published_fabs_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=published_fabs_create_sql_string, + column_names=list(PUBLISHED_FABS_DELTA_COLUMNS), + extra_columns={ + "hash": lambda: sf.xxhash64("*"), + "action_year": lambda: sf.year(sf.to_date("action_date")), + "action_month": lambda: sf.month(sf.to_date("action_date")), + }, + # add_hash_field=True, + # delta_table_create_partitions=["action_year", "action_month"], ), "transaction_fpds": TableSpec( - **{ - "model": TransactionFPDS, - "source_table": "vw_transaction_fpds", - "source_database": "int", - "destination_database": "raw", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_fpds_sql_string, - "custom_schema": "", - "column_names": TRANSACTION_FPDS_VIEW_COLUMNS, - "add_hash_field": True, - } + model=TransactionFPDS, + source_table="vw_transaction_fpds", + source_database="int", + destination_database="raw", + partition_column="transaction_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=transaction_fpds_sql_string, + custom_schema="", + column_names=TRANSACTION_FPDS_VIEW_COLUMNS, ), "transaction_normalized": TableSpec( **{ @@ -168,7 +168,7 @@ "is_partition_column_unique": True, "delta_table_create_sql": transaction_normalized_sql_string, "column_names": list(TRANSACTION_NORMALIZED_COLUMNS), - "add_hash_field": True, + # "add_hash_field": True, } ), # Tables loaded in from the Broker @@ -330,7 +330,8 @@ def handle(self, *args, **options) -> None: partition_column_type = table_spec.partition_column_type is_partition_column_unique = table_spec.is_partition_column_unique custom_schema = table_spec.custom_schema - add_hash_field = table_spec.add_hash_field + save_mode = table_spec.save_mode + merge_condition = table_spec.merge_condition # Set the database that will be interacted with for all Delta Lake table Spark-based activity logger.info(f"Using Spark Database: {destination_database}") @@ -339,13 +340,9 @@ def handle(self, *args, **options) -> None: # Resolve JDBC URL for Source Database jdbc_url = get_usas_jdbc_url() if not is_from_broker else get_broker_jdbc_url() if not jdbc_url: - raise RuntimeError( - "Couldn't find JDBC url, please properly configure your CONFIG." - ) + raise RuntimeError("Couldn't find JDBC url, please properly configure your CONFIG.") if not jdbc_url.startswith("jdbc:postgresql://"): - raise ValueError( - "JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://..." - ) + raise ValueError("JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://...") # If a partition_column is present, read from jdbc using partitioning if partition_column: @@ -356,9 +353,7 @@ def handle(self, *args, **options) -> None: is_numeric_partitioning_col = False is_date_partitioning_col = True else: - raise ValueError( - "partition_column_type should be either 'numeric' or 'date'" - ) + raise ValueError("partition_column_type should be either 'numeric' or 'date'") # Read from table or view df = extract_db_data_frame( @@ -385,8 +380,9 @@ def handle(self, *args, **options) -> None: properties=get_jdbc_connection_properties(), ) - if add_hash_field: - df = df.withColumn("hash", sf.xxhash64("*")) + extra_columns = table_spec.extra_columns if table_spec.extra_columns else {} + for name, column in extra_columns.items(): + df = df.withColumn(name, column()) # Make sure that the column order defined in the Delta table schema matches # that of the Spark dataframe used to pull from the Postgres table. While not @@ -395,6 +391,6 @@ def handle(self, *args, **options) -> None: df = df.select(table_spec.column_names) # Write to S3 - load_delta_table(spark, df, destination_table_name, True) + load_delta_table(spark, df, destination_table_name, save_mode=save_mode, merge_condition=merge_condition) if spark_created_by_command: spark.stop() diff --git a/usaspending_api/etl/management/commands/load_transaction_normalized.py b/usaspending_api/etl/management/commands/load_transaction_normalized.py index dd89e851af..937a04f2d0 100644 --- a/usaspending_api/etl/management/commands/load_transaction_normalized.py +++ b/usaspending_api/etl/management/commands/load_transaction_normalized.py @@ -30,7 +30,7 @@ def add_arguments(parser): @staticmethod def handle(*args, **options): with prepare_spark() as spark: - fabs_loader = FABSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + # fabs_loader = FABSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) fpds_loader = FPDSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) - fabs_loader.load_transactions() + # fabs_loader.load_transactions() fpds_loader.load_transactions() diff --git a/usaspending_api/etl/table_specs.py b/usaspending_api/etl/table_specs.py index d3fc405e7c..46579952fa 100644 --- a/usaspending_api/etl/table_specs.py +++ b/usaspending_api/etl/table_specs.py @@ -2,7 +2,7 @@ from typing import Any, Callable, Literal from django.db import models -from pyspark.sql import SparkSession +from pyspark.sql import Column, SparkSession from pyspark.sql.types import StructType @@ -10,6 +10,8 @@ class TableSpec: destination_database: Literal["arc", "int", "raw", "rpt", "test"] delta_table_create_sql: str | StructType + save_mode: Literal["append", "merge", "overwrite"] = "overwrite" + merge_condition: str | Column | None = None column_names: list[str] | None = None model: models.Model | None = None is_from_broker: bool = False @@ -25,6 +27,17 @@ class TableSpec: delta_table_create_options: dict[str, str | bool] | None = None delta_table_create_partitions: list[str] | None = None tsvectors: dict[str, list[str]] | None = None + extra_columns: dict[str, Column] | None = None + + def __post_init__(self): + if isinstance(self.delta_table_create_sql, str): + if self.delta_table_create_partitions is not None or self.delta_table_create_options is not None: + raise TypeError( + "delta_table_create_partitions and delta_table_create_options can only be used when " + "delta_table_create_sql is a StructType." + ) + if self.save_mode == "merge" and self.merge_condition is None: + raise TypeError("merge_condition must be used when save_mode is merge") @dataclass(kw_only=True) diff --git a/usaspending_api/etl/transaction_delta_loaders/loaders.py b/usaspending_api/etl/transaction_delta_loaders/loaders.py index dc04dd0795..b8db37febd 100644 --- a/usaspending_api/etl/transaction_delta_loaders/loaders.py +++ b/usaspending_api/etl/transaction_delta_loaders/loaders.py @@ -5,14 +5,16 @@ from typing import Callable, Literal from delta import DeltaTable -from pyspark.sql import functions as sf, SparkSession, Window +from pyspark.sql import DataFrame, functions as sf, SparkSession, Window from usaspending_api.broker.helpers.build_business_categories_boolean_dict import fpds_boolean_columns from usaspending_api.broker.helpers.last_load_date import ( get_earliest_load_date, + get_latest_load_date, update_last_load_date, + get_last_load_date, ) from usaspending_api.common.data_classes import TransactionColumn from usaspending_api.common.etl.spark import create_ref_temp_views @@ -37,17 +39,20 @@ class AbstractDeltaTransactionLoader(ABC): id_col: str source_table: str col_info = list[TransactionColumn] + last_etl_load_date: datetime def __init__(self, spark, etl_level: Literal["fabs", "fpds", "normalized"], spark_s3_bucket: str) -> None: self.etl_level = etl_level + self.last_etl_load_date = get_last_load_date(f"transaction_{self.etl_level}") self.spark_s3_bucket: spark_s3_bucket self.spark = spark def load_transactions(self) -> None: + logger.info(f"LOADING TRANSACTIONS -- level: {self.etl_level}, last load date: {self.last_etl_load_date}") if not self.spark._jsparkSession.catalog().tableExists(f"int.transaction_{self.etl_level}"): raise Exception(f"Table: int.transaction_{self.etl_level} does not exist.") logger.info(f"Running UPSERT SQL for transaction_{self.etl_level} ETL") - self.spark.sql(self.transaction_merge_into_sql()) + self.transaction_merge() next_last_load = get_earliest_load_date( ("source_procurement_transaction", "source_assistance_transaction"), datetime.utcfromtimestamp(0) ) @@ -59,44 +64,37 @@ def build_date_format_sql(self, col: TransactionColumn, is_casted_to_date: bool regexp_mmddYYYY = r"(\\d{2})(?[-/])(\\d{2})(\\k)(\\d{4})(.\\d{2}:\\d{2}:\\d{2}([+-]\\d{2}:\\d{2})?)?" regexp_YYYYmmdd = r"(\\d{4})(?[-/]?)(\\d{2})(\\k)(\\d{2})(.\\d{2}:\\d{2}:\\d{2}([+-]\\d{2}:\\d{2})?)?" - mmddYYYY_fmt = f""" - (regexp_extract({self.source_table}.{col.source}, '{regexp_mmddYYYY}', 5) - || '-' || - regexp_extract({self.source_table}.{col.source}, '{regexp_mmddYYYY}', 1) - || '-' || - regexp_extract({self.source_table}.{col.source}, '{regexp_mmddYYYY}', 3)) - """ - YYYYmmdd_fmt = f""" - (regexp_extract({self.source_table}.{col.source}, '{regexp_YYYYmmdd}', 1) - || '-' || - regexp_extract({self.source_table}.{col.source}, '{regexp_YYYYmmdd}', 3) - || '-' || - regexp_extract({self.source_table}.{col.source}, '{regexp_YYYYmmdd}', 5)) - """ + mmddYYYY_fmt = sf.concat( + sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_mmddYYYY, 5), + sf.lit("-"), + sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_mmddYYYY, 1), + sf.lit("-"), + sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_mmddYYYY, 3), + ) + YYYYmmdd_fmt = sf.concat( + sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_YYYYmmdd, 1), + sf.lit("-"), + sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_YYYYmmdd, 3), + sf.lit("-"), + sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_YYYYmmdd, 5), + ) if is_casted_to_date: - mmddYYYY_fmt = f"""CAST({mmddYYYY_fmt} - AS DATE) - """ - YYYYmmdd_fmt = f"""CAST({YYYYmmdd_fmt} - AS DATE) - """ - - sql_snippet = f""" - CASE WHEN regexp({self.source_table}.{col.source}, '{regexp_mmddYYYY}') - THEN {mmddYYYY_fmt} - ELSE {YYYYmmdd_fmt} - END - """ - - return sql_snippet + mmddYYYY_fmt = mmddYYYY_fmt.cast("date") + YYYYmmdd_fmt = YYYYmmdd_fmt.cast("date") + + snippet = sf.when( + sf.regexp(f"{self.source_table}.{col.source}", sf.lit(regexp_mmddYYYY)), mmddYYYY_fmt + ).otherwise(YYYYmmdd_fmt) + + return snippet def handle_column(self, col: TransactionColumn, is_result_aliased=True) -> str: if col.handling == "cast": - retval = f"CAST({self.source_table}.{col.source} AS {col.delta_type})" + retval = sf.col(f"{self.source_table}.{col.source}").cast(col.delta_type) elif col.handling == "literal": # Use col.source directly as the value - retval = f"{col.source}" + retval = sf.lit(col.source) elif col.handling == "parse_string_datetime_to_date": # These are string fields that actually hold DATES/TIMESTAMPS and need to be cast as dates. # However, they may not be properly parsed when calling CAST(... AS DATE). @@ -107,54 +105,54 @@ def handle_column(self, col: TransactionColumn, is_result_aliased=True) -> str: retval = self.build_date_format_sql(col, is_casted_to_date=False) elif col.delta_type.upper() == "STRING": # Capitalize and remove leading & trailing whitespace from all string values - retval = f"ucase(trim({self.source_table}.{col.source}))" + retval = sf.ucase(sf.trim(sf.col(f"{self.source_table}.{col.source}"))) elif col.delta_type.upper() == "BOOLEAN" and not col.handling == "leave_null": # Unless specified, convert any nulls to false for boolean columns - retval = f"COALESCE({self.source_table}.{col.source}, FALSE)" + retval = sf.coalesce(sf.col(f"{self.source_table}.{col.source}"), sf.lit(False)) else: - retval = f"{self.source_table}.{col.source}" + retval = sf.col(f"{self.source_table}.{col.source}") # Handle scalar transformations if the column requires it if col.scalar_transformation is not None: - retval = col.scalar_transformation.format(input=retval) + retval = col.scalar_transformation(retval) - retval = f"{retval}{' AS ' + col.dest_name if is_result_aliased else ''}" + retval = retval.alias(col.dest_name) if is_result_aliased else retval return retval @property def select_columns(self) -> list[str]: - return ["CAST(NULL AS LONG) AS transaction_id"] + [ + return [sf.lit(None).cast("LONG").alias("transaction_id")] + [ self.handle_column(col) for col in self.col_info if col.dest_name != "transaction_id" ] - def source_subquery_sql(self) -> str: - select_columns_str = ",\n ".join(self.select_columns) - sql = f""" - SELECT - {select_columns_str} - FROM {self.source_table} - """ - return sql - - def transaction_merge_into_sql(self) -> str: - silver_table_cols = ", ".join([col.dest_name for col in self.col_info if col.dest_name != "transaction_id"]) - sql = f""" - MERGE INTO int.transaction_{self.etl_level} AS silver_table - USING ( - {self.source_subquery_sql()} - ) AS source_subquery - ON - silver_table.{self.id_col} = source_subquery.{self.id_col} - AND silver_table.hash = source_subquery.hash - WHEN NOT MATCHED - THEN INSERT - ({silver_table_cols}) - VALUES ({silver_table_cols}) - WHEN NOT MATCHED BY SOURCE - THEN DELETE - """ - - return sql + def source_subquery_df(self) -> DataFrame: + return ( + self.spark.read.format("delta") + .option("readChangeFeed", "true") + .option("startingTimestamp", self.last_etl_load_date.strftime("%Y-%m-%d %H:%M:%S")) + .table(self.source_table) + .select(self.select_columns) + ) + + def transaction_merge(self) -> None: + source = self.source_subquery_df().alias("s") + target = DeltaTable.forName(self.spark, f"int.transaction_{self.etl_level}").alias("t") + id_match_condition = f"t.{self.id_col} == s.{self.id_col}" + row_not_updated_condition = "t.hash == s.hash" + partition_pruning_conditions = "t.action_year == s.action_year AND t.action_month == s.action_month" + ( + target.merge( + source, " AND ".join([id_match_condition, row_not_updated_condition, partition_pruning_conditions]) + ) + .whenNotMatchedInsert( + values={ + col.dest_name: sf.col(f"s.{col.dest_name}") + for col in self.col_info + if col.dest_name != "transaction_id" + }, + ) + .execute() + ) class FPDSDeltaTransactionLoader(AbstractDeltaTransactionLoader): @@ -181,39 +179,51 @@ class NormalizedMixin: handle_column: Callable source_table: str etl_level: str + last_etl_load_date: datetime select_columns: list[str] to_normalized_col_info: list[TransactionColumn] normalization_type: Literal["fabs", "fpds"] - prepare_spark: Callable - def source_subquery_sql(self) -> str: - additional_joins = f""" - LEFT OUTER JOIN global_temp.subtier_agency AS funding_subtier_agency ON ( - funding_subtier_agency.subtier_code = {self.source_table}.funding_sub_tier_agency_co + def source_subquery_df(self) -> DataFrame: + funding_subtier_agency = self.spark.table("global_temp.subtier_agency").alias("funding_subtier_agency") + funding_agency = self.spark.table("global_temp.agency").alias("funding_agency") + awarding_subtier_agency = ( + self.spark.table("global_temp.subtier_agency") + .withColumn("awarding_subtier_agency_id", sf.col("subtier_agency_id")) + .alias("awarding_subtier_agency") + ) + awarding_agency = self.spark.table("global_temp.agency").alias("awarding_agency") + df = ( + self.spark.read.format("delta") + .option("readChangeFeed", "true") + .option("startingTimestamp", self.last_etl_load_date.strftime("%Y-%m-%d %H:%M:%S")) + .table(self.source_table) + ) + return ( + df.join( + funding_subtier_agency, + funding_subtier_agency.subtier_code == df.funding_sub_tier_agency_co, + how="leftouter", ) - LEFT OUTER JOIN global_temp.agency AS funding_agency ON ( - funding_agency.subtier_agency_id = funding_subtier_agency.subtier_agency_id + .join( + funding_agency, + funding_agency.subtier_agency_id == funding_subtier_agency.subtier_agency_id, + how="leftouter", ) - LEFT OUTER JOIN global_temp.subtier_agency AS awarding_subtier_agency ON ( - awarding_subtier_agency.subtier_code = {self.source_table}.awarding_sub_tier_agency_c + .join( + awarding_subtier_agency, + awarding_subtier_agency.subtier_code == df.awarding_sub_tier_agency_c, + how="leftouter", ) - LEFT OUTER JOIN global_temp.agency AS awarding_agency ON ( - awarding_agency.subtier_agency_id = awarding_subtier_agency.subtier_agency_id + .join( + awarding_agency, + awarding_agency.subtier_agency_id == awarding_subtier_agency.awarding_subtier_agency_id, + how="leftouter", ) - """ - - # Since the select columns may have complicated logic, put them on separate lines for debugging. - # However, strings inside {} expressions in f-strings can't contain backslashes, so will join them first - # before inserting into overall sql statement. - select_columns_str = ",\n ".join(self.select_columns) - return f""" - SELECT - {select_columns_str} - FROM {self.source_table} - {additional_joins} - """ - - def transaction_merge_into_sql(self) -> str: + .select(self.select_columns) + ) + + def transaction_merge(self) -> None: create_ref_temp_views(self.spark) load_datetime = datetime.now(timezone.utc) special_columns = ["create_date", "update_date"] @@ -227,33 +237,29 @@ def transaction_merge_into_sql(self) -> str: set_cols.append(f"""int.transaction_normalized.update_date = '{load_datetime.isoformat(" ")}'""") # Move create_date and update_date to the end of the list of column names for ease of handling # during record insert - insert_col_name_list = [ - col_name for col_name in TRANSACTION_NORMALIZED_COLUMNS if col_name not in special_columns - ] - insert_col_name_list.extend(special_columns) - insert_col_names = ", ".join([col_name for col_name in insert_col_name_list]) + insert_col_names = [col_name for col_name in TRANSACTION_NORMALIZED_COLUMNS if col_name not in special_columns] + insert_col_names.extend(special_columns) # On insert, all values except for create_date and update_date will come from the subquery - insert_value_list = insert_col_name_list[:-2] - insert_value_list.extend([f"""'{load_datetime.isoformat(" ")}'"""] * 2) - insert_values = ", ".join([value for value in insert_value_list]) - - sql = f""" - MERGE INTO int.transaction_normalized - USING ( - {self.source_subquery_sql()} - ) AS source_subquery - ON transaction_normalized.transaction_unique_id = source_subquery.transaction_unique_id - AND transaction_normalized.hash = source_subquery.hash - WHEN NOT MATCHED - THEN INSERT - ({insert_col_names}) - VALUES ({insert_values}) - WHEN NOT MATCHED BY SOURCE AND {'NOT' if self.normalization_type== 'fabs' else ''} transaction_normalized.is_fpds - THEN DELETE - """ - - return sql + insert_values = [sf.col(col) for col in insert_col_names[:-2]] + insert_values.extend([sf.lit(f"""'{load_datetime.isoformat(" ")}'""")] * 2) + + target = DeltaTable.forName(self.spark, "int.transaction_normalized").alias("t") + id_condition = "t.transaction_unique_id = s.transaction_unique_id" + row_not_updated_condition = "t.hash == s.hash" + type_partition_condition = f"{'NOT' if self.normalization_type == 'fabs' else ''} t.is_fpds" + date_partition_conditions = "t.action_year == s.action_year AND t.action_month == s.action_month" + ( + target.merge( + self.source_subquery_df().alias("s"), + " AND ".join( + [id_condition, row_not_updated_condition, type_partition_condition, date_partition_conditions] + ), + ) + .whenNotMatchedInsert(dict(zip(insert_col_names, insert_values))) + .whenNotMatchedBySourceDelete(f"{'NOT' if self.normalization_type== 'fabs' else ''} t.is_fpds") + .execute() + ) def populate_transaction_normalized_ids(self) -> None: target = DeltaTable.forName(self.spark, "int.transaction_normalized").alias("t") @@ -382,27 +388,27 @@ def select_columns(self) -> list[str]: action_date_col = next( filter(lambda c: c.dest_name == "action_date" and c.source == "action_date", DAP_TO_NORMALIZED_COLUMN_INFO) ) - parse_action_date_sql_snippet = self.handle_column(action_date_col, is_result_aliased=False) + parse_action_date_snippet = self.handle_column(action_date_col, is_result_aliased=False) select_cols = [ - "CAST(NULL AS LONG) AS id", - "CAST(NULL AS LONG) AS award_id", - "awarding_agency.id AS awarding_agency_id", - f"""CASE WHEN month({parse_action_date_sql_snippet}) > 9 - THEN year({parse_action_date_sql_snippet}) + 1 - ELSE year({parse_action_date_sql_snippet}) - END AS fiscal_year""", - "funding_agency.id AS funding_agency_id", + sf.lit(None).cast("LONG").alias("id"), + sf.lit(None).cast("LONG").alias("award_id"), + sf.col("awarding_agency.id").alias("awarding_agency_id"), + sf.when(sf.month(parse_action_date_snippet) > sf.lit(9), sf.year(parse_action_date_snippet) + sf.lit(1)) + .otherwise(sf.year(parse_action_date_snippet)) + .alias("fiscal_year"), + sf.col("funding_agency.id").alias("funding_agency_id"), + ] + fpds_business_category_columns = [ + sf.col(col) for col in fpds_boolean_columns + ["contracting_officers_deter", "domestic_or_foreign_entity"] ] - fpds_business_category_columns = copy.copy(fpds_boolean_columns) - # Add a couple of non-boolean columns that are needed in the business category logic - fpds_business_category_columns.extend(["contracting_officers_deter", "domestic_or_foreign_entity"]) named_struct_text = ", ".join([f"'{col}', {self.source_table}.{col}" for col in fpds_business_category_columns]) select_cols.extend( [ # business_categories - f"get_business_categories_fpds(named_struct({named_struct_text})) AS business_categories", + sf.expr(f"get_business_categories_fpds(named_struct({named_struct_text})) AS business_categories"), # type - f""" + sf.expr( + f""" CASE WHEN {self.source_table}.pulled_from <> 'IDV' THEN {self.source_table}.contract_award_type WHEN {self.source_table}.idv_type = 'B' AND {self.source_table}.type_of_idc IS NOT NULL THEN 'IDV_B_' || {self.source_table}.type_of_idc @@ -419,9 +425,11 @@ def select_columns(self) -> list[str]: THEN 'IDV_B_C' ELSE 'IDV_' || {self.source_table}.idv_type END AS type - """, + """ + ), # type_description - f""" + sf.expr( + f""" CASE WHEN {self.source_table}.pulled_from <> 'IDV' THEN {self.source_table}.contract_award_type_desc WHEN {self.source_table}.idv_type = 'B' @@ -432,7 +440,8 @@ def select_columns(self) -> list[str]: THEN 'INDEFINITE DELIVERY CONTRACT' ELSE {self.source_table}.idv_type_description END AS type_description - """, + """ + ), ] ) for col in DAP_TO_NORMALIZED_COLUMN_INFO: diff --git a/usaspending_api/transactions/delta_models/detached_award_procurement.py b/usaspending_api/transactions/delta_models/detached_award_procurement.py index b2bc834687..facbd2682a 100644 --- a/usaspending_api/transactions/delta_models/detached_award_procurement.py +++ b/usaspending_api/transactions/delta_models/detached_award_procurement.py @@ -93,7 +93,10 @@ "fair_opportunity_limited_s": {"delta": "STRING", "postgres": "TEXT"}, "fed_biz_opps": {"delta": "STRING", "postgres": "TEXT"}, "fed_biz_opps_description": {"delta": "STRING", "postgres": "TEXT"}, - "federal_action_obligation": {"delta": "NUMERIC(38, 18)", "postgres": "NUMERIC(38,18"}, + "federal_action_obligation": { + "delta": "NUMERIC(38, 18)", + "postgres": "NUMERIC(38,18", + }, "federal_agency": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, "federally_funded_research": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, "for_profit_organization": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, @@ -311,6 +314,8 @@ } DELTA_ONLY_COLUMNS = { "hash": "LONG", + "action_year": "INTEGER", + "action_month": "INTEGER", } DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS = { **{k: v["delta"] for k, v in DETACHED_AWARD_PROCUREMENT_COLUMNS.items()}, @@ -323,5 +328,7 @@ {", ".join([f"{key} {val}" for key, val in DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS.items()])} ) USING DELTA + PARTITIONED BY (action_year, action_month) LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ diff --git a/usaspending_api/transactions/delta_models/published_fabs.py b/usaspending_api/transactions/delta_models/published_fabs.py index 21ac24bd4d..6ad119c542 100644 --- a/usaspending_api/transactions/delta_models/published_fabs.py +++ b/usaspending_api/transactions/delta_models/published_fabs.py @@ -26,9 +26,15 @@ "correction_delete_ind_desc": {"delta": "STRING", "postgres": "TEXT"}, "correction_delete_indicatr": {"delta": "STRING", "postgres": "TEXT"}, "created_at": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP"}, - "face_value_loan_guarantee": {"delta": "NUMERIC(38,18)", "postgres": "NUMERIC(38,18"}, + "face_value_loan_guarantee": { + "delta": "NUMERIC(38,18)", + "postgres": "NUMERIC(38,18", + }, "fain": {"delta": "STRING", "postgres": "TEXT"}, - "federal_action_obligation": {"delta": "NUMERIC(38,18)", "postgres": "NUMERIC(38,18"}, + "federal_action_obligation": { + "delta": "NUMERIC(38,18)", + "postgres": "NUMERIC(38,18", + }, "fiscal_year_and_quarter_co": {"delta": "STRING", "postgres": "TEXT"}, "funding_agency_code": {"delta": "STRING", "postgres": "TEXT"}, "funding_agency_name": {"delta": "STRING", "postgres": "TEXT"}, @@ -48,7 +54,10 @@ "high_comp_officer4_full_na": {"delta": "STRING", "postgres": "TEXT"}, "high_comp_officer5_amount": {"delta": "STRING", "postgres": "TEXT"}, "high_comp_officer5_full_na": {"delta": "STRING", "postgres": "TEXT"}, - "indirect_federal_sharing": {"delta": "NUMERIC(38,18)", "postgres": "NUMERIC(38,18)"}, + "indirect_federal_sharing": { + "delta": "NUMERIC(38,18)", + "postgres": "NUMERIC(38,18)", + }, "is_active": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, "is_historical": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, "legal_entity_address_line1": {"delta": "STRING", "postgres": "TEXT"}, @@ -70,8 +79,14 @@ "legal_entity_zip5": {"delta": "STRING", "postgres": "TEXT"}, "legal_entity_zip_last4": {"delta": "STRING", "postgres": "TEXT"}, "modified_at": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP"}, - "non_federal_funding_amount": {"delta": "NUMERIC(38,18)", "postgres": "NUMERIC(38,18)"}, - "original_loan_subsidy_cost": {"delta": "NUMERIC(38,18)", "postgres": "NUMERIC(38,18)"}, + "non_federal_funding_amount": { + "delta": "NUMERIC(38,18)", + "postgres": "NUMERIC(38,18)", + }, + "original_loan_subsidy_cost": { + "delta": "NUMERIC(38,18)", + "postgres": "NUMERIC(38,18)", + }, "period_of_performance_curr": {"delta": "STRING", "postgres": "TEXT"}, "period_of_performance_star": {"delta": "STRING", "postgres": "TEXT"}, "place_of_perfor_state_code": {"delta": "STRING", "postgres": "TEXT"}, @@ -103,12 +118,16 @@ } DELTA_ONLY_COLUMNS = { "hash": "LONG", + "action_year": "INTEGER", + "action_month": "INTEGER", } PUBLISHED_FABS_DELTA_COLUMNS = { **{k: v["delta"] for k, v in PUBLISHED_FABS_COLUMNS.items()}, **DELTA_ONLY_COLUMNS, } -PUBLISHED_FABS_POSTGRES_COLUMNS = {k: v["postgres"] for k, v in PUBLISHED_FABS_COLUMNS.items()} +PUBLISHED_FABS_POSTGRES_COLUMNS = { + k: v["postgres"] for k, v in PUBLISHED_FABS_COLUMNS.items() +} published_fabs_create_sql_string = rf""" CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( diff --git a/usaspending_api/transactions/delta_models/transaction_fpds.py b/usaspending_api/transactions/delta_models/transaction_fpds.py index 41d07e60b1..e9452d6aa2 100644 --- a/usaspending_api/transactions/delta_models/transaction_fpds.py +++ b/usaspending_api/transactions/delta_models/transaction_fpds.py @@ -1,412 +1,215 @@ +from pyspark.sql import functions as sf + from usaspending_api.common.data_classes import TransactionColumn TRANSACTION_FPDS_COLUMN_INFO = [ TransactionColumn("a_76_fair_act_action", "a_76_fair_act_action", "STRING"), - TransactionColumn( - "a_76_fair_act_action_desc", "a_76_fair_act_action_desc", "STRING" - ), - TransactionColumn( - "action_date", "action_date", "STRING", "string_datetime_remove_timestamp" - ), + TransactionColumn("a_76_fair_act_action_desc", "a_76_fair_act_action_desc", "STRING"), + TransactionColumn("action_date", "action_date", "STRING", "string_datetime_remove_timestamp"), TransactionColumn("action_type", "action_type", "STRING"), TransactionColumn("action_type_description", "action_type_description", "STRING"), TransactionColumn("agency_id", "agency_id", "STRING"), TransactionColumn("airport_authority", "airport_authority", "BOOLEAN"), - TransactionColumn( - "alaskan_native_owned_corpo", "alaskan_native_owned_corpo", "BOOLEAN" - ), - TransactionColumn( - "alaskan_native_servicing_i", "alaskan_native_servicing_i", "BOOLEAN" - ), - TransactionColumn( - "american_indian_owned_busi", "american_indian_owned_busi", "BOOLEAN" - ), + TransactionColumn("alaskan_native_owned_corpo", "alaskan_native_owned_corpo", "BOOLEAN"), + TransactionColumn("alaskan_native_servicing_i", "alaskan_native_servicing_i", "BOOLEAN"), + TransactionColumn("american_indian_owned_busi", "american_indian_owned_busi", "BOOLEAN"), TransactionColumn("annual_revenue", "annual_revenue", "STRING"), - TransactionColumn( - "asian_pacific_american_own", "asian_pacific_american_own", "BOOLEAN" - ), + TransactionColumn("asian_pacific_american_own", "asian_pacific_american_own", "BOOLEAN"), TransactionColumn("award_description", "award_description", "STRING"), - TransactionColumn( - "award_modification_amendme", "award_modification_amendme", "STRING" - ), + TransactionColumn("award_modification_amendme", "award_modification_amendme", "STRING"), TransactionColumn("award_or_idv_flag", "award_or_idv_flag", "STRING"), - TransactionColumn( - "awardee_or_recipient_legal", "awardee_or_recipient_legal", "STRING" - ), + TransactionColumn("awardee_or_recipient_legal", "awardee_or_recipient_legal", "STRING"), TransactionColumn("awardee_or_recipient_uei", "awardee_or_recipient_uei", "STRING"), - TransactionColumn( - "awardee_or_recipient_uniqu", "awardee_or_recipient_uniqu", "STRING" - ), + TransactionColumn("awardee_or_recipient_uniqu", "awardee_or_recipient_uniqu", "STRING"), TransactionColumn("awarding_agency_code", "awarding_agency_code", "STRING"), TransactionColumn("awarding_agency_name", "awarding_agency_name", "STRING"), TransactionColumn("awarding_office_code", "awarding_office_code", "STRING"), TransactionColumn("awarding_office_name", "awarding_office_name", "STRING"), - TransactionColumn( - "awarding_sub_tier_agency_c", "awarding_sub_tier_agency_c", "STRING" - ), - TransactionColumn( - "awarding_sub_tier_agency_n", "awarding_sub_tier_agency_n", "STRING" - ), - TransactionColumn( - "base_and_all_options_value", "base_and_all_options_value", "STRING" - ), - TransactionColumn( - "base_exercised_options_val", "base_exercised_options_val", "STRING" - ), - TransactionColumn( - "black_american_owned_busin", "black_american_owned_busin", "BOOLEAN" - ), - TransactionColumn( - "c1862_land_grant_college", "c1862_land_grant_college", "BOOLEAN" - ), - TransactionColumn( - "c1890_land_grant_college", "c1890_land_grant_college", "BOOLEAN" - ), - TransactionColumn( - "c1994_land_grant_college", "c1994_land_grant_college", "BOOLEAN" - ), + TransactionColumn("awarding_sub_tier_agency_c", "awarding_sub_tier_agency_c", "STRING"), + TransactionColumn("awarding_sub_tier_agency_n", "awarding_sub_tier_agency_n", "STRING"), + TransactionColumn("base_and_all_options_value", "base_and_all_options_value", "STRING"), + TransactionColumn("base_exercised_options_val", "base_exercised_options_val", "STRING"), + TransactionColumn("black_american_owned_busin", "black_american_owned_busin", "BOOLEAN"), + TransactionColumn("c1862_land_grant_college", "c1862_land_grant_college", "BOOLEAN"), + TransactionColumn("c1890_land_grant_college", "c1890_land_grant_college", "BOOLEAN"), + TransactionColumn("c1994_land_grant_college", "c1994_land_grant_college", "BOOLEAN"), TransactionColumn("c8a_program_participant", "c8a_program_participant", "BOOLEAN"), TransactionColumn("cage_code", "cage_code", "STRING"), TransactionColumn("city_local_government", "city_local_government", "BOOLEAN"), - TransactionColumn( - "clinger_cohen_act_pla_desc", "clinger_cohen_act_pla_desc", "STRING" - ), - TransactionColumn( - "clinger_cohen_act_planning", "clinger_cohen_act_planning", "STRING" - ), - TransactionColumn( - "commercial_item_acqui_desc", "commercial_item_acqui_desc", "STRING" - ), - TransactionColumn( - "commercial_item_acquisitio", "commercial_item_acquisitio", "STRING" - ), - TransactionColumn( - "commercial_item_test_desc", "commercial_item_test_desc", "STRING" - ), - TransactionColumn( - "commercial_item_test_progr", "commercial_item_test_progr", "STRING" - ), - TransactionColumn( - "community_developed_corpor", "community_developed_corpor", "BOOLEAN" - ), - TransactionColumn( - "community_development_corp", "community_development_corp", "BOOLEAN" - ), + TransactionColumn("clinger_cohen_act_pla_desc", "clinger_cohen_act_pla_desc", "STRING"), + TransactionColumn("clinger_cohen_act_planning", "clinger_cohen_act_planning", "STRING"), + TransactionColumn("commercial_item_acqui_desc", "commercial_item_acqui_desc", "STRING"), + TransactionColumn("commercial_item_acquisitio", "commercial_item_acquisitio", "STRING"), + TransactionColumn("commercial_item_test_desc", "commercial_item_test_desc", "STRING"), + TransactionColumn("commercial_item_test_progr", "commercial_item_test_progr", "STRING"), + TransactionColumn("community_developed_corpor", "community_developed_corpor", "BOOLEAN"), + TransactionColumn("community_development_corp", "community_development_corp", "BOOLEAN"), TransactionColumn("consolidated_contract", "consolidated_contract", "STRING"), - TransactionColumn( - "consolidated_contract_desc", "consolidated_contract_desc", "STRING" - ), - TransactionColumn( - "construction_wage_rat_desc", "construction_wage_rat_desc", "STRING" - ), - TransactionColumn( - "construction_wage_rate_req", "construction_wage_rate_req", "STRING" - ), - TransactionColumn( - "contingency_humanitar_desc", "contingency_humanitar_desc", "STRING" - ), - TransactionColumn( - "contingency_humanitarian_o", "contingency_humanitarian_o", "STRING" - ), + TransactionColumn("consolidated_contract_desc", "consolidated_contract_desc", "STRING"), + TransactionColumn("construction_wage_rat_desc", "construction_wage_rat_desc", "STRING"), + TransactionColumn("construction_wage_rate_req", "construction_wage_rate_req", "STRING"), + TransactionColumn("contingency_humanitar_desc", "contingency_humanitar_desc", "STRING"), + TransactionColumn("contingency_humanitarian_o", "contingency_humanitarian_o", "STRING"), TransactionColumn("contract_award_type", "contract_award_type", "STRING"), TransactionColumn("contract_award_type_desc", "contract_award_type_desc", "STRING"), TransactionColumn("contract_bundling", "contract_bundling", "STRING"), - TransactionColumn( - "contract_bundling_descrip", "contract_bundling_descrip", "STRING" - ), + TransactionColumn("contract_bundling_descrip", "contract_bundling_descrip", "STRING"), TransactionColumn("contract_financing", "contract_financing", "STRING"), - TransactionColumn( - "contract_financing_descrip", "contract_financing_descrip", "STRING" - ), - TransactionColumn( - "contracting_officers_desc", "contracting_officers_desc", "STRING" - ), - TransactionColumn( - "contracting_officers_deter", "contracting_officers_deter", "STRING" - ), + TransactionColumn("contract_financing_descrip", "contract_financing_descrip", "STRING"), + TransactionColumn("contracting_officers_desc", "contracting_officers_desc", "STRING"), + TransactionColumn("contracting_officers_deter", "contracting_officers_deter", "STRING"), TransactionColumn("contracts", "contracts", "BOOLEAN"), - TransactionColumn( - "corporate_entity_not_tax_e", "corporate_entity_not_tax_e", "BOOLEAN" - ), - TransactionColumn( - "corporate_entity_tax_exemp", "corporate_entity_tax_exemp", "BOOLEAN" - ), - TransactionColumn( - "cost_accounting_stand_desc", "cost_accounting_stand_desc", "STRING" - ), - TransactionColumn( - "cost_accounting_standards", "cost_accounting_standards", "STRING" - ), + TransactionColumn("corporate_entity_not_tax_e", "corporate_entity_not_tax_e", "BOOLEAN"), + TransactionColumn("corporate_entity_tax_exemp", "corporate_entity_tax_exemp", "BOOLEAN"), + TransactionColumn("cost_accounting_stand_desc", "cost_accounting_stand_desc", "STRING"), + TransactionColumn("cost_accounting_standards", "cost_accounting_standards", "STRING"), TransactionColumn("cost_or_pricing_data", "cost_or_pricing_data", "STRING"), - TransactionColumn( - "cost_or_pricing_data_desc", "cost_or_pricing_data_desc", "STRING" - ), + TransactionColumn("cost_or_pricing_data_desc", "cost_or_pricing_data_desc", "STRING"), TransactionColumn("council_of_governments", "council_of_governments", "BOOLEAN"), - TransactionColumn( - "country_of_product_or_desc", "country_of_product_or_desc", "STRING" - ), - TransactionColumn( - "country_of_product_or_serv", "country_of_product_or_serv", "STRING" - ), + TransactionColumn("country_of_product_or_desc", "country_of_product_or_desc", "STRING"), + TransactionColumn("country_of_product_or_serv", "country_of_product_or_serv", "STRING"), TransactionColumn("county_local_government", "county_local_government", "BOOLEAN"), TransactionColumn("created_at", "created_at", "TIMESTAMP"), - TransactionColumn( - "current_total_value_award", "current_total_value_award", "STRING" - ), - TransactionColumn( - "detached_award_proc_unique", "detached_award_proc_unique", "STRING" - ), - TransactionColumn( - "detached_award_procurement_id", "detached_award_procurement_id", "INTEGER" - ), + TransactionColumn("current_total_value_award", "current_total_value_award", "STRING"), + TransactionColumn("detached_award_proc_unique", "detached_award_proc_unique", "STRING"), + TransactionColumn("detached_award_procurement_id", "detached_award_procurement_id", "INTEGER"), TransactionColumn("division_name", "division_name", "STRING"), - TransactionColumn( - "division_number_or_office", "division_number_or_office", "STRING" - ), - TransactionColumn( - "dod_claimant_prog_cod_desc", "dod_claimant_prog_cod_desc", "STRING" - ), - TransactionColumn( - "dod_claimant_program_code", "dod_claimant_program_code", "STRING" - ), - TransactionColumn( - "domestic_or_foreign_e_desc", "domestic_or_foreign_e_desc", "STRING" - ), - TransactionColumn( - "domestic_or_foreign_entity", "domestic_or_foreign_entity", "STRING" - ), + TransactionColumn("division_number_or_office", "division_number_or_office", "STRING"), + TransactionColumn("dod_claimant_prog_cod_desc", "dod_claimant_prog_cod_desc", "STRING"), + TransactionColumn("dod_claimant_program_code", "dod_claimant_program_code", "STRING"), + TransactionColumn("domestic_or_foreign_e_desc", "domestic_or_foreign_e_desc", "STRING"), + TransactionColumn("domestic_or_foreign_entity", "domestic_or_foreign_entity", "STRING"), TransactionColumn("domestic_shelter", "domestic_shelter", "BOOLEAN"), - TransactionColumn( - "dot_certified_disadvantage", "dot_certified_disadvantage", "BOOLEAN" - ), - TransactionColumn( - "economically_disadvantaged", "economically_disadvantaged", "BOOLEAN" - ), + TransactionColumn("dot_certified_disadvantage", "dot_certified_disadvantage", "BOOLEAN"), + TransactionColumn("economically_disadvantaged", "economically_disadvantaged", "BOOLEAN"), TransactionColumn("educational_institution", "educational_institution", "BOOLEAN"), TransactionColumn("emerging_small_business", "emerging_small_business", "BOOLEAN"), TransactionColumn("entity_data_source", "entity_data_source", "STRING"), - TransactionColumn( - "epa_designated_produc_desc", "epa_designated_produc_desc", "STRING" - ), + TransactionColumn("epa_designated_produc_desc", "epa_designated_produc_desc", "STRING"), TransactionColumn("epa_designated_product", "epa_designated_product", "STRING"), TransactionColumn("evaluated_preference", "evaluated_preference", "STRING"), - TransactionColumn( - "evaluated_preference_desc", "evaluated_preference_desc", "STRING" - ), - TransactionColumn( - "extent_compete_description", "extent_compete_description", "STRING" - ), + TransactionColumn("evaluated_preference_desc", "evaluated_preference_desc", "STRING"), + TransactionColumn("extent_compete_description", "extent_compete_description", "STRING"), TransactionColumn("extent_competed", "extent_competed", "STRING"), - TransactionColumn( - "fair_opportunity_limi_desc", "fair_opportunity_limi_desc", "STRING" - ), - TransactionColumn( - "fair_opportunity_limited_s", "fair_opportunity_limited_s", "STRING" - ), + TransactionColumn("fair_opportunity_limi_desc", "fair_opportunity_limi_desc", "STRING"), + TransactionColumn("fair_opportunity_limited_s", "fair_opportunity_limited_s", "STRING"), TransactionColumn("fed_biz_opps", "fed_biz_opps", "STRING"), TransactionColumn("fed_biz_opps_description", "fed_biz_opps_description", "STRING"), - TransactionColumn( - "federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)" - ), + TransactionColumn("federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)"), TransactionColumn("federal_agency", "federal_agency", "BOOLEAN"), - TransactionColumn( - "federally_funded_research", "federally_funded_research", "BOOLEAN" - ), + TransactionColumn("federally_funded_research", "federally_funded_research", "BOOLEAN"), TransactionColumn("for_profit_organization", "for_profit_organization", "BOOLEAN"), TransactionColumn("foreign_funding", "foreign_funding", "STRING"), TransactionColumn("foreign_funding_desc", "foreign_funding_desc", "STRING"), TransactionColumn("foreign_government", "foreign_government", "BOOLEAN"), - TransactionColumn( - "foreign_owned_and_located", "foreign_owned_and_located", "BOOLEAN" - ), + TransactionColumn("foreign_owned_and_located", "foreign_owned_and_located", "BOOLEAN"), TransactionColumn("foundation", "foundation", "BOOLEAN"), TransactionColumn("funding_agency_code", "funding_agency_code", "STRING"), TransactionColumn("funding_agency_name", "funding_agency_name", "STRING"), TransactionColumn("funding_office_code", "funding_office_code", "STRING"), TransactionColumn("funding_office_name", "funding_office_name", "STRING"), - TransactionColumn( - "funding_sub_tier_agency_co", "funding_sub_tier_agency_co", "STRING" - ), - TransactionColumn( - "funding_sub_tier_agency_na", "funding_sub_tier_agency_na", "STRING" - ), - TransactionColumn( - "government_furnished_desc", "government_furnished_desc", "STRING" - ), - TransactionColumn( - "government_furnished_prope", "government_furnished_prope", "STRING" - ), + TransactionColumn("funding_sub_tier_agency_co", "funding_sub_tier_agency_co", "STRING"), + TransactionColumn("funding_sub_tier_agency_na", "funding_sub_tier_agency_na", "STRING"), + TransactionColumn("government_furnished_desc", "government_furnished_desc", "STRING"), + TransactionColumn("government_furnished_prope", "government_furnished_prope", "STRING"), TransactionColumn("grants", "grants", "BOOLEAN"), - TransactionColumn( - "hispanic_american_owned_bu", "hispanic_american_owned_bu", "BOOLEAN" - ), - TransactionColumn( - "hispanic_servicing_institu", "hispanic_servicing_institu", "BOOLEAN" - ), - TransactionColumn( - "historically_black_college", "historically_black_college", "BOOLEAN" - ), - TransactionColumn( - "historically_underutilized", "historically_underutilized", "BOOLEAN" - ), + TransactionColumn("hispanic_american_owned_bu", "hispanic_american_owned_bu", "BOOLEAN"), + TransactionColumn("hispanic_servicing_institu", "hispanic_servicing_institu", "BOOLEAN"), + TransactionColumn("historically_black_college", "historically_black_college", "BOOLEAN"), + TransactionColumn("historically_underutilized", "historically_underutilized", "BOOLEAN"), TransactionColumn("hospital_flag", "hospital_flag", "BOOLEAN"), - TransactionColumn( - "housing_authorities_public", "housing_authorities_public", "BOOLEAN" - ), + TransactionColumn("housing_authorities_public", "housing_authorities_public", "BOOLEAN"), TransactionColumn("idv_type", "idv_type", "STRING"), TransactionColumn("idv_type_description", "idv_type_description", "STRING"), - TransactionColumn( - "indian_tribe_federally_rec", "indian_tribe_federally_rec", "BOOLEAN" - ), - TransactionColumn( - "information_technolog_desc", "information_technolog_desc", "STRING" - ), - TransactionColumn( - "information_technology_com", "information_technology_com", "STRING" - ), - TransactionColumn( - "inherently_government_desc", "inherently_government_desc", "STRING" - ), - TransactionColumn( - "inherently_government_func", "inherently_government_func", "STRING" - ), + TransactionColumn("indian_tribe_federally_rec", "indian_tribe_federally_rec", "BOOLEAN"), + TransactionColumn("information_technolog_desc", "information_technolog_desc", "STRING"), + TransactionColumn("information_technology_com", "information_technology_com", "STRING"), + TransactionColumn("inherently_government_desc", "inherently_government_desc", "STRING"), + TransactionColumn("inherently_government_func", "inherently_government_func", "STRING"), TransactionColumn( "initial_report_date", "initial_report_date", "STRING", "string_datetime_remove_timestamp", ), - TransactionColumn( - "inter_municipal_local_gove", "inter_municipal_local_gove", "BOOLEAN" - ), - TransactionColumn( - "interagency_contract_desc", "interagency_contract_desc", "STRING" - ), - TransactionColumn( - "interagency_contracting_au", "interagency_contracting_au", "STRING" - ), - TransactionColumn( - "international_organization", "international_organization", "BOOLEAN" - ), + TransactionColumn("inter_municipal_local_gove", "inter_municipal_local_gove", "BOOLEAN"), + TransactionColumn("interagency_contract_desc", "interagency_contract_desc", "STRING"), + TransactionColumn("interagency_contracting_au", "interagency_contracting_au", "STRING"), + TransactionColumn("international_organization", "international_organization", "BOOLEAN"), TransactionColumn("interstate_entity", "interstate_entity", "BOOLEAN"), - TransactionColumn( - "joint_venture_economically", "joint_venture_economically", "BOOLEAN" - ), - TransactionColumn( - "joint_venture_women_owned", "joint_venture_women_owned", "BOOLEAN" - ), + TransactionColumn("joint_venture_economically", "joint_venture_economically", "BOOLEAN"), + TransactionColumn("joint_venture_women_owned", "joint_venture_women_owned", "BOOLEAN"), TransactionColumn("labor_standards", "labor_standards", "STRING"), TransactionColumn("labor_standards_descrip", "labor_standards_descrip", "STRING"), TransactionColumn("labor_surplus_area_firm", "labor_surplus_area_firm", "BOOLEAN"), TransactionColumn("last_modified", "last_modified", "STRING"), - TransactionColumn( - "legal_entity_address_line1", "legal_entity_address_line1", "STRING" - ), - TransactionColumn( - "legal_entity_address_line2", "legal_entity_address_line2", "STRING" - ), - TransactionColumn( - "legal_entity_address_line3", "legal_entity_address_line3", "STRING" - ), + TransactionColumn("legal_entity_address_line1", "legal_entity_address_line1", "STRING"), + TransactionColumn("legal_entity_address_line2", "legal_entity_address_line2", "STRING"), + TransactionColumn("legal_entity_address_line3", "legal_entity_address_line3", "STRING"), TransactionColumn("legal_entity_city_name", "legal_entity_city_name", "STRING"), - TransactionColumn( - "legal_entity_congressional", "legal_entity_congressional", "STRING" - ), + TransactionColumn("legal_entity_congressional", "legal_entity_congressional", "STRING"), TransactionColumn( "legal_entity_country_code", "legal_entity_country_code", "STRING", - scalar_transformation="CASE {input} \ - WHEN 'UNITED STATES' THEN 'USA' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: sf.when(col == "UNITED STATES", "USA").otherwise(col), ), TransactionColumn( "legal_entity_country_name", "legal_entity_country_name", "STRING", - scalar_transformation="CASE \ - WHEN {input} = 'USA' THEN 'UNITED STATES' \ - WHEN COALESCE({input}, '') = '' AND legal_entity_country_code = 'UNITED STATES' THEN 'UNITED STATES' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: ( + sf.when(col == "USA", sf.lit("UNITED STATES")) + .when( + (sf.coalesce(col, sf.lit("")) == sf.lit("")) + & (sf.col("legal_entity_country_code") == sf.lit("UNITED STATES")), + sf.lit("UNITED STATES"), + ) + .otherwise(col) + ), ), TransactionColumn("legal_entity_county_code", "legal_entity_county_code", "STRING"), TransactionColumn("legal_entity_county_name", "legal_entity_county_name", "STRING"), TransactionColumn("legal_entity_state_code", "legal_entity_state_code", "STRING"), - TransactionColumn( - "legal_entity_state_descrip", "legal_entity_state_descrip", "STRING" - ), + TransactionColumn("legal_entity_state_descrip", "legal_entity_state_descrip", "STRING"), TransactionColumn("legal_entity_zip4", "legal_entity_zip4", "STRING"), TransactionColumn("legal_entity_zip5", "legal_entity_zip5", "STRING"), TransactionColumn("legal_entity_zip_last4", "legal_entity_zip_last4", "STRING"), - TransactionColumn( - "limited_liability_corporat", "limited_liability_corporat", "BOOLEAN" - ), + TransactionColumn("limited_liability_corporat", "limited_liability_corporat", "BOOLEAN"), TransactionColumn("local_area_set_aside", "local_area_set_aside", "STRING"), - TransactionColumn( - "local_area_set_aside_desc", "local_area_set_aside_desc", "STRING" - ), + TransactionColumn("local_area_set_aside_desc", "local_area_set_aside_desc", "STRING"), TransactionColumn("local_government_owned", "local_government_owned", "BOOLEAN"), TransactionColumn("major_program", "major_program", "STRING"), TransactionColumn("manufacturer_of_goods", "manufacturer_of_goods", "BOOLEAN"), - TransactionColumn( - "materials_supplies_article", "materials_supplies_article", "STRING" - ), - TransactionColumn( - "materials_supplies_descrip", "materials_supplies_descrip", "STRING" - ), + TransactionColumn("materials_supplies_article", "materials_supplies_article", "STRING"), + TransactionColumn("materials_supplies_descrip", "materials_supplies_descrip", "STRING"), TransactionColumn("minority_institution", "minority_institution", "BOOLEAN"), TransactionColumn("minority_owned_business", "minority_owned_business", "BOOLEAN"), TransactionColumn("multi_year_contract", "multi_year_contract", "STRING"), TransactionColumn("multi_year_contract_desc", "multi_year_contract_desc", "STRING"), - TransactionColumn( - "multiple_or_single_aw_desc", "multiple_or_single_aw_desc", "STRING" - ), - TransactionColumn( - "multiple_or_single_award_i", "multiple_or_single_award_i", "STRING" - ), - TransactionColumn( - "municipality_local_governm", "municipality_local_governm", "BOOLEAN" - ), + TransactionColumn("multiple_or_single_aw_desc", "multiple_or_single_aw_desc", "STRING"), + TransactionColumn("multiple_or_single_award_i", "multiple_or_single_award_i", "STRING"), + TransactionColumn("municipality_local_governm", "municipality_local_governm", "BOOLEAN"), TransactionColumn("naics", "naics", "STRING"), TransactionColumn("naics_description", "naics_description", "STRING"), TransactionColumn("national_interest_action", "national_interest_action", "STRING"), TransactionColumn("national_interest_desc", "national_interest_desc", "STRING"), - TransactionColumn( - "native_american_owned_busi", "native_american_owned_busi", "BOOLEAN" - ), - TransactionColumn( - "native_hawaiian_owned_busi", "native_hawaiian_owned_busi", "BOOLEAN" - ), - TransactionColumn( - "native_hawaiian_servicing", "native_hawaiian_servicing", "BOOLEAN" - ), + TransactionColumn("native_american_owned_busi", "native_american_owned_busi", "BOOLEAN"), + TransactionColumn("native_hawaiian_owned_busi", "native_hawaiian_owned_busi", "BOOLEAN"), + TransactionColumn("native_hawaiian_servicing", "native_hawaiian_servicing", "BOOLEAN"), TransactionColumn("nonprofit_organization", "nonprofit_organization", "BOOLEAN"), TransactionColumn("number_of_actions", "number_of_actions", "STRING"), TransactionColumn("number_of_employees", "number_of_employees", "STRING"), - TransactionColumn( - "number_of_offers_received", "number_of_offers_received", "STRING" - ), - TransactionColumn( - "officer_1_amount", "high_comp_officer1_amount", "NUMERIC(23,2)", "cast" - ), + TransactionColumn("number_of_offers_received", "number_of_offers_received", "STRING"), + TransactionColumn("officer_1_amount", "high_comp_officer1_amount", "NUMERIC(23,2)", "cast"), TransactionColumn("officer_1_name", "high_comp_officer1_full_na", "STRING"), - TransactionColumn( - "officer_2_amount", "high_comp_officer2_amount", "NUMERIC(23,2)", "cast" - ), + TransactionColumn("officer_2_amount", "high_comp_officer2_amount", "NUMERIC(23,2)", "cast"), TransactionColumn("officer_2_name", "high_comp_officer2_full_na", "STRING"), - TransactionColumn( - "officer_3_amount", "high_comp_officer3_amount", "NUMERIC(23,2)", "cast" - ), + TransactionColumn("officer_3_amount", "high_comp_officer3_amount", "NUMERIC(23,2)", "cast"), TransactionColumn("officer_3_name", "high_comp_officer3_full_na", "STRING"), - TransactionColumn( - "officer_4_amount", "high_comp_officer4_amount", "NUMERIC(23,2)", "cast" - ), + TransactionColumn("officer_4_amount", "high_comp_officer4_amount", "NUMERIC(23,2)", "cast"), TransactionColumn("officer_4_name", "high_comp_officer4_full_na", "STRING"), - TransactionColumn( - "officer_5_amount", "high_comp_officer5_amount", "NUMERIC(23,2)", "cast" - ), + TransactionColumn("officer_5_amount", "high_comp_officer5_amount", "NUMERIC(23,2)", "cast"), TransactionColumn("officer_5_name", "high_comp_officer5_full_na", "STRING"), TransactionColumn( "ordering_period_end_date", @@ -415,245 +218,123 @@ "string_datetime_remove_timestamp", ), TransactionColumn("organizational_type", "organizational_type", "STRING"), - TransactionColumn( - "other_minority_owned_busin", "other_minority_owned_busin", "BOOLEAN" - ), - TransactionColumn( - "other_not_for_profit_organ", "other_not_for_profit_organ", "BOOLEAN" - ), - TransactionColumn( - "other_statutory_authority", "other_statutory_authority", "STRING" - ), - TransactionColumn( - "other_than_full_and_o_desc", "other_than_full_and_o_desc", "STRING" - ), - TransactionColumn( - "other_than_full_and_open_c", "other_than_full_and_open_c", "STRING" - ), + TransactionColumn("other_minority_owned_busin", "other_minority_owned_busin", "BOOLEAN"), + TransactionColumn("other_not_for_profit_organ", "other_not_for_profit_organ", "BOOLEAN"), + TransactionColumn("other_statutory_authority", "other_statutory_authority", "STRING"), + TransactionColumn("other_than_full_and_o_desc", "other_than_full_and_o_desc", "STRING"), + TransactionColumn("other_than_full_and_open_c", "other_than_full_and_open_c", "STRING"), TransactionColumn("parent_award_id", "parent_award_id", "STRING"), - TransactionColumn( - "partnership_or_limited_lia", "partnership_or_limited_lia", "BOOLEAN" - ), - TransactionColumn( - "performance_based_se_desc", "performance_based_se_desc", "STRING" - ), - TransactionColumn( - "performance_based_service", "performance_based_service", "STRING" - ), - TransactionColumn( - "period_of_perf_potential_e", "period_of_perf_potential_e", "STRING" - ), - TransactionColumn( - "period_of_performance_curr", "period_of_performance_curr", "STRING" - ), - TransactionColumn( - "period_of_performance_star", "period_of_performance_star", "STRING" - ), + TransactionColumn("partnership_or_limited_lia", "partnership_or_limited_lia", "BOOLEAN"), + TransactionColumn("performance_based_se_desc", "performance_based_se_desc", "STRING"), + TransactionColumn("performance_based_service", "performance_based_service", "STRING"), + TransactionColumn("period_of_perf_potential_e", "period_of_perf_potential_e", "STRING"), + TransactionColumn("period_of_performance_curr", "period_of_performance_curr", "STRING"), + TransactionColumn("period_of_performance_star", "period_of_performance_star", "STRING"), TransactionColumn("piid", "piid", "STRING"), TransactionColumn("place_of_manufacture", "place_of_manufacture", "STRING"), - TransactionColumn( - "place_of_manufacture_desc", "place_of_manufacture_desc", "STRING" - ), - TransactionColumn( - "place_of_perf_country_desc", "place_of_perf_country_desc", "STRING" - ), - TransactionColumn( - "place_of_perfor_state_desc", "place_of_perfor_state_desc", "STRING" - ), - TransactionColumn( - "place_of_perform_city_name", "place_of_perform_city_name", "STRING" - ), + TransactionColumn("place_of_manufacture_desc", "place_of_manufacture_desc", "STRING"), + TransactionColumn("place_of_perf_country_desc", "place_of_perf_country_desc", "STRING"), + TransactionColumn("place_of_perfor_state_desc", "place_of_perfor_state_desc", "STRING"), + TransactionColumn("place_of_perform_city_name", "place_of_perform_city_name", "STRING"), TransactionColumn( "place_of_perform_country_c", "place_of_perform_country_c", "STRING", - scalar_transformation="CASE {input} \ - WHEN 'UNITED STATES' THEN 'USA' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: sf.when(col == sf.lit("UNITED STATES"), sf.lit("USA")).otherwise(col), ), TransactionColumn( "place_of_perform_country_n", "place_of_perform_country_n", "STRING", - scalar_transformation="CASE \ - WHEN {input} = 'USA' THEN 'UNITED STATES' \ - WHEN COALESCE({input}, '') = '' AND place_of_perform_country_c = 'UNITED STATES' THEN 'UNITED STATES' \ - ELSE {input} \ - END", - ), - TransactionColumn( - "place_of_perform_county_co", "place_of_perform_county_co", "STRING" - ), - TransactionColumn( - "place_of_perform_county_na", "place_of_perform_county_na", "STRING" - ), - TransactionColumn( - "place_of_perform_state_nam", "place_of_perform_state_nam", "STRING" - ), - TransactionColumn( - "place_of_perform_zip_last4", "place_of_perform_zip_last4", "STRING" - ), - TransactionColumn( - "place_of_performance_congr", "place_of_performance_congr", "STRING" - ), - TransactionColumn( - "place_of_performance_locat", "place_of_performance_locat", "STRING" - ), - TransactionColumn( - "place_of_performance_state", "place_of_performance_state", "STRING" - ), - TransactionColumn( - "place_of_performance_zip4a", "place_of_performance_zip4a", "STRING" - ), - TransactionColumn( - "place_of_performance_zip5", "place_of_performance_zip5", "STRING" - ), + scalar_transformation=lambda col: ( + sf.when(col == sf.lit("USA"), sf.lit("UNITED STATES")) + .when( + (sf.coalesce(col, sf.lit("")) == sf.lit("")) + & (sf.col("place_of_perform_country_c") == sf.lit("UNITED STATES")), + sf.lit("UNITED STATES"), + ) + .otherwise(col) + ), + ), + TransactionColumn("place_of_perform_county_co", "place_of_perform_county_co", "STRING"), + TransactionColumn("place_of_perform_county_na", "place_of_perform_county_na", "STRING"), + TransactionColumn("place_of_perform_state_nam", "place_of_perform_state_nam", "STRING"), + TransactionColumn("place_of_perform_zip_last4", "place_of_perform_zip_last4", "STRING"), + TransactionColumn("place_of_performance_congr", "place_of_performance_congr", "STRING"), + TransactionColumn("place_of_performance_locat", "place_of_performance_locat", "STRING"), + TransactionColumn("place_of_performance_state", "place_of_performance_state", "STRING"), + TransactionColumn("place_of_performance_zip4a", "place_of_performance_zip4a", "STRING"), + TransactionColumn("place_of_performance_zip5", "place_of_performance_zip5", "STRING"), TransactionColumn("planning_commission", "planning_commission", "BOOLEAN"), TransactionColumn("port_authority", "port_authority", "BOOLEAN"), - TransactionColumn( - "potential_total_value_awar", "potential_total_value_awar", "STRING" - ), - TransactionColumn( - "price_evaluation_adjustmen", "price_evaluation_adjustmen", "STRING" - ), - TransactionColumn( - "private_university_or_coll", "private_university_or_coll", "BOOLEAN" - ), - TransactionColumn( - "product_or_service_co_desc", "product_or_service_co_desc", "STRING" - ), + TransactionColumn("potential_total_value_awar", "potential_total_value_awar", "STRING"), + TransactionColumn("price_evaluation_adjustmen", "price_evaluation_adjustmen", "STRING"), + TransactionColumn("private_university_or_coll", "private_university_or_coll", "BOOLEAN"), + TransactionColumn("product_or_service_co_desc", "product_or_service_co_desc", "STRING"), TransactionColumn("product_or_service_code", "product_or_service_code", "STRING"), TransactionColumn("program_acronym", "program_acronym", "STRING"), - TransactionColumn( - "program_system_or_equ_desc", "program_system_or_equ_desc", "STRING" - ), - TransactionColumn( - "program_system_or_equipmen", "program_system_or_equipmen", "STRING" - ), + TransactionColumn("program_system_or_equ_desc", "program_system_or_equ_desc", "STRING"), + TransactionColumn("program_system_or_equipmen", "program_system_or_equipmen", "STRING"), TransactionColumn("pulled_from", "pulled_from", "STRING"), - TransactionColumn( - "purchase_card_as_paym_desc", "purchase_card_as_paym_desc", "STRING" - ), - TransactionColumn( - "purchase_card_as_payment_m", "purchase_card_as_payment_m", "STRING" - ), - TransactionColumn( - "receives_contracts_and_gra", "receives_contracts_and_gra", "BOOLEAN" - ), - TransactionColumn( - "recovered_materials_s_desc", "recovered_materials_s_desc", "STRING" - ), - TransactionColumn( - "recovered_materials_sustai", "recovered_materials_sustai", "STRING" - ), - TransactionColumn( - "referenced_idv_agency_desc", "referenced_idv_agency_desc", "STRING" - ), - TransactionColumn( - "referenced_idv_agency_iden", "referenced_idv_agency_iden", "STRING" - ), - TransactionColumn( - "referenced_idv_agency_name", "referenced_idv_agency_name", "STRING" - ), - TransactionColumn( - "referenced_idv_modificatio", "referenced_idv_modificatio", "STRING" - ), + TransactionColumn("purchase_card_as_paym_desc", "purchase_card_as_paym_desc", "STRING"), + TransactionColumn("purchase_card_as_payment_m", "purchase_card_as_payment_m", "STRING"), + TransactionColumn("receives_contracts_and_gra", "receives_contracts_and_gra", "BOOLEAN"), + TransactionColumn("recovered_materials_s_desc", "recovered_materials_s_desc", "STRING"), + TransactionColumn("recovered_materials_sustai", "recovered_materials_sustai", "STRING"), + TransactionColumn("referenced_idv_agency_desc", "referenced_idv_agency_desc", "STRING"), + TransactionColumn("referenced_idv_agency_iden", "referenced_idv_agency_iden", "STRING"), + TransactionColumn("referenced_idv_agency_name", "referenced_idv_agency_name", "STRING"), + TransactionColumn("referenced_idv_modificatio", "referenced_idv_modificatio", "STRING"), TransactionColumn("referenced_idv_type", "referenced_idv_type", "STRING"), TransactionColumn("referenced_idv_type_desc", "referenced_idv_type_desc", "STRING"), - TransactionColumn( - "referenced_mult_or_si_desc", "referenced_mult_or_si_desc", "STRING" - ), - TransactionColumn( - "referenced_mult_or_single", "referenced_mult_or_single", "STRING" - ), + TransactionColumn("referenced_mult_or_si_desc", "referenced_mult_or_si_desc", "STRING"), + TransactionColumn("referenced_mult_or_single", "referenced_mult_or_single", "STRING"), # The referenced_multi_or_single field does not appear in the django model and may have been created inadvertently # in the Delta model previously. Since it is always NULL, it is a candidate for elimination. TransactionColumn("referenced_multi_or_single", "NULL", "STRING", "literal"), TransactionColumn("research", "research", "STRING"), TransactionColumn("research_description", "research_description", "STRING"), TransactionColumn("sam_exception", "sam_exception", "STRING"), - TransactionColumn( - "sam_exception_description", "sam_exception_description", "STRING" - ), - TransactionColumn( - "sba_certified_8_a_joint_ve", "sba_certified_8_a_joint_ve", "BOOLEAN" - ), - TransactionColumn( - "school_district_local_gove", "school_district_local_gove", "BOOLEAN" - ), + TransactionColumn("sam_exception_description", "sam_exception_description", "STRING"), + TransactionColumn("sba_certified_8_a_joint_ve", "sba_certified_8_a_joint_ve", "BOOLEAN"), + TransactionColumn("school_district_local_gove", "school_district_local_gove", "BOOLEAN"), TransactionColumn("school_of_forestry", "school_of_forestry", "BOOLEAN"), TransactionColumn("sea_transportation", "sea_transportation", "STRING"), TransactionColumn("sea_transportation_desc", "sea_transportation_desc", "STRING"), - TransactionColumn( - "self_certified_small_disad", "self_certified_small_disad", "BOOLEAN" - ), - TransactionColumn( - "service_disabled_veteran_o", "service_disabled_veteran_o", "BOOLEAN" - ), - TransactionColumn( - "small_agricultural_coopera", "small_agricultural_coopera", "BOOLEAN" - ), - TransactionColumn( - "small_business_competitive", "small_business_competitive", "BOOLEAN" - ), - TransactionColumn( - "small_disadvantaged_busine", "small_disadvantaged_busine", "BOOLEAN" - ), + TransactionColumn("self_certified_small_disad", "self_certified_small_disad", "BOOLEAN"), + TransactionColumn("service_disabled_veteran_o", "service_disabled_veteran_o", "BOOLEAN"), + TransactionColumn("small_agricultural_coopera", "small_agricultural_coopera", "BOOLEAN"), + TransactionColumn("small_business_competitive", "small_business_competitive", "BOOLEAN"), + TransactionColumn("small_disadvantaged_busine", "small_disadvantaged_busine", "BOOLEAN"), TransactionColumn("sole_proprietorship", "sole_proprietorship", "BOOLEAN"), TransactionColumn("solicitation_date", "solicitation_date", "DATE", "cast"), TransactionColumn("solicitation_identifier", "solicitation_identifier", "STRING"), - TransactionColumn( - "solicitation_procedur_desc", "solicitation_procedur_desc", "STRING" - ), + TransactionColumn("solicitation_procedur_desc", "solicitation_procedur_desc", "STRING"), TransactionColumn("solicitation_procedures", "solicitation_procedures", "STRING"), - TransactionColumn( - "state_controlled_instituti", "state_controlled_instituti", "BOOLEAN" - ), - TransactionColumn( - "subchapter_s_corporation", "subchapter_s_corporation", "BOOLEAN" - ), - TransactionColumn( - "subcontinent_asian_asian_i", "subcontinent_asian_asian_i", "BOOLEAN" - ), + TransactionColumn("state_controlled_instituti", "state_controlled_instituti", "BOOLEAN"), + TransactionColumn("subchapter_s_corporation", "subchapter_s_corporation", "BOOLEAN"), + TransactionColumn("subcontinent_asian_asian_i", "subcontinent_asian_asian_i", "BOOLEAN"), TransactionColumn("subcontracting_plan", "subcontracting_plan", "STRING"), TransactionColumn("subcontracting_plan_desc", "subcontracting_plan_desc", "STRING"), TransactionColumn("the_ability_one_program", "the_ability_one_program", "BOOLEAN"), TransactionColumn("total_obligated_amount", "total_obligated_amount", "STRING"), -<<<<<<< HEAD TransactionColumn("township_local_government", "township_local_government", "BOOLEAN"), TransactionColumn("transaction_id", None, "LONG"), -======= - TransactionColumn( - "township_local_government", "township_local_government", "BOOLEAN" - ), - TransactionColumn("transaction_id", None, "LONG NOT NULL"), ->>>>>>> ftr/table-spec-dataclass TransactionColumn("transaction_number", "transaction_number", "STRING"), TransactionColumn("transit_authority", "transit_authority", "BOOLEAN"), TransactionColumn("tribal_college", "tribal_college", "BOOLEAN"), TransactionColumn("tribally_owned_business", "tribally_owned_business", "BOOLEAN"), - TransactionColumn( - "type_of_contract_pric_desc", "type_of_contract_pric_desc", "STRING" - ), + TransactionColumn("type_of_contract_pric_desc", "type_of_contract_pric_desc", "STRING"), TransactionColumn("type_of_contract_pricing", "type_of_contract_pricing", "STRING"), TransactionColumn("type_of_idc", "type_of_idc", "STRING"), TransactionColumn("type_of_idc_description", "type_of_idc_description", "STRING"), TransactionColumn("type_set_aside", "type_set_aside", "STRING"), - TransactionColumn( - "type_set_aside_description", "type_set_aside_description", "STRING" - ), - TransactionColumn( - "ultimate_parent_legal_enti", "ultimate_parent_legal_enti", "STRING" - ), + TransactionColumn("type_set_aside_description", "type_set_aside_description", "STRING"), + TransactionColumn("ultimate_parent_legal_enti", "ultimate_parent_legal_enti", "STRING"), TransactionColumn("ultimate_parent_uei", "ultimate_parent_uei", "STRING"), - TransactionColumn( - "ultimate_parent_unique_ide", "ultimate_parent_unique_ide", "STRING" - ), + TransactionColumn("ultimate_parent_unique_ide", "ultimate_parent_unique_ide", "STRING"), TransactionColumn("undefinitized_action", "undefinitized_action", "STRING"), - TransactionColumn( - "undefinitized_action_desc", "undefinitized_action_desc", "STRING" - ), + TransactionColumn("undefinitized_action_desc", "undefinitized_action_desc", "STRING"), TransactionColumn("unique_award_key", "unique_award_key", "STRING"), TransactionColumn("updated_at", "updated_at", "TIMESTAMP"), TransactionColumn("us_federal_government", "us_federal_government", "BOOLEAN"), @@ -662,32 +343,22 @@ TransactionColumn("us_state_government", "us_state_government", "BOOLEAN"), TransactionColumn("us_tribal_government", "us_tribal_government", "BOOLEAN"), TransactionColumn("vendor_alternate_name", "vendor_alternate_name", "STRING"), - TransactionColumn( - "vendor_alternate_site_code", "vendor_alternate_site_code", "STRING" - ), - TransactionColumn( - "vendor_doing_as_business_n", "vendor_doing_as_business_n", "STRING" - ), + TransactionColumn("vendor_alternate_site_code", "vendor_alternate_site_code", "STRING"), + TransactionColumn("vendor_doing_as_business_n", "vendor_doing_as_business_n", "STRING"), TransactionColumn("vendor_enabled", "vendor_enabled", "STRING"), TransactionColumn("vendor_fax_number", "vendor_fax_number", "STRING"), TransactionColumn("vendor_legal_org_name", "vendor_legal_org_name", "STRING"), - TransactionColumn( - "vendor_location_disabled_f", "vendor_location_disabled_f", "STRING" - ), + TransactionColumn("vendor_location_disabled_f", "vendor_location_disabled_f", "STRING"), TransactionColumn("vendor_phone_number", "vendor_phone_number", "STRING"), TransactionColumn("vendor_site_code", "vendor_site_code", "STRING"), TransactionColumn("veteran_owned_business", "veteran_owned_business", "BOOLEAN"), TransactionColumn("veterinary_college", "veterinary_college", "BOOLEAN"), TransactionColumn("veterinary_hospital", "veterinary_hospital", "BOOLEAN"), TransactionColumn("woman_owned_business", "woman_owned_business", "BOOLEAN"), -<<<<<<< HEAD TransactionColumn("women_owned_small_business", "women_owned_small_business", "BOOLEAN"), TransactionColumn("hash", "hash", "LONG"), -======= - TransactionColumn( - "women_owned_small_business", "women_owned_small_business", "BOOLEAN" - ), ->>>>>>> ftr/table-spec-dataclass + TransactionColumn("action_year", "action_year", "INTEGER"), + TransactionColumn("action_month", "action_month", "INTEGER"), ] TRANSACTION_FPDS_COLUMNS = [col.dest_name for col in TRANSACTION_FPDS_COLUMN_INFO] @@ -716,9 +387,7 @@ ] TRANSACTION_FPDS_VIEW_COLUMNS = [ - col.dest_name - for col in TRANSACTION_FPDS_COLUMN_INFO - if col.dest_name not in delta_columns_not_in_view + col.dest_name for col in TRANSACTION_FPDS_COLUMN_INFO if col.dest_name not in delta_columns_not_in_view ] transaction_fpds_sql_string = rf""" @@ -726,33 +395,27 @@ {", ".join([f"{col.dest_name} {col.delta_type}" for col in TRANSACTION_FPDS_COLUMN_INFO])} ) USING DELTA + PARTITIONED BY (action_year, action_month) LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ # Mapping from raw.detached_award_procurement to int.transaction_normalized columns, where a simple mapping exists DAP_TO_NORMALIZED_COLUMN_INFO = [ - TransactionColumn( - "action_date", "action_date", "DATE", "parse_string_datetime_to_date" - ), + TransactionColumn("action_date", "action_date", "DATE", "parse_string_datetime_to_date"), TransactionColumn("action_type", "action_type", "STRING"), TransactionColumn("action_type_description", "action_type_description", "STRING"), TransactionColumn("certified_date", "NULL", "DATE", "literal"), TransactionColumn("description", "award_description", "STRING"), TransactionColumn("face_value_loan_guarantee", "NULL", "NUMERIC(23, 2)", "literal"), - TransactionColumn( - "federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)" - ), + TransactionColumn("federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)"), TransactionColumn("funding_amount", "NULL", "NUMERIC(23, 2)", "literal"), TransactionColumn("indirect_federal_sharing", "NULL", "NUMERIC(23, 2)", "literal"), TransactionColumn("is_fpds", "TRUE", "BOOLEAN", "literal"), TransactionColumn("last_modified_date", "last_modified", "TIMESTAMP", "cast"), TransactionColumn("modification_number", "award_modification_amendme", "STRING"), - TransactionColumn( - "non_federal_funding_amount", "NULL", "NUMERIC(23, 2)", "literal" - ), - TransactionColumn( - "original_loan_subsidy_cost", "NULL", "NUMERIC(23, 2)", "literal" - ), + TransactionColumn("non_federal_funding_amount", "NULL", "NUMERIC(23, 2)", "literal"), + TransactionColumn("original_loan_subsidy_cost", "NULL", "NUMERIC(23, 2)", "literal"), # All period_of_performance_* fields seen as: YYYY-MM-DD 00:00:00, so cast works # BUT it's still just a string and could morph, so defensively smart-date-parsing the string TransactionColumn( @@ -771,4 +434,6 @@ TransactionColumn("unique_award_key", "unique_award_key", "STRING"), TransactionColumn("usaspending_unique_transaction_id", "NULL", "STRING", "literal"), TransactionColumn("hash", "hash", "LONG"), + TransactionColumn("action_year", "action_year", "INTEGER"), + TransactionColumn("action_month", "action_month", "INTEGER"), ] diff --git a/usaspending_api/transactions/delta_models/transaction_normalized.py b/usaspending_api/transactions/delta_models/transaction_normalized.py index 91471e146c..c685f467f2 100644 --- a/usaspending_api/transactions/delta_models/transaction_normalized.py +++ b/usaspending_api/transactions/delta_models/transaction_normalized.py @@ -29,6 +29,8 @@ "update_date": "TIMESTAMP", "usaspending_unique_transaction_id": "STRING", "hash": "LONG", + "action_year": "INTEGER", + "action_month": "INTEGER", } transaction_normalized_sql_string = rf""" @@ -36,5 +38,7 @@ {", ".join([f"{key} {val}" for key, val in TRANSACTION_NORMALIZED_COLUMNS.items()])} ) USING DELTA + PARTITIONED BY (is_fpds, action_year, action_month) LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ From d2bb7cf91329b934f787b4ee33dcb186f4a0d959 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Thu, 19 Feb 2026 16:19:54 -0600 Subject: [PATCH 50/59] [DEV-14453] - Reworking loaders to add partitions and use CDF --- usaspending_api/awards/delta_models/awards.py | 1 + usaspending_api/common/data_classes.py | 2 +- .../commands/load_table_to_delta.py | 169 ++++++++---------- .../commands/load_transaction_normalized.py | 4 +- .../etl/transaction_delta_loaders/loaders.py | 165 +++++++++-------- .../etl/transaction_delta_loaders/utils.py | 25 +++ .../delta_models/published_fabs.py | 6 +- .../delta_models/transaction_fabs.py | 52 +++--- .../delta_models/transaction_fpds.py | 16 +- 9 files changed, 232 insertions(+), 208 deletions(-) create mode 100644 usaspending_api/etl/transaction_delta_loaders/utils.py diff --git a/usaspending_api/awards/delta_models/awards.py b/usaspending_api/awards/delta_models/awards.py index bee808269e..d87124862b 100644 --- a/usaspending_api/awards/delta_models/awards.py +++ b/usaspending_api/awards/delta_models/awards.py @@ -55,4 +55,5 @@ ) USING DELTA LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ diff --git a/usaspending_api/common/data_classes.py b/usaspending_api/common/data_classes.py index 9a7416814d..ebef5ade96 100644 --- a/usaspending_api/common/data_classes.py +++ b/usaspending_api/common/data_classes.py @@ -35,7 +35,7 @@ def robust_order_by_fields(self) -> tuple[str] | tuple[str, str]: @dataclass class TransactionColumn: dest_name: str - source: Optional[str] + source: str | bool | None delta_type: str handling: Literal[ "cast", "leave_null", "literal", "normal", "parse_string_datetime_to_date", "string_datetime_remove_timestamp" diff --git a/usaspending_api/etl/management/commands/load_table_to_delta.py b/usaspending_api/etl/management/commands/load_table_to_delta.py index 615001881f..8b9397a08f 100644 --- a/usaspending_api/etl/management/commands/load_table_to_delta.py +++ b/usaspending_api/etl/management/commands/load_table_to_delta.py @@ -36,6 +36,7 @@ ) from usaspending_api.config import CONFIG from usaspending_api.etl.table_specs import TableSpec +from usaspending_api.etl.transaction_delta_loaders.utils import parse_date_column from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_COLUMNS, RECIPIENT_PROFILE_DELTA_COLUMNS, @@ -82,7 +83,6 @@ is_partition_column_unique=True, delta_table_create_sql=awards_sql_string, column_names=list(AWARDS_COLUMNS), - # delta_table_create_partitions=["fiscal_year"], ), "detached_award_procurement": TableSpec( model=SourceProcurementTransaction, @@ -124,8 +124,6 @@ is_partition_column_unique=True, delta_table_create_sql=transaction_fabs_sql_string, column_names=TRANSACTION_FABS_VIEW_COLUMNS, - # add_hash_field=True, - # delta_table_create_partitions=["action_year", "action_month"], ), "published_fabs": TableSpec( model=SourceAssistanceTransaction, @@ -139,11 +137,9 @@ column_names=list(PUBLISHED_FABS_DELTA_COLUMNS), extra_columns={ "hash": lambda: sf.xxhash64("*"), - "action_year": lambda: sf.year(sf.to_date("action_date")), - "action_month": lambda: sf.month(sf.to_date("action_date")), + "action_year": lambda: sf.year(parse_date_column("action_date")), + "action_month": lambda: sf.month(parse_date_column("action_date")), }, - # add_hash_field=True, - # delta_table_create_partitions=["action_year", "action_month"], ), "transaction_fpds": TableSpec( model=TransactionFPDS, @@ -158,112 +154,95 @@ column_names=TRANSACTION_FPDS_VIEW_COLUMNS, ), "transaction_normalized": TableSpec( - **{ - "model": TransactionNormalized, - "source_table": "vw_transaction_normalized", - "source_database": "int", - "destination_database": "raw", - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_normalized_sql_string, - "column_names": list(TRANSACTION_NORMALIZED_COLUMNS), - # "add_hash_field": True, - } + model=TransactionNormalized, + source_table="vw_transaction_normalized", + source_database="int", + destination_database="raw", + partition_column="id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=transaction_normalized_sql_string, + column_names=list(TRANSACTION_NORMALIZED_COLUMNS), ), # Tables loaded in from the Broker "subaward": TableSpec( - **{ - "is_from_broker": True, - "source_table": "subaward", - "destination_database": "raw", - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": broker_subawards_sql_string, - "column_names": list(BROKER_SUBAWARDS_COLUMNS), - } + is_from_broker=True, + source_table="subaward", + destination_database="raw", + partition_column="id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=broker_subawards_sql_string, + column_names=list(BROKER_SUBAWARDS_COLUMNS), ), "zips": TableSpec( - **{ - "is_from_broker": True, - "source_table": "zips", - "destination_database": "raw", - "partition_column": "zips_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": zips_sql_string, - "column_names": list(ZIPS_COLUMNS), - } + is_from_broker=True, + source_table="zips", + destination_database="raw", + partition_column="zips_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=zips_sql_string, + column_names=list(ZIPS_COLUMNS), ), # Additional definitions for use in testing; # These are copies of Views / Materialized Views / Tables from Postgres to Spark to aid in # data comparison between current Postgres data and the data transformed via Spark. "award_search_testing": TableSpec( - **{ - "model": AwardSearch, - "source_table": "award_search", - "destination_database": "rpt", - "partition_column": "award_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": award_search_create_sql_string, - "custom_schema": "total_covid_outlay NUMERIC(23,2), total_covid_obligation NUMERIC(23,2), recipient_hash " - "STRING, federal_accounts STRING, cfdas ARRAY, tas_components ARRAY", - "column_names": list(AWARD_SEARCH_COLUMNS), - } + model=AwardSearch, + source_table="award_search", + destination_database="rpt", + partition_column="award_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=award_search_create_sql_string, + custom_schema="total_covid_outlay NUMERIC(23,2), total_covid_obligation NUMERIC(23,2), recipient_hash " + "STRING, federal_accounts STRING, cfdas ARRAY, tas_components ARRAY", + column_names=list(AWARD_SEARCH_COLUMNS), ), "recipient_lookup_testing": TableSpec( - **{ - "model": RecipientLookup, - "source_table": "recipient_lookup", - "source_database": "rpt", - "destination_database": "raw", - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": recipient_lookup_create_sql_string, - "custom_schema": "recipient_hash STRING", - "column_names": list(RECIPIENT_LOOKUP_COLUMNS), - } + model=RecipientLookup, + source_table="recipient_lookup", + source_database="rpt", + destination_database="raw", + partition_column="id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=recipient_lookup_create_sql_string, + custom_schema="recipient_hash STRING", + column_names=list(RECIPIENT_LOOKUP_COLUMNS), ), "recipient_profile_testing": TableSpec( - **{ - "model": RecipientProfile, - "source_table": "recipient_profile", - "source_database": "rpt", - "destination_database": "raw", - "partition_column": "id", - "partition_column_type": "numeric", - "delta_table_create_sql": recipient_profile_create_sql_string, - "is_partition_column_unique": True, - "custom_schema": "recipient_hash STRING", - "column_names": list(RECIPIENT_PROFILE_DELTA_COLUMNS), - } + model=RecipientProfile, + source_table="recipient_profile", + source_database="rpt", + destination_database="raw", + partition_column="id", + partition_column_type="numeric", + delta_table_create_sql=recipient_profile_create_sql_string, + is_partition_column_unique=True, + custom_schema="recipient_hash STRING", + column_names=list(RECIPIENT_PROFILE_DELTA_COLUMNS), ), "sam_recipient_testing": TableSpec( - **{ - "model": DUNS, - "source_table": "duns", - "source_database": "int", - "destination_database": "raw", - "delta_table_create_sql": sam_recipient_create_sql_string, - "custom_schema": "broker_duns_id STRING, business_types_codes ARRAY", - "column_names": list(SAM_RECIPIENT_COLUMNS), - } + model=DUNS, + source_table="duns", + source_database="int", + destination_database="raw", + delta_table_create_sql=sam_recipient_create_sql_string, + custom_schema="broker_duns_id STRING, business_types_codes ARRAY", + column_names=list(SAM_RECIPIENT_COLUMNS), ), "transaction_search_testing": TableSpec( - **{ - "model": TransactionSearch, - "source_table": "transaction_search", - "destination_database": "test", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_search_create_sql_string, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", - "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), - } + model=TransactionSearch, + source_table="transaction_search", + destination_database="test", + partition_column="transaction_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=transaction_search_create_sql_string, + custom_schema="recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", + column_names=list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), ), } diff --git a/usaspending_api/etl/management/commands/load_transaction_normalized.py b/usaspending_api/etl/management/commands/load_transaction_normalized.py index 937a04f2d0..dd89e851af 100644 --- a/usaspending_api/etl/management/commands/load_transaction_normalized.py +++ b/usaspending_api/etl/management/commands/load_transaction_normalized.py @@ -30,7 +30,7 @@ def add_arguments(parser): @staticmethod def handle(*args, **options): with prepare_spark() as spark: - # fabs_loader = FABSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + fabs_loader = FABSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) fpds_loader = FPDSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) - # fabs_loader.load_transactions() + fabs_loader.load_transactions() fpds_loader.load_transactions() diff --git a/usaspending_api/etl/transaction_delta_loaders/loaders.py b/usaspending_api/etl/transaction_delta_loaders/loaders.py index b8db37febd..5f2370cf27 100644 --- a/usaspending_api/etl/transaction_delta_loaders/loaders.py +++ b/usaspending_api/etl/transaction_delta_loaders/loaders.py @@ -1,29 +1,33 @@ -import copy import logging from abc import ABC from datetime import datetime, timezone from typing import Callable, Literal from delta import DeltaTable -from pyspark.sql import DataFrame, functions as sf, SparkSession, Window +from pyspark.sql import Column, DataFrame, functions as sf, SparkSession, Window +from pyspark.sql.types import ArrayType, StringType from usaspending_api.broker.helpers.build_business_categories_boolean_dict import fpds_boolean_columns +from usaspending_api.broker.helpers.get_business_categories import ( + get_business_categories_fabs, + get_business_categories_fpds, +) from usaspending_api.broker.helpers.last_load_date import ( get_earliest_load_date, - get_latest_load_date, - update_last_load_date, get_last_load_date, + update_last_load_date, ) from usaspending_api.common.data_classes import TransactionColumn from usaspending_api.common.etl.spark import create_ref_temp_views - +from usaspending_api.etl.transaction_delta_loaders.utils import parse_date_column from usaspending_api.transactions.delta_models.transaction_fabs import ( FABS_TO_NORMALIZED_COLUMN_INFO, TRANSACTION_FABS_COLUMN_INFO, ) + from usaspending_api.transactions.delta_models.transaction_fpds import ( DAP_TO_NORMALIZED_COLUMN_INFO, TRANSACTION_FPDS_COLUMN_INFO, @@ -58,51 +62,20 @@ def load_transactions(self) -> None: ) update_last_load_date(f"transaction_{self.etl_level}", next_last_load) - def build_date_format_sql(self, col: TransactionColumn, is_casted_to_date: bool = True) -> str: - # Each of these regexps allows for an optional timestamp portion, separated from the date by some character, - # and the timestamp allows for an optional UTC offset. In any case, the timestamp is ignored, though. - regexp_mmddYYYY = r"(\\d{2})(?[-/])(\\d{2})(\\k)(\\d{4})(.\\d{2}:\\d{2}:\\d{2}([+-]\\d{2}:\\d{2})?)?" - regexp_YYYYmmdd = r"(\\d{4})(?[-/]?)(\\d{2})(\\k)(\\d{2})(.\\d{2}:\\d{2}:\\d{2}([+-]\\d{2}:\\d{2})?)?" - - mmddYYYY_fmt = sf.concat( - sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_mmddYYYY, 5), - sf.lit("-"), - sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_mmddYYYY, 1), - sf.lit("-"), - sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_mmddYYYY, 3), - ) - YYYYmmdd_fmt = sf.concat( - sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_YYYYmmdd, 1), - sf.lit("-"), - sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_YYYYmmdd, 3), - sf.lit("-"), - sf.regexp_extract(sf.col(f"{self.source_table}.{col.source}"), regexp_YYYYmmdd, 5), - ) - - if is_casted_to_date: - mmddYYYY_fmt = mmddYYYY_fmt.cast("date") - YYYYmmdd_fmt = YYYYmmdd_fmt.cast("date") - - snippet = sf.when( - sf.regexp(f"{self.source_table}.{col.source}", sf.lit(regexp_mmddYYYY)), mmddYYYY_fmt - ).otherwise(YYYYmmdd_fmt) - - return snippet - - def handle_column(self, col: TransactionColumn, is_result_aliased=True) -> str: + def handle_column(self, col: TransactionColumn, is_result_aliased=True) -> Column: if col.handling == "cast": retval = sf.col(f"{self.source_table}.{col.source}").cast(col.delta_type) elif col.handling == "literal": # Use col.source directly as the value - retval = sf.lit(col.source) + retval = sf.lit(col.source).cast(col.delta_type) elif col.handling == "parse_string_datetime_to_date": # These are string fields that actually hold DATES/TIMESTAMPS and need to be cast as dates. # However, they may not be properly parsed when calling CAST(... AS DATE). - retval = self.build_date_format_sql(col, is_casted_to_date=True) + retval = parse_date_column(col.source, table=self.source_table, is_casted_to_date=True) elif col.handling == "string_datetime_remove_timestamp": # These are string fields that actually hold DATES/TIMESTAMPS, but need the non-DATE part discarded, # even though they remain as strings - retval = self.build_date_format_sql(col, is_casted_to_date=False) + retval = parse_date_column(col.source, table=self.source_table, is_casted_to_date=False) elif col.delta_type.upper() == "STRING": # Capitalize and remove leading & trailing whitespace from all string values retval = sf.ucase(sf.trim(sf.col(f"{self.source_table}.{col.source}"))) @@ -120,30 +93,53 @@ def handle_column(self, col: TransactionColumn, is_result_aliased=True) -> str: return retval @property - def select_columns(self) -> list[str]: + def select_columns(self) -> list[Column]: return [sf.lit(None).cast("LONG").alias("transaction_id")] + [ self.handle_column(col) for col in self.col_info if col.dest_name != "transaction_id" ] - def source_subquery_df(self) -> DataFrame: + def to_insert_df(self) -> DataFrame: + window_spec = Window.partitionBy(self.id_col) return ( self.spark.read.format("delta") .option("readChangeFeed", "true") .option("startingTimestamp", self.last_etl_load_date.strftime("%Y-%m-%d %H:%M:%S")) .table(self.source_table) + .withColumn("latest_version", sf.max("_commit_version").over(window_spec)) + .filter( + sf.col("_change_type").isin(["insert", "update_postimage"]) + & (sf.col("_commit_version") == sf.col("latest_version")) + ) .select(self.select_columns) ) + def to_delete_df(self) -> DataFrame: + version_window = Window.partitionBy(self.id_col, "hash", "_commit_version") + transaction_window = Window.partitionBy(self.id_col, "hash") + return ( + self.spark.read.format("delta") + .option("readChangeFeed", "true") + .option("startingTimestamp", self.last_etl_load_date.strftime("%Y-%m-%d %H:%M:%S")) + .table(self.source_table) + .withColumn("latest_version", sf.max(sf.col("_commit_version")).over(transaction_window)) + .withColumn("has_insert", sf.max(sf.col("_change_type") == "insert").over(version_window)) + .filter( + (sf.col("_change_type") == sf.lit("delete")) + & (sf.col("_commit_version") == sf.col("latest_version")) + & ~sf.col("has_insert") + ) + .select(self.id_col, "hash", "action_year", "action_month") + ) + def transaction_merge(self) -> None: - source = self.source_subquery_df().alias("s") + source = self.to_insert_df().alias("s") + logger.info(f"number of rows: {source.count()}") target = DeltaTable.forName(self.spark, f"int.transaction_{self.etl_level}").alias("t") - id_match_condition = f"t.{self.id_col} == s.{self.id_col}" - row_not_updated_condition = "t.hash == s.hash" + id_condition = f"t.{self.id_col} == s.{self.id_col}" + hash_condition = "t.hash == s.hash" partition_pruning_conditions = "t.action_year == s.action_year AND t.action_month == s.action_month" ( - target.merge( - source, " AND ".join([id_match_condition, row_not_updated_condition, partition_pruning_conditions]) - ) + target.merge(source, " AND ".join([id_condition, hash_condition, partition_pruning_conditions])) .whenNotMatchedInsert( values={ col.dest_name: sf.col(f"s.{col.dest_name}") @@ -153,6 +149,14 @@ def transaction_merge(self) -> None: ) .execute() ) + ( + target.merge( + self.to_delete_df().alias("s"), + " AND ".join([id_condition, hash_condition, partition_pruning_conditions]), + ) + .whenMatchedDelete() + .execute() + ) class FPDSDeltaTransactionLoader(AbstractDeltaTransactionLoader): @@ -178,6 +182,8 @@ class NormalizedMixin: spark: SparkSession handle_column: Callable source_table: str + id_col: str + source_id_col: str etl_level: str last_etl_load_date: datetime select_columns: list[str] @@ -193,13 +199,19 @@ def source_subquery_df(self) -> DataFrame: .alias("awarding_subtier_agency") ) awarding_agency = self.spark.table("global_temp.agency").alias("awarding_agency") + window_spec = Window.partitionBy(self.source_id_col) df = ( self.spark.read.format("delta") .option("readChangeFeed", "true") .option("startingTimestamp", self.last_etl_load_date.strftime("%Y-%m-%d %H:%M:%S")) .table(self.source_table) + .withColumn("latest_version", sf.max("_commit_version").over(window_spec)) + .filter( + sf.col("_change_type").isin(["insert", "update_postimage"]) + & (sf.col("_commit_version") == sf.col("latest_version")) + ) ) - return ( + result = ( df.join( funding_subtier_agency, funding_subtier_agency.subtier_code == df.funding_sub_tier_agency_co, @@ -222,6 +234,7 @@ def source_subquery_df(self) -> DataFrame: ) .select(self.select_columns) ) + return result def transaction_merge(self) -> None: create_ref_temp_views(self.spark) @@ -242,7 +255,7 @@ def transaction_merge(self) -> None: # On insert, all values except for create_date and update_date will come from the subquery insert_values = [sf.col(col) for col in insert_col_names[:-2]] - insert_values.extend([sf.lit(f"""'{load_datetime.isoformat(" ")}'""")] * 2) + insert_values.extend([sf.lit(f"{load_datetime.isoformat(sep=' ')}")] * 2) target = DeltaTable.forName(self.spark, "int.transaction_normalized").alias("t") id_condition = "t.transaction_unique_id = s.transaction_unique_id" @@ -256,8 +269,7 @@ def transaction_merge(self) -> None: [id_condition, row_not_updated_condition, type_partition_condition, date_partition_conditions] ), ) - .whenNotMatchedInsert(dict(zip(insert_col_names, insert_values))) - .whenNotMatchedBySourceDelete(f"{'NOT' if self.normalization_type== 'fabs' else ''} t.is_fpds") + .whenNotMatchedInsert(values=dict(zip(insert_col_names, insert_values))) .execute() ) @@ -333,6 +345,7 @@ class FABSNormalizedDeltaTransactionLoader(NormalizedMixin, AbstractDeltaTransac def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: super().__init__(spark=spark, etl_level="normalized", spark_s3_bucket=spark_s3_bucket) self.id_col = "transaction_unique_id" + self.source_id_col = "afa_generated_unique" self.source_table = "raw.published_fabs" self.to_normalized_col_info = FABS_TO_NORMALIZED_COLUMN_INFO self.normalization_type = "fabs" @@ -342,32 +355,26 @@ def select_columns(self) -> list[str]: action_date_col = next( filter(lambda c: c.dest_name == "action_date" and c.source == "action_date", FABS_TO_NORMALIZED_COLUMN_INFO) ) - parse_action_date_sql_snippet = self.handle_column(action_date_col, is_result_aliased=False) + parse_action_date_snippet = self.handle_column(action_date_col, is_result_aliased=False) select_cols = [ - "CAST(NULL AS LONG) AS id", - "CAST(NULL AS LONG) AS award_id", - "awarding_agency.id AS awarding_agency_id", - f"""CASE WHEN month({parse_action_date_sql_snippet}) > 9 - THEN year({parse_action_date_sql_snippet}) + 1 - ELSE year({parse_action_date_sql_snippet}) - END AS fiscal_year""", - "funding_agency.id AS funding_agency_id", + sf.lit(None).cast("LONG").alias("id"), + sf.lit(None).cast("LONG").alias("award_id"), + sf.col("awarding_agency.id").alias("awarding_agency_id"), + sf.when(sf.month(parse_action_date_snippet) > sf.lit(9), sf.year(parse_action_date_snippet) + sf.lit(1)) + .otherwise(sf.year(parse_action_date_snippet)) + .alias("fiscal_year"), + sf.col("funding_agency.id").alias("funding_agency_id"), ] - select_cols.extend( - [ - # business_categories - f"get_business_categories_fabs({self.source_table}.business_types) AS business_categories", - # funding_amount - # In theory, this should be equal to - # CAST(COALESCE({bronze_table_name}.federal_action_obligation, 0) - # + COALESCE({bronze_table_name}.non_federal_funding_amount, 0) - # AS NUMERIC(23, 2)) - # However, for some historical records, this isn't true. - f""" - CAST({self.source_table}.total_funding_amount AS NUMERIC(23, 2)) AS funding_amount - """, - ] + get_business_categories_fabs_udf = sf.udf( + lambda x: get_business_categories_fabs(x), + ArrayType(StringType()), ) + select_cols = select_cols + [ + get_business_categories_fabs_udf(sf.col(f"{self.source_table}.business_types")).alias( + "business_categories" + ), + sf.expr(f"CAST({self.source_table}.total_funding_amount AS NUMERIC(23, 2)) AS funding_amount"), + ] for col in FABS_TO_NORMALIZED_COLUMN_INFO: select_cols.append(self.handle_column(col)) @@ -379,6 +386,7 @@ class FPDSNormalizedDeltaTransactionLoader(NormalizedMixin, AbstractDeltaTransac def __init__(self, spark, spark_s3_bucket: str) -> None: super().__init__(spark=spark, etl_level="normalized", spark_s3_bucket=spark_s3_bucket) self.id_col = "transaction_unique_id" + self.source_id_col = "detached_award_proc_unique" self.source_table = "raw.detached_award_procurement" self.to_normalized_col_info = DAP_TO_NORMALIZED_COLUMN_INFO self.normalization_type = "fpds" @@ -401,11 +409,12 @@ def select_columns(self) -> list[str]: fpds_business_category_columns = [ sf.col(col) for col in fpds_boolean_columns + ["contracting_officers_deter", "domestic_or_foreign_entity"] ] - named_struct_text = ", ".join([f"'{col}', {self.source_table}.{col}" for col in fpds_business_category_columns]) + get_business_categories_fpds_udf = sf.udf(lambda x: get_business_categories_fpds(x), ArrayType(StringType())) select_cols.extend( [ - # business_categories - sf.expr(f"get_business_categories_fpds(named_struct({named_struct_text})) AS business_categories"), + get_business_categories_fpds_udf(sf.struct(*fpds_business_category_columns)).alias( + "business_categories" + ), # type sf.expr( f""" diff --git a/usaspending_api/etl/transaction_delta_loaders/utils.py b/usaspending_api/etl/transaction_delta_loaders/utils.py new file mode 100644 index 0000000000..2517f8d9c2 --- /dev/null +++ b/usaspending_api/etl/transaction_delta_loaders/utils.py @@ -0,0 +1,25 @@ +from pyspark.sql import Column, functions as sf + + +def parse_date_column(column: str, table: str | None = None, is_casted_to_date: bool = True) -> Column: + column_ref = sf.col((f"{table}." if table else "") + column) + regexp_mmddYYYY = r"(\d{2})(?[-/])(\d{2})(\k)(\d{4})(.\d{2}:\d{2}:\d{2}([+-]\d{2}:\d{2})?)?" + regexp_YYYYmmdd = r"(\d{4})(?[-/]?)(\d{2})(\k)(\d{2})(.\d{2}:\d{2}:\d{2}([+-]\d{2}:\d{2})?)?" + mmddYYYY_fmt = sf.concat( + sf.regexp_extract(column_ref, regexp_mmddYYYY, 5), + sf.lit("-"), + sf.regexp_extract(column_ref, regexp_mmddYYYY, 1), + sf.lit("-"), + sf.regexp_extract(column_ref, regexp_mmddYYYY, 3), + ) + YYYYmmdd_fmt = sf.concat( + sf.regexp_extract(column_ref, regexp_YYYYmmdd, 1), + sf.lit("-"), + sf.regexp_extract(column_ref, regexp_YYYYmmdd, 3), + sf.lit("-"), + sf.regexp_extract(column_ref, regexp_YYYYmmdd, 5), + ) + if is_casted_to_date: + mmddYYYY_fmt = mmddYYYY_fmt.cast("date") + YYYYmmdd_fmt = YYYYmmdd_fmt.cast("date") + return sf.when(sf.regexp(column_ref, sf.lit(regexp_mmddYYYY)), mmddYYYY_fmt).otherwise(YYYYmmdd_fmt) diff --git a/usaspending_api/transactions/delta_models/published_fabs.py b/usaspending_api/transactions/delta_models/published_fabs.py index 6ad119c542..2b06b2e88d 100644 --- a/usaspending_api/transactions/delta_models/published_fabs.py +++ b/usaspending_api/transactions/delta_models/published_fabs.py @@ -125,14 +125,14 @@ **{k: v["delta"] for k, v in PUBLISHED_FABS_COLUMNS.items()}, **DELTA_ONLY_COLUMNS, } -PUBLISHED_FABS_POSTGRES_COLUMNS = { - k: v["postgres"] for k, v in PUBLISHED_FABS_COLUMNS.items() -} +PUBLISHED_FABS_POSTGRES_COLUMNS = {k: v["postgres"] for k, v in PUBLISHED_FABS_COLUMNS.items()} published_fabs_create_sql_string = rf""" CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( {", ".join([f'{key} {val}' for key, val in PUBLISHED_FABS_DELTA_COLUMNS.items()])} ) USING DELTA + PARTITIONED BY (action_year, action_month) LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ diff --git a/usaspending_api/transactions/delta_models/transaction_fabs.py b/usaspending_api/transactions/delta_models/transaction_fabs.py index d6cc32481c..c674d628ae 100644 --- a/usaspending_api/transactions/delta_models/transaction_fabs.py +++ b/usaspending_api/transactions/delta_models/transaction_fabs.py @@ -1,3 +1,5 @@ +from pyspark.sql import functions as sf + from usaspending_api.common.data_classes import TransactionColumn TRANSACTION_FABS_COLUMN_INFO = [ @@ -51,20 +53,21 @@ "legal_entity_country_code", "legal_entity_country_code", "STRING", - scalar_transformation="CASE {input} \ - WHEN 'UNITED STATES' THEN 'USA' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: sf.when(col == sf.lit("UNITED STATES"), sf.lit("USA")).otherwise(col), ), TransactionColumn( "legal_entity_country_name", "legal_entity_country_name", "STRING", - scalar_transformation="CASE \ - WHEN {input} = 'USA' THEN 'UNITED STATES' \ - WHEN COALESCE({input}, '') = '' AND legal_entity_country_code = 'UNITED STATES' THEN 'UNITED STATES' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: ( + sf.when(col == sf.lit("USA"), sf.lit("UNITED STATES")) + .when( + (sf.coalesce(col, sf.lit("")) == sf.lit("")) + & (sf.col("legal_entity_country_code") == sf.lit("UNITED STATES")), + sf.lit("UNITED STATES"), + ) + .otherwise(col) + ), ), TransactionColumn("legal_entity_county_code", "legal_entity_county_code", "STRING"), TransactionColumn("legal_entity_county_name", "legal_entity_county_name", "STRING"), @@ -96,20 +99,21 @@ "place_of_perform_country_c", "place_of_perform_country_c", "STRING", - scalar_transformation="CASE {input} \ - WHEN 'UNITED STATES' THEN 'USA' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: sf.when(col == sf.lit("UNITED STATES"), sf.lit("USA")).otherwise(col), ), TransactionColumn( "place_of_perform_country_n", "place_of_perform_country_n", "STRING", - scalar_transformation="CASE \ - WHEN {input} = 'USA' THEN 'UNITED STATES' \ - WHEN COALESCE({input}, '') = '' AND place_of_perform_country_c = 'UNITED STATES' THEN 'UNITED STATES' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: ( + sf.when(col == sf.lit("USA"), sf.lit("UNITED STATES")) + .when( + (sf.coalesce(col, sf.lit("")) == sf.lit("")) + & (sf.col("place_of_perform_country_c") == sf.lit("UNITED STATES")), + sf.lit("UNITED STATES"), + ) + .otherwise(col) + ), ), TransactionColumn("place_of_perform_county_co", "place_of_perform_county_co", "STRING"), TransactionColumn("place_of_perform_county_na", "place_of_perform_county_na", "STRING"), @@ -137,6 +141,8 @@ TransactionColumn("updated_at", "updated_at", "TIMESTAMP"), TransactionColumn("uri", "uri", "STRING"), TransactionColumn("hash", "hash", "LONG"), + TransactionColumn("action_year", "action_year", "INTEGER"), + TransactionColumn("action_month", "action_month", "INTEGER"), ] TRANSACTION_FABS_COLUMNS = [col.dest_name for col in TRANSACTION_FABS_COLUMN_INFO] @@ -159,7 +165,9 @@ {", ".join([f'{col.dest_name} {col.delta_type}' for col in TRANSACTION_FABS_COLUMN_INFO])} ) USING DELTA + PARTITIONED BY (action_year, action_month) LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ # Mapping from raw.published_fabs to int.transaction_normalized columns, where a simple mapping exists @@ -168,12 +176,12 @@ TransactionColumn("action_date", "action_date", "DATE", "parse_string_datetime_to_date"), TransactionColumn("action_type", "action_type", "STRING"), TransactionColumn("action_type_description", "action_type_description", "STRING"), - TransactionColumn("certified_date", "NULL", "DATE", "literal"), + TransactionColumn("certified_date", None, "DATE", "literal"), TransactionColumn("description", "award_description", "STRING"), TransactionColumn("face_value_loan_guarantee", "face_value_loan_guarantee", "NUMERIC(23,2)"), TransactionColumn("federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)"), TransactionColumn("indirect_federal_sharing", "indirect_federal_sharing", "NUMERIC(23, 2)", "cast"), - TransactionColumn("is_fpds", "FALSE", "BOOLEAN", "literal"), + TransactionColumn("is_fpds", False, "BOOLEAN", "literal"), TransactionColumn("last_modified_date", "modified_at", "DATE", "cast"), TransactionColumn("modification_number", "award_modification_amendme", "STRING"), TransactionColumn("non_federal_funding_amount", "non_federal_funding_amount", "NUMERIC(23,2)"), @@ -189,6 +197,8 @@ TransactionColumn("type", "assistance_type", "STRING"), TransactionColumn("type_description", "assistance_type_desc", "STRING"), TransactionColumn("unique_award_key", "unique_award_key", "STRING"), - TransactionColumn("usaspending_unique_transaction_id", "NULL", "STRING", "literal"), + TransactionColumn("usaspending_unique_transaction_id", None, "STRING", "literal"), TransactionColumn("hash", "hash", "LONG"), + TransactionColumn("action_year", "action_year", "INTEGER"), + TransactionColumn("action_month", "action_month", "INTEGER"), ] diff --git a/usaspending_api/transactions/delta_models/transaction_fpds.py b/usaspending_api/transactions/delta_models/transaction_fpds.py index e9452d6aa2..e6063f9792 100644 --- a/usaspending_api/transactions/delta_models/transaction_fpds.py +++ b/usaspending_api/transactions/delta_models/transaction_fpds.py @@ -291,7 +291,7 @@ TransactionColumn("referenced_mult_or_single", "referenced_mult_or_single", "STRING"), # The referenced_multi_or_single field does not appear in the django model and may have been created inadvertently # in the Delta model previously. Since it is always NULL, it is a candidate for elimination. - TransactionColumn("referenced_multi_or_single", "NULL", "STRING", "literal"), + TransactionColumn("referenced_multi_or_single", None, "STRING", "literal"), TransactionColumn("research", "research", "STRING"), TransactionColumn("research_description", "research_description", "STRING"), TransactionColumn("sam_exception", "sam_exception", "STRING"), @@ -405,17 +405,17 @@ TransactionColumn("action_date", "action_date", "DATE", "parse_string_datetime_to_date"), TransactionColumn("action_type", "action_type", "STRING"), TransactionColumn("action_type_description", "action_type_description", "STRING"), - TransactionColumn("certified_date", "NULL", "DATE", "literal"), + TransactionColumn("certified_date", None, "DATE", "literal"), TransactionColumn("description", "award_description", "STRING"), - TransactionColumn("face_value_loan_guarantee", "NULL", "NUMERIC(23, 2)", "literal"), + TransactionColumn("face_value_loan_guarantee", None, "NUMERIC(23, 2)", "literal"), TransactionColumn("federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)"), TransactionColumn("funding_amount", "NULL", "NUMERIC(23, 2)", "literal"), - TransactionColumn("indirect_federal_sharing", "NULL", "NUMERIC(23, 2)", "literal"), - TransactionColumn("is_fpds", "TRUE", "BOOLEAN", "literal"), + TransactionColumn("indirect_federal_sharing", None, "NUMERIC(23, 2)", "literal"), + TransactionColumn("is_fpds", True, "BOOLEAN", "literal"), TransactionColumn("last_modified_date", "last_modified", "TIMESTAMP", "cast"), TransactionColumn("modification_number", "award_modification_amendme", "STRING"), - TransactionColumn("non_federal_funding_amount", "NULL", "NUMERIC(23, 2)", "literal"), - TransactionColumn("original_loan_subsidy_cost", "NULL", "NUMERIC(23, 2)", "literal"), + TransactionColumn("non_federal_funding_amount", None, "NUMERIC(23, 2)", "literal"), + TransactionColumn("original_loan_subsidy_cost", None, "NUMERIC(23, 2)", "literal"), # All period_of_performance_* fields seen as: YYYY-MM-DD 00:00:00, so cast works # BUT it's still just a string and could morph, so defensively smart-date-parsing the string TransactionColumn( @@ -432,7 +432,7 @@ ), TransactionColumn("transaction_unique_id", "detached_award_proc_unique", "STRING"), TransactionColumn("unique_award_key", "unique_award_key", "STRING"), - TransactionColumn("usaspending_unique_transaction_id", "NULL", "STRING", "literal"), + TransactionColumn("usaspending_unique_transaction_id", None, "STRING", "literal"), TransactionColumn("hash", "hash", "LONG"), TransactionColumn("action_year", "action_year", "INTEGER"), TransactionColumn("action_month", "action_month", "INTEGER"), From 19539c3ccfc7fbc1e5fe8d530bcd5718357a44f8 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 23 Feb 2026 10:32:00 -0600 Subject: [PATCH 51/59] [DEV-14453] - Adding deletes to transaction normalized mixin --- .../etl/transaction_delta_loaders/loaders.py | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/usaspending_api/etl/transaction_delta_loaders/loaders.py b/usaspending_api/etl/transaction_delta_loaders/loaders.py index 5f2370cf27..73a55df8f6 100644 --- a/usaspending_api/etl/transaction_delta_loaders/loaders.py +++ b/usaspending_api/etl/transaction_delta_loaders/loaders.py @@ -113,9 +113,9 @@ def to_insert_df(self) -> DataFrame: .select(self.select_columns) ) - def to_delete_df(self) -> DataFrame: - version_window = Window.partitionBy(self.id_col, "hash", "_commit_version") - transaction_window = Window.partitionBy(self.id_col, "hash") + def to_delete_df(self, id_col) -> DataFrame: + version_window = Window.partitionBy(id_col, "hash", "_commit_version") + transaction_window = Window.partitionBy(id_col, "hash") return ( self.spark.read.format("delta") .option("readChangeFeed", "true") @@ -128,7 +128,7 @@ def to_delete_df(self) -> DataFrame: & (sf.col("_commit_version") == sf.col("latest_version")) & ~sf.col("has_insert") ) - .select(self.id_col, "hash", "action_year", "action_month") + .select(id_col, "hash", "action_year", "action_month") ) def transaction_merge(self) -> None: @@ -151,7 +151,7 @@ def transaction_merge(self) -> None: ) ( target.merge( - self.to_delete_df().alias("s"), + self.to_delete_df(self.id_col).alias("s"), " AND ".join([id_condition, hash_condition, partition_pruning_conditions]), ) .whenMatchedDelete() @@ -181,6 +181,7 @@ class NormalizedMixin: spark: SparkSession handle_column: Callable + to_delete_df: Callable source_table: str id_col: str source_id_col: str @@ -190,7 +191,7 @@ class NormalizedMixin: to_normalized_col_info: list[TransactionColumn] normalization_type: Literal["fabs", "fpds"] - def source_subquery_df(self) -> DataFrame: + def to_insert_df(self) -> DataFrame: funding_subtier_agency = self.spark.table("global_temp.subtier_agency").alias("funding_subtier_agency") funding_agency = self.spark.table("global_temp.agency").alias("funding_agency") awarding_subtier_agency = ( @@ -259,19 +260,26 @@ def transaction_merge(self) -> None: target = DeltaTable.forName(self.spark, "int.transaction_normalized").alias("t") id_condition = "t.transaction_unique_id = s.transaction_unique_id" - row_not_updated_condition = "t.hash == s.hash" + hash_condition = "t.hash == s.hash" type_partition_condition = f"{'NOT' if self.normalization_type == 'fabs' else ''} t.is_fpds" - date_partition_conditions = "t.action_year == s.action_year AND t.action_month == s.action_month" + partition_pruning_conditions = "t.action_year == s.action_year AND t.action_month == s.action_month" ( target.merge( - self.source_subquery_df().alias("s"), - " AND ".join( - [id_condition, row_not_updated_condition, type_partition_condition, date_partition_conditions] - ), + self.to_insert_df().alias("s"), + " AND ".join([id_condition, hash_condition, type_partition_condition, partition_pruning_conditions]), ) .whenNotMatchedInsert(values=dict(zip(insert_col_names, insert_values))) .execute() ) + delete_id_condition = f"t.transaction_unique_id = s.{self.source_id_col}" + ( + target.merge( + self.to_delete_df(self.source_id_col).alias("s"), + " AND ".join([delete_id_condition, hash_condition, partition_pruning_conditions]), + ) + .whenMatchedDelete() + .execute() + ) def populate_transaction_normalized_ids(self) -> None: target = DeltaTable.forName(self.spark, "int.transaction_normalized").alias("t") From 679cfd878ac6d264b17bc606e17973e7ce59b50e Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 23 Feb 2026 15:58:55 -0600 Subject: [PATCH 52/59] [DEV-14453] - Update date parse util --- usaspending_api/etl/transaction_delta_loaders/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usaspending_api/etl/transaction_delta_loaders/utils.py b/usaspending_api/etl/transaction_delta_loaders/utils.py index 2517f8d9c2..23849b92d0 100644 --- a/usaspending_api/etl/transaction_delta_loaders/utils.py +++ b/usaspending_api/etl/transaction_delta_loaders/utils.py @@ -22,4 +22,4 @@ def parse_date_column(column: str, table: str | None = None, is_casted_to_date: if is_casted_to_date: mmddYYYY_fmt = mmddYYYY_fmt.cast("date") YYYYmmdd_fmt = YYYYmmdd_fmt.cast("date") - return sf.when(sf.regexp(column_ref, sf.lit(regexp_mmddYYYY)), mmddYYYY_fmt).otherwise(YYYYmmdd_fmt) + return sf.when(sf.regexp_extract(column_ref, sf.lit(regexp_mmddYYYY)), mmddYYYY_fmt).otherwise(YYYYmmdd_fmt) From e67750734f8d2b260440c2c1426f70859d64279e Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 25 Feb 2026 12:40:28 -0600 Subject: [PATCH 53/59] [DEV-14453] - Update date parse util --- usaspending_api/etl/transaction_delta_loaders/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usaspending_api/etl/transaction_delta_loaders/utils.py b/usaspending_api/etl/transaction_delta_loaders/utils.py index 23849b92d0..2fc2273b68 100644 --- a/usaspending_api/etl/transaction_delta_loaders/utils.py +++ b/usaspending_api/etl/transaction_delta_loaders/utils.py @@ -22,4 +22,4 @@ def parse_date_column(column: str, table: str | None = None, is_casted_to_date: if is_casted_to_date: mmddYYYY_fmt = mmddYYYY_fmt.cast("date") YYYYmmdd_fmt = YYYYmmdd_fmt.cast("date") - return sf.when(sf.regexp_extract(column_ref, sf.lit(regexp_mmddYYYY)), mmddYYYY_fmt).otherwise(YYYYmmdd_fmt) + return sf.when(sf.rlike(column_ref, sf.lit(regexp_mmddYYYY)), mmddYYYY_fmt).otherwise(YYYYmmdd_fmt) From 33534d78a024f734ee801b81918c526e636015cd Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 25 Feb 2026 14:17:48 -0600 Subject: [PATCH 54/59] [DEV-14453] - Update save mode for detached award procurement --- usaspending_api/etl/management/commands/load_table_to_delta.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/usaspending_api/etl/management/commands/load_table_to_delta.py b/usaspending_api/etl/management/commands/load_table_to_delta.py index 8b9397a08f..35ba00d1f3 100644 --- a/usaspending_api/etl/management/commands/load_table_to_delta.py +++ b/usaspending_api/etl/management/commands/load_table_to_delta.py @@ -99,8 +99,6 @@ "action_year": lambda: sf.year(sf.to_date("action_date")), "action_month": lambda: sf.month(sf.to_date("action_date")), }, - save_mode="merge", - merge_condition="s.detached_award_procurement_id = t.detached_award_procurement_id and s.hash = t.hash", ), "financial_accounts_by_awards": TableSpec( model=FinancialAccountsByAwards, From 4d7678ac7d4a7e43c9cf185784a803b21c696a80 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 25 Feb 2026 14:43:29 -0600 Subject: [PATCH 55/59] [DEV-14453] - Update date parse util --- usaspending_api/etl/transaction_delta_loaders/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/usaspending_api/etl/transaction_delta_loaders/utils.py b/usaspending_api/etl/transaction_delta_loaders/utils.py index 2fc2273b68..fbc12ab88c 100644 --- a/usaspending_api/etl/transaction_delta_loaders/utils.py +++ b/usaspending_api/etl/transaction_delta_loaders/utils.py @@ -22,4 +22,6 @@ def parse_date_column(column: str, table: str | None = None, is_casted_to_date: if is_casted_to_date: mmddYYYY_fmt = mmddYYYY_fmt.cast("date") YYYYmmdd_fmt = YYYYmmdd_fmt.cast("date") - return sf.when(sf.rlike(column_ref, sf.lit(regexp_mmddYYYY)), mmddYYYY_fmt).otherwise(YYYYmmdd_fmt) + return sf.when(sf.regexp_extract(column_ref, regexp_mmddYYYY, 0) != sf.lit(""), mmddYYYY_fmt).otherwise( + YYYYmmdd_fmt + ) From b3877e07f275b2fa6c6cd4997a74831780789241 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Thu, 26 Feb 2026 14:26:59 -0600 Subject: [PATCH 56/59] [DEV-14453] - Update column handling for pyspark 3.4 --- usaspending_api/etl/transaction_delta_loaders/loaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usaspending_api/etl/transaction_delta_loaders/loaders.py b/usaspending_api/etl/transaction_delta_loaders/loaders.py index 73a55df8f6..fee3e08faa 100644 --- a/usaspending_api/etl/transaction_delta_loaders/loaders.py +++ b/usaspending_api/etl/transaction_delta_loaders/loaders.py @@ -78,7 +78,7 @@ def handle_column(self, col: TransactionColumn, is_result_aliased=True) -> Colum retval = parse_date_column(col.source, table=self.source_table, is_casted_to_date=False) elif col.delta_type.upper() == "STRING": # Capitalize and remove leading & trailing whitespace from all string values - retval = sf.ucase(sf.trim(sf.col(f"{self.source_table}.{col.source}"))) + retval = sf.upper(sf.trim(sf.col(f"{self.source_table}.{col.source}"))) elif col.delta_type.upper() == "BOOLEAN" and not col.handling == "leave_null": # Unless specified, convert any nulls to false for boolean columns retval = sf.coalesce(sf.col(f"{self.source_table}.{col.source}"), sf.lit(False)) From c359d4ab2d2e0bd1e23f3c87c99e872ae9319013 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 2 Mar 2026 16:51:17 -0600 Subject: [PATCH 57/59] [DEV-14453] - Adding more logging --- .../etl/transaction_delta_loaders/loaders.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/usaspending_api/etl/transaction_delta_loaders/loaders.py b/usaspending_api/etl/transaction_delta_loaders/loaders.py index fee3e08faa..24a662cb4e 100644 --- a/usaspending_api/etl/transaction_delta_loaders/loaders.py +++ b/usaspending_api/etl/transaction_delta_loaders/loaders.py @@ -1,6 +1,7 @@ import logging from abc import ABC from datetime import datetime, timezone +from time import perf_counter from typing import Callable, Literal from delta import DeltaTable @@ -336,16 +337,30 @@ def populate_award_ids(self) -> None: w = Window.orderBy(needs_ids.unique_award_key) with_ids = needs_ids.withColumn("award_id", (max_id + sf.row_number().over(w)).cast("LONG")).alias("s") ( - target.merge(with_ids, f"t.unique_award_key = s.unique_award_key") + target.merge(with_ids, "t.unique_award_key = s.unique_award_key") .whenMatchedUpdate(set={"t.award_id": "s.award_id"}) .execute() ) def load_transactions(self) -> None: + start = perf_counter() + logger.info("Loading transactions...") super().load_transactions() + s1 = perf_counter() + logger.info(f"Loading transactions took {s1 - start:.2f} seconds.") + logger.info(f"populating award ids...") self.populate_award_ids() + s2 = perf_counter() + logger.info(f"Populating awards took {s2 - s1:.2f} seconds.") + logger.info("populating transaction normalized ids...") self.populate_transaction_normalized_ids() + s3 = perf_counter() + logger.info(f"Populating normalized ids took {s3 - s2:.2f} seconds.") + logger.info("linking transactions to normalized...") self.link_transactions_to_normalized() + s4 = perf_counter() + logger.info(f"Linking took {s4 - s3:.2f} seconds.") + logger.info(f"total time {s2 - start:.2f} seconds.") class FABSNormalizedDeltaTransactionLoader(NormalizedMixin, AbstractDeltaTransactionLoader): From 7f0724219af32fea6d4910411fb3b2e36f36c72e Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Thu, 19 Mar 2026 17:17:54 -0500 Subject: [PATCH 58/59] [DEV-14453] - Update insert and delete dfs to just get new rows --- .../commands/load_table_to_delta.py | 4 ++ .../load_transaction_fabs_in_delta.py | 13 +++- .../load_transaction_fpds_in_delta.py | 13 +++- .../commands/load_transaction_normalized.py | 19 +++++- .../etl/transaction_delta_loaders/loaders.py | 63 ++++++++++++------- 5 files changed, 84 insertions(+), 28 deletions(-) diff --git a/usaspending_api/etl/management/commands/load_table_to_delta.py b/usaspending_api/etl/management/commands/load_table_to_delta.py index 35ba00d1f3..c1c611a4cf 100644 --- a/usaspending_api/etl/management/commands/load_table_to_delta.py +++ b/usaspending_api/etl/management/commands/load_table_to_delta.py @@ -86,6 +86,8 @@ ), "detached_award_procurement": TableSpec( model=SourceProcurementTransaction, + save_mode="merge", + merge_condition="t.detached_award_procurement_id == s.detached_award_procurement_id and t.hash == s.hash", source_table="source_procurement_transaction", source_database="raw", destination_database="raw", @@ -126,6 +128,8 @@ "published_fabs": TableSpec( model=SourceAssistanceTransaction, source_table="source_assistance_transaction", + save_mode="merge", + merge_condition="t.published_fabs_id == s.published_fabs_id and t.hash == s.hash", source_database="raw", destination_database="raw", partition_column="published_fabs_id", diff --git a/usaspending_api/etl/management/commands/load_transaction_fabs_in_delta.py b/usaspending_api/etl/management/commands/load_transaction_fabs_in_delta.py index 9dbb3d64ae..2531592a5b 100644 --- a/usaspending_api/etl/management/commands/load_transaction_fabs_in_delta.py +++ b/usaspending_api/etl/management/commands/load_transaction_fabs_in_delta.py @@ -17,6 +17,13 @@ class Command(BaseCommand): @staticmethod def add_arguments(parser): + parser.add_argument( + "--alt-last-load-date", + type=str, + required=False, + default=None, + help="Alternative last load datetime in %Y-%m-%d %H:%M:%S (e.g. 2026-03-19 14:00:00) format.", + ) parser.add_argument( "--spark-s3-bucket", type=str, @@ -28,5 +35,9 @@ def add_arguments(parser): @staticmethod def handle(*args, **options): with prepare_spark() as spark: - loader = FABSDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + loader = FABSDeltaTransactionLoader( + spark=spark, + alt_last_load_date=options["alt_last_load_date"], + spark_s3_bucket=options["spark_s3_bucket"], + ) loader.load_transactions() diff --git a/usaspending_api/etl/management/commands/load_transaction_fpds_in_delta.py b/usaspending_api/etl/management/commands/load_transaction_fpds_in_delta.py index 783ab20ca1..ed8665ab9b 100644 --- a/usaspending_api/etl/management/commands/load_transaction_fpds_in_delta.py +++ b/usaspending_api/etl/management/commands/load_transaction_fpds_in_delta.py @@ -17,6 +17,13 @@ class Command(BaseCommand): @staticmethod def add_arguments(parser): + parser.add_argument( + "--alt-last-load-date", + type=str, + required=False, + default=None, + help="Alternative last load datetime in %Y-%m-%d %H:%M:%S (e.g. 2026-03-19 14:00:00) format.", + ) parser.add_argument( "--spark-s3-bucket", type=str, @@ -28,5 +35,9 @@ def add_arguments(parser): @staticmethod def handle(*args, **options): with prepare_spark() as spark: - loader = FPDSDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + loader = FPDSDeltaTransactionLoader( + spark=spark, + alt_last_load_date=options["alt_last_load_date"], + spark_s3_bucket=options["spark_s3_bucket"], + ) loader.load_transactions() diff --git a/usaspending_api/etl/management/commands/load_transaction_normalized.py b/usaspending_api/etl/management/commands/load_transaction_normalized.py index dd89e851af..590a30d22d 100644 --- a/usaspending_api/etl/management/commands/load_transaction_normalized.py +++ b/usaspending_api/etl/management/commands/load_transaction_normalized.py @@ -19,6 +19,13 @@ class Command(BaseCommand): @staticmethod def add_arguments(parser): + parser.add_argument( + "--alt-last-load-date", + type=str, + required=False, + default=None, + help="Alternative last load datetime in %Y-%m-%d %H:%M:%S (e.g. 2026-03-19 14:00:00) format.", + ) parser.add_argument( "--spark-s3-bucket", type=str, @@ -30,7 +37,15 @@ def add_arguments(parser): @staticmethod def handle(*args, **options): with prepare_spark() as spark: - fabs_loader = FABSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) - fpds_loader = FPDSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + fabs_loader = FABSNormalizedDeltaTransactionLoader( + spark=spark, + alt_last_load_date=options["alt_last_load_date"], + spark_s3_bucket=options["spark_s3_bucket"], + ) + fpds_loader = FPDSNormalizedDeltaTransactionLoader( + spark=spark, + alt_last_load_date=options["alt_last_load_date"], + spark_s3_bucket=options["spark_s3_bucket"], + ) fabs_loader.load_transactions() fpds_loader.load_transactions() diff --git a/usaspending_api/etl/transaction_delta_loaders/loaders.py b/usaspending_api/etl/transaction_delta_loaders/loaders.py index 24a662cb4e..0b66394fea 100644 --- a/usaspending_api/etl/transaction_delta_loaders/loaders.py +++ b/usaspending_api/etl/transaction_delta_loaders/loaders.py @@ -46,9 +46,18 @@ class AbstractDeltaTransactionLoader(ABC): col_info = list[TransactionColumn] last_etl_load_date: datetime - def __init__(self, spark, etl_level: Literal["fabs", "fpds", "normalized"], spark_s3_bucket: str) -> None: + def __init__( + self, + spark, + etl_level: Literal["fabs", "fpds", "normalized"], + alt_last_load_date: str | None, + spark_s3_bucket: str, + ) -> None: self.etl_level = etl_level - self.last_etl_load_date = get_last_load_date(f"transaction_{self.etl_level}") + if alt_last_load_date is not None: + self.last_etl_load_date = datetime.strptime(alt_last_load_date, "%Y-%m-%d %H:%M:%S") + else: + self.last_etl_load_date = get_last_load_date(f"transaction_{self.etl_level}") self.spark_s3_bucket: spark_s3_bucket self.spark = spark @@ -100,37 +109,37 @@ def select_columns(self) -> list[Column]: ] def to_insert_df(self) -> DataFrame: - window_spec = Window.partitionBy(self.id_col) - return ( + df = ( self.spark.read.format("delta") .option("readChangeFeed", "true") - .option("startingTimestamp", self.last_etl_load_date.strftime("%Y-%m-%d %H:%M:%S")) + .option("startingVersion", 0) .table(self.source_table) - .withColumn("latest_version", sf.max("_commit_version").over(window_spec)) .filter( sf.col("_change_type").isin(["insert", "update_postimage"]) - & (sf.col("_commit_version") == sf.col("latest_version")) + & (sf.col("_commit_timestamp") > self.last_etl_load_date) ) .select(self.select_columns) ) + logger.info(f"Inserting {df.count()} rows.") + return df def to_delete_df(self, id_col) -> DataFrame: version_window = Window.partitionBy(id_col, "hash", "_commit_version") - transaction_window = Window.partitionBy(id_col, "hash") - return ( + df = ( self.spark.read.format("delta") .option("readChangeFeed", "true") - .option("startingTimestamp", self.last_etl_load_date.strftime("%Y-%m-%d %H:%M:%S")) + .option("startingVersion", 0) .table(self.source_table) - .withColumn("latest_version", sf.max(sf.col("_commit_version")).over(transaction_window)) .withColumn("has_insert", sf.max(sf.col("_change_type") == "insert").over(version_window)) .filter( (sf.col("_change_type") == sf.lit("delete")) - & (sf.col("_commit_version") == sf.col("latest_version")) + & (sf.col("_commit_timestamp") > self.last_etl_load_date) & ~sf.col("has_insert") ) .select(id_col, "hash", "action_year", "action_month") ) + logger.info(f"Deleting {df.count()} rows.") + return df def transaction_merge(self) -> None: source = self.to_insert_df().alias("s") @@ -162,8 +171,10 @@ def transaction_merge(self) -> None: class FPDSDeltaTransactionLoader(AbstractDeltaTransactionLoader): - def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: - super().__init__(spark=spark, etl_level="fpds", spark_s3_bucket=spark_s3_bucket) + def __init__(self, spark: SparkSession, alt_last_load_date: str | None, spark_s3_bucket: str) -> None: + super().__init__( + spark=spark, etl_level="fpds", alt_last_load_date=alt_last_load_date, spark_s3_bucket=spark_s3_bucket + ) self.id_col = "detached_award_proc_unique" self.source_table = "raw.detached_award_procurement" self.col_info = TRANSACTION_FPDS_COLUMN_INFO @@ -171,8 +182,10 @@ def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: class FABSDeltaTransactionLoader(AbstractDeltaTransactionLoader): - def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: - super().__init__(spark=spark, etl_level="fabs", spark_s3_bucket=spark_s3_bucket) + def __init__(self, spark: SparkSession, alt_last_load_date: str | None, spark_s3_bucket: str) -> None: + super().__init__( + spark=spark, etl_level="fabs", alt_last_load_date=alt_last_load_date, spark_s3_bucket=spark_s3_bucket + ) self.id_col = "afa_generated_unique" self.source_table = "raw.published_fabs" self.col_info = TRANSACTION_FABS_COLUMN_INFO @@ -201,16 +214,14 @@ def to_insert_df(self) -> DataFrame: .alias("awarding_subtier_agency") ) awarding_agency = self.spark.table("global_temp.agency").alias("awarding_agency") - window_spec = Window.partitionBy(self.source_id_col) df = ( self.spark.read.format("delta") .option("readChangeFeed", "true") - .option("startingTimestamp", self.last_etl_load_date.strftime("%Y-%m-%d %H:%M:%S")) + .option("startingVersion", 0) .table(self.source_table) - .withColumn("latest_version", sf.max("_commit_version").over(window_spec)) .filter( sf.col("_change_type").isin(["insert", "update_postimage"]) - & (sf.col("_commit_version") == sf.col("latest_version")) + & (sf.col("_commit_timestamp") > self.last_etl_load_date) ) ) result = ( @@ -365,8 +376,10 @@ def load_transactions(self) -> None: class FABSNormalizedDeltaTransactionLoader(NormalizedMixin, AbstractDeltaTransactionLoader): - def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: - super().__init__(spark=spark, etl_level="normalized", spark_s3_bucket=spark_s3_bucket) + def __init__(self, spark: SparkSession, alt_last_load_date: str | None, spark_s3_bucket: str) -> None: + super().__init__( + spark=spark, etl_level="normalized", alt_last_load_date=alt_last_load_date, spark_s3_bucket=spark_s3_bucket + ) self.id_col = "transaction_unique_id" self.source_id_col = "afa_generated_unique" self.source_table = "raw.published_fabs" @@ -406,8 +419,10 @@ def select_columns(self) -> list[str]: class FPDSNormalizedDeltaTransactionLoader(NormalizedMixin, AbstractDeltaTransactionLoader): - def __init__(self, spark, spark_s3_bucket: str) -> None: - super().__init__(spark=spark, etl_level="normalized", spark_s3_bucket=spark_s3_bucket) + def __init__(self, spark, alt_last_load_date: str | None, spark_s3_bucket: str) -> None: + super().__init__( + spark=spark, etl_level="normalized", alt_last_load_date=alt_last_load_date, spark_s3_bucket=spark_s3_bucket + ) self.id_col = "transaction_unique_id" self.source_id_col = "detached_award_proc_unique" self.source_table = "raw.detached_award_procurement" From be1ff6d190140a0797f62b40cafb7ce86eac779a Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 20 Mar 2026 16:00:39 -0500 Subject: [PATCH 59/59] [DEV-14453] - removing counts to speed up job --- usaspending_api/etl/transaction_delta_loaders/loaders.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/usaspending_api/etl/transaction_delta_loaders/loaders.py b/usaspending_api/etl/transaction_delta_loaders/loaders.py index 0b66394fea..d19978876d 100644 --- a/usaspending_api/etl/transaction_delta_loaders/loaders.py +++ b/usaspending_api/etl/transaction_delta_loaders/loaders.py @@ -120,7 +120,6 @@ def to_insert_df(self) -> DataFrame: ) .select(self.select_columns) ) - logger.info(f"Inserting {df.count()} rows.") return df def to_delete_df(self, id_col) -> DataFrame: @@ -138,12 +137,10 @@ def to_delete_df(self, id_col) -> DataFrame: ) .select(id_col, "hash", "action_year", "action_month") ) - logger.info(f"Deleting {df.count()} rows.") return df def transaction_merge(self) -> None: source = self.to_insert_df().alias("s") - logger.info(f"number of rows: {source.count()}") target = DeltaTable.forName(self.spark, f"int.transaction_{self.etl_level}").alias("t") id_condition = f"t.{self.id_col} == s.{self.id_col}" hash_condition = "t.hash == s.hash"