diff --git a/.github/workflows/code-style-checks.yaml b/.github/workflows/code-style-checks.yaml index 488458f2ec..41963673e1 100644 --- a/.github/workflows/code-style-checks.yaml +++ b/.github/workflows/code-style-checks.yaml @@ -31,6 +31,18 @@ jobs: - name: Checkout Source Repository uses: actions/checkout@v4 + - name: Get Changed Python Files + id: changed-python-files + uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 + with: + files: | + **.py + separator: " " + + - name: Check If `changed-python-files` Is An Empty String + if: steps.changed-python-files.outputs.all_changed_files == '' + run: echo "No changed files detected" + - name: Set Combined ENV run: | echo "DATA_BROKER_DATABASE_URL=postgres://$BROKER_DB_USER:$BROKER_DB_PASSWORD@$BROKER_DB_HOST:$BROKER_DB_PORT/$BROKER_DB_NAME" >> $GITHUB_ENV @@ -52,11 +64,10 @@ jobs: - name: Init Python Environment uses: ./.github/actions/init-python-environment - - name: Run Flake8 - run: flake8 - - - name: Run Black - run: black --check --diff . + # changed-python-files could be an empty string, which would cause `ruff check` to be run against the entire project. + - name: Run Ruff Linter + if: steps.changed-python-files.outputs.all_changed_files != '' + run: ruff check ${{ steps.changed-python-files.outputs.all_changed_files }} - name: Run Check For Endpoint Documentation run: python manage.py check_for_endpoint_documentation diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7ad8d1eb64..850453ca2c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,16 +3,9 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: - - id: debug-statements -- repo: https://github.com/pycqa/flake8.git - rev: 7.1.0 + - id: debug-statements +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.14.14 hooks: - - id: flake8 - language_version: python3.10.12 -- repo: https://github.com/psf/black - rev: 24.10.0 - hooks: - - id: black - language_version: python3.10.12 - additional_dependencies: - - "click==8.0.4" + - id: ruff-check + types_or: [ python, pyi ] # avoid linting other python file types like Jupyter notebooks diff --git a/pyproject.toml b/pyproject.toml index fcc33952fc..2e5ff823b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ readme = 'README.md' license = "CC0-1.0" license-files = ["LICENSE"] requires-python = '>=3.10' -classifiers=[ +classifiers = [ "Development Status :: 5 - Production/Stable", "Programming Language :: Python", "Programming Language :: Python :: 3", @@ -97,11 +97,9 @@ dependencies = [ [project.optional-dependencies] dev = [ - "black==24.10.0", "click==8.1.7", "docker==7.0.0", "dredd-hooks==0.2.0", - "flake8==7.1.0", "importlib-metadata==8.5.0", "mock==5.1.*", "model-bakery==1.17.*", @@ -112,6 +110,7 @@ dev = [ "pytest-django==4.8.*", "pytest-pretty==1.2.*", "pytest-xdist==3.5.*", + "ruff==0.14.14", ] spark = [ "delta-spark==3.2.*", @@ -138,14 +137,13 @@ DJANGO_SETTINGS_MODULE = "usaspending_api.settings" addopts = "--cov=usaspending_api" markers = [ "signal_handling: Mark all tests that import the signal library and invoke signals. This MUST be done on the main thread, and can cause errors if pytest-xdist subordinates parellel test sessions to background threads.", - # These are "auto" marked based on fixture usage. See conftest.py pytest_collection_modifyitems "spark: Mark all tests using the spark fixture. Can be selected with -m spark or deselected with -m (not spark)", "database: Mark all integration tests using a database. Can be selected with -m database or deselected with -m (not database)", "elasticsearch: Mark all integration tests using Elasticsearch. Can be selected with -m database or deselected with -m (not elasticsearch)", ] pythonpath = [ - "." + "." ] [tool.coverage.run] @@ -163,7 +161,68 @@ exclude_lines = [ "pragma: no cover" ] -[tool.black] -line-length = 120 -target-version = ['py310'] -exclude = '/(\.git|\.venv|venv|migrations)/' +[tool.ruff.lint] +preview = true # enable new rules +exclude = [ + '.git', + '.venv', + 'venv', + '**/migrations/**', + 'build', + 'usaspending_api.egg-info' +] + +select = [ + "PLR0913", # max arguments in function + "PLR0904", # max number of public methods + "PLR0911", # max number of return statements + "PLR0916", # max number of boolean expressions + "PLR0915", # max number of lines in a function + "PLR0912", # max number of logical branches in a function + "PLR1702", # max number of nested blocks + "C901", # cognitive complexity (functions) + "I001", # unsorted imports + "B", # flake8 bugbear + "E", # pycodestyle errors + "F", # pyflakes + "W", # pycodestyle warnings + "ANN001", # missing type annotation for function argument + "ANN201", # missing return type annotation for public function or method + "ANN202", # missing return type annotation for private function or method +] +ignore = [ + "E203", # whitespace before punctuation +] + +pylint.max-args = 6 +pylint.max-public-methods = 20 +pylint.max-returns = 3 +pylint.max-bool-expr = 8 +pylint.max-statements = 45 +pylint.max-branches = 10 +pylint.max-nested-blocks = 5 +mccabe.max-complexity = 15 +pycodestyle.max-line-length = 120 + +[tool.ruff.lint.per-file-ignores] +"**/tests/**/test*.py" = [ + "ANN001", # missing-type-function-argument; avoid conflict with fixtures + "ANN201", # missing-return-type-undocumented-public-function; avoid conflict with test case return values + "ANN202", # missing-return-type-undocumented-private-function; avoid conflict with test case return values + "PLR0913", # too-many-arguments; avoid conflict with too many fixtures + "PLR0915", # too-many-statements; avoid conflict with long fixtures +] +"**/conftest*.py" = [ + "ANN001", # missing-type-function-argument; avoid conflict with fixtures + "ANN201", # missing-return-type-undocumented-public-function; avoid conflict with test case return values + "ANN202", # missing-return-type-undocumented-private-function; avoid conflict with test case return values + "PLR0913", # too-many-arguments; avoid conflict with too many fixtures + "PLR0915", # too-many-statements; avoid conflict with long fixtures +] +"**/tests/**/data/**.py" = [ + "ANN001", # missing-type-function-argument; avoid conflict with fixtures + "ANN201", # missing-return-type-undocumented-public-function; avoid conflict with test case return values + "ANN202", # missing-return-type-undocumented-private-function; avoid conflict with test case return values + "PLR0913", # too-many-arguments; avoid conflict with too many fixtures + "PLR0915", # too-many-statements; avoid conflict with long fixtures +] diff --git a/usaspending_api/accounts/helpers.py b/usaspending_api/accounts/helpers.py index a978574d32..c4c1ab3532 100644 --- a/usaspending_api/accounts/helpers.py +++ b/usaspending_api/accounts/helpers.py @@ -1,6 +1,5 @@ import datetime - TAS_COMPONENT_TO_FIELD_MAPPING = { "ata": "allocation_transfer_agency_id", "aid": "agency_id", @@ -12,7 +11,9 @@ } -def start_and_end_dates_from_fyq(fiscal_year, fiscal_quarter): +def start_and_end_dates_from_fyq( + fiscal_year: int, fiscal_quarter: int +) -> tuple[datetime.date, datetime.date]: if fiscal_quarter == 1: start_date = datetime.date(fiscal_year - 1, 10, 1) end_date = datetime.date(fiscal_year - 1, 12, 31) diff --git a/usaspending_api/awards/delta_models/awards.py b/usaspending_api/awards/delta_models/awards.py index 1b5119cf21..d87124862b 100644 --- a/usaspending_api/awards/delta_models/awards.py +++ b/usaspending_api/awards/delta_models/awards.py @@ -17,7 +17,7 @@ "generated_unique_award_id": "STRING NOT NULL", "id": "LONG", "is_fpds": "BOOLEAN NOT NULL", - "last_modified_date": "DATE", + "last_modified_date": "TIMESTAMP", "latest_transaction_id": "LONG", "non_federal_funding_amount": "NUMERIC(23,2)", "officer_1_amount": "NUMERIC(23,2)", @@ -55,4 +55,5 @@ ) USING DELTA LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ diff --git a/usaspending_api/awards/migrations/0115_alter_transactionnormalized_last_modified_date.py b/usaspending_api/awards/migrations/0115_alter_transactionnormalized_last_modified_date.py new file mode 100644 index 0000000000..dc4df9a4ce --- /dev/null +++ b/usaspending_api/awards/migrations/0115_alter_transactionnormalized_last_modified_date.py @@ -0,0 +1,15 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("awards", "0114_alter_ctodlinkageupdates_award_id"), + ] + + operations = [ + migrations.AlterField( + model_name="transactionnormalized", + name="last_modified_date", + field=models.DateTimeField(null=True), + ), + ] diff --git a/usaspending_api/awards/models/transaction_normalized.py b/usaspending_api/awards/models/transaction_normalized.py index 7ab1180ebf..cc0324a5f0 100644 --- a/usaspending_api/awards/models/transaction_normalized.py +++ b/usaspending_api/awards/models/transaction_normalized.py @@ -1,14 +1,15 @@ import os -from django.db import models - from django.contrib.postgres.fields import ArrayField +from django.db import models class TransactionNormalized(models.Model): id = models.BigAutoField(primary_key=True) award = models.ForeignKey( - "search.AwardSearch", on_delete=models.DO_NOTHING, help_text="The award which this transaction is contained in" + "search.AwardSearch", + on_delete=models.DO_NOTHING, + help_text="The award which this transaction is contained in", ) usaspending_unique_transaction_id = models.TextField( blank=True, @@ -29,7 +30,9 @@ class TransactionNormalized(models.Model): help_text="The plain text description of the transaction type", ) period_of_performance_start_date = models.DateField( - verbose_name="Period of Performance Start Date", null=True, help_text="The period of performance start date" + verbose_name="Period of Performance Start Date", + null=True, + help_text="The period of performance start date", ) period_of_performance_current_end_date = models.DateField( verbose_name="Period of Performance Current End Date", @@ -37,9 +40,15 @@ class TransactionNormalized(models.Model): help_text="The current end date of the period of performance", ) action_date = models.DateField( - verbose_name="Transaction Date", help_text="The date this transaction was actioned", db_index=True + verbose_name="Transaction Date", + help_text="The date this transaction was actioned", + db_index=True, + ) + action_type = models.TextField( + blank=True, + null=True, + help_text="The type of transaction. For example, A, B, C, D", ) - action_type = models.TextField(blank=True, null=True, help_text="The type of transaction. For example, A, B, C, D") action_type_description = models.TextField(blank=True, null=True) federal_action_obligation = models.DecimalField( max_digits=23, @@ -90,22 +99,36 @@ class TransactionNormalized(models.Model): null=True, help_text="The agency which is funding this transaction", ) - description = models.TextField(null=True, help_text="The description of this transaction") - last_modified_date = models.DateField( + description = models.TextField( + null=True, help_text="The description of this transaction" + ) + last_modified_date = models.DateTimeField( blank=True, null=True, help_text="The date this transaction was last modified" ) - certified_date = models.DateField(blank=True, null=True, help_text="The date this transaction was certified") + certified_date = models.DateField( + blank=True, null=True, help_text="The date this transaction was certified" + ) create_date = models.DateTimeField( - auto_now_add=True, blank=True, null=True, help_text="The date this transaction was created in the API" + auto_now_add=True, + blank=True, + null=True, + help_text="The date this transaction was created in the API", ) update_date = models.DateTimeField( - auto_now=True, null=True, help_text="The last time this transaction was updated in the API", db_index=True + auto_now=True, + null=True, + help_text="The last time this transaction was updated in the API", + db_index=True, + ) + fiscal_year = models.IntegerField( + blank=True, null=True, help_text="Fiscal Year calculated based on Action Date" ) - fiscal_year = models.IntegerField(blank=True, null=True, help_text="Fiscal Year calculated based on Action Date") transaction_unique_id = models.TextField( blank=False, null=False, default="NONE", verbose_name="Transaction Unique ID" ) - is_fpds = models.BooleanField(blank=False, null=False, default=False, verbose_name="Is FPDS") + is_fpds = models.BooleanField( + blank=False, null=False, default=False, verbose_name="Is FPDS" + ) funding_amount = models.DecimalField( max_digits=23, decimal_places=2, @@ -114,7 +137,11 @@ class TransactionNormalized(models.Model): help_text="Assistance data variable. non_federal_funding_amount + federal_action_obligation", ) non_federal_funding_amount = models.DecimalField( - max_digits=23, decimal_places=2, blank=True, null=True, help_text="Assistance Data variable." + max_digits=23, + decimal_places=2, + blank=True, + null=True, + help_text="Assistance Data variable.", ) unique_award_key = models.TextField(null=True, db_index=True) # From broker. business_categories = ArrayField(models.TextField(), default=list) @@ -149,9 +176,18 @@ class Meta: vw_transaction_normalized_sql = f""" CREATE OR REPLACE VIEW rpt.vw_transaction_normalized AS SELECT - {(','+os.linesep+' '*12).join([ - (v+(f'::{NORM_CASTED_COL_MAP[k]}' if k in NORM_CASTED_COL_MAP else '')).ljust(62)+' AS '+k.ljust(48) - for k, v in NORM_TO_TRANSACTION_SEARCH_COL_MAP.items()])} + { + ("," + os.linesep + " " * 12).join( + [ + ( + v + (f"::{NORM_CASTED_COL_MAP[k]}" if k in NORM_CASTED_COL_MAP else "") + ).ljust(62) + + " AS " + + k.ljust(48) + for k, v in NORM_TO_TRANSACTION_SEARCH_COL_MAP.items() + ] + ) +} FROM rpt.transaction_search; """ diff --git a/usaspending_api/awards/tests/integration/test_awards_v2.py b/usaspending_api/awards/tests/integration/test_awards_v2.py index a695be16c8..1e7a7b582c 100644 --- a/usaspending_api/awards/tests/integration/test_awards_v2.py +++ b/usaspending_api/awards/tests/integration/test_awards_v2.py @@ -2,6 +2,7 @@ import json import pytest +from django.test import Client from model_bakery import baker from rest_framework import status @@ -14,15 +15,29 @@ def awards_and_transactions(db): baker.make("recipient.DUNS", **duns) # Recipient Lookup - parent_recipient_lookup = {"duns": "123", "uei": "ABC", "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e"} - recipient_lookup = {"duns": "456", "uei": "DEF", "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab"} + parent_recipient_lookup = { + "duns": "123", + "uei": "ABC", + "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", + } + recipient_lookup = { + "duns": "456", + "uei": "DEF", + "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", + } baker.make("recipient.RecipientLookup", **parent_recipient_lookup) baker.make("recipient.RecipientLookup", **recipient_lookup) # Recipient Profile - parent_recipient_profile = {"recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", "recipient_level": "P"} - recipient_profile = {"recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", "recipient_level": "C"} + parent_recipient_profile = { + "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", + "recipient_level": "P", + } + recipient_profile = { + "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", + "recipient_level": "C", + } baker.make("recipient.RecipientProfile", **parent_recipient_profile) baker.make("recipient.RecipientProfile", **recipient_profile) @@ -59,30 +74,66 @@ def awards_and_transactions(db): baker.make("references.PSC", code="1005", description="More specific whatever") baker.make("references.PSC", code="A", description="R&D") baker.make("references.PSC", code="A1", description="R&D - Steak Sauce") - baker.make("references.PSC", code="A13", description="R&D - Brand specific steak condiments") - baker.make("references.PSC", code="A136", description="R&D - Very specific steak research") + baker.make( + "references.PSC", + code="A13", + description="R&D - Brand specific steak condiments", + ) + baker.make( + "references.PSC", code="A136", description="R&D - Very specific steak research" + ) baker.make("references.PSC", code="M", description="Something") baker.make("references.PSC", code="M1", description="Something More Specific") baker.make("references.PSC", code="M123", description="Something Most Specific") # NAICS baker.make("references.NAICS", code="11", description="Agriculture") - baker.make("references.NAICS", code="1111", description="Soybean & Oilseed Agriculture") + baker.make( + "references.NAICS", code="1111", description="Soybean & Oilseed Agriculture" + ) baker.make("references.NAICS", code="111120", description="Soybean Harvesting") # Toptier Agency - toptier_agency_1 = {"pk": 1, "abbreviation": "TA1", "name": "TOPTIER AGENCY 1", "toptier_code": "ABC"} - toptier_agency_2 = {"pk": 2, "abbreviation": "TA2", "name": "TOPTIER AGENCY 2", "toptier_code": "002"} + toptier_agency_1 = { + "pk": 1, + "abbreviation": "TA1", + "name": "TOPTIER AGENCY 1", + "toptier_code": "ABC", + } + toptier_agency_2 = { + "pk": 2, + "abbreviation": "TA2", + "name": "TOPTIER AGENCY 2", + "toptier_code": "002", + } - ta1 = baker.make("references.ToptierAgency", **toptier_agency_1, _fill_optional=True) - ta2 = baker.make("references.ToptierAgency", **toptier_agency_2, _fill_optional=True) + ta1 = baker.make( + "references.ToptierAgency", **toptier_agency_1, _fill_optional=True + ) + ta2 = baker.make( + "references.ToptierAgency", **toptier_agency_2, _fill_optional=True + ) # Subtier Agency - subtier_agency_1 = {"pk": 1, "abbreviation": "SA1", "name": "SUBTIER AGENCY 1", "subtier_code": "DEF"} - subtier_agency_2 = {"pk": 2, "abbreviation": "SA2", "name": "SUBTIER AGENCY 2", "subtier_code": "1000"} + subtier_agency_1 = { + "pk": 1, + "abbreviation": "SA1", + "name": "SUBTIER AGENCY 1", + "subtier_code": "DEF", + } + subtier_agency_2 = { + "pk": 2, + "abbreviation": "SA2", + "name": "SUBTIER AGENCY 2", + "subtier_code": "1000", + } - sa1 = baker.make("references.SubtierAgency", **subtier_agency_1, _fill_optional=True) - sa2 = baker.make("references.SubtierAgency", **subtier_agency_2, _fill_optional=True) + sa1 = baker.make( + "references.SubtierAgency", **subtier_agency_1, _fill_optional=True + ) + sa2 = baker.make( + "references.SubtierAgency", **subtier_agency_2, _fill_optional=True + ) # Agency agency = { @@ -130,7 +181,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+0000", "officer_1_amount": 50000.00, "officer_1_name": "John Apple", "officer_2_amount": 4623.00, @@ -183,7 +234,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+0000", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -236,7 +287,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+0000", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -289,7 +340,7 @@ def awards_and_transactions(db): "recipient_location_state_name": "North Carolina", "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+0000", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -344,7 +395,7 @@ def awards_and_transactions(db): "recipient_location_state_name": None, "legal_entity_zip_last4": "5312", "recipient_location_zip5": "12204", - "last_modified_date": "2000-01-02", + "last_modified_date": "2000-01-02 00:00:00+0000", "non_federal_funding_amount": 0, "officer_1_amount": 50000.00, "officer_1_name": "John Apple", @@ -405,7 +456,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03", + "last_modified_date": "2001-02-03 00:00:00+0000", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -495,7 +546,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03", + "last_modified_date": "2001-02-03 00:00:00+0000", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -585,7 +636,7 @@ def awards_and_transactions(db): "information_technolog_desc": "NOT IT PRODUCTS OR SERVICES", "interagency_contract_desc": "NOT APPLICABLE", "labor_standards_descrip": "NO", - "last_modified_date": "2001-02-03", + "last_modified_date": "2001-02-03 00:00:00+0000", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -971,15 +1022,27 @@ def awards_and_transactions(db): "rollup_total_obligation": 4500, "parent_award_id": None, } - parent_award_2 = {"award_id": 8, "generated_unique_award_id": "CONT_IDV_AWARD8_1000", "parent_award_id": 9} - parent_award_3 = {"award_id": 9, "generated_unique_award_id": "CONT_IDV_AWARD9_1000", "parent_award_id": None} + parent_award_2 = { + "award_id": 8, + "generated_unique_award_id": "CONT_IDV_AWARD8_1000", + "parent_award_id": 9, + } + parent_award_3 = { + "award_id": 9, + "generated_unique_award_id": "CONT_IDV_AWARD9_1000", + "parent_award_id": None, + } baker.make("awards.ParentAward", **parent_award_1) baker.make("awards.ParentAward", **parent_award_2) baker.make("awards.ParentAward", **parent_award_3) - dsws1 = baker.make("submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-01-01") - baker.make("submissions.SubmissionAttributes", toptier_code="ABC", submission_window=dsws1) + dsws1 = baker.make( + "submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-01-01" + ) + baker.make( + "submissions.SubmissionAttributes", toptier_code="ABC", submission_window=dsws1 + ) @pytest.fixture @@ -988,14 +1051,14 @@ def update_awards(db): baker.make("search.AwardSearch", award_id=12) -def test_award_last_updated_endpoint(client, update_awards): +def test_award_last_updated_endpoint(client: Client, update_awards): """Test the awards endpoint.""" resp = client.get("/api/v2/awards/last_updated/") assert resp.status_code == status.HTTP_200_OK assert resp.data["last_updated"] == datetime.datetime.now().strftime("%m/%d/%Y") -def test_award_endpoint_generated_id(client, awards_and_transactions): +def test_award_endpoint_generated_id(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/ASST_AGG_1830212.0481163_3620/") assert resp.status_code == status.HTTP_200_OK assert json.loads(resp.content.decode("utf-8")) == expected_response_asst @@ -1017,15 +1080,24 @@ def test_award_endpoint_generated_id(client, awards_and_transactions): assert json.loads(resp.content.decode("utf-8")) == expected_response_asst -def test_award_endpoint_parent_award(client, awards_and_transactions): - dsws1 = baker.make("submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-01-01") - baker.make("submissions.SubmissionAttributes", toptier_code="ABC", submission_window=dsws1) - baker.make("submissions.SubmissionAttributes", toptier_code="002", submission_window=dsws1) +def test_award_endpoint_parent_award(client: Client, awards_and_transactions): + dsws1 = baker.make( + "submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-01-01" + ) + baker.make( + "submissions.SubmissionAttributes", toptier_code="ABC", submission_window=dsws1 + ) + baker.make( + "submissions.SubmissionAttributes", toptier_code="002", submission_window=dsws1 + ) # Test contract award with parent resp = client.get("/api/v2/awards/7/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["parent_award"] == expected_contract_award_parent() + assert ( + json.loads(resp.content.decode("utf-8"))["parent_award"] + == expected_contract_award_parent() + ) # Test contract award without parent resp = client.get("/api/v2/awards/10/") @@ -1035,7 +1107,10 @@ def test_award_endpoint_parent_award(client, awards_and_transactions): # Test idv award with parent resp = client.get("/api/v2/awards/8/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["parent_award"] == expected_idv_award_parent() + assert ( + json.loads(resp.content.decode("utf-8"))["parent_award"] + == expected_idv_award_parent() + ) # Test idv award without parent resp = client.get("/api/v2/awards/9/") @@ -1043,13 +1118,15 @@ def test_award_endpoint_parent_award(client, awards_and_transactions): assert json.loads(resp.content.decode("utf-8"))["parent_award"] is None -def test_award_endpoint_parent_award_no_submissions(client, awards_and_transactions): +def test_award_endpoint_parent_award_no_submissions( + client: Client, awards_and_transactions +): # Test contract award with parent resp = client.get("/api/v2/awards/7/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["parent_award"] == expected_contract_award_parent( - include_slug=False - ) + assert json.loads(resp.content.decode("utf-8"))[ + "parent_award" + ] == expected_contract_award_parent(include_slug=False) # Test contract award without parent resp = client.get("/api/v2/awards/10/") @@ -1059,7 +1136,9 @@ def test_award_endpoint_parent_award_no_submissions(client, awards_and_transacti # Test idv award with parent resp = client.get("/api/v2/awards/8/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["parent_award"] == expected_idv_award_parent(include_slug=False) + assert json.loads(resp.content.decode("utf-8"))[ + "parent_award" + ] == expected_idv_award_parent(include_slug=False) # Test idv award without parent resp = client.get("/api/v2/awards/9/") @@ -1067,7 +1146,7 @@ def test_award_endpoint_parent_award_no_submissions(client, awards_and_transacti assert json.loads(resp.content.decode("utf-8"))["parent_award"] is None -def test_award_multiple_cfdas(client, awards_and_transactions): +def test_award_multiple_cfdas(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/3/") assert resp.status_code == status.HTTP_200_OK assert json.loads(resp.content.decode("utf-8"))["cfda_info"] == [ @@ -1104,14 +1183,20 @@ def test_award_multiple_cfdas(client, awards_and_transactions): ] -def test_award_psc_hierarchy_types(client, awards_and_transactions): +def test_award_psc_hierarchy_types(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/5/") assert resp.status_code == status.HTTP_200_OK assert json.loads(resp.content.decode("utf-8"))["psc_hierarchy"] == { "toptier_code": {"description": "R&D", "code": "A"}, "midtier_code": {"description": "R&D - Steak Sauce", "code": "A1"}, - "subtier_code": {"description": "R&D - Brand specific steak condiments", "code": "A13"}, - "base_code": {"description": "R&D - Very specific steak research", "code": "A136"}, + "subtier_code": { + "description": "R&D - Brand specific steak condiments", + "code": "A13", + }, + "base_code": { + "description": "R&D - Very specific steak research", + "code": "A136", + }, } resp = client.get("/api/v2/awards/6/") @@ -1124,7 +1209,7 @@ def test_award_psc_hierarchy_types(client, awards_and_transactions): } -def test_foreign_city(client, awards_and_transactions): +def test_foreign_city(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/13/") assert resp.status_code == status.HTTP_200_OK assert json.loads(resp.content.decode("utf-8"))["recipient"]["location"] == { @@ -1146,25 +1231,34 @@ def test_foreign_city(client, awards_and_transactions): } -def test_special_characters(client, awards_and_transactions): +def test_special_characters(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/") assert resp.status_code == status.HTTP_200_OK - resp = client.get("/api/v2/awards/count/transaction/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/") + resp = client.get( + "/api/v2/awards/count/transaction/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/" + ) assert resp.status_code == status.HTTP_200_OK - resp = client.get("/api/v2/awards/count/subaward/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/") + resp = client.get( + "/api/v2/awards/count/subaward/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/" + ) assert resp.status_code == status.HTTP_200_OK - resp = client.get("/api/v2/awards/count/federal_account/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/") + resp = client.get( + "/api/v2/awards/count/federal_account/ASST_NON_:~$@*\"()%23/,^&+=`!'%/_. -_9700/" + ) assert resp.status_code == status.HTTP_200_OK -def test_zip4_switch(client, awards_and_transactions): +def test_zip4_switch(client: Client, awards_and_transactions): resp = client.get("/api/v2/awards/10/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["recipient"]["location"]["zip4"] == "0000" + assert ( + json.loads(resp.content.decode("utf-8"))["recipient"]["location"]["zip4"] + == "0000" + ) -def test_file_c_data(client, awards_and_transactions): +def test_file_c_data(client: Client, awards_and_transactions): defc = baker.make("references.DisasterEmergencyFundCode", code="L") baker.make( "submissions.DABSSubmissionWindowSchedule", @@ -1211,8 +1305,12 @@ def test_file_c_data(client, awards_and_transactions): # fiscal period is not 12 & is not after 2020-04-01, so we expect no data to come back resp = client.get("/api/v2/awards/1/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 100.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 0.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 100.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 0.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 100.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 0.0 baker.make( @@ -1236,8 +1334,12 @@ def test_file_c_data(client, awards_and_transactions): resp = client.get("/api/v2/awards/1/") # now we have the period 12 data, so we expect outlays here assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 200.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 100.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 200.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 100.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 200.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 100.0 baker.make( @@ -1261,8 +1363,12 @@ def test_file_c_data(client, awards_and_transactions): # again, period is not 12, no data reported resp = client.get("/api/v2/awards/1/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 210.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 100.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 210.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 100.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 210.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 100.0 baker.make( @@ -1286,8 +1392,12 @@ def test_file_c_data(client, awards_and_transactions): # expect outlays here resp = client.get("/api/v2/awards/1/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 220.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 110.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 220.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 110.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 220.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 110.0 baker.make( @@ -1310,13 +1420,17 @@ def test_file_c_data(client, awards_and_transactions): # period is 12 but amounts are 0, so we expect no change resp = client.get("/api/v2/awards/1/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 220.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 110.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 220.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 110.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 220.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 110.0 -def test_outlay_calculations(client, awards_and_transactions): +def test_outlay_calculations(client: Client, awards_and_transactions: None): defc = baker.make("references.DisasterEmergencyFundCode", code="L") baker.make( "submissions.DABSSubmissionWindowSchedule", @@ -1374,8 +1488,12 @@ def test_outlay_calculations(client, awards_and_transactions): resp = client.get("/api/v2/awards/1/") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [{"code": "L", "amount": 10.0}] - assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [{"code": "L", "amount": 7.0}] + assert json.loads(resp.content.decode("utf-8"))["account_obligations_by_defc"] == [ + {"code": "L", "amount": 10.0} + ] + assert json.loads(resp.content.decode("utf-8"))["account_outlays_by_defc"] == [ + {"code": "L", "amount": 7.0} + ] assert json.loads(resp.content.decode("utf-8"))["total_account_obligation"] == 10.0 assert json.loads(resp.content.decode("utf-8"))["total_account_outlay"] == 7.0 assert json.loads(resp.content.decode("utf-8"))["total_outlay"] == 7.0 @@ -1435,7 +1553,11 @@ def test_outlay_calculations(client, awards_and_transactions): "code": "ABC", "slug": "toptier-agency-1", }, - "subtier_agency": {"name": "SUBTIER AGENCY 1", "abbreviation": "SA1", "code": "DEF"}, + "subtier_agency": { + "name": "SUBTIER AGENCY 1", + "abbreviation": "SA1", + "code": "DEF", + }, "office_agency_name": "awarding_office", }, "funding_agency": { @@ -1447,7 +1569,11 @@ def test_outlay_calculations(client, awards_and_transactions): "code": "ABC", "slug": "toptier-agency-1", }, - "subtier_agency": {"name": "SUBTIER AGENCY 1", "abbreviation": "SA1", "code": "DEF"}, + "subtier_agency": { + "name": "SUBTIER AGENCY 1", + "abbreviation": "SA1", + "code": "DEF", + }, "office_agency_name": "funding_office", }, "recipient": { @@ -1489,7 +1615,11 @@ def test_outlay_calculations(client, awards_and_transactions): {"name": None, "amount": None}, ] }, - "period_of_performance": {"start_date": "2004-02-04", "end_date": "2005-02-04", "last_modified_date": "2000-01-02"}, + "period_of_performance": { + "start_date": "2004-02-04", + "end_date": "2005-02-04", + "last_modified_date": "2000-01-02", + }, "place_of_performance": { "address_line1": None, "address_line2": None, @@ -1537,7 +1667,11 @@ def test_outlay_calculations(client, awards_and_transactions): "code": "ABC", "slug": "toptier-agency-1", }, - "subtier_agency": {"name": "SUBTIER AGENCY 1", "abbreviation": "SA1", "code": "DEF"}, + "subtier_agency": { + "name": "SUBTIER AGENCY 1", + "abbreviation": "SA1", + "code": "DEF", + }, "office_agency_name": "awarding_office", }, "funding_agency": { @@ -1549,7 +1683,11 @@ def test_outlay_calculations(client, awards_and_transactions): "code": "ABC", "slug": "toptier-agency-1", }, - "subtier_agency": {"name": "SUBTIER AGENCY 1", "abbreviation": "SA1", "code": "DEF"}, + "subtier_agency": { + "name": "SUBTIER AGENCY 1", + "abbreviation": "SA1", + "code": "DEF", + }, "office_agency_name": "funding_office", }, "recipient": { @@ -1691,7 +1829,10 @@ def test_outlay_calculations(client, awards_and_transactions): "date_signed": "2004-03-02", "naics_hierarchy": { "toptier_code": {"description": "Agriculture", "code": "11"}, - "midtier_code": {"description": "Soybean & Oilseed Agriculture", "code": "1111"}, + "midtier_code": { + "description": "Soybean & Oilseed Agriculture", + "code": "1111", + }, "base_code": {"description": "Soybean Harvesting", "code": "111120"}, }, "psc_hierarchy": { @@ -1721,7 +1862,7 @@ def test_outlay_calculations(client, awards_and_transactions): } -def expected_contract_award_parent(include_slug=True): +def expected_contract_award_parent(include_slug: bool = True) -> dict[str, any]: return { "agency_id": 2, "agency_name": "TOPTIER AGENCY 2", @@ -1737,7 +1878,7 @@ def expected_contract_award_parent(include_slug=True): } -def expected_idv_award_parent(include_slug=True): +def expected_idv_award_parent(include_slug: bool = True) -> dict[str, any]: return { "agency_id": 2, "agency_name": "TOPTIER AGENCY 2", diff --git a/usaspending_api/awards/v2/data_layer/orm.py b/usaspending_api/awards/v2/data_layer/orm.py index 289ffc5662..617324b652 100644 --- a/usaspending_api/awards/v2/data_layer/orm.py +++ b/usaspending_api/awards/v2/data_layer/orm.py @@ -1,12 +1,11 @@ import copy import logging - from collections import OrderedDict from decimal import Decimal -from django.db.models import Sum, F, Subquery -from django.utils.text import slugify from typing import Optional +from django.db.models import F, Subquery, Sum +from django.utils.text import slugify from usaspending_api.awards.models import ( Award, @@ -22,22 +21,30 @@ FPDS_AWARD_FIELDS, FPDS_CONTRACT_FIELDS, ) -from usaspending_api.awards.v2.data_layer.orm_utils import delete_keys_from_dict, split_mapper_into_qs -from usaspending_api.common.helpers.business_categories_helper import get_business_category_display_names -from usaspending_api.common.helpers.data_constants import state_code_from_name, state_name_from_code +from usaspending_api.awards.v2.data_layer.orm_utils import ( + delete_keys_from_dict, + split_mapper_into_qs, +) +from usaspending_api.awards.v2.data_layer.sql import defc_sql +from usaspending_api.common.helpers.business_categories_helper import ( + get_business_category_display_names, +) +from usaspending_api.common.helpers.data_constants import ( + state_code_from_name, + state_name_from_code, +) from usaspending_api.common.helpers.date_helper import get_date_from_datetime from usaspending_api.common.helpers.sql_helpers import execute_sql_to_ordered_dictionary from usaspending_api.common.recipient_lookups import obtain_recipient_uri from usaspending_api.references.models import ( + NAICS, + PSC, Agency, Cfda, DisasterEmergencyFundCode, - NAICS, - PSC, SubtierAgency, ToptierAgencyPublishedDABSView, ) -from usaspending_api.awards.v2.data_layer.sql import defc_sql from usaspending_api.search.models import AwardSearch logger = logging.getLogger("console") @@ -59,13 +66,19 @@ def construct_assistance_response(requested_award_dict: dict) -> OrderedDict: response["record_type"] = transaction["record_type"] response["cfda_info"] = fetch_all_cfda_details(award) - response["transaction_obligated_amount"] = fetch_transaction_obligated_amount_by_internal_award_id(award["id"]) + response["transaction_obligated_amount"] = ( + fetch_transaction_obligated_amount_by_internal_award_id(award["id"]) + ) response["funding_agency"] = fetch_agency_details(response["_funding_agency_id"]) if response["funding_agency"]: - response["funding_agency"]["office_agency_name"] = transaction["_funding_office_name"] + response["funding_agency"]["office_agency_name"] = transaction[ + "_funding_office_name" + ] response["awarding_agency"] = fetch_agency_details(response["_awarding_agency_id"]) if response["awarding_agency"]: - response["awarding_agency"]["office_agency_name"] = transaction["_awarding_office_name"] + response["awarding_agency"]["office_agency_name"] = transaction[ + "_awarding_office_name" + ] response["period_of_performance"] = OrderedDict( [ ("start_date", award["_start_date"]), @@ -105,15 +118,22 @@ def construct_contract_response(requested_award_dict: dict) -> OrderedDict: response["latest_transaction_contract_data"] = transaction response["funding_agency"] = fetch_agency_details(response["_funding_agency_id"]) if response["funding_agency"]: - response["funding_agency"]["office_agency_name"] = transaction["_funding_office_name"] + response["funding_agency"]["office_agency_name"] = transaction[ + "_funding_office_name" + ] response["awarding_agency"] = fetch_agency_details(response["_awarding_agency_id"]) if response["awarding_agency"]: - response["awarding_agency"]["office_agency_name"] = transaction["_awarding_office_name"] + response["awarding_agency"]["office_agency_name"] = transaction[ + "_awarding_office_name" + ] response["period_of_performance"] = OrderedDict( [ ("start_date", award["_start_date"]), ("end_date", award["_end_date"]), - ("last_modified_date", transaction["_last_modified"]), + ( + "last_modified_date", + get_date_from_datetime(transaction["_last_modified"]), + ), ("potential_end_date", transaction["_period_of_perf_potential_e"]), ] ) @@ -121,7 +141,9 @@ def construct_contract_response(requested_award_dict: dict) -> OrderedDict: response["executive_details"] = create_officers_object(award) response["place_of_performance"] = create_place_of_performance_object(transaction) if transaction["product_or_service_code"]: - response["psc_hierarchy"] = fetch_psc_hierarchy(transaction["product_or_service_code"]) + response["psc_hierarchy"] = fetch_psc_hierarchy( + transaction["product_or_service_code"] + ) if transaction["naics"]: response["naics_hierarchy"] = fetch_naics_hierarchy(transaction["naics"]) response["total_outlay"] = fetch_total_outlays(award["id"]) @@ -153,19 +175,28 @@ def construct_idv_response(requested_award_dict: dict) -> OrderedDict: transaction = fetch_fpds_details_by_pk(award["_trx"], mapper) - response["parent_award"] = fetch_idv_parent_award_details(award["generated_unique_award_id"]) + response["parent_award"] = fetch_idv_parent_award_details( + award["generated_unique_award_id"] + ) response["latest_transaction_contract_data"] = transaction response["funding_agency"] = fetch_agency_details(response["_funding_agency_id"]) if response["funding_agency"]: - response["funding_agency"]["office_agency_name"] = transaction["_funding_office_name"] + response["funding_agency"]["office_agency_name"] = transaction[ + "_funding_office_name" + ] response["awarding_agency"] = fetch_agency_details(response["_awarding_agency_id"]) if response["awarding_agency"]: - response["awarding_agency"]["office_agency_name"] = transaction["_awarding_office_name"] + response["awarding_agency"]["office_agency_name"] = transaction[ + "_awarding_office_name" + ] response["period_of_performance"] = OrderedDict( [ ("start_date", award["_start_date"]), ("end_date", transaction["_end_date"]), - ("last_modified_date", transaction["_last_modified_date"]), + ( + "last_modified_date", + get_date_from_datetime(transaction["_last_modified_date"]), + ), ("potential_end_date", transaction["_period_of_perf_potential_e"]), ] ) @@ -173,7 +204,9 @@ def construct_idv_response(requested_award_dict: dict) -> OrderedDict: response["executive_details"] = create_officers_object(award) response["place_of_performance"] = create_place_of_performance_object(transaction) if transaction["product_or_service_code"]: - response["psc_hierarchy"] = fetch_psc_hierarchy(transaction["product_or_service_code"]) + response["psc_hierarchy"] = fetch_psc_hierarchy( + transaction["product_or_service_code"] + ) if transaction["naics"]: response["naics_hierarchy"] = fetch_naics_hierarchy(transaction["naics"]) response["total_outlay"] = fetch_total_outlays(award["id"]) @@ -213,27 +246,46 @@ def create_recipient_object(db_row_dict: dict) -> OrderedDict: ( "business_categories", get_business_category_display_names( - fetch_business_categories_by_transaction_id(db_row_dict["_transaction_id"]) + fetch_business_categories_by_transaction_id( + db_row_dict["_transaction_id"] + ) ), ), ( "location", OrderedDict( [ - ("location_country_code", db_row_dict["_rl_location_country_code"]), + ( + "location_country_code", + db_row_dict["_rl_location_country_code"], + ), ("country_name", db_row_dict["_rl_country_name"]), ("state_code", db_row_dict["_rl_state_code"]), ("state_name", db_row_dict["_rl_state_name"]), - ("city_name", db_row_dict["_rl_city_name"] or db_row_dict.get("_rl_foreign_city")), + ( + "city_name", + db_row_dict["_rl_city_name"] + or db_row_dict.get("_rl_foreign_city"), + ), ("county_code", db_row_dict["_rl_county_code"]), ("county_name", db_row_dict["_rl_county_name"]), ("address_line1", db_row_dict["_rl_address_line1"]), ("address_line2", db_row_dict["_rl_address_line2"]), ("address_line3", db_row_dict["_rl_address_line3"]), - ("congressional_code", db_row_dict["_rl_congressional_code_current"]), - ("zip4", db_row_dict.get("_rl_zip_last_4") or db_row_dict.get("_rl_zip4")), + ( + "congressional_code", + db_row_dict["_rl_congressional_code_current"], + ), + ( + "zip4", + db_row_dict.get("_rl_zip_last_4") + or db_row_dict.get("_rl_zip4"), + ), ("zip5", db_row_dict["_rl_zip5"]), - ("foreign_postal_code", db_row_dict.get("_rl_foreign_postal_code")), + ( + "foreign_postal_code", + db_row_dict.get("_rl_foreign_postal_code"), + ), ("foreign_province", db_row_dict.get("_rl_foreign_province")), ] ), @@ -296,12 +348,19 @@ def fetch_award_details(filter_q: dict, mapper_fields: OrderedDict) -> dict: return Award.objects.filter(**filter_q).values(*vals).annotate(**ann).first() -def fetch_contract_parent_award_details(parent_piid: str, parent_fpds_agency: str) -> Optional[OrderedDict]: - parent_guai = "CONT_IDV_{}_{}".format(parent_piid or "NONE", parent_fpds_agency or "NONE") +def fetch_contract_parent_award_details( + parent_piid: str, parent_fpds_agency: str +) -> Optional[OrderedDict]: + parent_guai = "CONT_IDV_{}_{}".format( + parent_piid or "NONE", parent_fpds_agency or "NONE" + ) parent_award_ids = ( ParentAward.objects.filter(generated_unique_award_id=parent_guai) - .annotate(parent_award_award_id=F("award_id"), parent_award_guai=F("generated_unique_award_id")) + .annotate( + parent_award_award_id=F("award_id"), + parent_award_guai=F("generated_unique_award_id"), + ) .values("parent_award_award_id", "parent_award_guai") .first() ) @@ -311,7 +370,9 @@ def fetch_contract_parent_award_details(parent_piid: str, parent_fpds_agency: st def fetch_idv_parent_award_details(guai: str) -> Optional[OrderedDict]: parent_award_ids = ( - ParentAward.objects.filter(generated_unique_award_id=guai, parent_award__isnull=False) + ParentAward.objects.filter( + generated_unique_award_id=guai, parent_award__isnull=False + ) .annotate( parent_award_award_id=F("parent_award__award_id"), parent_award_guai=F("parent_award__generated_unique_award_id"), @@ -348,7 +409,9 @@ def _fetch_parent_award_details(parent_award_ids: dict) -> Optional[OrderedDict] return None parent_sub_agency = ( - SubtierAgency.objects.filter(subtier_code=parent_award["latest_transaction__contract_data__agency_id"]) + SubtierAgency.objects.filter( + subtier_code=parent_award["latest_transaction__contract_data__agency_id"] + ) .values("name", "subtier_agency_id") .first() ) @@ -358,7 +421,8 @@ def _fetch_parent_award_details(parent_award_ids: dict) -> Optional[OrderedDict] toptier_flag=True, toptier_agency_id=Subquery( Agency.objects.filter( - subtier_agency_id__isnull=False, subtier_agency_id=parent_sub_agency["subtier_agency_id"] + subtier_agency_id__isnull=False, + subtier_agency_id=parent_sub_agency["subtier_agency_id"], ).values("toptier_agency_id") ), ) @@ -379,17 +443,33 @@ def _fetch_parent_award_details(parent_award_ids: dict) -> Optional[OrderedDict] ("agency_id", parent_agency["id"] if parent_agency else None), ("agency_name", agency_name), ("agency_slug", slugify(agency_name) if has_agency_page else None), - ("sub_agency_id", parent_award["latest_transaction__contract_data__agency_id"]), - ("sub_agency_name", parent_sub_agency["name"] if parent_sub_agency else None), + ( + "sub_agency_id", + parent_award["latest_transaction__contract_data__agency_id"], + ), + ( + "sub_agency_name", + parent_sub_agency["name"] if parent_sub_agency else None, + ), ("award_id", parent_award_award_id), ("generated_unique_award_id", parent_award_guai), - ("idv_type_description", parent_award["latest_transaction__contract_data__idv_type_description"]), + ( + "idv_type_description", + parent_award["latest_transaction__contract_data__idv_type_description"], + ), ( "multiple_or_single_aw_desc", - parent_award["latest_transaction__contract_data__multiple_or_single_aw_desc"], + parent_award[ + "latest_transaction__contract_data__multiple_or_single_aw_desc" + ], ), ("piid", parent_award["latest_transaction__contract_data__piid"]), - ("type_of_idc_description", parent_award["latest_transaction__contract_data__type_of_idc_description"]), + ( + "type_of_idc_description", + parent_award[ + "latest_transaction__contract_data__type_of_idc_description" + ], + ), ] ) @@ -398,19 +478,33 @@ def _fetch_parent_award_details(parent_award_ids: dict) -> Optional[OrderedDict] def fetch_fabs_details_by_pk(primary_key: int, mapper: OrderedDict) -> dict: vals, ann = split_mapper_into_qs(mapper) - return TransactionFABS.objects.filter(pk=primary_key).values(*vals).annotate(**ann).first() + return ( + TransactionFABS.objects.filter(pk=primary_key) + .values(*vals) + .annotate(**ann) + .first() + ) def fetch_fpds_details_by_pk(primary_key: int, mapper: OrderedDict) -> dict: vals, ann = split_mapper_into_qs(mapper) - return TransactionFPDS.objects.filter(pk=primary_key).values(*vals).annotate(**ann).first() + return ( + TransactionFPDS.objects.filter(pk=primary_key) + .values(*vals) + .annotate(**ann) + .first() + ) -def fetch_latest_ec_details(award_id: int, mapper: OrderedDict, transaction_type: str) -> dict: +def fetch_latest_ec_details( + award_id: int, mapper: OrderedDict, transaction_type: str +) -> dict: vals, ann = split_mapper_into_qs(mapper) model = TransactionFPDS if transaction_type == "fpds" else TransactionFABS retval = ( - model.objects.filter(transaction__award_id=award_id, officer_1_name__isnull=False) + model.objects.filter( + transaction__award_id=award_id, officer_1_name__isnull=False + ) .values(*vals) .annotate(**ann) .order_by("-action_date") @@ -418,8 +512,10 @@ def fetch_latest_ec_details(award_id: int, mapper: OrderedDict, transaction_type return retval.first() -def agency_has_file_c_submission(toptier_agency_id): - return ToptierAgencyPublishedDABSView.objects.filter(toptier_agency_id=toptier_agency_id).exists() +def agency_has_file_c_submission(toptier_agency_id: int) -> bool: + return ToptierAgencyPublishedDABSView.objects.filter( + toptier_agency_id=toptier_agency_id + ).exists() def fetch_agency_details(agency_id: int) -> Optional[dict]: @@ -444,7 +540,9 @@ def fetch_agency_details(agency_id: int) -> Optional[dict]: "name": agency["toptier_agency__name"], "code": agency["toptier_agency__toptier_code"], "abbreviation": agency["toptier_agency__abbreviation"], - "slug": slugify(agency["toptier_agency__name"]) if has_agency_page else None, + "slug": slugify(agency["toptier_agency__name"]) + if has_agency_page + else None, }, "subtier_agency": { "name": agency["subtier_agency__name"], @@ -456,7 +554,11 @@ def fetch_agency_details(agency_id: int) -> Optional[dict]: def fetch_business_categories_by_transaction_id(transaction_id: int) -> list: - tn = TransactionNormalized.objects.filter(pk=transaction_id).values("business_categories").first() + tn = ( + TransactionNormalized.objects.filter(pk=transaction_id) + .values("business_categories") + .first() + ) if tn: return tn["business_categories"] @@ -473,8 +575,15 @@ def normalize_cfda_number_format(fabs_transaction: dict) -> str: def fetch_all_cfda_details(award: dict) -> list: - fabs_values = ["cfda_number", "federal_action_obligation", "non_federal_funding_amount", "total_funding_amount"] - queryset = TransactionFABS.objects.filter(transaction__award_id=award["id"]).values(*fabs_values) + fabs_values = [ + "cfda_number", + "federal_action_obligation", + "non_federal_funding_amount", + "total_funding_amount", + ] + queryset = TransactionFABS.objects.filter(transaction__award_id=award["id"]).values( + *fabs_values + ) cfda_dicts = {} for transaction in queryset: clean_cfda_number_str = normalize_cfda_number_format(transaction) @@ -482,11 +591,17 @@ def fetch_all_cfda_details(award: dict) -> list: cfda_dicts.update( { clean_cfda_number_str: { - "federal_action_obligation": cfda_dicts[clean_cfda_number_str]["federal_action_obligation"] + "federal_action_obligation": cfda_dicts[clean_cfda_number_str][ + "federal_action_obligation" + ] + Decimal(transaction["federal_action_obligation"] or 0), - "non_federal_funding_amount": cfda_dicts[clean_cfda_number_str]["non_federal_funding_amount"] + "non_federal_funding_amount": cfda_dicts[clean_cfda_number_str][ + "non_federal_funding_amount" + ] + Decimal(transaction["non_federal_funding_amount"] or 0), - "total_funding_amount": cfda_dicts[clean_cfda_number_str]["total_funding_amount"] + "total_funding_amount": cfda_dicts[clean_cfda_number_str][ + "total_funding_amount" + ] + Decimal(transaction["total_funding_amount"] or 0), } } @@ -495,9 +610,15 @@ def fetch_all_cfda_details(award: dict) -> list: cfda_dicts.update( { clean_cfda_number_str: { - "federal_action_obligation": Decimal(transaction["federal_action_obligation"] or 0), - "non_federal_funding_amount": Decimal(transaction["non_federal_funding_amount"] or 0), - "total_funding_amount": Decimal(transaction["total_funding_amount"] or 0), + "federal_action_obligation": Decimal( + transaction["federal_action_obligation"] or 0 + ), + "non_federal_funding_amount": Decimal( + transaction["non_federal_funding_amount"] or 0 + ), + "total_funding_amount": Decimal( + transaction["total_funding_amount"] or 0 + ), } } ) @@ -519,10 +640,19 @@ def fetch_all_cfda_details(award: dict) -> list: ("cfda_popular_name", details.get("popular_name")), ("cfda_title", details.get("program_title")), ("cfda_website", details.get("website_address")), - ("federal_action_obligation_amount", cfda_dicts[cfda_number]["federal_action_obligation"]), - ("non_federal_funding_amount", cfda_dicts[cfda_number]["non_federal_funding_amount"]), + ( + "federal_action_obligation_amount", + cfda_dicts[cfda_number]["federal_action_obligation"], + ), + ( + "non_federal_funding_amount", + cfda_dicts[cfda_number]["non_federal_funding_amount"], + ), ("sam_website", details.get("url")), - ("total_funding_amount", cfda_dicts[cfda_number]["total_funding_amount"]), + ( + "total_funding_amount", + cfda_dicts[cfda_number]["total_funding_amount"], + ), ] ) ) @@ -547,10 +677,12 @@ def fetch_cfda_details_using_cfda_number(cfda: str) -> dict: return cfda_details or {} -def fetch_transaction_obligated_amount_by_internal_award_id(internal_award_id: int) -> Optional[Decimal]: - _sum = FinancialAccountsByAwards.objects.filter(award_id=internal_award_id).aggregate( - Sum("transaction_obligated_amount") - ) +def fetch_transaction_obligated_amount_by_internal_award_id( + internal_award_id: int, +) -> Optional[Decimal]: + _sum = FinancialAccountsByAwards.objects.filter( + award_id=internal_award_id + ).aggregate(Sum("transaction_obligated_amount")) if _sum: return _sum["transaction_obligated_amount__sum"] @@ -558,12 +690,19 @@ def fetch_transaction_obligated_amount_by_internal_award_id(internal_award_id: i def fetch_psc_hierarchy(psc_code: str) -> dict: - codes = [psc_code, psc_code[:2], psc_code[:1], psc_code[:3] if psc_code[0] == "A" else None] + codes = [ + psc_code, + psc_code[:2], + psc_code[:1], + psc_code[:3] if psc_code[0] == "A" else None, + ] toptier_code = {} midtier_code = {} subtier_code = {} # only used for R&D codes which start with "A" base_code = {} - if psc_code[0].isalpha(): # we only want to look for the toptier code for services, which start with letters + if psc_code[ + 0 + ].isalpha(): # we only want to look for the toptier code for services, which start with letters try: psc_top = PSC.objects.get(code=codes[2]) toptier_code = {"code": psc_top.code, "description": psc_top.description} @@ -579,7 +718,9 @@ def fetch_psc_hierarchy(psc_code: str) -> dict: base_code = {"code": psc.code, "description": psc.description} except PSC.DoesNotExist: pass - if codes[3] is not None: # don't bother looking for 3 digit codes unless they start with "A" + if ( + codes[3] is not None + ): # don't bother looking for 3 digit codes unless they start with "A" try: psc_rd = PSC.objects.get(code=codes[3]) subtier_code = {"code": psc_rd.code, "description": psc_rd.description} @@ -615,13 +756,19 @@ def fetch_naics_hierarchy(naics: str) -> dict: base_code = {"code": base.code, "description": base.description} except NAICS.DoesNotExist: pass - results = {"toptier_code": toptier_code, "midtier_code": midtier_code, "base_code": base_code} + results = { + "toptier_code": toptier_code, + "midtier_code": midtier_code, + "base_code": base_code, + } return results def fetch_account_details_award(award_id: int) -> dict: award_id_sql = "faba.award_id = {award_id}".format(award_id=award_id) - results = execute_sql_to_ordered_dictionary(defc_sql.format(award_id_sql=award_id_sql)) + results = execute_sql_to_ordered_dictionary( + defc_sql.format(award_id_sql=award_id_sql) + ) outlay_by_code = [] obligation_by_code = [] total_outlay = 0 @@ -631,8 +778,15 @@ def fetch_account_details_award(award_id: int) -> dict: if row["disaster_emergency_fund_code"] in defcs: total_outlay += row["total_outlay"] total_obligations += row["obligated_amount"] - outlay_by_code.append({"code": row["disaster_emergency_fund_code"], "amount": row["total_outlay"]}) - obligation_by_code.append({"code": row["disaster_emergency_fund_code"], "amount": row["obligated_amount"]}) + outlay_by_code.append( + {"code": row["disaster_emergency_fund_code"], "amount": row["total_outlay"]} + ) + obligation_by_code.append( + { + "code": row["disaster_emergency_fund_code"], + "amount": row["obligated_amount"], + } + ) results = { "total_account_outlay": total_outlay, "total_account_obligation": total_obligations, @@ -681,7 +835,9 @@ def fetch_total_outlays(award_id: int) -> dict: faba.gross_outlay_amount_by_award_cpe != 0 ); """ - results = execute_sql_to_ordered_dictionary(sql.format(award_id_sql=f"faba.award_id = {award_id}")) + results = execute_sql_to_ordered_dictionary( + sql.format(award_id_sql=f"faba.award_id = {award_id}") + ) if len(results) > 0: return results[0]["total_outlay"] return None diff --git a/usaspending_api/awards/v2/lookups/lookups.py b/usaspending_api/awards/v2/lookups/lookups.py index 17f052785c..cd64448e49 100644 --- a/usaspending_api/awards/v2/lookups/lookups.py +++ b/usaspending_api/awards/v2/lookups/lookups.py @@ -72,7 +72,11 @@ **direct_payment_award_mapping, **other_award_mapping, } -non_loan_assistance_award_mapping = {**grant_award_mapping, **direct_payment_award_mapping, **other_award_mapping} +non_loan_assistance_award_mapping = { + **grant_award_mapping, + **direct_payment_award_mapping, + **other_award_mapping, +} # TODO: include IDV mappings in the award_type_mapping and update award_filter.py award_type_mapping = { @@ -105,9 +109,24 @@ # 'F': 'Cooperative Agreement', # 'G': 'Grant for Research', # 'S': 'Funded Space Act Agreement', - # 'T': 'Training Grant' + # 'T': 'Training Grant', + "F001": "Grant", + "F002": "Cooperative Agreement", + "F003": "Direct Loan", + "F004": "Loan Guarantee", + "F005": "Indemnity / Insurance (non-loan)", + "F006": "Direct Payment for Specified Use", + "F007": "Direct Payment with Unrestricted Use", + "F008": "Asset Forfeiture / Equitable Sharing", + "F009": "Sale, Exchange, or Donation of Property and Goods", + "F010": "Other Financial Assistance", +} +contract_type_mapping = { + "A": "BPA Call", + "B": "Purchase Order", + "C": "Delivery Order", + "D": "Definitive Contract", } -contract_type_mapping = {"A": "BPA Call", "B": "Purchase Order", "C": "Delivery Order", "D": "Definitive Contract"} idv_type_mapping = { "IDV_A": "GWAC Government Wide Acquisition Contract", "IDV_B": "IDC Multi-Agency Contract, Other Indefinite Delivery Contract", @@ -118,18 +137,47 @@ "IDV_D": "BOA Basic Ordering Agreement", "IDV_E": "BPA Blanket Purchase Agreement", } -grant_type_mapping = {"02": "Block Grant", "03": "Formula Grant", "04": "Project Grant", "05": "Cooperative Agreement"} -direct_payment_type_mapping = {"06": "Direct Payment for Specified Use", "10": "Direct Payment with Unrestricted Use"} -loan_type_mapping = {"07": "Direct Loan", "08": "Guaranteed/Insured Loan"} +grant_type_mapping = { + "02": "Block Grant", + "03": "Formula Grant", + "04": "Project Grant", + "05": "Cooperative Agreement", + "F001": "Grant", + "F002": "Cooperative Agreement", +} +direct_payment_type_mapping = { + "06": "Direct Payment for Specified Use", + "10": "Direct Payment with Unrestricted Use", + "F006": "Direct Payment for Specified Use", + "F007": "Direct Payment with Unrestricted Use", +} +loan_type_mapping = { + "07": "Direct Loan", + "08": "Guaranteed/Insured Loan", + "F003": "Direct Loan", + "F004": "Loan Guarantee", +} # -1 is a derived type that we added as a "catch-all" for any invalid `type` values -other_type_mapping = {"09": "Insurance", "11": "Other Financial Assistance", "-1": "Not Specified"} +other_type_mapping = { + "09": "Insurance", + "11": "Other Financial Assistance", + "-1": "Not Specified", + "F005": "Indemnity / Insurance (non-loan)", + "F008": "Asset Forfeiture / Equitable Sharing", + "F009": "Sale, Exchange, or Donation of Property and Goods", + "F010": "Other Financial Assistance", +} assistance_type_mapping = { **grant_type_mapping, **direct_payment_type_mapping, **loan_type_mapping, **other_type_mapping, } -non_loan_assistance_type_mapping = {**grant_type_mapping, **direct_payment_type_mapping, **other_type_mapping} +non_loan_assistance_type_mapping = { + **grant_type_mapping, + **direct_payment_type_mapping, + **other_type_mapping, +} procurement_type_mapping = {**contract_type_mapping, **idv_type_mapping} all_award_types_mappings = { "contracts": list(contract_type_mapping), @@ -141,9 +189,13 @@ } all_awards_types_to_category = { - type_code: category for category, type_codes in all_award_types_mappings.items() for type_code in type_codes + type_code: category + for category, type_codes in all_award_types_mappings.items() + for type_code in type_codes } all_subaward_types = ["grant", "procurement"] -SUBAWARD_MAPPING_LOOKUP = {key: value.replace(".keyword", "") for key, value in subaward_mapping.items()} +SUBAWARD_MAPPING_LOOKUP = { + key: value.replace(".keyword", "") for key, value in subaward_mapping.items() +} diff --git a/usaspending_api/common/data_classes.py b/usaspending_api/common/data_classes.py index ede54ed88c..ebef5ade96 100644 --- a/usaspending_api/common/data_classes.py +++ b/usaspending_api/common/data_classes.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Optional +from typing import Callable, Optional from typing_extensions import Literal @@ -35,7 +35,7 @@ def robust_order_by_fields(self) -> tuple[str] | tuple[str, str]: @dataclass class TransactionColumn: dest_name: str - source: Optional[str] + source: str | bool | None delta_type: str handling: Literal[ "cast", "leave_null", "literal", "normal", "parse_string_datetime_to_date", "string_datetime_remove_timestamp" @@ -47,4 +47,4 @@ class TransactionColumn: # calling code to format the string with a input. You should expect the scalar transformation # to be applied on this input. For example, a valid scalar_transformation string is # "CASE {input} WHEN 'UNITED STATES' THEN 'USA' ELSE {input} END" - scalar_transformation: str = None + scalar_transformation: Callable | None= None diff --git a/usaspending_api/common/etl/spark.py b/usaspending_api/common/etl/spark.py index 30770ef124..8d84201e52 100644 --- a/usaspending_api/common/etl/spark.py +++ b/usaspending_api/common/etl/spark.py @@ -10,19 +10,36 @@ import os import shutil import time -from collections import namedtuple from itertools import chain -from typing import List +from typing import Literal import duckdb +from delta import DeltaTable from duckdb.experimental.spark.sql import SparkSession as DuckDBSparkSession from duckdb.experimental.spark.sql.dataframe import DataFrame as DuckDBDataFrame -from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.functions import col, concat, concat_ws, expr, lit, regexp_replace, to_date, transform, when +from pyspark.sql import Column, DataFrame, SparkSession +from pyspark.sql.functions import ( + col, + concat, + concat_ws, + expr, + lit, + regexp_replace, + to_date, + transform, + when, +) from pyspark.sql.types import ArrayType, DecimalType, StringType, StructType -from usaspending_api.accounts.models import AppropriationAccountBalances, FederalAccount, TreasuryAppropriationAccount -from usaspending_api.common.helpers.s3_helpers import rename_s3_object, retrieve_s3_bucket_object_list +from usaspending_api.accounts.models import ( + AppropriationAccountBalances, + FederalAccount, + TreasuryAppropriationAccount, +) +from usaspending_api.common.helpers.s3_helpers import ( + rename_s3_object, + retrieve_s3_bucket_object_list, +) from usaspending_api.common.helpers.spark_helpers import ( get_broker_jdbc_url, get_jdbc_connection_properties, @@ -30,7 +47,9 @@ ) from usaspending_api.config import CONFIG from usaspending_api.download.filestreaming.download_generation import EXCEL_ROW_LIMIT -from usaspending_api.financial_activities.models import FinancialAccountsByProgramActivityObjectClass +from usaspending_api.financial_activities.models import ( + FinancialAccountsByProgramActivityObjectClass, +) from usaspending_api.recipient.models import StateData from usaspending_api.references.models import ( CGAC, @@ -52,9 +71,15 @@ ToptierAgency, ZipsGrouped, ) -from usaspending_api.reporting.models import ReportingAgencyMissingTas, ReportingAgencyOverview +from usaspending_api.reporting.models import ( + ReportingAgencyMissingTas, + ReportingAgencyOverview, +) from usaspending_api.settings import CSV_LOCAL_PATH, IS_LOCAL, USASPENDING_AWS_REGION -from usaspending_api.submissions.models import DABSSubmissionWindowSchedule, SubmissionAttributes +from usaspending_api.submissions.models import ( + DABSSubmissionWindowSchedule, + SubmissionAttributes, +) MAX_PARTITIONS = CONFIG.SPARK_MAX_PARTITIONS _USAS_RDS_REF_TABLES = [ @@ -87,12 +112,17 @@ ZipsGrouped, ] -_BROKER_REF_TABLES = ["cd_state_grouped", "cd_zips_grouped", "cd_county_grouped", "cd_city_grouped"] +_BROKER_REF_TABLES = [ + "cd_state_grouped", + "cd_zips_grouped", + "cd_county_grouped", + "cd_city_grouped", +] logger = logging.getLogger(__name__) -def extract_db_data_frame( +def extract_db_data_frame( # noqa: PLR0912, PLR0913, PLR0915 spark: SparkSession, conn_props: dict, jdbc_url: str, @@ -293,28 +323,48 @@ def load_delta_table( spark: SparkSession, source_df: DataFrame, delta_table_name: str, - overwrite: bool = False, + save_mode: Literal["append", "merge", "overwrite"] = "append", + merge_condition: str | Column | None = None, + partition_columns: list[str] | None = None, ) -> None: """ Write DataFrame data to a table in Delta format. Args: spark: the SparkSession source_df: DataFrame with data to write - delta_table_name: table to write into. Currently this function requires the table to already exist. - overwrite: If True, will replace all existing data with that of the DataFrame, while append will add new data. - If left False (the default), the DataFrame data will be appended to existing data. + delta_table_name: table to write into. Currently, this function requires the table to already exist. + save_mode: one of "append", "merge", "overwrite" + merge_condition: merge_condition must be provided if save_mode is "merge" + partition_columns: list of column names to partition by Returns: None """ + start = time.perf_counter() logger.info(f"LOAD (START): Loading data into Delta table {delta_table_name}") # NOTE: Best to (only?) use .saveAsTable(name=) rather than .insertInto(tableName=) # ... The insertInto does not seem to align/merge columns from DataFrame to table columns (defaults to column order) - save_mode = "overwrite" if overwrite else "append" - source_df.write.format(source="delta").mode(saveMode=save_mode).saveAsTable(name=delta_table_name) - logger.info(f"LOAD (FINISH): Loaded data into Delta table {delta_table_name}") + if save_mode == "merge": + if merge_condition is None: + raise ValueError("merge_condition cannot be None when save_mode is 'merge'") + target = DeltaTable.forName(spark, delta_table_name).alias("t") + ( + target.merge(source_df.alias("s"), merge_condition) + .whenNotMatchedInsertAll() + .whenNotMatchedBySourceDelete() + .execute() + ) + else: + source_df.write.format(source="delta").mode(saveMode=save_mode).saveAsTable(name=delta_table_name) + end = time.perf_counter() + logger.info(f"LOAD (FINISH): Loaded data into Delta table {delta_table_name} in {end - start:.2f} seconds.") def load_es_index( - spark: SparkSession, source_df: DataFrame, base_config: dict, index_name: str, routing: str, doc_id: str + spark: SparkSession, + source_df: DataFrame, + base_config: dict, + index_name: str, + routing: str, + doc_id: str, ) -> None: # pragma: no cover -- will be used and tested eventually index_config = base_config.copy() index_config["es.resource.write"] = index_name @@ -331,7 +381,7 @@ def load_es_index( spark.sparkContext._jvm.org.elasticsearch.spark.sql.EsSparkSQL.saveToEs(jvm_data_df, jvm_es_config_map) -def merge_delta_table(spark: SparkSession, source_df: DataFrame, delta_table_name: str, merge_column: str): +def merge_delta_table(spark: SparkSession, source_df: DataFrame, delta_table_name: str, merge_column: str) -> None: source_df.create_or_replace_temporary_view("temp_table") spark.sql( @@ -345,7 +395,11 @@ def merge_delta_table(spark: SparkSession, source_df: DataFrame, delta_table_nam def diff( - left: DataFrame, right: DataFrame, unique_key_col="id", compare_cols=None, include_unchanged_rows=False + left: DataFrame, + right: DataFrame, + unique_key_col: str = "id", + compare_cols: list[str] | None = None, + include_unchanged_rows: bool = False, ) -> DataFrame: """Compares two Spark DataFrames that share a schema and returns row-level differences in a DataFrame @@ -426,14 +480,26 @@ def diff( differences = ( left.withColumn("exists", lit(1)) .alias("l") - .join(right.withColumn("exists", lit(1)).alias("r"), left[unique_key_col] == right[unique_key_col], "fullouter") + .join( + right.withColumn("exists", lit(1)).alias("r"), + left[unique_key_col] == right[unique_key_col], + "fullouter", + ) .withColumn("diff", expr(compare_expr)) ) # Put "diff" col first, then follow by the l and r value for each column, for all columns compared cols_to_show = ( ["diff"] + [f"l.{unique_key_col}", f"r.{unique_key_col}"] - + list(chain(*zip([f"l.{c}" for c in compare_cols], [f"r.{c}" for c in compare_cols], strict=False))) + + list( + chain( + *zip( + [f"l.{c}" for c in compare_cols], + [f"r.{c}" for c in compare_cols], + strict=False, + ) + ) + ) ) differences = differences.select(*cols_to_show) if not include_unchanged_rows: @@ -452,8 +518,8 @@ def convert_decimal_cols_to_string(df: DataFrame) -> DataFrame: def convert_array_cols_to_string( df: DataFrame, - is_postgres_array_format=False, - is_for_csv_export=False, + is_postgres_array_format: bool = False, + is_for_csv_export: bool = False, ) -> DataFrame: """For each column that is an Array of ANYTHING, transform it to a string-ified representation of that Array. @@ -482,7 +548,8 @@ def convert_array_cols_to_string( 2. Escape any quotes inside the array element with backslash. - A case that involves all of this will yield CSV field value like this when viewed in a text editor, assuming Spark CSV options are: quote='"', escape='"' (the default is for it to match quote) - ...,"{""{\""simple\"": \""elem1\"", \""other\"": \""elem1\""}"", ""{\""simple\"": \""elem2\"", \""other\"": \""elem2\""}""}",... + ...,"{""{\""simple\"": \""elem1\"", \""other\"": \""elem1\""}"", + ""{\""simple\"": \""elem2\"", \""other\"": \""elem2\""}""}",... """ arr_open_bracket = "[" arr_close_bracket = "]" @@ -517,10 +584,14 @@ def convert_array_cols_to_string( # Special handling in case of data that already has either a quote " or backslash \ # inside an array element # First replace any single backslash character \ with TWO \\ (an escaped backslash) - # Then replace any quote " character with \" (escaped quote, inside a quoted array elem) + # Then replace quote " character with \" (escaped quote, inside a quoted array elem) # NOTE: these regexp_replace get sent down to a Java replaceAll, which will require # FOUR backslashes to represent ONE - regexp_replace(regexp_replace(c, "\\\\", "\\\\\\\\"), '"', '\\\\"'), + regexp_replace( + regexp_replace(c, "\\\\", "\\\\\\\\"), + '"', + '\\\\"', + ), lit('"'), ), ) @@ -534,14 +605,14 @@ def convert_array_cols_to_string( return df_no_arrays -def build_ref_table_name_list(): +def build_ref_table_name_list() -> list[str]: return [rds_ref_table._meta.db_table for rds_ref_table in _USAS_RDS_REF_TABLES] -def _generate_global_view_sql_strings(tables: List[str], jdbc_url: str) -> List[str]: +def _generate_global_view_sql_strings(tables: list[str], jdbc_url: str) -> list[str]: """Generates the CREATE OR REPLACE SQL strings for each of the given tables and JDBC URL""" - sql_strings: List[str] = [] + sql_strings: list[str] = [] jdbc_conn_props = get_jdbc_connection_properties() for table_name in tables: @@ -561,7 +632,9 @@ def _generate_global_view_sql_strings(tables: List[str], jdbc_url: str) -> List[ return sql_strings -def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broker_views: bool = False): +def create_ref_temp_views( # noqa: PLR0912 + spark: SparkSession | DuckDBSparkSession, create_broker_views: bool = False +) -> None: """Create global temporary Spark reference views that sit atop remote PostgreSQL RDS tables Setting create_broker_views to True will create views for all tables list in _BROKER_REF_TABLES Note: They will all be listed under global_temp.{table_name} @@ -579,15 +652,14 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke match isinstance(spark, DuckDBSparkSession): case True: logger.info("Creating ref temp views using DuckDB") - if IS_LOCAL: spark.sql( f""" CREATE OR REPLACE SECRET ( TYPE s3, PROVIDER config, - KEY_ID '{CONFIG.AWS_ACCESS_KEY}', - SECRET '{CONFIG.AWS_SECRET_KEY}', + KEY_ID '{CONFIG.AWS_ACCESS_KEY.get_secret_value()}', + SECRET '{CONFIG.AWS_SECRET_KEY.get_secret_value()}', ENDPOINT '{CONFIG.AWS_S3_ENDPOINT}', URL_STYLE 'path', USE_SSL 'false' @@ -610,7 +682,10 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke _download_delta_tables = [ {"schema": "rpt", "table_name": "account_balances_download"}, - {"schema": "rpt", "table_name": "object_class_program_activity_download"}, + { + "schema": "rpt", + "table_name": "object_class_program_activity_download", + }, ] # The DuckDB Delta extension is needed to interact with DeltaLake tables @@ -627,9 +702,9 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke """ ) logger.info(f"Successfully created table {table['schema']}.{table['table_name']}") - except duckdb.IOException: + except duckdb.IOException as exc: logger.exception(f"Failed to create table {table['table_name']}") - raise RuntimeError(f"Failed to create table {table['table_name']}") + raise RuntimeError(f"Failed to create table {table['table_name']}") from exc # The DuckDB Postgres extension is needed to connect to the USAS Postgres DB spark.sql("LOAD postgres; CREATE SCHEMA IF NOT EXISTS global_temp;") @@ -638,9 +713,9 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke for table in rds_ref_tables: try: spark.sql(f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM usas.public.{table};") - except duckdb.CatalogException: + except duckdb.CatalogException as exc: logger.exception(f"Failed to create view {table} for {table}") - raise RuntimeError(f"Failed to create view {table} for {table}") + raise RuntimeError(f"Failed to create view {table} for {table}") from exc if create_broker_views: spark.sql( @@ -654,9 +729,9 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke for table in _BROKER_REF_TABLES: try: spark.sql(f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM broker.public.{table};") - except duckdb.CatalogException: + except duckdb.CatalogException as exc: logger.exception(f"Failed to create view {table} for {table}") - raise RuntimeError(f"Failed to create view {table} for {table}") + raise RuntimeError(f"Failed to create view {table} for {table}") from exc case False: logger.info("Creating ref temp views using Spark") @@ -682,14 +757,14 @@ def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broke logger.info("Created the reference views in the global_temp database") -def write_csv_file( +def write_csv_file( # noqa: PLR0913 spark: SparkSession, df: DataFrame, parts_dir: str, - max_records_per_file=EXCEL_ROW_LIMIT, - overwrite=True, - logger=None, - delimiter=",", + max_records_per_file: int = EXCEL_ROW_LIMIT, + overwrite: bool = True, + logger: logging.Logger | None = None, + delimiter: str = ",", ) -> int: """Write DataFrame data to CSV file parts. Args: @@ -785,7 +860,7 @@ def write_csv_file_duckdb( for dir in _partition_dirs: _old_csv_path = f"{dir}/{os.listdir(dir)[0]}" _new_csv_path = ( - f"{temp_csv_directory_path}{download_file_name}/{download_file_name}_{dir.split('=')[1].zfill(2)}.csv" + f"{temp_csv_directory_path}{download_file_name}" f"/{download_file_name}_{dir.split('=')[1].zfill(2)}.csv" ) shutil.move(_old_csv_path, _new_csv_path) full_file_paths.append(_new_csv_path) @@ -796,33 +871,6 @@ def write_csv_file_duckdb( return df_record_count, full_file_paths -def _merge_file_parts(fs, out_stream, conf, hadoop, partial_merged_file_path, part_file_list): - """Read-in files in alphabetical order and append them one by one to the merged file""" - - for part_file in part_file_list: - in_stream = None - try: - in_stream = fs.open(part_file) - # Write bytes of each file read and keep out_stream open after write for next file - hadoop.io.IOUtils.copyBytes(in_stream, out_stream, conf, False) - finally: - if in_stream: - in_stream.close() - if fs.exists(partial_merged_file_path): - fs.delete(partial_merged_file_path, True) - - -def _merge_grouper(items, group_size): - """Helper to chunk up files into mergeable groups""" - FileMergeGroup = namedtuple("FileMergeGroup", ["part", "file_list"]) - if len(items) <= group_size: - yield FileMergeGroup(None, items) - return - group_generator = (items[i : i + group_size] for i in range(0, len(items), group_size)) - for i, group in enumerate(group_generator, start=1): - yield FileMergeGroup(i, group) - - def rename_part_files( bucket_name: str, destination_file_name: str, @@ -849,7 +897,8 @@ def rename_part_files( [ file.key for file in retrieve_s3_bucket_object_list( - bucket_name, key_prefix=f"{temp_download_dir_name}/{destination_file_name}/part-" + bucket_name, + key_prefix=f"{temp_download_dir_name}/{destination_file_name}/part-", ) if file.key.endswith(file_format) ] diff --git a/usaspending_api/common/helpers/date_helper.py b/usaspending_api/common/helpers/date_helper.py index 141037f9d1..5bf8a92c3f 100644 --- a/usaspending_api/common/helpers/date_helper.py +++ b/usaspending_api/common/helpers/date_helper.py @@ -1,37 +1,36 @@ import operator - from argparse import ArgumentTypeError -from datetime import datetime, timezone -from dateutil import parser +from datetime import date, datetime, timezone from typing import Callable +from dateutil import parser -def now(): + +def now() -> datetime: """Now now() is a standardized function to obtain "now" when you need it now.""" return datetime.now(timezone.utc) -def cast_datetime_to_naive(datetime): +def cast_datetime_to_naive(datetime_to_cast: datetime) -> datetime: """ Removes timezone information, but converts non-UTC datetimes to UTC - beforehand so that the returned datetime will be naive but will also be UTC. - """ - if datetime.tzinfo is not None: - datetime = datetime.astimezone(timezone.utc) - return datetime.replace(tzinfo=None) + beforehand so that the returned datetime will be naive but will also be UTC.""" + if datetime_to_cast.tzinfo is not None: + datetime_to_cast = datetime_to_cast.astimezone(timezone.utc) + return datetime_to_cast.replace(tzinfo=None) -def cast_datetime_to_utc(datetime): +def cast_datetime_to_utc(datetime_to_cast: datetime) -> datetime: """ If datetime has no tzinfo, assume it is UTC, otherwise convert the datetime to UTC. """ - if datetime.tzinfo is None: - return datetime.replace(tzinfo=timezone.utc) - return datetime.astimezone(timezone.utc) + if datetime_to_cast.tzinfo is None: + return datetime_to_cast.replace(tzinfo=timezone.utc) + return datetime_to_cast.astimezone(timezone.utc) -def datetime_command_line_argument_type(naive): +def datetime_command_line_argument_type(naive: bool) -> datetime: """ This function is designed to be used as a date/time type for argparse command line parameters. argparse parameter types need to be passed @@ -45,7 +44,7 @@ def datetime_command_line_argument_type(naive): is timezone aware. If it is timezone naive, it is assumed to be UTC. """ - def _datetime_command_line_argument_type(input_string): + def _datetime_command_line_argument_type(input_string: str) -> datetime: """ A very flexible date/time parser to be used as a command line argument parser. See wrapper for timezone handling instructions. @@ -62,25 +61,29 @@ def _datetime_command_line_argument_type(input_string): else: return cast_datetime_to_utc(parsed) - except (OverflowError, TypeError, ValueError): - raise ArgumentTypeError("Unable to convert provided value to date/time") + except (OverflowError, TypeError, ValueError) as exc: + raise ArgumentTypeError( + "Unable to convert provided value to date/time" + ) from exc return _datetime_command_line_argument_type -def get_date_from_datetime(date_time, **kwargs): +def get_date_from_datetime(date_time: datetime | str, **kwargs) -> date: """ Pass a keyword argument called "default" if you wish to have a specific value returned when the date cannot be extracted from date_time, otherwise date_time will be returned. """ try: + if isinstance(date_time, str): + date_time = parser.parse(date_time) return date_time.date() except Exception: return kwargs.get("default", date_time) -def fy(raw_date): +def fy(raw_date: str | date | None) -> int | None: """Federal fiscal year corresponding to date""" if raw_date is None: @@ -93,8 +96,8 @@ def fy(raw_date): result = raw_date.year if raw_date.month > 9: result += 1 - except AttributeError: - raise TypeError("{} needs year and month attributes".format(raw_date)) + except AttributeError as exc: + raise TypeError(f"{raw_date} needs year and month attributes") from exc return result @@ -109,7 +112,9 @@ def datetime_is_lt(first_datetime: datetime, second_datetime: datetime) -> bool: return _compare_datetimes(first_datetime, second_datetime, operator.lt) -def _compare_datetimes(first_datetime: datetime, second_datetime: datetime, op_func: Callable) -> bool: +def _compare_datetimes( + first_datetime: datetime, second_datetime: datetime, op_func: Callable +) -> bool: """Comparison of datetimes using provided function. If TZ-unaware, assumes UTC""" dt_1 = cast_datetime_to_utc(first_datetime) dt_2 = cast_datetime_to_utc(second_datetime) diff --git a/usaspending_api/common/spark/jobs.py b/usaspending_api/common/spark/jobs.py index fe397c2667..5e3f95e23e 100644 --- a/usaspending_api/common/spark/jobs.py +++ b/usaspending_api/common/spark/jobs.py @@ -13,6 +13,7 @@ from django.conf import settings from django.core.management import call_command from duckdb.experimental.spark.sql import SparkSession as DuckDBSparkSession +from usaspending_api.config import CONFIG from usaspending_api.common.spark.configs import LOCAL_EXTENDED_EXTRA_CONF, OPTIONAL_SPARK_HIVE_JAR, SPARK_SESSION_JARS @@ -165,10 +166,12 @@ def handle_start(self, job_name: str, command_name: str, command_options: list[s mode="BATCH", jobDriver={ "sparkSubmit": { - "entryPoint": command_name, - "entryPointArguments": command_options, + "entryPoint": f"s3://{CONFIG.SPARK_S3_BUCKET}/master/manage.py", + "entryPointArguments": [command_name, *command_options], } }, + # TODO: Requires updating to EMR 7 + # retryPolicy={"maxAttempts": 2}, ) return response diff --git a/usaspending_api/common/tests/integration/test_spark_jobs.py b/usaspending_api/common/tests/integration/test_spark_jobs.py index 389d41371e..063de13a8a 100644 --- a/usaspending_api/common/tests/integration/test_spark_jobs.py +++ b/usaspending_api/common/tests/integration/test_spark_jobs.py @@ -5,7 +5,7 @@ def test_local_spark_jobs_strategy(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): expected_table_name = "award_search" delta_table_spec = TABLE_SPEC[expected_table_name] - expected_db_name = delta_table_spec["destination_database"] + expected_db_name = delta_table_spec.destination_database spark_jobs = SparkJobs(LocalStrategy()) spark_jobs.start( diff --git a/usaspending_api/disaster/tests/integration/test_cfda_count.py b/usaspending_api/disaster/tests/integration/test_cfda_count.py index f574229dc3..217c23e761 100644 --- a/usaspending_api/disaster/tests/integration/test_cfda_count.py +++ b/usaspending_api/disaster/tests/integration/test_cfda_count.py @@ -91,5 +91,5 @@ def test_invalid_award_type_codes( assert resp.status_code == status.HTTP_400_BAD_REQUEST assert ( resp.data["detail"] - == "Field 'filter|award_type_codes' is outside valid values ['-1', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11']" + == "Field 'filter|award_type_codes' is outside valid values ['-1', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', 'F001', 'F002', 'F003', 'F004', 'F005', 'F006', 'F007', 'F008', 'F009', 'F010']" # noqa: E501 ) diff --git a/usaspending_api/disaster/tests/integration/test_cfda_loans.py b/usaspending_api/disaster/tests/integration/test_cfda_loans.py index 9807903d4e..f998c39f2d 100644 --- a/usaspending_api/disaster/tests/integration/test_cfda_loans.py +++ b/usaspending_api/disaster/tests/integration/test_cfda_loans.py @@ -219,7 +219,7 @@ def test_invalid_award_type_codes( resp = helpers.post_for_spending_endpoint(client, url, award_type_codes=["ZZ", "08"], def_codes=["L", "M"]) assert resp.status_code == status.HTTP_400_BAD_REQUEST - assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08']" + assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08', 'F003', 'F004']" @pytest.mark.django_db diff --git a/usaspending_api/disaster/tests/integration/test_cfda_spending.py b/usaspending_api/disaster/tests/integration/test_cfda_spending.py index 5f3e764516..b4dfc71b85 100644 --- a/usaspending_api/disaster/tests/integration/test_cfda_spending.py +++ b/usaspending_api/disaster/tests/integration/test_cfda_spending.py @@ -20,7 +20,11 @@ def test_correct_response_defc_no_results( @pytest.mark.django_db def test_correct_response_single_defc( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) @@ -75,7 +79,11 @@ def test_correct_response_single_defc( @pytest.mark.django_db def test_correct_response_multiple_defc( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) @@ -131,16 +139,24 @@ def test_correct_response_multiple_defc( @pytest.mark.django_db def test_correct_response_with_query( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) - resp = helpers.post_for_spending_endpoint(client, url, def_codes=["L", "M"], query="GIBBERISH") + resp = helpers.post_for_spending_endpoint( + client, url, def_codes=["L", "M"], query="GIBBERISH" + ) expected_results = [] assert resp.status_code == status.HTTP_200_OK assert resp.json()["results"] == expected_results - resp = helpers.post_for_spending_endpoint(client, url, def_codes=["L", "M"], query="3") + resp = helpers.post_for_spending_endpoint( + client, url, def_codes=["L", "M"], query="3" + ) expected_results = [ { "code": "30.300", @@ -163,16 +179,24 @@ def test_correct_response_with_query( @pytest.mark.django_db def test_correct_response_with_award_type_codes( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) - resp = helpers.post_for_spending_endpoint(client, url, def_codes=["L", "M"], award_type_codes=["11"]) + resp = helpers.post_for_spending_endpoint( + client, url, def_codes=["L", "M"], award_type_codes=["11"] + ) expected_results = [] assert resp.status_code == status.HTTP_200_OK assert resp.json()["results"] == expected_results - resp = helpers.post_for_spending_endpoint(client, url, def_codes=["L", "M"], award_type_codes=["07", "09", "11"]) + resp = helpers.post_for_spending_endpoint( + client, url, def_codes=["L", "M"], award_type_codes=["07", "09", "11"] + ) expected_results = [ { "code": "20.200", @@ -208,53 +232,91 @@ def test_correct_response_with_award_type_codes( @pytest.mark.django_db -def test_invalid_defc(client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions): +def test_invalid_defc( + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = helpers.post_for_spending_endpoint(client, url, def_codes=["ZZ"]) assert resp.status_code == status.HTTP_400_BAD_REQUEST - assert resp.data["detail"] == "Field 'filter|def_codes' is outside valid values ['L', 'M', 'N']" + assert ( + resp.data["detail"] + == "Field 'filter|def_codes' is outside valid values ['L', 'M', 'N']" + ) @pytest.mark.django_db -def test_invalid_defc_type(client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions): +def test_invalid_defc_type( + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = helpers.post_for_spending_endpoint(client, url, def_codes="100") assert resp.status_code == status.HTTP_400_BAD_REQUEST - assert resp.data["detail"] == "Invalid value in 'filter|def_codes'. '100' is not a valid type (array)" + assert ( + resp.data["detail"] + == "Invalid value in 'filter|def_codes'. '100' is not a valid type (array)" + ) @pytest.mark.django_db -def test_missing_defc(client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions): +def test_missing_defc( + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = helpers.post_for_spending_endpoint(client, url) assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY - assert resp.data["detail"] == "Missing value: 'filter|def_codes' is a required field" + assert ( + resp.data["detail"] == "Missing value: 'filter|def_codes' is a required field" + ) @pytest.mark.django_db def test_invalid_award_type_codes( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) - resp = helpers.post_for_spending_endpoint(client, url, award_type_codes=["ZZ", "08"], def_codes=["L", "M"]) + resp = helpers.post_for_spending_endpoint( + client, url, award_type_codes=["ZZ", "08"], def_codes=["L", "M"] + ) assert resp.status_code == status.HTTP_400_BAD_REQUEST assert ( resp.data["detail"] - == "Field 'filter|award_type_codes' is outside valid values ['-1', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11']" + == "Field 'filter|award_type_codes' is outside valid values ['-1', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', 'F001', 'F002', 'F003', 'F004', 'F005', 'F006', 'F007', 'F008', 'F009', 'F010']" # noqa: E501 ) @pytest.mark.django_db def test_pagination_page_and_limit( - client, monkeypatch, helpers, elasticsearch_award_index, cfda_awards_and_transactions + client, + monkeypatch, + helpers, + elasticsearch_award_index, + cfda_awards_and_transactions, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) - resp = helpers.post_for_spending_endpoint(client, url, def_codes=["L", "M"], page=2, limit=1, sort="description") + resp = helpers.post_for_spending_endpoint( + client, url, def_codes=["L", "M"], page=2, limit=1, sort="description" + ) expected_results = { "totals": {"award_count": 4, "obligation": 2222.0, "outlay": 1100.0}, "results": [ diff --git a/usaspending_api/disaster/tests/integration/test_disaster_agency_loans.py b/usaspending_api/disaster/tests/integration/test_disaster_agency_loans.py index 1404c0a9da..292ec2b1b5 100644 --- a/usaspending_api/disaster/tests/integration/test_disaster_agency_loans.py +++ b/usaspending_api/disaster/tests/integration/test_disaster_agency_loans.py @@ -111,4 +111,4 @@ def test_invalid_award_type_codes(client, monkeypatch, helpers, elasticsearch_aw resp = helpers.post_for_spending_endpoint(client, url, award_type_codes=["ZZ", "08"], def_codes=["L", "M"]) assert resp.status_code == status.HTTP_400_BAD_REQUEST - assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08']" + assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08', 'F003', 'F004']" diff --git a/usaspending_api/disaster/tests/integration/test_recipient_loans.py b/usaspending_api/disaster/tests/integration/test_recipient_loans.py index 1b047f5186..526a7a6dae 100644 --- a/usaspending_api/disaster/tests/integration/test_recipient_loans.py +++ b/usaspending_api/disaster/tests/integration/test_recipient_loans.py @@ -287,7 +287,7 @@ def test_invalid_award_type_codes(client, monkeypatch, helpers, elasticsearch_aw resp = helpers.post_for_spending_endpoint(client, url, award_type_codes=["ZZ", "08"], def_codes=["L", "M"]) assert resp.status_code == status.HTTP_400_BAD_REQUEST - assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08']" + assert resp.data["detail"] == "Field 'filter|award_type_codes' is outside valid values ['07', '08', 'F003', 'F004']" @pytest.mark.django_db diff --git a/usaspending_api/download/delta_downloads/account_balances.py b/usaspending_api/download/delta_downloads/account_balances.py index b8b0ceb61d..21eb9378c0 100644 --- a/usaspending_api/download/delta_downloads/account_balances.py +++ b/usaspending_api/download/delta_downloads/account_balances.py @@ -1,5 +1,7 @@ -from pyspark.sql import functions as sf, Column, DataFrame, SparkSession -from usaspending_api.config import CONFIG +from duckdb.experimental.spark.sql import SparkSession as DuckDBSparkSession +from duckdb.experimental.spark.sql.column import Column as DuckDBSparkColumn +from duckdb.experimental.spark.sql.dataframe import DataFrame as DuckDBSparkDataFrame +from pyspark.sql import Column, DataFrame, SparkSession from usaspending_api.common.spark.utils import collect_concat from usaspending_api.download.delta_downloads.abstract_downloads.account_download import ( @@ -11,34 +13,42 @@ AbstractAccountDownloadFactory, AccountDownloadConditionName, ) -from usaspending_api.download.delta_downloads.filters.account_filters import AccountDownloadFilters +from usaspending_api.download.delta_downloads.filters.account_filters import ( + AccountDownloadFilters, +) from usaspending_api.submissions.helpers import get_submission_ids_for_periods class AccountBalancesMixin: """Shared code between concrete implementations of the AbstractAccountDownload""" - spark: SparkSession + spark: SparkSession | DuckDBSparkSession filters: AccountDownloadFilters - dynamic_filters: Column + dynamic_filters: Column | DuckDBSparkColumn group_by_cols: list[str] - agg_cols: list[Column] - select_cols: list[Column] + agg_cols: list[Column | DuckDBSparkColumn] + select_cols: list[Column | DuckDBSparkColumn] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if isinstance(self.spark, DuckDBSparkSession): + from duckdb.experimental.spark.sql import functions + else: + from pyspark.sql import functions + + self.sf = functions @property - def download_table(self) -> DataFrame: - # TODO: This should be reverted back after Spark downloads are migrated to EMR - # return self.spark.table("rpt.account_balances_download") - return self.spark.read.format("delta").load( - f"s3a://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/account_balances_download" - ) + def download_table(self) -> DataFrame | DuckDBSparkDataFrame: + return self.spark.table("rpt.account_balances_download") - def _build_dataframes(self) -> list[DataFrame]: + def _build_dataframes(self) -> list[DataFrame | DuckDBSparkDataFrame]: return [ self.download_table.filter( - sf.col("submission_id").isin( + self.sf.col("submission_id").isin( get_submission_ids_for_periods( self.filters.reporting_fiscal_year, self.filters.reporting_fiscal_quarter, @@ -54,7 +64,6 @@ def _build_dataframes(self) -> list[DataFrame]: class FederalAccountDownload(AccountBalancesMixin, AbstractAccountDownload): - @property def account_level(self) -> AccountLevel: return AccountLevel.FEDERAL_ACCOUNT @@ -65,84 +74,115 @@ def submission_type(self) -> SubmissionType: @property def group_by_cols(self) -> list[str]: - return ["federal_account_symbol", "owning_agency_name", "federal_account_name", "submission_period"] + return [ + "federal_account_symbol", + "owning_agency_name", + "federal_account_name", + "submission_period", + ] @property - def agg_cols(self) -> list[Column]: + def agg_cols(self) -> list[Column | DuckDBSparkColumn]: return [ collect_concat("reporting_agency_name", spark=self.spark), collect_concat("agency_identifier_name", spark=self.spark), collect_concat("budget_function", spark=self.spark), collect_concat("budget_subfunction", spark=self.spark), - sf.sum(sf.col("budget_authority_unobligated_balance_brought_forward")).alias( - "budget_authority_unobligated_balance_brought_forward" + self.sf.sum( + self.sf.col("budget_authority_unobligated_balance_brought_forward") + ).alias("budget_authority_unobligated_balance_brought_forward"), + self.sf.sum( + self.sf.col("adjustments_to_unobligated_balance_brought_forward_cpe") + ).alias("adjustments_to_unobligated_balance_brought_forward_cpe"), + self.sf.sum(self.sf.col("budget_authority_appropriated_amount")).alias( + "budget_authority_appropriated_amount" + ), + self.sf.sum(self.sf.col("borrowing_authority_amount")).alias( + "borrowing_authority_amount" ), - sf.sum(sf.col("adjustments_to_unobligated_balance_brought_forward_cpe")).alias( - "adjustments_to_unobligated_balance_brought_forward_cpe" + self.sf.sum(self.sf.col("contract_authority_amount")).alias( + "contract_authority_amount" ), - sf.sum(sf.col("budget_authority_appropriated_amount")).alias("budget_authority_appropriated_amount"), - sf.sum(sf.col("borrowing_authority_amount")).alias("borrowing_authority_amount"), - sf.sum(sf.col("contract_authority_amount")).alias("contract_authority_amount"), - sf.sum(sf.col("spending_authority_from_offsetting_collections_amount")).alias( - "spending_authority_from_offsetting_collections_amount" + self.sf.sum( + self.sf.col("spending_authority_from_offsetting_collections_amount") + ).alias("spending_authority_from_offsetting_collections_amount"), + self.sf.sum(self.sf.col("total_other_budgetary_resources_amount")).alias( + "total_other_budgetary_resources_amount" ), - sf.sum(sf.col("total_other_budgetary_resources_amount")).alias("total_other_budgetary_resources_amount"), - sf.sum(sf.col("total_budgetary_resources")).alias("total_budgetary_resources"), - sf.sum(sf.col("obligations_incurred")).alias("obligations_incurred"), - sf.sum(sf.col("deobligations_or_recoveries_or_refunds_from_prior_year")).alias( - "deobligations_or_recoveries_or_refunds_from_prior_year" + self.sf.sum(self.sf.col("total_budgetary_resources")).alias( + "total_budgetary_resources" ), - sf.sum(sf.col("unobligated_balance")).alias("unobligated_balance"), - sf.sum( - sf.when( + self.sf.sum(self.sf.col("obligations_incurred")).alias( + "obligations_incurred" + ), + self.sf.sum( + self.sf.col("deobligations_or_recoveries_or_refunds_from_prior_year") + ).alias("deobligations_or_recoveries_or_refunds_from_prior_year"), + self.sf.sum(self.sf.col("unobligated_balance")).alias( + "unobligated_balance" + ), + self.sf.sum( + self.sf.when( ( ( - sf.col("quarter_format_flag") - & (sf.col("reporting_fiscal_quarter") == self.filters.reporting_fiscal_quarter) + self.sf.col("quarter_format_flag") + & ( + self.sf.col("reporting_fiscal_quarter") + == self.filters.reporting_fiscal_quarter + ) ) | ( - ~sf.col("quarter_format_flag") - & (sf.col("reporting_fiscal_period") == self.filters.reporting_fiscal_period) + ~self.sf.col("quarter_format_flag") + & ( + self.sf.col("reporting_fiscal_period") + == self.filters.reporting_fiscal_period + ) ) ) - & (sf.col("reporting_fiscal_year") == self.filters.reporting_fiscal_year), - sf.col("gross_outlay_amount"), + & ( + self.sf.col("reporting_fiscal_year") + == self.filters.reporting_fiscal_year + ), + self.sf.col("gross_outlay_amount"), ).otherwise(0) ).alias("gross_outlay_amount"), - sf.sum(sf.col("status_of_budgetary_resources_total")).alias("status_of_budgetary_resources_total"), - sf.max(sf.date_format("last_modified_date", "yyyy-MM-dd")).alias("last_modified_date"), + self.sf.sum(self.sf.col("status_of_budgetary_resources_total")).alias( + "status_of_budgetary_resources_total" + ), + self.sf.max(self.sf.col("last_modified_date")).alias( + "max_last_modified_date" + ), ] @property def select_cols(self) -> list[Column]: return [ - sf.col("owning_agency_name"), - sf.col("reporting_agency_name"), - sf.col("submission_period"), - sf.col("federal_account_symbol"), - sf.col("federal_account_name"), - sf.col("agency_identifier_name"), - sf.col("budget_function"), - sf.col("budget_subfunction"), - sf.col("budget_authority_unobligated_balance_brought_forward"), - sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), - sf.col("budget_authority_appropriated_amount"), - sf.col("borrowing_authority_amount"), - sf.col("contract_authority_amount"), - sf.col("spending_authority_from_offsetting_collections_amount"), - sf.col("total_other_budgetary_resources_amount"), - sf.col("total_budgetary_resources"), - sf.col("obligations_incurred"), - sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), - sf.col("unobligated_balance"), - sf.col("gross_outlay_amount"), - sf.col("status_of_budgetary_resources_total"), - sf.col("last_modified_date"), + self.sf.col("owning_agency_name"), + self.sf.col("reporting_agency_name"), + self.sf.col("submission_period"), + self.sf.col("federal_account_symbol"), + self.sf.col("federal_account_name"), + self.sf.col("agency_identifier_name"), + self.sf.col("budget_function"), + self.sf.col("budget_subfunction"), + self.sf.col("budget_authority_unobligated_balance_brought_forward"), + self.sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), + self.sf.col("budget_authority_appropriated_amount"), + self.sf.col("borrowing_authority_amount"), + self.sf.col("contract_authority_amount"), + self.sf.col("spending_authority_from_offsetting_collections_amount"), + self.sf.col("total_other_budgetary_resources_amount"), + self.sf.col("total_budgetary_resources"), + self.sf.col("obligations_incurred"), + self.sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), + self.sf.col("unobligated_balance"), + self.sf.col("gross_outlay_amount"), + self.sf.col("status_of_budgetary_resources_total"), + self.sf.col("max_last_modified_date").alias("last_modified_date"), ] class TreasuryAccountDownload(AccountBalancesMixin, AbstractAccountDownload): - @property def account_level(self) -> AccountLevel: return AccountLevel.TREASURY_ACCOUNT @@ -152,98 +192,100 @@ def submission_type(self) -> SubmissionType: return SubmissionType.ACCOUNT_BALANCES @property - def group_by_cols(self) -> list[Column]: + def group_by_cols(self) -> list[Column | DuckDBSparkColumn]: return [ - sf.col("data_source"), - sf.col("appropriation_account_balances_id"), - sf.col("budget_authority_unobligated_balance_brought_forward"), - sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), - sf.col("budget_authority_appropriated_amount"), - sf.col("borrowing_authority_amount"), - sf.col("contract_authority_amount"), - sf.col("spending_authority_from_offsetting_collections_amount"), - sf.col("total_other_budgetary_resources_amount"), - sf.col("total_budgetary_resources"), - sf.col("gross_outlay_amount"), - sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), - sf.col("unobligated_balance"), - sf.col("status_of_budgetary_resources_total"), - sf.col("obligations_incurred"), - sf.col("drv_appropriation_availability_period_start_date"), - sf.col("drv_appropriation_availability_period_end_date"), - sf.col("drv_appropriation_account_expired_status"), - sf.col("drv_obligations_unpaid_amount"), - sf.col("drv_other_obligated_amount"), - sf.col("reporting_period_start"), - sf.col("reporting_period_end"), - sf.col("appropriation_account_last_modified"), - sf.col("certified_date"), - sf.col("create_date"), - sf.col("update_date"), - sf.col("final_of_fy"), - sf.col("submission_id"), - sf.col("treasury_account_identifier"), - sf.col("owning_agency_name"), - sf.col("reporting_agency_name"), - sf.col("allocation_transfer_agency_identifier_code"), - sf.col("agency_identifier_code"), - sf.col("beginning_period_of_availability"), - sf.col("ending_period_of_availability"), - sf.col("availability_type_code"), - sf.col("main_account_code"), - sf.col("sub_account_code"), - sf.col("treasury_account_symbol"), - sf.col("treasury_account_name"), - sf.col("budget_function"), - sf.col("budget_subfunction"), - sf.col("federal_account_symbol"), - sf.col("federal_account_name"), - sf.col("agency_identifier_name"), - sf.col("allocation_transfer_agency_identifier_name"), - sf.col("submission_period"), + self.sf.col("data_source"), + self.sf.col("appropriation_account_balances_id"), + self.sf.col("budget_authority_unobligated_balance_brought_forward"), + self.sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), + self.sf.col("budget_authority_appropriated_amount"), + self.sf.col("borrowing_authority_amount"), + self.sf.col("contract_authority_amount"), + self.sf.col("spending_authority_from_offsetting_collections_amount"), + self.sf.col("total_other_budgetary_resources_amount"), + self.sf.col("total_budgetary_resources"), + self.sf.col("gross_outlay_amount"), + self.sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), + self.sf.col("unobligated_balance"), + self.sf.col("status_of_budgetary_resources_total"), + self.sf.col("obligations_incurred"), + self.sf.col("drv_appropriation_availability_period_start_date"), + self.sf.col("drv_appropriation_availability_period_end_date"), + self.sf.col("drv_appropriation_account_expired_status"), + self.sf.col("drv_obligations_unpaid_amount"), + self.sf.col("drv_other_obligated_amount"), + self.sf.col("reporting_period_start"), + self.sf.col("reporting_period_end"), + self.sf.col("appropriation_account_last_modified"), + self.sf.col("certified_date"), + self.sf.col("create_date"), + self.sf.col("update_date"), + self.sf.col("final_of_fy"), + self.sf.col("submission_id"), + self.sf.col("treasury_account_identifier"), + self.sf.col("owning_agency_name"), + self.sf.col("reporting_agency_name"), + self.sf.col("allocation_transfer_agency_identifier_code"), + self.sf.col("agency_identifier_code"), + self.sf.col("beginning_period_of_availability"), + self.sf.col("ending_period_of_availability"), + self.sf.col("availability_type_code"), + self.sf.col("main_account_code"), + self.sf.col("sub_account_code"), + self.sf.col("treasury_account_symbol"), + self.sf.col("treasury_account_name"), + self.sf.col("budget_function"), + self.sf.col("budget_subfunction"), + self.sf.col("federal_account_symbol"), + self.sf.col("federal_account_name"), + self.sf.col("agency_identifier_name"), + self.sf.col("allocation_transfer_agency_identifier_name"), + self.sf.col("submission_period"), ] @property - def agg_cols(self) -> list[Column]: + def agg_cols(self) -> list[Column | DuckDBSparkColumn]: return [ - sf.max(sf.date_format("last_modified_date", "yyyy-MM-dd")).alias("max_last_modified_date"), + self.sf.max(self.sf.col("last_modified_date")).alias( + "max_last_modified_date" + ), ] @property - def select_cols(self) -> list[Column]: + def select_cols(self) -> list[Column | DuckDBSparkColumn]: return [ - sf.col("owning_agency_name"), - sf.col("reporting_agency_name"), - sf.col("submission_period"), - sf.col("allocation_transfer_agency_identifier_code"), - sf.col("agency_identifier_code"), - sf.col("beginning_period_of_availability"), - sf.col("ending_period_of_availability"), - sf.col("availability_type_code"), - sf.col("main_account_code"), - sf.col("sub_account_code"), - sf.col("treasury_account_symbol"), - sf.col("treasury_account_name"), - sf.col("agency_identifier_name"), - sf.col("allocation_transfer_agency_identifier_name"), - sf.col("budget_function"), - sf.col("budget_subfunction"), - sf.col("federal_account_symbol"), - sf.col("federal_account_name"), - sf.col("budget_authority_unobligated_balance_brought_forward"), - sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), - sf.col("budget_authority_appropriated_amount"), - sf.col("borrowing_authority_amount"), - sf.col("contract_authority_amount"), - sf.col("spending_authority_from_offsetting_collections_amount"), - sf.col("total_other_budgetary_resources_amount"), - sf.col("total_budgetary_resources"), - sf.col("obligations_incurred"), - sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), - sf.col("unobligated_balance"), - sf.col("gross_outlay_amount"), - sf.col("status_of_budgetary_resources_total"), - sf.col("max_last_modified_date").alias("last_modified_date"), + self.sf.col("owning_agency_name"), + self.sf.col("reporting_agency_name"), + self.sf.col("submission_period"), + self.sf.col("allocation_transfer_agency_identifier_code"), + self.sf.col("agency_identifier_code"), + self.sf.col("beginning_period_of_availability"), + self.sf.col("ending_period_of_availability"), + self.sf.col("availability_type_code"), + self.sf.col("main_account_code"), + self.sf.col("sub_account_code"), + self.sf.col("treasury_account_symbol"), + self.sf.col("treasury_account_name"), + self.sf.col("agency_identifier_name"), + self.sf.col("allocation_transfer_agency_identifier_name"), + self.sf.col("budget_function"), + self.sf.col("budget_subfunction"), + self.sf.col("federal_account_symbol"), + self.sf.col("federal_account_name"), + self.sf.col("budget_authority_unobligated_balance_brought_forward"), + self.sf.col("adjustments_to_unobligated_balance_brought_forward_cpe"), + self.sf.col("budget_authority_appropriated_amount"), + self.sf.col("borrowing_authority_amount"), + self.sf.col("contract_authority_amount"), + self.sf.col("spending_authority_from_offsetting_collections_amount"), + self.sf.col("total_other_budgetary_resources_amount"), + self.sf.col("total_budgetary_resources"), + self.sf.col("obligations_incurred"), + self.sf.col("deobligations_or_recoveries_or_refunds_from_prior_year"), + self.sf.col("unobligated_balance"), + self.sf.col("gross_outlay_amount"), + self.sf.col("status_of_budgetary_resources_total"), + self.sf.col("max_last_modified_date").alias("last_modified_date"), ] diff --git a/usaspending_api/download/delta_downloads/award_financial.py b/usaspending_api/download/delta_downloads/award_financial.py index b0990b4b3b..1a59ab932d 100644 --- a/usaspending_api/download/delta_downloads/award_financial.py +++ b/usaspending_api/download/delta_downloads/award_financial.py @@ -2,7 +2,6 @@ from pyspark.sql import functions as sf, Column, DataFrame, SparkSession -from usaspending_api.config import CONFIG from usaspending_api.common.spark.utils import collect_concat, filter_submission_and_sum from usaspending_api.download.delta_downloads.abstract_downloads.account_download import ( @@ -31,11 +30,7 @@ class AwardFinancialMixin: @property def download_table(self) -> DataFrame: - # TODO: This should be reverted back after Spark downloads are migrated to EMR - # return self.spark.table("rpt.award_financial_download") - return self.spark.read.format("delta").load( - f"s3a://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/award_financial_download" - ) + return self.spark.table("rpt.award_financial_download") @property def non_zero_filters(self) -> Column: @@ -49,7 +44,7 @@ def non_zero_filters(self) -> Column: @property def award_categories(self) -> dict[str, Column]: return { - "Assistance": (sf.isnotnull(sf.col("is_fpds")) & ~sf.col("is_fpds")), + "Assistance": (~sf.isnull(sf.col("is_fpds")) & ~sf.col("is_fpds")), "Contracts": sf.col("is_fpds"), "Unlinked": sf.isnull(sf.col("is_fpds")), } diff --git a/usaspending_api/download/delta_downloads/object_class_program_activity.py b/usaspending_api/download/delta_downloads/object_class_program_activity.py index 3cec5a6272..82fd05018e 100644 --- a/usaspending_api/download/delta_downloads/object_class_program_activity.py +++ b/usaspending_api/download/delta_downloads/object_class_program_activity.py @@ -4,7 +4,6 @@ from pyspark.sql import Column, DataFrame, SparkSession from usaspending_api.common.spark.utils import collect_concat, filter_submission_and_sum -from usaspending_api.config import CONFIG from usaspending_api.download.delta_downloads.abstract_downloads.account_download import ( AbstractAccountDownload, AccountLevel, @@ -13,7 +12,9 @@ from usaspending_api.download.delta_downloads.abstract_factories.account_download_factory import ( AbstractAccountDownloadFactory, ) -from usaspending_api.download.delta_downloads.filters.account_filters import AccountDownloadFilters +from usaspending_api.download.delta_downloads.filters.account_filters import ( + AccountDownloadFilters, +) from usaspending_api.download.v2.download_column_historical_lookups import query_paths from usaspending_api.submissions.helpers import get_submission_ids_for_periods @@ -33,7 +34,7 @@ class ObjectClassProgramActivityMixin: def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - if type(self.spark) is DuckDBSparkSession: + if isinstance(self.spark, DuckDBSparkSession): from duckdb.experimental.spark.sql import functions else: from pyspark.sql import functions @@ -42,13 +43,7 @@ def __init__(self, *args, **kwargs): @property def download_table(self) -> DataFrame | DuckDBSparkDataFrame: - if isinstance(self.spark, DuckDBSparkSession): - return self.spark.table("rpt.object_class_program_activity_download") - else: - # TODO: This should be reverted back after Spark downloads are migrated to EMR - return self.spark.read.format("delta").load( - f"s3a://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/object_class_program_activity_download" - ) + return self.spark.table("rpt.object_class_program_activity_download") def _build_dataframes(self) -> list[DataFrame | DuckDBSparkDataFrame]: return [ @@ -64,7 +59,12 @@ def _build_dataframes(self) -> list[DataFrame | DuckDBSparkDataFrame]: .filter(self.dynamic_filters) .groupby(self.group_by_cols) .agg(*[agg_func(col) for col, agg_func in self.agg_cols.items()]) - .drop(*[self.sf.col(f"object_class_program_activity_download.{col}") for col in self.agg_cols]) + .drop( + *[ + self.sf.col(f"object_class_program_activity_download.{col}") + for col in self.agg_cols + ] + ) .select(*self.select_cols) # Sorting by a value that is repeated often will help improve compression during the zipping step .sort(self.sort_by_cols), @@ -104,51 +104,109 @@ def agg_cols(self) -> dict[str, callable]: "budget_function": lambda col: collect_concat(col, spark=self.spark), "budget_subfunction": lambda col: collect_concat(col, spark=self.spark), "obligations_incurred": lambda col: self.sf.sum(col).alias(col), - "obligations_undelivered_orders_unpaid_total": lambda col: self.sf.sum(col).alias(col), - "obligations_undelivered_orders_unpaid_total_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL480100_undelivered_orders_obligations_unpaid": lambda col: self.sf.sum(col).alias(col), - "USSGL480100_undelivered_orders_obligations_unpaid_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL488100_upward_adj_prior_year_undeliv_orders_oblig_unpaid": lambda col: self.sf.sum(col).alias(col), - "obligations_delivered_orders_unpaid_total": lambda col: self.sf.sum(col).alias(col), - "obligations_delivered_orders_unpaid_total_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL490100_delivered_orders_obligations_unpaid": lambda col: self.sf.sum(col).alias(col), - "USSGL490100_delivered_orders_obligations_unpaid_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL498100_upward_adj_of_prior_year_deliv_orders_oblig_unpaid": lambda col: self.sf.sum(col).alias(col), + "obligations_undelivered_orders_unpaid_total": lambda col: self.sf.sum( + col + ).alias(col), + "obligations_undelivered_orders_unpaid_total_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL480100_undelivered_orders_obligations_unpaid": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL480100_undelivered_orders_obligations_unpaid_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL488100_upward_adj_prior_year_undeliv_orders_oblig_unpaid": lambda col: self.sf.sum( + col + ).alias(col), + "obligations_delivered_orders_unpaid_total": lambda col: self.sf.sum( + col + ).alias(col), + "obligations_delivered_orders_unpaid_total_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL490100_delivered_orders_obligations_unpaid": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL490100_delivered_orders_obligations_unpaid_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL498100_upward_adj_of_prior_year_deliv_orders_oblig_unpaid": lambda col: self.sf.sum( + col + ).alias(col), "gross_outlay_amount_FYB_to_period_end": lambda col: filter_submission_and_sum( col, self.filters, spark=self.spark ), "gross_outlay_amount_FYB": lambda col: self.sf.sum(col).alias(col), - "gross_outlays_undelivered_orders_prepaid_total": lambda col: self.sf.sum(col).alias(col), - "gross_outlays_undelivered_orders_prepaid_total_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL480200_undelivered_orders_obligations_prepaid_advanced": lambda col: self.sf.sum(col).alias(col), - "USSGL480200_undelivered_orders_obligations_prepaid_advanced_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL488200_upward_adj_prior_year_undeliv_orders_oblig_prepaid": lambda col: self.sf.sum(col).alias(col), - "gross_outlays_delivered_orders_paid_total": lambda col: self.sf.sum(col).alias(col), - "gross_outlays_delivered_orders_paid_total_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL490200_delivered_orders_obligations_paid": lambda col: self.sf.sum(col).alias(col), - "USSGL490800_authority_outlayed_not_yet_disbursed": lambda col: self.sf.sum(col).alias(col), - "USSGL490800_authority_outlayed_not_yet_disbursed_FYB": lambda col: self.sf.sum(col).alias(col), - "USSGL498200_upward_adj_of_prior_year_deliv_orders_oblig_paid": lambda col: self.sf.sum(col).alias(col), - "deobligations_or_recoveries_or_refunds_from_prior_year": lambda col: self.sf.sum(col).alias(col), - "USSGL487100_downward_adj_prior_year_unpaid_undeliv_orders_oblig": lambda col: self.sf.sum(col).alias(col), - "USSGL497100_downward_adj_prior_year_unpaid_deliv_orders_oblig": lambda col: self.sf.sum(col).alias(col), + "gross_outlays_undelivered_orders_prepaid_total": lambda col: self.sf.sum( + col + ).alias(col), + "gross_outlays_undelivered_orders_prepaid_total_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL480200_undelivered_orders_obligations_prepaid_advanced": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL480200_undelivered_orders_obligations_prepaid_advanced_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL488200_upward_adj_prior_year_undeliv_orders_oblig_prepaid": lambda col: self.sf.sum( + col + ).alias(col), + "gross_outlays_delivered_orders_paid_total": lambda col: self.sf.sum( + col + ).alias(col), + "gross_outlays_delivered_orders_paid_total_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL490200_delivered_orders_obligations_paid": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL490800_authority_outlayed_not_yet_disbursed": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL490800_authority_outlayed_not_yet_disbursed_FYB": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL498200_upward_adj_of_prior_year_deliv_orders_oblig_paid": lambda col: self.sf.sum( + col + ).alias(col), + "deobligations_or_recoveries_or_refunds_from_prior_year": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL487100_downward_adj_prior_year_unpaid_undeliv_orders_oblig": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL497100_downward_adj_prior_year_unpaid_deliv_orders_oblig": lambda col: self.sf.sum( + col + ).alias(col), "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig": lambda col: filter_submission_and_sum( col, self.filters, spark=self.spark ), "USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig": lambda col: filter_submission_and_sum( col, self.filters, spark=self.spark ), - "USSGL483100_undelivered_orders_obligations_transferred_unpaid": lambda col: self.sf.sum(col).alias(col), - "USSGL493100_delivered_orders_obligations_transferred_unpaid": lambda col: self.sf.sum(col).alias(col), - "USSGL483200_undeliv_orders_oblig_transferred_prepaid_advanced": lambda col: self.sf.sum(col).alias(col), - "last_modified_date": lambda col: self.sf.max(col).alias("max_last_modified_date"), + "USSGL483100_undelivered_orders_obligations_transferred_unpaid": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL493100_delivered_orders_obligations_transferred_unpaid": lambda col: self.sf.sum( + col + ).alias(col), + "USSGL483200_undeliv_orders_oblig_transferred_prepaid_advanced": lambda col: self.sf.sum( + col + ).alias(col), + "last_modified_date": lambda col: self.sf.max(col).alias( + "max_last_modified_date" + ), } @property def select_cols(self) -> list[Column | DuckDBSparkColumn]: return [ self.sf.col(col) - for col in query_paths["object_class_program_activity"]["federal_account"].keys() + for col in query_paths["object_class_program_activity"][ + "federal_account" + ].keys() if not col.startswith("last_modified_date") ] + [self.sf.col("max_last_modified_date").alias("last_modified_date")] @@ -267,7 +325,9 @@ def sort_by_cols(self) -> list[str]: def select_cols(self) -> list[Column | DuckDBSparkColumn]: return [ self.sf.col(col) - for col in query_paths["object_class_program_activity"]["treasury_account"].keys() + for col in query_paths["object_class_program_activity"][ + "treasury_account" + ].keys() if not col.startswith("last_modified_date") ] + [self.sf.col("max_last_modified_date").alias("last_modified_date")] diff --git a/usaspending_api/download/management/commands/download_sqs_worker.py b/usaspending_api/download/management/commands/download_sqs_worker.py index 879cea7688..a8261897c3 100644 --- a/usaspending_api/download/management/commands/download_sqs_worker.py +++ b/usaspending_api/download/management/commands/download_sqs_worker.py @@ -5,6 +5,8 @@ import traceback from typing import Callable +import boto3 + # Third-party library imports from opentelemetry.trace import SpanKind, Status, StatusCode @@ -14,7 +16,7 @@ # Application imports from usaspending_api.common.logging import configure_logging -from usaspending_api.common.spark.jobs import SparkJobs, LocalStrategy, DatabricksStrategy +from usaspending_api.common.spark.jobs import SparkJobs, LocalStrategy, EmrServerlessStrategy from usaspending_api.common.sqs.sqs_handler import DownloadLogic, get_sqs_queue from usaspending_api.common.sqs.sqs_job_logging import log_job_message from usaspending_api.common.sqs.sqs_work_dispatcher import ( @@ -165,9 +167,23 @@ def _run_spark_download(download_job_id: int, job_name: str) -> None: command_options = [f"--skip-local-cleanup"] extra_options = {"run_as_container": True} else: - strategy = DatabricksStrategy() + strategy = EmrServerlessStrategy() command_options = [] - extra_options = {} + + ssm_client = boto3.client("ssm", settings.USASPENDING_AWS_REGION) + param_resp = ssm_client.get_parameters( + Names=[settings.EMR_DOWNLOAD_APP_PARAM_NAME, settings.EMR_DOWNLOAD_ROLE_PARAM_NAME], WithDecryption=True + ) + if param_resp.get("InvalidParameters"): + logger.error(f"Invalid parameters: {param_resp['InvalidParameters']}") + raise ValueError("Invalid parameters") + param_values = {param["Name"]: param["Value"] for param in param_resp["Parameters"]} + + extra_options = { + "application_id": param_values[settings.EMR_DOWNLOAD_APP_PARAM_NAME], + "execution_role_arn": param_values[settings.EMR_DOWNLOAD_ROLE_PARAM_NAME], + } + spark_jobs = SparkJobs(strategy) spark_jobs.start( job_name=job_name, diff --git a/usaspending_api/download/tests/integration/test_account_download_factories.py b/usaspending_api/download/tests/integration/test_account_download_factories.py index b554a3b4ca..091a2b445a 100644 --- a/usaspending_api/download/tests/integration/test_account_download_factories.py +++ b/usaspending_api/download/tests/integration/test_account_download_factories.py @@ -6,7 +6,6 @@ import pytest from django.core.management import call_command from model_bakery import baker -from usaspending_api.config import CONFIG from usaspending_api.common.etl.spark import create_ref_temp_views from usaspending_api.download.delta_downloads.account_balances import AccountBalancesDownloadFactory @@ -23,18 +22,12 @@ @pytest.fixture(scope="function") -def award_financial_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db, monkeypatch): +def award_financial_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): call_command( "create_delta_table", "--destination-table=award_financial_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.award_financial.AwardFinancialMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/award_financial_download" - ), - ) column_placeholders = {field.name: [None] * 5 for field in award_financial_schema} test_data_df = pd.DataFrame( data={ @@ -75,18 +68,12 @@ def award_financial_table(spark, s3_unittest_data_bucket, hive_unittest_metastor @pytest.fixture(scope="function") -def award_financial_table_award_category(spark, s3_unittest_data_bucket, hive_unittest_metastore_db, monkeypatch): +def award_financial_table_award_category(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): call_command( "create_delta_table", "--destination-table=award_financial_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.award_financial.AwardFinancialMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/award_financial_download" - ), - ) column_placeholders = {field.name: [None] * 5 for field in award_financial_schema} test_data_df = pd.DataFrame( data={ @@ -127,18 +114,12 @@ def award_financial_table_award_category(spark, s3_unittest_data_bucket, hive_un @pytest.fixture(scope="function") -def account_balances_download_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db, monkeypatch): +def account_balances_download_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): call_command( "create_delta_table", "--destination-table=account_balances_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.account_balances.AccountBalancesMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/account_balances_download" - ), - ) column_placeholders = {field.name: [None] * 5 for field in account_balances_schema} test_data_df = pd.DataFrame( data={ @@ -175,20 +156,12 @@ def account_balances_download_table(spark, s3_unittest_data_bucket, hive_unittes @pytest.fixture(scope="function") -def object_class_by_program_activity_download_table( - spark, s3_unittest_data_bucket, hive_unittest_metastore_db, monkeypatch -): +def object_class_by_program_activity_download_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): call_command( "create_delta_table", "--destination-table=object_class_program_activity_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.object_class_program_activity.ObjectClassProgramActivityMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/object_class_program_activity_download" - ), - ) column_placeholders = {field.name: [None] * 5 for field in object_class_program_activity_schema} test_data_df = pd.DataFrame( data={ diff --git a/usaspending_api/download/tests/integration/test_download_accounts.py b/usaspending_api/download/tests/integration/test_download_accounts.py index 62123cb966..08560f6d95 100644 --- a/usaspending_api/download/tests/integration/test_download_accounts.py +++ b/usaspending_api/download/tests/integration/test_download_accounts.py @@ -10,7 +10,6 @@ from django.core.management import call_command from model_bakery import baker from rest_framework import status -from usaspending_api.config import CONFIG from usaspending_api.accounts.models import FederalAccount, TreasuryAppropriationAccount from usaspending_api.awards.models import FinancialAccountsByAwards @@ -23,42 +22,24 @@ @pytest.fixture -def create_download_delta_tables(spark, s3_unittest_data_bucket, hive_unittest_metastore_db, monkeypatch): +def create_download_delta_tables(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): call_command( "create_delta_table", f"--spark-s3-bucket={s3_unittest_data_bucket}", f"--destination-table=award_financial_download", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.award_financial.AwardFinancialMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/award_financial_download" - ), - ) call_command( "create_delta_table", f"--spark-s3-bucket={s3_unittest_data_bucket}", f"--destination-table=object_class_program_activity_download", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.object_class_program_activity.ObjectClassProgramActivityMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/object_class_program_activity_download" - ), - ) call_command( "create_delta_table", f"--spark-s3-bucket={s3_unittest_data_bucket}", f"--destination-table=account_balances_download", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.account_balances.AccountBalancesMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/account_balances_download" - ), - ) yield diff --git a/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py b/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py index c23c057c59..00669b306c 100644 --- a/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py +++ b/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py @@ -1,18 +1,17 @@ -import zipfile import datetime -import pytest import os +import zipfile +from csv import reader +from os import listdir +import pytest from django.core.management import call_command -from os import listdir from model_bakery import baker -from csv import reader -from usaspending_api.settings import HOST from usaspending_api.awards.models import TransactionDelta from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.download.v2.download_column_historical_lookups import query_paths - +from usaspending_api.settings import HOST # Make sure UTC or test will fail later in the day TODAY = datetime.datetime.strftime(datetime.datetime.utcnow(), "%Y%m%d") @@ -22,11 +21,19 @@ @pytest.mark.django_db(transaction=True) def monthly_download_delta_data(db, monkeypatch): baker.make( - "references.ToptierAgency", toptier_agency_id=1, toptier_code="001", name="Test_Agency", _fill_optional=True + "references.ToptierAgency", + toptier_agency_id=1, + toptier_code="001", + name="Test_Agency", + _fill_optional=True, ) baker.make("references.Agency", pk=1, toptier_agency_id=1, _fill_optional=True) baker.make( - "references.ToptierAgency", toptier_agency_id=2, toptier_code="002", name="Test_Agency 2", _fill_optional=True + "references.ToptierAgency", + toptier_agency_id=2, + toptier_code="002", + name="Test_Agency 2", + _fill_optional=True, ) baker.make("references.Agency", pk=2, toptier_agency_id=2, _fill_optional=True) i = 1 @@ -92,10 +99,16 @@ def monthly_download_delta_data(db, monkeypatch): @pytest.mark.django_db(transaction=True) def test_all_agencies(monthly_download_delta_data, monkeypatch): - call_command("populate_monthly_delta_files", "--debugging_skip_deleted", "--last_date=2020-12-31") + call_command( + "populate_monthly_delta_files", + "--debugging_skip_deleted", + "--last_date=2020-12-31", + ) file_list = listdir("csv_downloads") assert f"FY(All)_All_Contracts_Delta_{TODAY}.zip" in file_list - os.remove(os.path.normpath(f"csv_downloads/FY(All)_All_Contracts_Delta_{TODAY}.zip")) + os.remove( + os.path.normpath(f"csv_downloads/FY(All)_All_Contracts_Delta_{TODAY}.zip") + ) @pytest.mark.django_db(transaction=True) @@ -397,23 +410,37 @@ def test_specific_agency(monthly_download_delta_data, monkeypatch): "", "", "", - f"{HOST}/award/CONT_AWD_1_0_0/" if "localhost" in HOST else f"https://{HOST}/award/CONT_AWD_1_0_0/", + f"{HOST}/award/CONT_AWD_1_0_0/" + if "localhost" in HOST + else f"https://{HOST}/award/CONT_AWD_1_0_0/", "", - "2020-05-07", + "2020-05-07 00:00:00+00", ] - call_command("populate_monthly_delta_files", "--agencies=1", "--debugging_skip_deleted", "--last_date=2020-12-31") + call_command( + "populate_monthly_delta_files", + "--agencies=1", + "--debugging_skip_deleted", + "--last_date=2020-12-31", + ) file_list = listdir("csv_downloads") assert f"FY(All)_001_Contracts_Delta_{TODAY}.zip" in file_list - with zipfile.ZipFile(os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}.zip"), "r") as zip_ref: + with zipfile.ZipFile( + os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}.zip"), "r" + ) as zip_ref: zip_ref.extractall("csv_downloads") assert f"FY(All)_001_Contracts_Delta_{TODAY}_1.csv" in listdir("csv_downloads") - with open(os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}_1.csv"), "r") as contract_file: + with open( + os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}_1.csv"), + "r", + ) as contract_file: csv_reader = reader(contract_file) row_count = 0 for row in csv_reader: if row_count == 0: # 63 is the character limit for column names - expected_row = [s[:63] for s in query_paths["transaction_search"]["d1"].keys()] + expected_row = [ + s[:63] for s in query_paths["transaction_search"]["d1"].keys() + ] # These cols are prepended during file processing expected_row = ["correction_delete_ind", "agency_id"] + expected_row assert row == expected_row @@ -421,8 +448,12 @@ def test_specific_agency(monthly_download_delta_data, monkeypatch): assert row == contract_data row_count += 1 assert row_count == 2 - os.remove(os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}.zip")) - os.remove(os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}_1.csv")) + os.remove( + os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}.zip") + ) + os.remove( + os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{TODAY}_1.csv") + ) @pytest.mark.django_db(transaction=True) @@ -474,7 +505,9 @@ def test_award_types(client, monthly_download_delta_data, monkeypatch): awarding_toptier_agency_name="Test_Agency", awarding_subtier_agency_name="Test_Agency", ) - baker.make("awards.TransactionDelta", transaction_id=2, created_at=datetime.datetime.now()) + baker.make( + "awards.TransactionDelta", transaction_id=2, created_at=datetime.datetime.now() + ) call_command( "populate_monthly_delta_files", "--agencies=1", @@ -484,4 +517,6 @@ def test_award_types(client, monthly_download_delta_data, monkeypatch): ) file_list = listdir("csv_downloads") assert f"FY(All)_001_Assistance_Delta_{TODAY}.zip" in file_list - os.remove(os.path.normpath(f"csv_downloads/FY(All)_001_Assistance_Delta_{TODAY}.zip")) + os.remove( + os.path.normpath(f"csv_downloads/FY(All)_001_Assistance_Delta_{TODAY}.zip") + ) diff --git a/usaspending_api/etl/management/commands/archive_table_in_delta.py b/usaspending_api/etl/management/commands/archive_table_in_delta.py index b26c5d242a..439d6cb864 100644 --- a/usaspending_api/etl/management/commands/archive_table_in_delta.py +++ b/usaspending_api/etl/management/commands/archive_table_in_delta.py @@ -1,10 +1,9 @@ import logging -import psycopg2 - from datetime import datetime, timedelta -from django.core.management.base import BaseCommand -from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string +import psycopg2 +from django.core.management.base import BaseCommand, CommandParser + from usaspending_api.common.etl.spark import load_delta_table from usaspending_api.common.helpers.spark_helpers import ( configure_spark_session, @@ -12,19 +11,25 @@ get_jdbc_connection_properties, get_usas_jdbc_url, ) -from usaspending_api.download.delta_models.download_job import download_job_create_sql_string +from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string +from usaspending_api.download.delta_models.download_job import ( + download_job_create_sql_string, +) +from usaspending_api.etl.table_specs import ArchiveTableSpec logger = logging.getLogger(__name__) TABLE_SPEC = { - "download_job": { - "destination_database": "arc", - "destination_table": "download_job", - "archive_date_field": "update_date", - "source_table": "download_job", - "source_database": "public", - "delta_table_create_sql": download_job_create_sql_string, - } + "download_job": ArchiveTableSpec( + **{ + "destination_database": "arc", + "destination_table": "download_job", + "archive_date_field": "update_date", + "source_table": "download_job", + "source_database": "public", + "delta_table_create_sql": download_job_create_sql_string, + } + ) } @@ -35,7 +40,8 @@ class Command(BaseCommand): those records from Postgres. """ - def add_arguments(self, parser): + @staticmethod + def add_arguments(parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -54,7 +60,8 @@ def add_arguments(self, parser): "--alt-db", type=str, required=False, - help="An alternate Delta Database (aka schema) in which to archive this table, overriding the TABLE_SPEC's destination_database", + help="An alternate Delta Database (aka schema) in which to archive this table, overriding the TABLE_SPEC's" + " destination_database", ) parser.add_argument( "--alt-name", @@ -63,7 +70,7 @@ def add_arguments(self, parser): help="An alternate Delta Table name which to archive this table, overriding the destination_table", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -86,12 +93,12 @@ def handle(self, *args, **options): archive_period = options["archive_period"] table_spec = TABLE_SPEC[destination_table] - destination_database = options["alt_db"] or table_spec["destination_database"] + destination_database = options["alt_db"] or table_spec.destination_database destination_table_name = options["alt_name"] or destination_table - source_table = table_spec["source_table"] - source_database = table_spec["source_database"] + source_table = table_spec.source_table + source_database = table_spec.source_database qualified_source_table = f"{source_database}.{source_table}" - archive_date_field = table_spec["archive_date_field"] + archive_date_field = table_spec.archive_date_field archive_date = datetime.now() - timedelta(days=archive_period) archive_date_string = archive_date.strftime("%Y-%m-%d") @@ -104,13 +111,18 @@ def handle(self, *args, **options): # Resolve JDBC URL for Source Database jdbc_url = get_usas_jdbc_url() if not jdbc_url: - raise RuntimeError(f"Couldn't find JDBC url, please properly configure your CONFIG.") + raise RuntimeError( + "Couldn't find JDBC url, please properly configure your CONFIG." + ) if not jdbc_url.startswith("jdbc:postgresql://"): - raise ValueError("JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://...") + raise ValueError( + "JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://..." + ) # Retrieve data from Postgres query_with_predicate = ( - f"(SELECT * FROM {qualified_source_table} WHERE {archive_date_field} < '{archive_date_string}') AS tmp" + f"(SELECT * FROM {qualified_source_table} " + f"WHERE {archive_date_field} < '{archive_date_string}') AS tmp" ) df = spark.read.jdbc( @@ -122,7 +134,9 @@ def handle(self, *args, **options): # Write data to Delta Lake in Append Mode load_delta_table(spark, df, destination_table_name, overwrite=False) archived_count = df.count() - logger.info(f"Archived {archived_count} records from the {qualified_source_table}") + logger.info( + f"Archived {archived_count} records from the {qualified_source_table}" + ) # Delete data from with psycopg2.connect(dsn=get_database_dsn_string()) as connection: @@ -132,7 +146,9 @@ def handle(self, *args, **options): ) deleted_count = cursor.rowcount - logger.info(f"Deleted {deleted_count} records from the {qualified_source_table} table") + logger.info( + f"Deleted {deleted_count} records from the {qualified_source_table} table" + ) # Shut down spark if spark_created_by_command: diff --git a/usaspending_api/etl/management/commands/create_delta_table.py b/usaspending_api/etl/management/commands/create_delta_table.py index cbdfe84f40..19978107dc 100644 --- a/usaspending_api/etl/management/commands/create_delta_table.py +++ b/usaspending_api/etl/management/commands/create_delta_table.py @@ -1,6 +1,6 @@ import logging -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandParser from pyspark.sql.types import StructType from usaspending_api.awards.delta_models.award_id_lookup import AWARD_ID_LOOKUP_SCHEMA @@ -10,23 +10,36 @@ ) from usaspending_api.common.spark.configs import DEFAULT_EXTRA_CONF from usaspending_api.config import CONFIG -from usaspending_api.etl.management.commands.archive_table_in_delta import TABLE_SPEC as ARCHIVE_TABLE_SPEC -from usaspending_api.etl.management.commands.load_query_to_delta import TABLE_SPEC as LOAD_QUERY_TABLE_SPEC -from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC as LOAD_TABLE_TABLE_SPEC -from usaspending_api.transactions.delta_models.transaction_id_lookup import TRANSACTION_ID_LOOKUP_SCHEMA +from usaspending_api.etl.management.commands.archive_table_in_delta import ( + TABLE_SPEC as ARCHIVE_TABLE_SPEC, +) +from usaspending_api.etl.management.commands.load_query_to_delta import ( + TABLE_SPEC as LOAD_QUERY_TABLE_SPEC, +) +from usaspending_api.etl.management.commands.load_table_to_delta import ( + TABLE_SPEC as LOAD_TABLE_TABLE_SPEC, +) +from usaspending_api.etl.table_specs import TableSpec +from usaspending_api.transactions.delta_models.transaction_id_lookup import ( + TRANSACTION_ID_LOOKUP_SCHEMA, +) TABLE_SPEC = { **ARCHIVE_TABLE_SPEC, **LOAD_TABLE_TABLE_SPEC, **LOAD_QUERY_TABLE_SPEC, - "award_id_lookup": { - "destination_database": "int", - "delta_table_create_sql": AWARD_ID_LOOKUP_SCHEMA, - }, - "transaction_id_lookup": { - "destination_database": "int", - "delta_table_create_sql": TRANSACTION_ID_LOOKUP_SCHEMA, - }, + "award_id_lookup": TableSpec( + **{ + "destination_database": "int", + "delta_table_create_sql": AWARD_ID_LOOKUP_SCHEMA, + } + ), + "transaction_id_lookup": TableSpec( + **{ + "destination_database": "int", + "delta_table_create_sql": TRANSACTION_ID_LOOKUP_SCHEMA, + } + ), } logger = logging.getLogger(__name__) @@ -37,7 +50,7 @@ class Command(BaseCommand): This command creates an empty Delta Table based on the provided --destination-table argument. """ - def add_arguments(self, parser): + def add_arguments(self, parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -66,7 +79,7 @@ def add_arguments(self, parser): "name", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: spark = get_active_spark_session() spark_created_by_command = False if not spark: @@ -78,27 +91,27 @@ def handle(self, *args, **options): spark_s3_bucket = options["spark_s3_bucket"] table_spec = TABLE_SPEC[destination_table] - destination_database = options["alt_db"] or table_spec["destination_database"] + destination_database = options["alt_db"] or table_spec.destination_database destination_table_name = options["alt_name"] or destination_table # Set the database that will be interacted with for all Delta Lake table Spark-based activity logger.info(f"Using Spark Database: {destination_database}") spark.sql(f"create database if not exists {destination_database};") spark.sql(f"use {destination_database};") - if isinstance(table_spec["delta_table_create_sql"], str): + if isinstance(table_spec.delta_table_create_sql, str): # Define Schema Using CREATE TABLE AS command spark.sql( - TABLE_SPEC[destination_table]["delta_table_create_sql"].format( + table_spec.delta_table_create_sql.format( DESTINATION_TABLE=destination_table_name, DESTINATION_DATABASE=destination_database, SPARK_S3_BUCKET=spark_s3_bucket, DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, ) ) - elif isinstance(table_spec["delta_table_create_sql"], StructType): - schema = table_spec["delta_table_create_sql"] - additional_options = table_spec.get("delta_table_create_options") or {} - partition_cols = table_spec.get("delta_table_create_partitions") or [] + elif isinstance(table_spec.delta_table_create_sql, StructType): + schema = table_spec.delta_table_create_sql + additional_options = table_spec.delta_table_create_options or {} + partition_cols = table_spec.delta_table_create_partitions or [] df = spark.createDataFrame([], schema) default_options = { diff --git a/usaspending_api/etl/management/commands/load_query_to_delta.py b/usaspending_api/etl/management/commands/load_query_to_delta.py index aa58341cd9..ff609a79e2 100644 --- a/usaspending_api/etl/management/commands/load_query_to_delta.py +++ b/usaspending_api/etl/management/commands/load_query_to_delta.py @@ -2,7 +2,7 @@ from argparse import ArgumentTypeError from typing import Callable -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandParser from pyspark.sql import SparkSession from usaspending_api.common.etl.spark import create_ref_temp_views @@ -35,7 +35,10 @@ load_object_class_program_activity_incremental, object_class_program_activity_schema, ) -from usaspending_api.download.delta_models.transaction_download import transaction_download_schema +from usaspending_api.download.delta_models.transaction_download import ( + transaction_download_schema, +) +from usaspending_api.etl.table_specs import QueryTableSpec from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_POSTGRES_COLUMNS, RECIPIENT_PROFILE_POSTGRES_COLUMNS, @@ -57,7 +60,10 @@ AWARD_SEARCH_POSTGRES_GOLD_COLUMNS, award_search_create_sql_string, ) -from usaspending_api.search.delta_models.dataframes.award_search import load_award_search, load_award_search_incremental +from usaspending_api.search.delta_models.dataframes.award_search import ( + load_award_search, + load_award_search_incremental, +) from usaspending_api.search.delta_models.dataframes.transaction_search import ( load_transaction_search, load_transaction_search_incremental, @@ -69,7 +75,12 @@ subaward_search_create_sql_string, subaward_search_load_sql_string, ) -from usaspending_api.search.models import AwardSearch, SubawardSearch, SummaryStateView, TransactionSearch +from usaspending_api.search.models import ( + AwardSearch, + SubawardSearch, + SummaryStateView, + TransactionSearch, +) from usaspending_api.settings import HOST from usaspending_api.transactions.delta_models import ( SUMMARY_STATE_VIEW_COLUMNS, @@ -89,360 +100,262 @@ logger = logging.getLogger(__name__) TABLE_SPEC = { - "award_search": { - "model": AwardSearch, - "is_from_broker": False, - "source_query": load_award_search, - "source_query_incremental": load_award_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "award_search", - "swap_schema": "rpt", - "partition_column": "award_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": award_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": AWARD_SEARCH_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," - " tas_components ARRAY", - "column_names": list(AWARD_SEARCH_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "award_search_gold": { - "model": AwardSearch, - "is_from_broker": False, - "source_query": load_award_search, - "source_query_incremental": load_award_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "award_search", - "swap_schema": "rpt", - "partition_column": "award_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": award_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": AWARD_SEARCH_POSTGRES_GOLD_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," - " tas_components ARRAY", - "column_names": list(AWARD_SEARCH_POSTGRES_GOLD_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "recipient_lookup": { - "model": RecipientLookup, - "is_from_broker": False, - "source_query": recipient_lookup_load_sql_string_list, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "recipient_lookup", - "swap_schema": "rpt", - "partition_column": "recipient_hash", - "partition_column_type": "string", - "is_partition_column_unique": True, - "delta_table_create_sql": rpt_recipient_lookup_create_sql_string, - "delta_table_create_options": None, - "source_schema": RECIPIENT_LOOKUP_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING", - "column_names": list(RPT_RECIPIENT_LOOKUP_DELTA_COLUMNS), - "postgres_seq_name": "recipient_lookup_id_seq", - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "recipient_profile": { - "model": RecipientProfile, - "is_from_broker": False, - "source_query": recipient_profile_load_sql_strings, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "recipient_profile", - "swap_schema": "rpt", - "partition_column": "recipient_hash", # This isn't used for anything - "partition_column_type": "string", - "is_partition_column_unique": False, - "delta_table_create_sql": recipient_profile_create_sql_string, - "delta_table_create_options": None, - "source_schema": RECIPIENT_PROFILE_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING", - "column_names": list(RPT_RECIPIENT_PROFILE_DELTA_COLUMNS), - "postgres_seq_name": "recipient_profile_id_seq", - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "summary_state_view": { - "model": SummaryStateView, - "is_from_broker": False, - "source_query": summary_state_view_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "summary_state_view", - "swap_schema": "rpt", - "partition_column": "duh", - "partition_column_type": "string", - "is_partition_column_unique": True, - "delta_table_create_sql": summary_state_view_create_sql_string, - "delta_table_create_options": None, - "source_schema": SUMMARY_STATE_VIEW_POSTGRES_COLUMNS, - "custom_schema": "duh STRING", - "column_names": list(SUMMARY_STATE_VIEW_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "sam_recipient": { - "model": None, - "is_from_broker": True, - "source_query": sam_recipient_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "int", - "swap_table": "duns", - "swap_schema": "int", - "partition_column": "broker_duns_id", - "partition_column_type": "string", - "is_partition_column_unique": True, - "delta_table_create_sql": sam_recipient_create_sql_string, - "delta_table_create_options": None, - "source_schema": SAM_RECIPIENT_POSTGRES_COLUMNS, - "custom_schema": None, - "column_names": list(SAM_RECIPIENT_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "transaction_search": { - "model": TransactionSearch, - "is_from_broker": False, - "source_query": load_transaction_search, - "source_query_incremental": load_transaction_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "transaction_search", - "swap_schema": "rpt", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": TRANSACTION_SEARCH_POSTGRES_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", - "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "transaction_search_gold": { - "model": TransactionSearch, - "is_from_broker": False, - "source_query": load_transaction_search, - "source_query_incremental": load_transaction_search_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "transaction_search", - "swap_schema": "rpt", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", - "column_names": list(TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": { - "partition_keys": ["is_fpds"], - "partitioning_form": "LIST", - "partitions": [ - {"table_suffix": "_fpds", "partitioning_clause": "FOR VALUES IN (TRUE)"}, - {"table_suffix": "_fabs", "partitioning_clause": "FOR VALUES IN (FALSE)"}, + "award_search": QueryTableSpec( + **{ + "model": AwardSearch, + "source_query": load_award_search, + "source_query_incremental": load_award_search_incremental, + "destination_database": "rpt", + "swap_table": "award_search", + "swap_schema": "rpt", + "partition_column": "award_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": award_search_create_sql_string, + "source_schema": AWARD_SEARCH_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," + " tas_components ARRAY", + "column_names": list(AWARD_SEARCH_COLUMNS), + } + ), + "award_search_gold": QueryTableSpec( + **{ + "model": AwardSearch, + "source_query": load_award_search, + "source_query_incremental": load_award_search_incremental, + "destination_database": "rpt", + "swap_table": "award_search", + "swap_schema": "rpt", + "partition_column": "award_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": award_search_create_sql_string, + "source_schema": AWARD_SEARCH_POSTGRES_GOLD_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, cfdas ARRAY," + " tas_components ARRAY", + "column_names": list(AWARD_SEARCH_POSTGRES_GOLD_COLUMNS), + } + ), + "recipient_lookup": QueryTableSpec( + **{ + "model": RecipientLookup, + "source_query": recipient_lookup_load_sql_string_list, + "destination_database": "rpt", + "swap_table": "recipient_lookup", + "swap_schema": "rpt", + "partition_column": "recipient_hash", + "partition_column_type": "string", + "is_partition_column_unique": True, + "delta_table_create_sql": rpt_recipient_lookup_create_sql_string, + "source_schema": RECIPIENT_LOOKUP_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING", + "column_names": list(RPT_RECIPIENT_LOOKUP_DELTA_COLUMNS), + "postgres_seq_name": "recipient_lookup_id_seq", + } + ), + "recipient_profile": QueryTableSpec( + **{ + "model": RecipientProfile, + "source_query": recipient_profile_load_sql_strings, + "destination_database": "rpt", + "swap_table": "recipient_profile", + "swap_schema": "rpt", + "partition_column": "recipient_hash", # This isn't used for anything + "partition_column_type": "string", + "delta_table_create_sql": recipient_profile_create_sql_string, + "source_schema": RECIPIENT_PROFILE_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING", + "column_names": list(RPT_RECIPIENT_PROFILE_DELTA_COLUMNS), + "postgres_seq_name": "recipient_profile_id_seq", + } + ), + "summary_state_view": QueryTableSpec( + **{ + "model": SummaryStateView, + "source_query": summary_state_view_load_sql_string, + "destination_database": "rpt", + "swap_table": "summary_state_view", + "swap_schema": "rpt", + "partition_column": "duh", + "partition_column_type": "string", + "is_partition_column_unique": True, + "delta_table_create_sql": summary_state_view_create_sql_string, + "source_schema": SUMMARY_STATE_VIEW_POSTGRES_COLUMNS, + "custom_schema": "duh STRING", + "column_names": list(SUMMARY_STATE_VIEW_COLUMNS), + } + ), + "sam_recipient": QueryTableSpec( + **{ + "is_from_broker": True, + "source_query": sam_recipient_load_sql_string, + "destination_database": "int", + "swap_table": "duns", + "swap_schema": "int", + "partition_column": "broker_duns_id", + "partition_column_type": "string", + "is_partition_column_unique": True, + "delta_table_create_sql": sam_recipient_create_sql_string, + "source_schema": SAM_RECIPIENT_POSTGRES_COLUMNS, + "column_names": list(SAM_RECIPIENT_COLUMNS), + } + ), + "transaction_search": QueryTableSpec( + **{ + "model": TransactionSearch, + "source_query": load_transaction_search, + "source_query_incremental": load_transaction_search_incremental, + "destination_database": "rpt", + "swap_table": "transaction_search", + "swap_schema": "rpt", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_search_create_sql_string, + "source_schema": TRANSACTION_SEARCH_POSTGRES_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", + "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), + } + ), + "transaction_search_gold": QueryTableSpec( + **{ + "model": TransactionSearch, + "source_query": load_transaction_search, + "source_query_incremental": load_transaction_search_incremental, + "destination_database": "rpt", + "swap_table": "transaction_search", + "swap_schema": "rpt", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_search_create_sql_string, + "source_schema": TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS, + "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", + "column_names": list(TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS), + "postgres_partition_spec": { + "partition_keys": ["is_fpds"], + "partitioning_form": "LIST", + "partitions": [ + { + "table_suffix": "_fpds", + "partitioning_clause": "FOR VALUES IN (TRUE)", + }, + { + "table_suffix": "_fabs", + "partitioning_clause": "FOR VALUES IN (FALSE)", + }, + ], + }, + } + ), + "transaction_current_cd_lookup": QueryTableSpec( + **{ + "source_query": transaction_current_cd_lookup_load_sql_string, + "destination_database": "int", + "swap_table": "transaction_current_cd_lookup", + "swap_schema": "int", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": transaction_current_cd_lookup_create_sql_string, + "source_schema": TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS, + "column_names": list(TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS), + } + ), + "subaward_search": QueryTableSpec( + **{ + "model": SubawardSearch, + "source_query": subaward_search_load_sql_string, + "destination_database": "rpt", + "swap_table": "subaward_search", + "swap_schema": "rpt", + "partition_column": "broker_subaward_id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": subaward_search_create_sql_string, + "source_schema": SUBAWARD_SEARCH_POSTGRES_COLUMNS, + "custom_schema": "treasury_account_identifiers ARRAY", + "column_names": list(SUBAWARD_SEARCH_COLUMNS), + "tsvectors": SUBAWARD_SEARCH_POSTGRES_VECTORS, + } + ), + "covid_faba_spending": QueryTableSpec( + **{ + "model": CovidFABASpending, + "source_query": covid_faba_spending_load_sql_strings, + "destination_database": "rpt", + "swap_table": "covid_faba_spending", + "swap_schema": "rpt", + "partition_column": "id", + "partition_column_type": "numeric", + "delta_table_create_sql": covid_faba_spending_create_sql_string, + "source_schema": COVID_FABA_SPENDING_POSTGRES_COLUMNS, + "column_names": list(COVID_FABA_SPENDING_DELTA_COLUMNS), + } + ), + "account_balances_download": QueryTableSpec( + **{ + "source_query": load_account_balances, + "source_query_incremental": load_account_balances_incremental, + "destination_database": "rpt", + "partition_column": "appropriation_account_balances_id", + "partition_column_type": "numeric", + "delta_table_create_sql": account_balances_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": [], + "delta_table_create_partitions": [ + "reporting_fiscal_year", + "funding_toptier_agency_id", ], - }, - "delta_table_create_partitions": None, - }, - "transaction_current_cd_lookup": { - "model": None, - "is_from_broker": False, - "source_query": transaction_current_cd_lookup_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "int", - "swap_table": "transaction_current_cd_lookup", - "swap_schema": "int", - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_current_cd_lookup_create_sql_string, - "delta_table_create_options": None, - "source_schema": TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS, - "custom_schema": "", - "column_names": list(TRANSACTION_CURRENT_CD_LOOKUP_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "subaward_search": { - "model": SubawardSearch, - "is_from_broker": False, - "source_query": subaward_search_load_sql_string, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "subaward_search", - "swap_schema": "rpt", - "partition_column": "broker_subaward_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": subaward_search_create_sql_string, - "delta_table_create_options": None, - "source_schema": SUBAWARD_SEARCH_POSTGRES_COLUMNS, - "custom_schema": "treasury_account_identifiers ARRAY", - "column_names": list(SUBAWARD_SEARCH_COLUMNS), - "postgres_seq_name": None, - "tsvectors": SUBAWARD_SEARCH_POSTGRES_VECTORS, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "covid_faba_spending": { - "model": CovidFABASpending, - "is_from_broker": False, - "source_query": covid_faba_spending_load_sql_strings, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": "covid_faba_spending", - "swap_schema": "rpt", - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": covid_faba_spending_create_sql_string, - "delta_table_create_options": None, - "source_schema": COVID_FABA_SPENDING_POSTGRES_COLUMNS, - "custom_schema": None, - "column_names": list(COVID_FABA_SPENDING_DELTA_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": None, - }, - "account_balances_download": { - "model": None, - "is_from_broker": False, - "source_query": load_account_balances, - "source_query_incremental": load_account_balances_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "appropriation_account_balances_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": account_balances_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], - }, - "award_financial_download": { - "model": None, - "is_from_broker": False, - "source_query": load_award_financial, - "source_query_incremental": load_award_financial_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "financial_accounts_by_awards_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": award_financial_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], - }, - "object_class_program_activity_download": { - "model": None, - "is_from_broker": False, - "source_query": load_object_class_program_activity, - "source_query_incremental": load_object_class_program_activity_incremental, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "financial_accounts_by_program_activity_object_class_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": object_class_program_activity_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["reporting_fiscal_year", "funding_toptier_agency_id"], - }, - "transaction_download": { - "model": None, - "is_from_broker": False, - "source_query": None, - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": transaction_download_schema, - "delta_table_create_options": {"delta.enableChangeDataFeed": True}, - "source_schema": None, - "custom_schema": None, - "column_names": list(), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - "delta_table_create_partitions": ["awarding_agency_code", "is_fpds", "action_date_fiscal_year"], - }, + } + ), + "award_financial_download": QueryTableSpec( + **{ + "source_query": load_award_financial, + "source_query_incremental": load_award_financial_incremental, + "destination_database": "rpt", + "partition_column": "financial_accounts_by_awards_id", + "partition_column_type": "numeric", + "delta_table_create_sql": award_financial_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": [], + "delta_table_create_partitions": [ + "reporting_fiscal_year", + "funding_toptier_agency_id", + ], + } + ), + "object_class_program_activity_download": QueryTableSpec( + **{ + "source_query": load_object_class_program_activity, + "source_query_incremental": load_object_class_program_activity_incremental, + "destination_database": "rpt", + "partition_column": "financial_accounts_by_program_activity_object_class_id", + "partition_column_type": "numeric", + "delta_table_create_sql": object_class_program_activity_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": [], + "delta_table_create_partitions": [ + "reporting_fiscal_year", + "funding_toptier_agency_id", + ], + } + ), + "transaction_download": QueryTableSpec( + **{ + "destination_database": "rpt", + "partition_column": "transaction_id", + "partition_column_type": "numeric", + "delta_table_create_sql": transaction_download_schema, + "delta_table_create_options": {"delta.enableChangeDataFeed": True}, + "column_names": [], + "delta_table_create_partitions": [ + "awarding_agency_code", + "is_fpds", + "action_date_fiscal_year", + ], + } + ), } @@ -458,7 +371,8 @@ class Command(BaseCommand): destination_table_name: str spark: SparkSession - def add_arguments(self, parser): + @staticmethod + def add_arguments(parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -486,7 +400,7 @@ def add_arguments(self, parser): help="Whether or not the table will be updated incrementally", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -501,32 +415,37 @@ def handle(self, *args, **options): spark_created_by_command = False if not self.spark: spark_created_by_command = True - self.spark = configure_spark_session(**extra_conf, spark_context=self.spark) # type: SparkSession + self.spark = configure_spark_session( + **extra_conf, spark_context=self.spark + ) # type: SparkSession # Resolve Parameters destination_table = options["destination_table"] table_spec = TABLE_SPEC[destination_table] - self.destination_database = options["alt_db"] or table_spec["destination_database"] - self.destination_table_name = options["alt_name"] or destination_table.split(".")[-1] - source_query_key = "source_query_incremental" if options["incremental"] else "source_query" - load_query = table_spec.get(source_query_key) + self.destination_database = options["alt_db"] or table_spec.destination_database + self.destination_table_name = ( + options["alt_name"] or destination_table.split(".")[-1] + ) + source_query_key = ( + "source_query_incremental" if options["incremental"] else "source_query" + ) + load_query = getattr(table_spec, source_query_key) if load_query is None: - raise ArgumentTypeError(f"Invalid source query. `{source_query_key}` must be specified in the TABLE_SPEC.") + raise ArgumentTypeError( + f"Invalid source query. `{source_query_key}` must be specified in the TABLE_SPEC." + ) # Set the database that will be interacted with for all Delta Lake table Spark-based activity logger.info(f"Using Spark Database: {self.destination_database}") self.spark.sql(f"use {self.destination_database};") - # Create User Defined Functions if needed - if table_spec.get("user_defined_functions"): - for udf_args in table_spec["user_defined_functions"]: - self.spark.udf.register(**udf_args) - create_ref_temp_views(self.spark, create_broker_views=True) if isinstance(load_query, list): for index, query in enumerate(load_query): - logger.info(f"Running query number: {index + 1}\nPreview of query: {query[:100]}") + logger.info( + f"Running query number: {index + 1}\nPreview of query: {query[:100]}" + ) self.run_spark_sql(query) else: self.run_spark_sql(load_query) @@ -534,7 +453,9 @@ def handle(self, *args, **options): if spark_created_by_command: self.spark.stop() - def run_spark_sql(self, query: str | Callable[[SparkSession, str, str], None]): + def run_spark_sql( + self, query: str | Callable[[SparkSession, str, str], None] + ) -> None: if isinstance(query, str): jdbc_conn_props = get_jdbc_connection_properties() self.spark.sql( @@ -551,4 +472,6 @@ def run_spark_sql(self, query: str | Callable[[SparkSession, str, str], None]): elif isinstance(query, Callable): query(self.spark, self.destination_database, self.destination_table_name) else: - raise ArgumentTypeError(f"Invalid query. `{query}` must be a string or a Callable.") + raise ArgumentTypeError( + f"Invalid query. `{query}` must be a string or a Callable." + ) diff --git a/usaspending_api/etl/management/commands/load_table_from_delta.py b/usaspending_api/etl/management/commands/load_table_from_delta.py index 79892c5dce..05ca3e78cd 100644 --- a/usaspending_api/etl/management/commands/load_table_from_delta.py +++ b/usaspending_api/etl/management/commands/load_table_from_delta.py @@ -1,40 +1,46 @@ import itertools import logging +from datetime import datetime +from math import ceil +from typing import Dict, Optional import boto3 import numpy as np import psycopg2 - from django import db +from django.core.management import CommandParser from django.core.management.base import BaseCommand from django.db.models import Model -from math import ceil -from pyspark.sql import SparkSession, DataFrame -from typing import Dict, Optional, List -from datetime import datetime +from pyspark.sql import Column, DataFrame, SparkSession from usaspending_api.common.csv_stream_s3_to_pg import copy_csvs_from_s3_to_pg from usaspending_api.common.etl.spark import convert_array_cols_to_string -from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.common.helpers.spark_helpers import ( configure_spark_session, get_active_spark_session, get_jdbc_connection_properties, get_usas_jdbc_url, ) +from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.config import CONFIG -from usaspending_api.settings import DEFAULT_TEXT_SEARCH_CONFIG - from usaspending_api.etl.management.commands.create_delta_table import TABLE_SPEC +from usaspending_api.etl.table_specs import QueryTableSpec +from usaspending_api.settings import DEFAULT_TEXT_SEARCH_CONFIG logger = logging.getLogger(__name__) # Note: the `delta` type is not actually in Spark SQL. It's how we're temporarily storing the data before converting it # to the proper postgres type, since pySpark doesn't automatically support this conversion. SPECIAL_TYPES_MAPPING = { - db.models.UUIDField: {"postgres": "UUID USING {column_name}::UUID", "delta": "TEXT"}, + db.models.UUIDField: { + "postgres": "UUID USING {column_name}::UUID", + "delta": "TEXT", + }, "UUID": {"postgres": "UUID USING {column_name}::UUID", "delta": "TEXT"}, - db.models.JSONField: {"postgres": "JSONB using {column_name}::JSON", "delta": "TEXT"}, + db.models.JSONField: { + "postgres": "JSONB using {column_name}::JSON", + "delta": "TEXT", + }, "JSONB": {"postgres": "JSONB using {column_name}::JSON", "delta": "TEXT"}, } @@ -46,7 +52,6 @@ class Command(BaseCommand): - help = """ This command reads data from a Delta table and copies it into a corresponding Postgres database table (under a temp name). As of now, it only supports a full reload of a table. If the table with the chosen temp name already @@ -55,7 +60,20 @@ class Command(BaseCommand): if a new table has been made. """ - def add_arguments(self, parser): + delta_table: str + delta_table_name: str + destination_database: str + column_names: list + + postgres_table: str + postgres_table_name: str + postgres_schema: str + postgres_cols: dict + + temp_table: str + temp_table_name: str + + def add_arguments(self, parser: CommandParser) -> None: parser.add_argument( "--delta-table", type=str, @@ -73,7 +91,7 @@ def add_arguments(self, parser): "--alt-delta-name", type=str, required=False, - help="An alternate delta table name to load, overriding the TABLE_SPEC destination_table" "name", + help="An alternate delta table name to load, overriding the TABLE_SPEC destination_tablename", ) parser.add_argument( "--jdbc-inserts", @@ -111,7 +129,8 @@ def add_arguments(self, parser): "If the job fails for some unexpected reason then the sequence will be reset to the previous value.", ) - def _split_dfs(self, df, special_columns): + @staticmethod + def _split_dfs(df: DataFrame, special_columns: str | Column) -> [DataFrame]: """Split a DataFrame into DataFrame subsets based on presence of NULL values in certain special columns Unfortunately, pySpark with the JDBC doesn't handle UUIDs/JSON well. @@ -129,13 +148,18 @@ def _split_dfs(self, df, special_columns): # Figure all the possible combos of filters filter_batches = [] for subset in itertools.product([True, False], repeat=len(special_columns)): - filter_batches.append({col: subset[i] for i, col in enumerate(special_columns)}) + filter_batches.append( + {col: subset[i] for i, col in enumerate(special_columns)} + ) # Generate all the split dfs based on the filter batches split_dfs = [] for filter_batch in filter_batches: # Apply the filters (True = null column, drop it. False = not null column, keep it) - modified_filters = [df[col].isNull() if val else df[col].isNotNull() for col, val in filter_batch.items()] + modified_filters = [ + df[col].isNull() if val else df[col].isNotNull() + for col, val in filter_batch.items() + ] split_df = df.filter(np.bitwise_and.reduce(modified_filters)) # Drop the columns where it's null **after filtering them out** @@ -145,7 +169,7 @@ def _split_dfs(self, df, special_columns): split_dfs.append(split_df) return split_dfs - def handle(self, *args, **options): + def _get_spark_session(self) -> SparkSession: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -155,202 +179,269 @@ def handle(self, *args, **options): "spark.sql.legacy.parquet.int96RebaseModeInWrite": "LEGACY", # for timestamps at/before 1900 "spark.sql.jsonGenerator.ignoreNullFields": "false", # keep nulls in our json } - spark = get_active_spark_session() spark_created_by_command = False if not spark: spark_created_by_command = True - spark = configure_spark_session(**extra_conf, spark_context=spark) # type: SparkSession + spark = configure_spark_session( + **extra_conf, spark_context=spark + ) # type: SparkSession + return spark, spark_created_by_command + + def handle(self, *args, **options) -> None: + spark, spark_created_by_command = self._get_spark_session() # Resolve Parameters - delta_table = options["delta_table"] + self.delta_table = options["delta_table"] recreate = options["recreate"] - - table_spec = TABLE_SPEC[delta_table] + table_spec = TABLE_SPEC[self.delta_table] # Delta side - destination_database = options["alt_delta_db"] or table_spec["destination_database"] - delta_table_name = options["alt_delta_name"] or delta_table - delta_table = f"{destination_database}.{delta_table_name}" if destination_database else delta_table_name + self.destination_database = ( + options["alt_delta_db"] or table_spec.destination_database + ) + self.delta_table_name = options["alt_delta_name"] or self.delta_table + self.delta_table = ( + f"{self.destination_database}.{self.delta_table_name}" + if self.destination_database + else self.delta_table_name + ) # Postgres side - source - postgres_table = None - postgres_model = table_spec["model"] - postgres_schema = table_spec["source_database"] or table_spec["swap_schema"] - postgres_table_name = table_spec["source_table"] or table_spec["swap_table"] - postgres_cols = table_spec["source_schema"] - column_names = table_spec.get("column_names") - tsvectors = table_spec.get("tsvectors") or {} - if postgres_table_name: - postgres_table = f"{postgres_schema}.{postgres_table_name}" if postgres_schema else postgres_table_name + self.postgres_schema = table_spec.source_database or table_spec.swap_schema + self.postgres_table_name = table_spec.source_table or table_spec.swap_table + self.postgres_cols = table_spec.source_schema + self.column_names = table_spec.column_names + if self.postgres_table_name: + self.postgres_table = ( + f"{self.postgres_schema}.{self.postgres_table_name}" + if self.postgres_schema + else self.postgres_table_name + ) # Postgres side - temp temp_schema = "temp" temp_table_suffix = "temp" - temp_table_suffix_appendage = f"_{temp_table_suffix}" if {temp_table_suffix} else "" - if postgres_table: - temp_table_name = f"{postgres_table_name}{temp_table_suffix_appendage}" - else: - temp_table_name = f"{delta_table_name}{temp_table_suffix_appendage}" - temp_table = f"{temp_schema}.{temp_table_name}" + temp_table_suffix_appendage = ( + f"_{temp_table_suffix}" if {temp_table_suffix} else "" + ) + self.temp_table_name = ( + f"{self.postgres_table_name}{temp_table_suffix_appendage}" + if self.postgres_table + else f"{self.delta_table_name}{temp_table_suffix_appendage}" + ) + self.temp_table = f"{temp_schema}.{self.temp_table_name}" - summary_msg = f"Copying delta table {delta_table} to a Postgres temp table {temp_table}." - if postgres_table: - summary_msg = f"{summary_msg} The temp table will be based on the postgres table {postgres_table}" + summary_msg = f"Copying delta table {self.delta_table} to a Postgres temp table {self.temp_table}." + if self.postgres_table: + summary_msg = f"{summary_msg} The temp table will be based on the postgres table {self.postgres_table}" logger.info(summary_msg) - # Checking if the temp destination table already exists - temp_dest_table_exists_sql = f""" - SELECT EXISTS ( - SELECT 1 - FROM information_schema.tables - WHERE table_schema = '{temp_schema}' - AND table_name = '{temp_table_name}') - """ - with db.connection.cursor() as cursor: - cursor.execute(temp_dest_table_exists_sql) - temp_dest_table_exists = cursor.fetchone()[0] + temp_dest_table_exists = self._temp_table_exists( + temp_schema, self.temp_table_name + ) # If it does, and we're recreating it, drop it first if temp_dest_table_exists and recreate: - logger.info(f"{temp_table} exists and recreate argument provided. Dropping first.") - # If the schema has changed and we need to do a complete reload, just drop the table and rebuild it - clear_table_sql = f"DROP TABLE {temp_table}" - with db.connection.cursor() as cursor: - cursor.execute(clear_table_sql) - logger.info(f"{temp_table} dropped.") + self._drop_temp_table() temp_dest_table_exists = False make_new_table = not temp_dest_table_exists - is_postgres_table_partitioned = table_spec.get("postgres_partition_spec") is not None - - if postgres_table or postgres_cols: - # Recreate the table if it doesn't exist. Spark's df.write automatically does this but doesn't account for - # the extra metadata (indexes, constraints, defaults) which CREATE TABLE X LIKE Y accounts for. - # If there is no postgres_table to base it on, it just relies on spark to make it and work with delta table - if make_new_table: - partition_clause = "" - storage_parameters = "WITH (autovacuum_enabled=FALSE)" - partitions_sql = [] - if is_postgres_table_partitioned: - partition_clause = ( - f"PARTITION BY {table_spec['postgres_partition_spec']['partitioning_form']}" - f"({', '.join(table_spec['postgres_partition_spec']['partition_keys'])})" - ) - storage_parameters = "" - partitions_sql = [ - ( - f"CREATE TABLE " - # Below: e.g. my_tbl_temp -> my_tbl_part_temp - f"{temp_table[:-len(temp_table_suffix_appendage)]}{pt['table_suffix']}{temp_table_suffix_appendage} " - f"PARTITION OF {temp_table} {pt['partitioning_clause']} " - f"{storage_parameters}" - ) - for pt in table_spec["postgres_partition_spec"]["partitions"] - ] - if postgres_table: - create_temp_sql = f""" - CREATE TABLE {temp_table} ( - LIKE {postgres_table} INCLUDING DEFAULTS INCLUDING GENERATED INCLUDING IDENTITY - ) {partition_clause} {storage_parameters} - """ - elif postgres_cols: - create_temp_sql = f""" - CREATE TABLE {temp_table} ( - {", ".join([f'{key} {val}' for key, val in postgres_cols.items()])} - ) {partition_clause} {storage_parameters} - """ - else: - raise RuntimeError( - "make_new_table=True but neither a postgres_table or postgres_cols are " - "populated for the target delta table in the TABLE_SPEC" - ) - with db.connection.cursor() as cursor: - logger.info(f"Creating {temp_table}") - cursor.execute(create_temp_sql) - logger.info(f"{temp_table} created.") - - if is_postgres_table_partitioned and partitions_sql: - for create_partition in partitions_sql: - logger.info(f"Creating partition of {temp_table} with SQL:\n{create_partition}") - cursor.execute(create_partition) - logger.info("Partition created.") - - # If there are vectors, add the triggers that will populate them based on other calls - # NOTE: Undetermined whether tsvector triggers can be applied on partitioned tables, - # at the top-level virtual/partitioned table (versus having to apply on each partition) - for tsvector_name, derived_from_cols in tsvectors.items(): - logger.info( - f"To prevent any confusion or duplicates, dropping the trigger" - f" tsvector_update_{tsvector_name} if it exists before potentially recreating it." - ) - cursor.execute(f"DROP TRIGGER IF EXISTS tsvector_update_{tsvector_name} ON {temp_table}") - - logger.info( - f"Adding tsvector trigger for column {tsvector_name}" - f" based on the following columns: {derived_from_cols}" - ) - derived_from_cols_str = ", ".join(derived_from_cols) - tsvector_trigger_sql = f""" - CREATE TRIGGER tsvector_update_{tsvector_name} BEFORE INSERT OR UPDATE - ON {temp_table} FOR EACH ROW EXECUTE PROCEDURE - tsvector_update_trigger({tsvector_name}, '{DEFAULT_TEXT_SEARCH_CONFIG}', - {derived_from_cols_str}) - """ - cursor.execute(tsvector_trigger_sql) - logger.info(f"tsvector trigger for column {tsvector_name} added.") + if self.postgres_table or self.postgres_cols: + self._recreate_table( + make_new_table=make_new_table, + table_spec=table_spec, + temp_table_suffix_appendage=temp_table_suffix_appendage, + ) # Read from Delta - df = spark.table(delta_table) + df = spark.table(self.delta_table) # Make sure that the column order defined in the Delta table schema matches # that of the Spark dataframe used to pull from the Postgres table. While not # always needed, this should help to prevent any future mismatch between the two. - if column_names: - df = df.select(column_names) + if self.column_names: + df = df.select(self.column_names) # If we're working off an existing table, truncate before loading in all the data if not make_new_table: - logger.info(f"Truncating existing table {temp_table}") + logger.info(f"Truncating existing table {self.temp_table}") with db.connection.cursor() as cursor: - cursor.execute(f"TRUNCATE {temp_table}") - logger.info(f"{temp_table} truncated.") + cursor.execute(f"TRUNCATE {self.temp_table}") + logger.info(f"{self.temp_table} truncated.") # Reset the sequence before load for a table if it exists - if options["reset_sequence"] and table_spec.get("postgres_seq_name"): - postgres_seq_last_value = self._set_sequence_value(table_spec["postgres_seq_name"]) - else: - postgres_seq_last_value = None + postgres_seq_last_value = ( + self._set_sequence_value(table_spec.postgres_seq_name) + if options["reset_sequence"] + and hasattr(table_spec, "postgres_seq_name") + and table_spec.postgres_seq_name + else None + ) + + self._write_df( + spark=spark, + df=df, + options=options, + postgres_seq_last_value=postgres_seq_last_value, + table_spec=table_spec, + ) + + self._finish( + options=options, + spark_created_by_command=spark_created_by_command, + spark=spark, + ) + + @staticmethod + def _temp_table_exists(temp_schema: str, temp_table_name: str) -> bool: + # Checking if the temp destination table already exists + temp_dest_table_exists_sql = f""" + SELECT EXISTS ( + SELECT 1 + FROM information_schema.tables + WHERE table_schema = '{temp_schema}' + AND table_name = '{temp_table_name}') + """ + with db.connection.cursor() as cursor: + cursor.execute(temp_dest_table_exists_sql) + return bool(cursor.fetchone()[0]) + + def _drop_temp_table(self) -> None: + logger.info( + f"{self.temp_table} exists and recreate argument provided. Dropping first." + ) + # If the schema has changed and we need to do a complete reload, just drop the table and rebuild it + clear_table_sql = f"DROP TABLE {self.temp_table}" + with db.connection.cursor() as cursor: + cursor.execute(clear_table_sql) + logger.info(f"{self.temp_table} dropped.") + + def _recreate_table( + self, + make_new_table: bool, + table_spec: QueryTableSpec, + temp_table_suffix_appendage: str, + ) -> None: + # Recreate the table if it doesn't exist. Spark's df.write automatically does this but doesn't account for + # the extra metadata (indexes, constraints, defaults) which CREATE TABLE X LIKE Y accounts for. + # If there is no postgres_table to base it on, it just relies on spark to make it and work with delta table + is_postgres_table_partitioned = ( + hasattr(table_spec, "postgres_partition_spec") + and table_spec.postgres_partition_spec is not None + ) + tsvectors = table_spec.tsvectors or {} + + if make_new_table: + partition_clause = "" + storage_parameters = "WITH (autovacuum_enabled=FALSE)" + partitions_sql = [] + if is_postgres_table_partitioned: + partition_clause = ( + f"PARTITION BY {table_spec.postgres_partition_spec['partitioning_form']}" + f"({', '.join(table_spec.postgres_partition_spec['partition_keys'])})" + ) + storage_parameters = "" + partitions_sql = [ + ( + f"CREATE TABLE " + # Below: e.g. my_tbl_temp -> my_tbl_part_temp + f"{self.temp_table[: -len(temp_table_suffix_appendage)]}" + f"{pt['table_suffix']}{temp_table_suffix_appendage} " + f"PARTITION OF {self.temp_table} {pt['partitioning_clause']} " + f"{storage_parameters}" + ) + for pt in table_spec.postgres_partition_spec["partitions"] + ] + if self.postgres_table: + create_temp_sql = f""" + CREATE TABLE {self.temp_table} ( + LIKE {self.postgres_table} INCLUDING DEFAULTS INCLUDING GENERATED INCLUDING IDENTITY + ) {partition_clause} {storage_parameters} + """ + elif self.postgres_cols: + create_temp_sql = f""" + CREATE TABLE {self.temp_table} ( + {", ".join([f"{key} {val}" for key, val in self.postgres_cols.items()])} + ) {partition_clause} {storage_parameters} + """ + else: + raise RuntimeError( + "make_new_table=True but neither a postgres_table or postgres_cols are " + "populated for the target delta table in the TABLE_SPEC" + ) + with db.connection.cursor() as cursor: + logger.info(f"Creating {self.temp_table}") + cursor.execute(create_temp_sql) + logger.info(f"{self.temp_table} created.") + + if is_postgres_table_partitioned and partitions_sql: + for create_partition in partitions_sql: + logger.info( + f"Creating partition of {self.temp_table} with SQL:\n{create_partition}" + ) + cursor.execute(create_partition) + logger.info("Partition created.") + + # If there are vectors, add the triggers that will populate them based on other calls + # NOTE: Undetermined whether tsvector triggers can be applied on partitioned tables, + # at the top-level virtual/partitioned table (versus having to apply on each partition) + for tsvector_name, derived_from_cols in tsvectors.items(): + logger.info( + f"To prevent any confusion or duplicates, dropping the trigger" + f" tsvector_update_{tsvector_name} if it exists before potentially recreating it." + ) + cursor.execute( + f"DROP TRIGGER IF EXISTS tsvector_update_{tsvector_name} ON {self.temp_table}" + ) - # Write to Postgres + logger.info( + f"Adding tsvector trigger for column {tsvector_name}" + f" based on the following columns: {derived_from_cols}" + ) + derived_from_cols_str = ", ".join(derived_from_cols) + tsvector_trigger_sql = f""" + CREATE TRIGGER tsvector_update_{tsvector_name} BEFORE INSERT OR UPDATE + ON {self.temp_table} FOR EACH ROW EXECUTE PROCEDURE + tsvector_update_trigger({tsvector_name}, '{DEFAULT_TEXT_SEARCH_CONFIG}', + {derived_from_cols_str}) + """ + cursor.execute(tsvector_trigger_sql) + logger.info(f"tsvector trigger for column {tsvector_name} added.") + + def _write_df( + self, + spark: SparkSession, + df: DataFrame, + options: dict, + postgres_seq_last_value: int | bool, + table_spec: QueryTableSpec, + ) -> None: use_jdbc_inserts = options["jdbc_inserts"] strategy = "JDBC INSERTs" if use_jdbc_inserts else "SQL bulk COPY CSV" logger.info( - f"LOAD (START): Loading data from Delta table {delta_table} to {temp_table} using {strategy} " f"strategy" + f"LOAD (START): Loading data from Delta table {self.delta_table} " + f"to {self.temp_table} using {strategy} strategy" ) - try: if use_jdbc_inserts: self._write_with_jdbc_inserts( - spark, df, - temp_table, + self.temp_table, split_df_by_special_cols=True, - postgres_model=postgres_model, - postgres_cols=postgres_cols, + postgres_model=table_spec.model, + postgres_cols=self.postgres_cols, overwrite=False, ) else: - if not column_names: - raise RuntimeError("column_names None or empty, but are required to map CSV cols to table cols") + if not self.column_names: + raise RuntimeError( + "column_names None or empty, but are required to map CSV cols to table cols" + ) spark_s3_bucket_name = options["spark_s3_bucket"] self._write_with_sql_bulk_copy_csv( spark, df, - delta_db=destination_database, - delta_table_name=delta_table_name, - temp_table=temp_table, - ordered_col_names=column_names, spark_s3_bucket_name=spark_s3_bucket_name, keep_csv_files=True if options["keep_csv_files"] else False, ) @@ -359,24 +450,10 @@ def handle(self, *args, **options): logger.error( f"Command failed unexpectedly; resetting the sequence to previous value: {postgres_seq_last_value}" ) - self._set_sequence_value(table_spec["postgres_seq_name"], postgres_seq_last_value) - raise Exception(exc) - - logger.info( - f"LOAD (FINISH): Loaded data from Delta table {delta_table} to {temp_table} using {strategy} " f"strategy" - ) - - # We're done with spark at this point - if spark_created_by_command: - spark.stop() - - if postgres_table: - logger.info( - f"Note: this has merely loaded the data from Delta. For various reasons, we've separated the" - f" metadata portion of the table download to a separate script. If not already done so," - f" please run the following additional command to complete the process: " - f" 'copy_table_metadata --source-table {postgres_table} --dest-table {temp_table}'." - ) + self._set_sequence_value( + table_spec.postgres_seq_name, postgres_seq_last_value + ) + raise exc def _set_sequence_value(self, seq_name: str, val: Optional[int] = None) -> int: """ @@ -391,20 +468,18 @@ def _set_sequence_value(self, seq_name: str, val: Optional[int] = None) -> int: with db.connection.cursor() as cursor: cursor.execute(f"SELECT last_value FROM {seq_name}") last_value = cursor.fetchone()[0] - cursor.execute(f"ALTER SEQUENCE IF EXISTS {seq_name} RESTART WITH {new_seq_val}") + cursor.execute( + f"ALTER SEQUENCE IF EXISTS {seq_name} RESTART WITH {new_seq_val}" + ) return last_value def _write_with_sql_bulk_copy_csv( self, spark: SparkSession, df: DataFrame, - delta_db: str, - delta_table_name: str, - temp_table: str, - ordered_col_names: List[str], spark_s3_bucket_name: str, - keep_csv_files=False, - ): + keep_csv_files: bool = False, + ) -> None: """ Write-from-delta-to-postgres strategy that relies on SQL bulk COPY of CSV files to Postgres. It uses the SQL COPY command on CSV files, which are created from the Delta table's underlying parquet files. @@ -448,10 +523,10 @@ def _write_with_sql_bulk_copy_csv( sub-folder of a "temp" folder. Be mindful of cleaning these up if setting to True. If False, the same output path is used for each write and nukes-and-paves the files in that output path. """ - csv_path = f"{CONFIG.SPARK_CSV_S3_PATH}/{delta_db}/{delta_table_name}/" + csv_path = f"{CONFIG.SPARK_CSV_S3_PATH}/{self.destination_database}/{self.delta_table_name}/" if keep_csv_files: csv_path = ( - f"{CONFIG.SPARK_CSV_S3_PATH}/temp/{delta_db}/{delta_table_name}/" + f"{CONFIG.SPARK_CSV_S3_PATH}/temp/{self.destination_database}/{self.delta_table_name}/" f"{datetime.strftime(datetime.utcnow(), '%Y%m%d%H%M%S')}/" ) s3_bucket_with_csv_path = f"s3a://{spark_s3_bucket_name}/{csv_path}" @@ -464,11 +539,15 @@ def _write_with_sql_bulk_copy_csv( aws_secret_access_key=CONFIG.AWS_SECRET_KEY.get_secret_value(), ) s3_resource = boto3_session.resource( - service_name="s3", region_name=CONFIG.AWS_REGION, endpoint_url=f"http://{CONFIG.AWS_S3_ENDPOINT}" + service_name="s3", + region_name=CONFIG.AWS_REGION, + endpoint_url=f"http://{CONFIG.AWS_S3_ENDPOINT}", ) else: s3_resource = boto3.resource( - service_name="s3", region_name=CONFIG.AWS_REGION, endpoint_url=f"https://{CONFIG.AWS_S3_ENDPOINT}" + service_name="s3", + region_name=CONFIG.AWS_REGION, + endpoint_url=f"https://{CONFIG.AWS_S3_ENDPOINT}", ) s3_bucket_name = spark_s3_bucket_name s3_bucket = s3_resource.Bucket(s3_bucket_name) @@ -476,15 +555,25 @@ def _write_with_sql_bulk_copy_csv( initial_size = sum(1 for _ in objs_collection) if initial_size > 0: - logger.info(f"LOAD: Starting to delete {initial_size} previous objects in {s3_bucket_with_csv_path}") + logger.info( + f"LOAD: Starting to delete {initial_size} previous objects in {s3_bucket_with_csv_path}" + ) objs_collection.delete() post_delete_size = sum(1 for _ in objs_collection) - logger.info(f"LOAD: Finished deleting. {post_delete_size} objects remain in {s3_bucket_with_csv_path}") + logger.info( + f"LOAD: Finished deleting. {post_delete_size} objects remain in {s3_bucket_with_csv_path}" + ) else: - logger.info(f"LOAD: Target S3 path {s3_bucket_with_csv_path} is empty or yet to be created") + logger.info( + f"LOAD: Target S3 path {s3_bucket_with_csv_path} is empty or yet to be created" + ) - logger.info(f"LOAD: Starting dump of Delta table to temp gzipped CSV files in {s3_bucket_with_csv_path}") - df_no_arrays = convert_array_cols_to_string(df, is_postgres_array_format=True, is_for_csv_export=True) + logger.info( + f"LOAD: Starting dump of Delta table to temp gzipped CSV files in {s3_bucket_with_csv_path}" + ) + df_no_arrays = convert_array_cols_to_string( + df, is_postgres_array_format=True, is_for_csv_export=True + ) df_no_arrays.write.options( maxRecordsPerFile=_SPARK_CSV_WRITE_TO_PG_MAX_RECORDS_PER_FILE, compression="gzip", @@ -493,18 +582,28 @@ def _write_with_sql_bulk_copy_csv( ignoreLeadingWhiteSpace=False, # must set for CSV write, as it defaults to true ignoreTrailingWhiteSpace=False, # must set for CSV write, as it defaults to true timestampFormat=CONFIG.SPARK_CSV_TIMEZONE_FORMAT, - ).mode(saveMode="overwrite" if not keep_csv_files else "errorifexists").csv(s3_bucket_with_csv_path) + ).mode(saveMode="overwrite" if not keep_csv_files else "errorifexists").csv( + s3_bucket_with_csv_path + ) logger.debug( f"Connecting to S3 at endpoint_url={CONFIG.AWS_S3_ENDPOINT}, region_name={CONFIG.AWS_REGION} to " f"get listing of contents of Bucket={spark_s3_bucket_name} with Prefix={csv_path}" ) - gzipped_csv_files = [f.key for f in s3_bucket.objects.filter(Prefix=csv_path) if f.key.endswith(".csv.gz")] + gzipped_csv_files = [ + f.key + for f in s3_bucket.objects.filter(Prefix=csv_path) + if f.key.endswith(".csv.gz") + ] file_count = len(gzipped_csv_files) - logger.info(f"LOAD: Finished dumping {file_count} CSV files in {s3_bucket_with_csv_path}") + logger.info( + f"LOAD: Finished dumping {file_count} CSV files in {s3_bucket_with_csv_path}" + ) - logger.info(f"LOAD: Starting SQL bulk COPY of {file_count} CSV files to Postgres {temp_table} table") + logger.info( + f"LOAD: Starting SQL bulk COPY of {file_count} CSV files to Postgres {self.temp_table} table" + ) db_dsn = get_database_dsn_string() with psycopg2.connect(dsn=db_dsn) as connection: @@ -518,7 +617,10 @@ def _write_with_sql_bulk_copy_csv( # fraction less than 1.0. The final value will be the greater of that or # SPARK_CSV_WRITE_TO_PG_MIN_PARTITIONS partitions = max( - ceil(max_parallel_workers * CONFIG.SPARK_CSV_WRITE_TO_PG_PARALLEL_WORKER_MULTIPLIER), + ceil( + max_parallel_workers + * CONFIG.SPARK_CSV_WRITE_TO_PG_PARALLEL_WORKER_MULTIPLIER + ), CONFIG.SPARK_CSV_WRITE_TO_PG_MIN_PARTITIONS, ) @@ -534,6 +636,8 @@ def _write_with_sql_bulk_copy_csv( # into the mapped function, its module, or an arg of it ... that is not pickle-able, this will throw an error. # One way to help is to resolve all arguments to primitive types (int, string) that can be passed # to the mapped function + temp_table = self.temp_table + ordered_col_names = self.column_names rdd.mapPartitionsWithIndex( lambda partition_idx, s3_obj_keys: copy_csvs_from_s3_to_pg( batch_num=partition_idx, @@ -547,18 +651,19 @@ def _write_with_sql_bulk_copy_csv( ), ).collect() - logger.info(f"LOAD: Finished SQL bulk COPY of {file_count} CSV files to Postgres {temp_table} table") + logger.info( + f"LOAD: Finished SQL bulk COPY of {file_count} CSV files to Postgres {self.temp_table} table" + ) def _write_with_jdbc_inserts( self, - spark: SparkSession, df: DataFrame, temp_table: str, split_df_by_special_cols: bool = False, postgres_model: Optional[Model] = None, postgres_cols: Optional[Dict[str, str]] = None, overwrite: bool = False, - ): + ) -> None: """ Write-from-delta-to-postgres strategy that leverages the native Spark ``DataFrame.write.jdbc`` approach. This will issue a series of individual INSERT statements over a JDBC connection-per-executor. @@ -594,7 +699,10 @@ def _write_with_jdbc_inserts( # special handling. Get those columns and handle each. if split_df_by_special_cols: if postgres_model: - col_type_mapping = [(column.name, type(column)) for column in postgres_model._meta.get_fields()] + col_type_mapping = [ + (column.name, type(column)) + for column in postgres_model._meta.get_fields() + ] else: col_type_mapping = list(postgres_cols.items()) for column_name, column_type in col_type_mapping: @@ -609,14 +717,18 @@ def _write_with_jdbc_inserts( ) for i, split_df in enumerate(split_dfs): # Note: we're only appending here as we don't want to re-truncate or overwrite with multiple dataframes - logger.info(f"LOAD: Loading part {i + 1} of {split_df_count} (note: unequal part sizes)") + logger.info( + f"LOAD: Loading part {i + 1} of {split_df_count} (note: unequal part sizes)" + ) split_df.write.jdbc( url=get_usas_jdbc_url(), table=temp_table, mode=save_mode, properties=get_jdbc_connection_properties(), ) - logger.info(f"LOAD: Part {i + 1} of {split_df_count} loaded (note: unequal part sizes)") + logger.info( + f"LOAD: Part {i + 1} of {split_df_count} loaded (note: unequal part sizes)" + ) else: # Do it in one shot df.write.jdbc( @@ -625,3 +737,28 @@ def _write_with_jdbc_inserts( mode=save_mode, properties=get_jdbc_connection_properties(), ) + + def _finish( + self, + options: dict, + spark_created_by_command: bool, + spark: SparkSession, + ) -> None: + use_jdbc_inserts = options["jdbc_inserts"] + strategy = "JDBC INSERTs" if use_jdbc_inserts else "SQL bulk COPY CSV" + logger.info( + f"LOAD (FINISH): Loaded data from Delta table {self.delta_table} " + f"to {self.temp_table} using {strategy} strategy" + ) + + # We're done with spark at this point + if spark_created_by_command: + spark.stop() + + if self.postgres_table: + logger.info( + f"Note: this has merely loaded the data from Delta. For various reasons, we've separated the" + f" metadata portion of the table download to a separate script. If not already done so," + f" please run the following additional command to complete the process: " + f" 'copy_table_metadata --source-table {self.postgres_table} --dest-table {self.temp_table}'." + ) diff --git a/usaspending_api/etl/management/commands/load_table_to_delta.py b/usaspending_api/etl/management/commands/load_table_to_delta.py index efe4bdd332..c1c611a4cf 100644 --- a/usaspending_api/etl/management/commands/load_table_to_delta.py +++ b/usaspending_api/etl/management/commands/load_table_to_delta.py @@ -1,322 +1,251 @@ import logging -from django.core.management import BaseCommand +from django.core.management import BaseCommand, CommandParser from pyspark.sql import functions as sf from usaspending_api.awards.delta_models import ( AWARDS_COLUMNS, - awards_sql_string, - FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS, - financial_accounts_by_awards_sql_string, BROKER_SUBAWARDS_COLUMNS, + FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS, + awards_sql_string, broker_subawards_sql_string, + financial_accounts_by_awards_sql_string, +) +from usaspending_api.awards.models import ( + Award, + FinancialAccountsByAwards, + TransactionFABS, + TransactionFPDS, + TransactionNormalized, +) +from usaspending_api.broker.delta_models.broker_zips import ( + ZIPS_COLUMNS, + zips_sql_string, +) +from usaspending_api.common.etl.spark import ( + extract_db_data_frame, + get_partition_bounds_sql, + load_delta_table, ) -from usaspending_api.broker.delta_models.broker_zips import ZIPS_COLUMNS, zips_sql_string -from usaspending_api.common.etl.spark import extract_db_data_frame, get_partition_bounds_sql, load_delta_table from usaspending_api.common.helpers.spark_helpers import ( configure_spark_session, get_active_spark_session, + get_broker_jdbc_url, get_jdbc_connection_properties, get_usas_jdbc_url, - get_broker_jdbc_url, ) from usaspending_api.config import CONFIG +from usaspending_api.etl.table_specs import TableSpec +from usaspending_api.etl.transaction_delta_loaders.utils import parse_date_column from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_COLUMNS, - recipient_lookup_create_sql_string, - recipient_profile_create_sql_string, RECIPIENT_PROFILE_DELTA_COLUMNS, SAM_RECIPIENT_COLUMNS, + recipient_lookup_create_sql_string, + recipient_profile_create_sql_string, sam_recipient_create_sql_string, ) -from usaspending_api.search.models import TransactionSearch, AwardSearch +from usaspending_api.recipient.models import DUNS, RecipientLookup, RecipientProfile +from usaspending_api.search.delta_models.award_search import ( + AWARD_SEARCH_COLUMNS, + award_search_create_sql_string, +) +from usaspending_api.search.models import AwardSearch, TransactionSearch from usaspending_api.transactions.delta_models import ( DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS, - detached_award_procurement_create_sql_string, + PUBLISHED_FABS_DELTA_COLUMNS, TRANSACTION_FABS_VIEW_COLUMNS, - transaction_fabs_sql_string, TRANSACTION_FPDS_VIEW_COLUMNS, - transaction_fpds_sql_string, TRANSACTION_NORMALIZED_COLUMNS, - transaction_normalized_sql_string, TRANSACTION_SEARCH_POSTGRES_COLUMNS, - transaction_search_create_sql_string, - PUBLISHED_FABS_DELTA_COLUMNS, + detached_award_procurement_create_sql_string, published_fabs_create_sql_string, + transaction_fabs_sql_string, + transaction_fpds_sql_string, + transaction_normalized_sql_string, + transaction_search_create_sql_string, ) -from usaspending_api.transactions.models import SourceAssistanceTransaction -from usaspending_api.transactions.models import SourceProcurementTransaction -from usaspending_api.search.delta_models.award_search import award_search_create_sql_string, AWARD_SEARCH_COLUMNS - -from usaspending_api.recipient.models import DUNS, RecipientLookup, RecipientProfile -from usaspending_api.awards.models import ( - Award, - FinancialAccountsByAwards, - TransactionFABS, - TransactionFPDS, - TransactionNormalized, +from usaspending_api.transactions.models import ( + SourceAssistanceTransaction, + SourceProcurementTransaction, ) logger = logging.getLogger(__name__) TABLE_SPEC = { - "awards": { - "model": Award, - "is_from_broker": False, - "source_table": "vw_awards", - "source_database": "rpt", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": awards_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(AWARDS_COLUMNS), - "tsvectors": None, - "add_hash_field": False, - }, - "detached_award_procurement": { - "model": SourceProcurementTransaction, - "is_from_broker": False, - "source_table": "source_procurement_transaction", - "source_database": "raw", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "detached_award_procurement_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": detached_award_procurement_create_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS), - "tsvectors": None, - "add_hash_field": True, - }, - "financial_accounts_by_awards": { - "model": FinancialAccountsByAwards, - "is_from_broker": False, - "source_table": "financial_accounts_by_awards", - "source_database": "public", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "financial_accounts_by_awards_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": financial_accounts_by_awards_sql_string, - "source_schema": None, - "custom_schema": "award_id LONG", - "column_names": list(FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS), - "tsvectors": None, - "add_hash_field": False, - }, - "transaction_fabs": { - "model": TransactionFABS, - "is_from_broker": False, - "source_table": "vw_transaction_fabs", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_fabs_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": TRANSACTION_FABS_VIEW_COLUMNS, - "tsvectors": None, - "add_hash_field": True, - }, - "published_fabs": { - "model": SourceAssistanceTransaction, - "is_from_broker": False, - "source_table": "source_assistance_transaction", - "source_database": "raw", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "published_fabs_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": published_fabs_create_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(PUBLISHED_FABS_DELTA_COLUMNS), - "tsvectors": None, - "add_hash_field": True, - }, - "transaction_fpds": { - "model": TransactionFPDS, - "is_from_broker": False, - "source_table": "vw_transaction_fpds", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_fpds_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": TRANSACTION_FPDS_VIEW_COLUMNS, - "tsvectors": None, - "add_hash_field": True, - }, - "transaction_normalized": { - "model": TransactionNormalized, - "is_from_broker": False, - "source_table": "vw_transaction_normalized", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_normalized_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(TRANSACTION_NORMALIZED_COLUMNS), - "tsvectors": None, - "add_hash_field": True, - }, + "awards": TableSpec( + model=Award, + source_table="vw_awards", + source_database="rpt", + destination_database="raw", + partition_column="id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=awards_sql_string, + column_names=list(AWARDS_COLUMNS), + ), + "detached_award_procurement": TableSpec( + model=SourceProcurementTransaction, + save_mode="merge", + merge_condition="t.detached_award_procurement_id == s.detached_award_procurement_id and t.hash == s.hash", + source_table="source_procurement_transaction", + source_database="raw", + destination_database="raw", + partition_column="detached_award_procurement_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=detached_award_procurement_create_sql_string, + column_names=list(DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS), + extra_columns={ + "hash": lambda: sf.xxhash64("*"), + "action_year": lambda: sf.year(sf.to_date("action_date")), + "action_month": lambda: sf.month(sf.to_date("action_date")), + }, + ), + "financial_accounts_by_awards": TableSpec( + model=FinancialAccountsByAwards, + source_table="financial_accounts_by_awards", + source_database="public", + destination_database="raw", + partition_column="financial_accounts_by_awards_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=financial_accounts_by_awards_sql_string, + custom_schema="award_id LONG", + column_names=list(FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS), + ), + "transaction_fabs": TableSpec( + model=TransactionFABS, + source_table="vw_transaction_fabs", + source_database="int", + destination_database="raw", + partition_column="transaction_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=transaction_fabs_sql_string, + column_names=TRANSACTION_FABS_VIEW_COLUMNS, + ), + "published_fabs": TableSpec( + model=SourceAssistanceTransaction, + source_table="source_assistance_transaction", + save_mode="merge", + merge_condition="t.published_fabs_id == s.published_fabs_id and t.hash == s.hash", + source_database="raw", + destination_database="raw", + partition_column="published_fabs_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=published_fabs_create_sql_string, + column_names=list(PUBLISHED_FABS_DELTA_COLUMNS), + extra_columns={ + "hash": lambda: sf.xxhash64("*"), + "action_year": lambda: sf.year(parse_date_column("action_date")), + "action_month": lambda: sf.month(parse_date_column("action_date")), + }, + ), + "transaction_fpds": TableSpec( + model=TransactionFPDS, + source_table="vw_transaction_fpds", + source_database="int", + destination_database="raw", + partition_column="transaction_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=transaction_fpds_sql_string, + custom_schema="", + column_names=TRANSACTION_FPDS_VIEW_COLUMNS, + ), + "transaction_normalized": TableSpec( + model=TransactionNormalized, + source_table="vw_transaction_normalized", + source_database="int", + destination_database="raw", + partition_column="id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=transaction_normalized_sql_string, + column_names=list(TRANSACTION_NORMALIZED_COLUMNS), + ), # Tables loaded in from the Broker - "subaward": { - "model": None, - "is_from_broker": True, - "source_table": "subaward", - "source_database": None, - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": broker_subawards_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(BROKER_SUBAWARDS_COLUMNS), - "tsvectors": None, - "add_hash_field": False, - }, - "zips": { - "model": None, - "is_from_broker": True, - "source_table": "zips", - "source_database": None, - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "zips_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": zips_sql_string, - "source_schema": None, - "custom_schema": "", - "column_names": list(ZIPS_COLUMNS), - "tsvectors": None, - "add_hash_field": False, - }, + "subaward": TableSpec( + is_from_broker=True, + source_table="subaward", + destination_database="raw", + partition_column="id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=broker_subawards_sql_string, + column_names=list(BROKER_SUBAWARDS_COLUMNS), + ), + "zips": TableSpec( + is_from_broker=True, + source_table="zips", + destination_database="raw", + partition_column="zips_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=zips_sql_string, + column_names=list(ZIPS_COLUMNS), + ), # Additional definitions for use in testing; # These are copies of Views / Materialized Views / Tables from Postgres to Spark to aid in # data comparison between current Postgres data and the data transformed via Spark. - "award_search_testing": { - "model": AwardSearch, - "is_from_broker": False, - "source_table": "award_search", - "source_database": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "award_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": award_search_create_sql_string, - "source_schema": None, - "custom_schema": "total_covid_outlay NUMERIC(23,2), total_covid_obligation NUMERIC(23,2), recipient_hash " + "award_search_testing": TableSpec( + model=AwardSearch, + source_table="award_search", + destination_database="rpt", + partition_column="award_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=award_search_create_sql_string, + custom_schema="total_covid_outlay NUMERIC(23,2), total_covid_obligation NUMERIC(23,2), recipient_hash " "STRING, federal_accounts STRING, cfdas ARRAY, tas_components ARRAY", - "column_names": list(AWARD_SEARCH_COLUMNS), - "tsvectors": None, - "add_hash_field": False, - }, - "recipient_lookup_testing": { - "model": RecipientLookup, - "is_from_broker": False, - "source_table": "recipient_lookup", - "source_database": "rpt", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": recipient_lookup_create_sql_string, - "source_schema": None, - "custom_schema": "recipient_hash STRING", - "column_names": list(RECIPIENT_LOOKUP_COLUMNS), - "tsvectors": None, - "add_hash_field": False, - }, - "recipient_profile_testing": { - "model": RecipientProfile, - "is_from_broker": False, - "source_table": "recipient_profile", - "source_database": "rpt", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "delta_table_create_sql": recipient_profile_create_sql_string, - "is_partition_column_unique": True, - "source_schema": None, - "custom_schema": "recipient_hash STRING", - "column_names": list(RECIPIENT_PROFILE_DELTA_COLUMNS), - "tsvectors": None, - "add_hash_field": False, - }, - "sam_recipient_testing": { - "model": DUNS, - "is_from_broker": False, - "source_table": "duns", - "source_database": "int", - "destination_database": "raw", - "swap_table": None, - "swap_schema": None, - "partition_column": None, - "partition_column_type": None, - "is_partition_column_unique": False, - "delta_table_create_sql": sam_recipient_create_sql_string, - "source_schema": None, - "custom_schema": "broker_duns_id STRING, business_types_codes ARRAY", - "column_names": list(SAM_RECIPIENT_COLUMNS), - "tsvectors": None, - "add_hash_field": False, - }, - "transaction_search_testing": { - "model": TransactionSearch, - "is_from_broker": False, - "source_table": "transaction_search", - "source_database": None, - "destination_database": "test", - "swap_table": None, - "swap_schema": None, - "partition_column": "transaction_id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": transaction_search_create_sql_string, - "source_schema": None, - "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", - "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), - "tsvectors": None, - "add_hash_field": False, - }, + column_names=list(AWARD_SEARCH_COLUMNS), + ), + "recipient_lookup_testing": TableSpec( + model=RecipientLookup, + source_table="recipient_lookup", + source_database="rpt", + destination_database="raw", + partition_column="id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=recipient_lookup_create_sql_string, + custom_schema="recipient_hash STRING", + column_names=list(RECIPIENT_LOOKUP_COLUMNS), + ), + "recipient_profile_testing": TableSpec( + model=RecipientProfile, + source_table="recipient_profile", + source_database="rpt", + destination_database="raw", + partition_column="id", + partition_column_type="numeric", + delta_table_create_sql=recipient_profile_create_sql_string, + is_partition_column_unique=True, + custom_schema="recipient_hash STRING", + column_names=list(RECIPIENT_PROFILE_DELTA_COLUMNS), + ), + "sam_recipient_testing": TableSpec( + model=DUNS, + source_table="duns", + source_database="int", + destination_database="raw", + delta_table_create_sql=sam_recipient_create_sql_string, + custom_schema="broker_duns_id STRING, business_types_codes ARRAY", + column_names=list(SAM_RECIPIENT_COLUMNS), + ), + "transaction_search_testing": TableSpec( + model=TransactionSearch, + source_table="transaction_search", + destination_database="test", + partition_column="transaction_id", + partition_column_type="numeric", + is_partition_column_unique=True, + delta_table_create_sql=transaction_search_create_sql_string, + custom_schema="recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", + column_names=list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), + ), } SPARK_PARTITION_ROWS = CONFIG.SPARK_PARTITION_ROWS @@ -330,7 +259,8 @@ class Command(BaseCommand): before new data is written. """ - def add_arguments(self, parser): + @staticmethod + def add_arguments(parser: CommandParser) -> None: parser.add_argument( "--destination-table", type=str, @@ -352,7 +282,7 @@ def add_arguments(self, parser): "name", ) - def handle(self, *args, **options): + def handle(self, *args, **options) -> None: extra_conf = { # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", @@ -373,15 +303,16 @@ def handle(self, *args, **options): destination_table = options["destination_table"] table_spec = TABLE_SPEC[destination_table] - is_from_broker = table_spec["is_from_broker"] - destination_database = options["alt_db"] or table_spec["destination_database"] + is_from_broker = table_spec.is_from_broker + destination_database = options["alt_db"] or table_spec.destination_database destination_table_name = options["alt_name"] or destination_table - source_table = table_spec["source_table"] - partition_column = table_spec["partition_column"] - partition_column_type = table_spec["partition_column_type"] - is_partition_column_unique = table_spec["is_partition_column_unique"] - custom_schema = table_spec["custom_schema"] - add_hash_field = table_spec["add_hash_field"] + source_table = table_spec.source_table + partition_column = table_spec.partition_column + partition_column_type = table_spec.partition_column_type + is_partition_column_unique = table_spec.is_partition_column_unique + custom_schema = table_spec.custom_schema + save_mode = table_spec.save_mode + merge_condition = table_spec.merge_condition # Set the database that will be interacted with for all Delta Lake table Spark-based activity logger.info(f"Using Spark Database: {destination_database}") @@ -390,7 +321,7 @@ def handle(self, *args, **options): # Resolve JDBC URL for Source Database jdbc_url = get_usas_jdbc_url() if not is_from_broker else get_broker_jdbc_url() if not jdbc_url: - raise RuntimeError(f"Couldn't find JDBC url, please properly configure your CONFIG.") + raise RuntimeError("Couldn't find JDBC url, please properly configure your CONFIG.") if not jdbc_url.startswith("jdbc:postgresql://"): raise ValueError("JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://...") @@ -430,16 +361,17 @@ def handle(self, *args, **options): properties=get_jdbc_connection_properties(), ) - if add_hash_field: - df = df.withColumn("hash", sf.xxhash64("*")) + extra_columns = table_spec.extra_columns if table_spec.extra_columns else {} + for name, column in extra_columns.items(): + df = df.withColumn(name, column()) # Make sure that the column order defined in the Delta table schema matches # that of the Spark dataframe used to pull from the Postgres table. While not # always needed, this should help to prevent any future mismatch between the two. - if table_spec.get("column_names"): - df = df.select(table_spec.get("column_names")) + if table_spec.column_names: + df = df.select(table_spec.column_names) # Write to S3 - load_delta_table(spark, df, destination_table_name, True) + load_delta_table(spark, df, destination_table_name, save_mode=save_mode, merge_condition=merge_condition) if spark_created_by_command: spark.stop() diff --git a/usaspending_api/etl/management/commands/load_transaction_fabs_in_delta.py b/usaspending_api/etl/management/commands/load_transaction_fabs_in_delta.py index 9dbb3d64ae..2531592a5b 100644 --- a/usaspending_api/etl/management/commands/load_transaction_fabs_in_delta.py +++ b/usaspending_api/etl/management/commands/load_transaction_fabs_in_delta.py @@ -17,6 +17,13 @@ class Command(BaseCommand): @staticmethod def add_arguments(parser): + parser.add_argument( + "--alt-last-load-date", + type=str, + required=False, + default=None, + help="Alternative last load datetime in %Y-%m-%d %H:%M:%S (e.g. 2026-03-19 14:00:00) format.", + ) parser.add_argument( "--spark-s3-bucket", type=str, @@ -28,5 +35,9 @@ def add_arguments(parser): @staticmethod def handle(*args, **options): with prepare_spark() as spark: - loader = FABSDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + loader = FABSDeltaTransactionLoader( + spark=spark, + alt_last_load_date=options["alt_last_load_date"], + spark_s3_bucket=options["spark_s3_bucket"], + ) loader.load_transactions() diff --git a/usaspending_api/etl/management/commands/load_transaction_fpds_in_delta.py b/usaspending_api/etl/management/commands/load_transaction_fpds_in_delta.py index 783ab20ca1..ed8665ab9b 100644 --- a/usaspending_api/etl/management/commands/load_transaction_fpds_in_delta.py +++ b/usaspending_api/etl/management/commands/load_transaction_fpds_in_delta.py @@ -17,6 +17,13 @@ class Command(BaseCommand): @staticmethod def add_arguments(parser): + parser.add_argument( + "--alt-last-load-date", + type=str, + required=False, + default=None, + help="Alternative last load datetime in %Y-%m-%d %H:%M:%S (e.g. 2026-03-19 14:00:00) format.", + ) parser.add_argument( "--spark-s3-bucket", type=str, @@ -28,5 +35,9 @@ def add_arguments(parser): @staticmethod def handle(*args, **options): with prepare_spark() as spark: - loader = FPDSDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + loader = FPDSDeltaTransactionLoader( + spark=spark, + alt_last_load_date=options["alt_last_load_date"], + spark_s3_bucket=options["spark_s3_bucket"], + ) loader.load_transactions() diff --git a/usaspending_api/etl/management/commands/load_transaction_normalized.py b/usaspending_api/etl/management/commands/load_transaction_normalized.py index dd89e851af..590a30d22d 100644 --- a/usaspending_api/etl/management/commands/load_transaction_normalized.py +++ b/usaspending_api/etl/management/commands/load_transaction_normalized.py @@ -19,6 +19,13 @@ class Command(BaseCommand): @staticmethod def add_arguments(parser): + parser.add_argument( + "--alt-last-load-date", + type=str, + required=False, + default=None, + help="Alternative last load datetime in %Y-%m-%d %H:%M:%S (e.g. 2026-03-19 14:00:00) format.", + ) parser.add_argument( "--spark-s3-bucket", type=str, @@ -30,7 +37,15 @@ def add_arguments(parser): @staticmethod def handle(*args, **options): with prepare_spark() as spark: - fabs_loader = FABSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) - fpds_loader = FPDSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + fabs_loader = FABSNormalizedDeltaTransactionLoader( + spark=spark, + alt_last_load_date=options["alt_last_load_date"], + spark_s3_bucket=options["spark_s3_bucket"], + ) + fpds_loader = FPDSNormalizedDeltaTransactionLoader( + spark=spark, + alt_last_load_date=options["alt_last_load_date"], + spark_s3_bucket=options["spark_s3_bucket"], + ) fabs_loader.load_transactions() fpds_loader.load_transactions() diff --git a/usaspending_api/etl/table_specs.py b/usaspending_api/etl/table_specs.py new file mode 100644 index 0000000000..46579952fa --- /dev/null +++ b/usaspending_api/etl/table_specs.py @@ -0,0 +1,66 @@ +from dataclasses import dataclass +from typing import Any, Callable, Literal + +from django.db import models +from pyspark.sql import Column, SparkSession +from pyspark.sql.types import StructType + + +@dataclass(kw_only=True) +class TableSpec: + destination_database: Literal["arc", "int", "raw", "rpt", "test"] + delta_table_create_sql: str | StructType + save_mode: Literal["append", "merge", "overwrite"] = "overwrite" + merge_condition: str | Column | None = None + column_names: list[str] | None = None + model: models.Model | None = None + is_from_broker: bool = False + source_table: str | None = None + source_database: Literal["public", "int", "raw", "rpt"] | None = None + swap_table: str | None = None + swap_schema: str | None = None + partition_column: str | None = None + partition_column_type: Literal["date", "numeric"] | None = None + is_partition_column_unique: bool = False + source_schema: dict[str, str] | None = None + custom_schema: str = "" + delta_table_create_options: dict[str, str | bool] | None = None + delta_table_create_partitions: list[str] | None = None + tsvectors: dict[str, list[str]] | None = None + extra_columns: dict[str, Column] | None = None + + def __post_init__(self): + if isinstance(self.delta_table_create_sql, str): + if self.delta_table_create_partitions is not None or self.delta_table_create_options is not None: + raise TypeError( + "delta_table_create_partitions and delta_table_create_options can only be used when " + "delta_table_create_sql is a StructType." + ) + if self.save_mode == "merge" and self.merge_condition is None: + raise TypeError("merge_condition must be used when save_mode is merge") + + +@dataclass(kw_only=True) +class QueryTableSpec(TableSpec): + source_query: ( + str + | Callable[[SparkSession, str, str], None] + | list[str] + | list[Callable[[SparkSession, str, str], None]] + | None + ) = None + source_query_incremental: ( + str + | Callable[[SparkSession, str, str], None] + | list[str] + | list[Callable[[SparkSession, str, str], None]] + | None + ) = None + postgres_seq_name: str | None = None + postgres_partition_spec: dict[str, Any] | None = None + + +@dataclass(kw_only=True) +class ArchiveTableSpec(TableSpec): + destination_table: str + archive_date_field: str diff --git a/usaspending_api/etl/tests/data/delta_model_for_test.py b/usaspending_api/etl/tests/data/delta_model_for_test.py index a65411ee37..15841ed1fd 100644 --- a/usaspending_api/etl/tests/data/delta_model_for_test.py +++ b/usaspending_api/etl/tests/data/delta_model_for_test.py @@ -2,9 +2,13 @@ from django.db import models +from usaspending_api.etl.table_specs import TableSpec + class TestModel(models.Model): - id = models.IntegerField(primary_key=True, help_text="surrogate primary key defined in Broker") + id = models.IntegerField( + primary_key=True, help_text="surrogate primary key defined in Broker" + ) test_timestamp = models.DateTimeField(null=True, blank=True) class Meta: @@ -26,22 +30,17 @@ class Meta: """ TEST_TABLE_SPEC = { - "test_table": { - "model": TestModel, - "is_from_broker": False, - "source_table": "test_table", - "source_database": "temp", - "destination_database": "temp", - "swap_table": None, - "swap_schema": None, - "partition_column": "id", - "partition_column_type": "numeric", - "is_partition_column_unique": True, - "delta_table_create_sql": TEST_TABLE_DELTA, - "source_schema": None, - "custom_schema": "", - "column_names": ["id", "test_timestamp"], - "tsvectors": None, - "add_hash_field": False, - } + "test_table": TableSpec( + **{ + "model": TestModel, + "source_table": "test_table", + "source_database": "temp", + "destination_database": "temp", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": True, + "delta_table_create_sql": TEST_TABLE_DELTA, + "column_names": ["id", "test_timestamp"], + } + ) } diff --git a/usaspending_api/etl/tests/integration/test_create_delta_table.py b/usaspending_api/etl/tests/integration/test_create_delta_table.py index b36597868f..e7e4106f18 100644 --- a/usaspending_api/etl/tests/integration/test_create_delta_table.py +++ b/usaspending_api/etl/tests/integration/test_create_delta_table.py @@ -21,7 +21,7 @@ def _verify_delta_table_creation( delta_table_spec = TABLE_SPEC[delta_table_name] cmd_args = [f"--destination-table={delta_table_name}", f"--spark-s3-bucket={s3_bucket}"] - expected_db_name = delta_table_spec["destination_database"] + expected_db_name = delta_table_spec.destination_database if alt_db: cmd_args += [f"--alt-db={alt_db}"] expected_db_name = alt_db diff --git a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py index ac383cc9e4..c66b60fb18 100644 --- a/usaspending_api/etl/tests/integration/test_load_to_from_delta.py +++ b/usaspending_api/etl/tests/integration/test_load_to_from_delta.py @@ -4,7 +4,7 @@ """ import json - +from copy import deepcopy from datetime import date, datetime, timedelta, timezone from typing import Any, Dict, List, Optional, Union @@ -12,24 +12,25 @@ import psycopg2 import pytest import pytz - +from django.conf import settings +from django.core.management import call_command +from django.db import connection, connections, models, transaction from model_bakery import baker from pyspark.sql import SparkSession -from django.conf import settings -from django.core.management import call_command -from django.db import connection, connections, transaction, models -from usaspending_api.config import CONFIG from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.etl.award_helpers import update_awards from usaspending_api.etl.broker_etl_helpers import dictfetchall from usaspending_api.etl.management.commands.create_delta_table import ( TABLE_SPEC, ) -from usaspending_api.etl.tests.data.delta_model_for_test import TestModel, TEST_TABLE_POSTGRES, TEST_TABLE_SPEC +from usaspending_api.etl.tests.data.delta_model_for_test import ( + TEST_TABLE_POSTGRES, + TEST_TABLE_SPEC, + TestModel, +) from usaspending_api.recipient.models import RecipientLookup from usaspending_api.tests.conftest_spark import create_and_load_all_delta_tables -from copy import deepcopy _NEW_ASSIST = { "published_fabs_id": 6, @@ -55,12 +56,18 @@ def _handle_string_cast(val: str) -> Union[str, dict, list]: """ if isinstance(val, list): try: - casted = [json.loads(element) if isinstance(element, str) else element for element in val] + casted = [ + json.loads(element) if isinstance(element, str) else element + for element in val + ] except (TypeError, json.decoder.JSONDecodeError): casted = [str(element) for element in val] elif isinstance(val, dict): try: - casted = {k: json.loads(element) if isinstance(element, str) else element for k, element in val.items()} + casted = { + k: json.loads(element) if isinstance(element, str) else element + for k, element in val.items() + } except (TypeError, json.decoder.JSONDecodeError): casted = {k: str(element) for k, element in val.items()} else: @@ -97,10 +104,17 @@ def equal_datasets( # Parsing custom_schema to specify schema_changes = {} - schema_type_converters = {"INT": int, "STRING": _handle_string_cast, "ARRAY": _handle_string_cast} + schema_type_converters = { + "INT": int, + "STRING": _handle_string_cast, + "ARRAY": _handle_string_cast, + } if custom_schema: for schema_change in custom_schema.split(", "): - col, new_col_type = schema_change.split()[0].strip(), schema_change.split()[1].strip() + col, new_col_type = ( + schema_change.split()[0].strip(), + schema_change.split()[1].strip(), + ) schema_changes[col] = new_col_type # Iterating through the values and finding any differences @@ -140,7 +154,9 @@ def equal_datasets( if isinstance(psql_val, list): psql_val = sorted_deep(psql_val) if isinstance(spark_val, str): - spark_val = [json.loads(idx.replace("'", '"')) for idx in [spark_val]][0] + spark_val = [ + json.loads(idx.replace("'", '"')) for idx in [spark_val] + ][0] spark_val = sorted_deep(spark_val) if psql_val != spark_val: @@ -174,7 +190,8 @@ def load_delta_table_from_postgres( call_command(load_command, *cmd_args) -def verify_delta_table_loaded_to_delta( +# TODO: Refactor and remove the "noqa" for PLR0912 +def verify_delta_table_loaded_to_delta( # noqa: PLR0912 spark: SparkSession, delta_table_name: str, s3_bucket: str, @@ -199,18 +216,20 @@ def verify_delta_table_loaded_to_delta( call_command("create_delta_table", f"--spark-s3-bucket={s3_bucket}", *cmd_args) call_command(load_command, *cmd_args) else: - load_delta_table_from_postgres(delta_table_name, s3_bucket, alt_db, alt_name, load_command) + load_delta_table_from_postgres( + delta_table_name, s3_bucket, alt_db, alt_name, load_command + ) if alt_name: expected_table_name = alt_name else: expected_table_name = delta_table_name.split(".")[-1] - partition_col = TABLE_SPEC[delta_table_name].get("partition_column") + partition_col = TABLE_SPEC[delta_table_name].partition_column if dummy_data is None: # get the postgres data to compare - model = TABLE_SPEC[delta_table_name]["model"] - is_from_broker = TABLE_SPEC[delta_table_name]["is_from_broker"] + model = TABLE_SPEC[delta_table_name].model + is_from_broker = TABLE_SPEC[delta_table_name].is_from_broker if delta_table_name == "summary_state_view": dummy_query = f"SELECT * from {expected_table_name}" if partition_col is not None: @@ -224,7 +243,7 @@ def verify_delta_table_loaded_to_delta( elif is_from_broker: # model can be None if loading from the Broker broker_connection = connections[settings.BROKER_DB_ALIAS] - source_broker_name = TABLE_SPEC[delta_table_name]["source_table"] + source_broker_name = TABLE_SPEC[delta_table_name].source_table with broker_connection.cursor() as cursor: dummy_query = f"SELECT * from {source_broker_name}" if partition_col is not None: @@ -244,7 +263,12 @@ def verify_delta_table_loaded_to_delta( received_query = f"{received_query} ORDER BY {partition_col}" received_data = [row.asDict() for row in spark.sql(received_query).collect()] - assert equal_datasets(dummy_data, received_data, TABLE_SPEC[delta_table_name]["custom_schema"], ignore_fields) + assert equal_datasets( + dummy_data, + received_data, + TABLE_SPEC[delta_table_name].custom_schema, + ignore_fields, + ) def verify_delta_table_loaded_from_delta( @@ -268,7 +292,7 @@ def verify_delta_table_loaded_from_delta( cmd_args += [f"--alt-delta-name={alt_name}"] expected_table_name = alt_name if jdbc_inserts: - cmd_args += [f"--jdbc-inserts"] + cmd_args += ["--jdbc-inserts"] else: if not spark_s3_bucket: raise RuntimeError( @@ -280,14 +304,18 @@ def verify_delta_table_loaded_from_delta( call_command(load_command, *cmd_args) # get the postgres data to compare - source_table = TABLE_SPEC[delta_table_name]["source_table"] or TABLE_SPEC[delta_table_name]["swap_table"] + + source_table = ( + TABLE_SPEC[delta_table_name].source_table + or TABLE_SPEC[delta_table_name].swap_table + ) temp_schema = "temp" if source_table: tmp_table_name = f"{temp_schema}.{source_table}_temp" else: tmp_table_name = f"{temp_schema}.{expected_table_name}_temp" postgres_query = f"SELECT * FROM {tmp_table_name}" - partition_col = TABLE_SPEC[delta_table_name]["partition_column"] + partition_col = TABLE_SPEC[delta_table_name].partition_column if partition_col is not None: postgres_query = f"{postgres_query} ORDER BY {partition_col}" with psycopg2.connect(dsn=get_database_dsn_string()) as connection: @@ -302,13 +330,21 @@ def verify_delta_table_loaded_from_delta( delta_data = [row.asDict() for row in spark.sql(delta_query).collect()] assert equal_datasets( - postgres_data, delta_data, TABLE_SPEC[delta_table_name]["custom_schema"], ignore_fields=ignore_fields + postgres_data, + delta_data, + TABLE_SPEC[delta_table_name].custom_schema, + ignore_fields=ignore_fields, ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_from_delta_for_recipient_lookup( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to # Postgres table, and then push the updated table to Delta. @@ -316,20 +352,35 @@ def test_load_table_to_from_delta_for_recipient_lookup( insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) ignore_fields = ["id", "update_date"] - tables_to_load = ["sam_recipient", "transaction_fabs", "transaction_fpds", "transaction_normalized"] + tables_to_load = [ + "sam_recipient", + "transaction_fabs", + "transaction_fpds", + "transaction_normalized", + ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) # Test initial load of Recipient Lookup @@ -364,7 +415,7 @@ def test_load_table_to_from_delta_for_recipient_lookup( award_id=new_award.award_id, is_fpds=False, type="07", - last_modified_date="2021-01-01", + last_modified_date="2021-01-01 00:00:00+00", cfda_number="12.456", recipient_uei="FABSUEI12345", recipient_unique_id="FABSDUNS12345", @@ -397,7 +448,12 @@ def test_load_table_to_from_delta_for_recipient_lookup( # Verify that the update alternate name exists expected_result = ["FABS RECIPIENT 12345"] - assert sorted(RecipientLookup.objects.filter(uei="FABSUEI12345").first().alternate_names) == expected_result + assert ( + sorted( + RecipientLookup.objects.filter(uei="FABSUEI12345").first().alternate_names + ) + == expected_result + ) tables_to_load = ["transaction_fabs", "transaction_normalized"] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) @@ -409,7 +465,10 @@ def test_load_table_to_from_delta_for_recipient_lookup( ignore_fields=ignore_fields, ) verify_delta_table_loaded_from_delta( - spark, "recipient_lookup", spark_s3_bucket=s3_unittest_data_bucket, ignore_fields=ignore_fields + spark, + "recipient_lookup", + spark_s3_bucket=s3_unittest_data_bucket, + ignore_fields=ignore_fields, ) verify_delta_table_loaded_from_delta( spark, "recipient_lookup", jdbc_inserts=True, ignore_fields=ignore_fields @@ -417,7 +476,9 @@ def test_load_table_to_from_delta_for_recipient_lookup( @pytest.mark.django_db(transaction=True) -def test_load_table_to_delta_for_published_fabs(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): +def test_load_table_to_delta_for_published_fabs( + spark, s3_unittest_data_bucket, hive_unittest_metastore_db +): baker.make( "transactions.SourceAssistanceTransaction", published_fabs_id=7, @@ -436,9 +497,14 @@ def test_load_table_to_delta_for_published_fabs(spark, s3_unittest_data_bucket, verify_delta_table_loaded_to_delta(spark, "published_fabs", s3_unittest_data_bucket) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_from_delta_for_recipient_profile( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to # Postgres table, and then push the updated table to Delta. @@ -446,17 +512,27 @@ def test_load_table_to_from_delta_for_recipient_profile( insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -469,13 +545,21 @@ def test_load_table_to_from_delta_for_recipient_profile( ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "recipient_profile", s3_unittest_data_bucket, load_command="load_query_to_delta", ignore_fields=["id"] + spark, + "recipient_profile", + s3_unittest_data_bucket, + load_command="load_query_to_delta", + ignore_fields=["id"], + ) + verify_delta_table_loaded_from_delta( + spark, "recipient_profile", jdbc_inserts=True, ignore_fields=["id"] ) - verify_delta_table_loaded_from_delta(spark, "recipient_profile", jdbc_inserts=True, ignore_fields=["id"]) @pytest.mark.django_db(transaction=True) -def test_load_table_to_delta_timezone_aware(spark, monkeypatch, s3_unittest_data_bucket, hive_unittest_metastore_db): +def test_load_table_to_delta_timezone_aware( + spark, monkeypatch, s3_unittest_data_bucket, hive_unittest_metastore_db +): """Test that timestamps are not inadvertently shifted due to loss of timezone during reads and writes. The big takeaways from this are: @@ -513,7 +597,10 @@ def test_load_table_to_delta_timezone_aware(spark, monkeypatch, s3_unittest_data with new_psycopg2_conn.cursor() as cursor: cursor.execute(TEST_TABLE_POSTGRES) TABLE_SPEC.update(TEST_TABLE_SPEC) - monkeypatch.setattr("usaspending_api.etl.management.commands.load_table_to_delta.TABLE_SPEC", TABLE_SPEC) + monkeypatch.setattr( + "usaspending_api.etl.management.commands.load_table_to_delta.TABLE_SPEC", + TABLE_SPEC, + ) # Prepare a model object without saving it, but do save the related fields # - https://model-bakery.readthedocs.io/en/latest/basic_usage.html#non-persistent-objects @@ -528,14 +615,22 @@ def test_load_table_to_delta_timezone_aware(spark, monkeypatch, s3_unittest_data populated_columns = ("id", "test_timestamp") def _get_sql_insert_from_model(model, populated_columns): - values = [value for value in model._meta.local_fields if value.column in populated_columns] + values = [ + value + for value in model._meta.local_fields + if value.column in populated_columns + ] q = models.sql.InsertQuery(model) q.insert_values(values, [model]) compiler = q.get_compiler("default") - setattr(compiler, "return_id", False) + compiler.return_id = False stmts = compiler.as_sql() stmt = [ - stmt % tuple(f"'{param}'" if type(param) in [str, date, datetime] else param for param in params) + stmt + % tuple( + f"'{param}'" if type(param) in [str, date, datetime] else param + for param in params + ) for stmt, params in stmts ] return stmt[0] @@ -544,7 +639,9 @@ def _get_sql_insert_from_model(model, populated_columns): with psycopg2.connect(get_database_dsn_string()) as new_psycopg2_conn: with new_psycopg2_conn.cursor() as cursor: cursor.execute("set session time zone 'HST'") - fabs_insert_sql = _get_sql_insert_from_model(model_with_tz, populated_columns) + fabs_insert_sql = _get_sql_insert_from_model( + model_with_tz, populated_columns + ) cursor.execute(fabs_insert_sql) assert cursor.rowcount == 1 new_psycopg2_conn.commit() @@ -567,14 +664,24 @@ def _get_sql_insert_from_model(model, populated_columns): # or with raw SQL), it will apply those time zone settings assert model_datetime.tzname() != "HST" assert model_datetime.tzname() == "UTC" - assert model_datetime.hour == 21 # shifted +10 to counteract the UTC offset by django upon saving it - assert model_datetime.utctimetuple().tm_hour == 21 # already shifted to UTC, so this just matches .hour (== 21) - assert dt_naive.utctimetuple().tm_hour == dt_naive.hour # naive, so stays the same - assert dt_with_utc.utctimetuple().tm_hour == dt_with_utc.hour # already UTC, so stays the same + assert ( + model_datetime.hour == 21 + ) # shifted +10 to counteract the UTC offset by django upon saving it + assert ( + model_datetime.utctimetuple().tm_hour == 21 + ) # already shifted to UTC, so this just matches .hour (== 21) + assert ( + dt_naive.utctimetuple().tm_hour == dt_naive.hour + ) # naive, so stays the same + assert ( + dt_with_utc.utctimetuple().tm_hour == dt_with_utc.hour + ) # already UTC, so stays the same # Confirm also that this is the case in the DB (i.e. it was at write-time that UTC was set, not read-time with connection.cursor() as cursor: - cursor.execute("select test_table.test_timestamp from test_table where id = 3") + cursor.execute( + "select test_table.test_timestamp from test_table where id = 3" + ) dt_from_db = [row[0] for row in cursor.fetchall()][0] # type: datetime assert dt_from_db.tzinfo is not None assert dt_from_db.tzname() == "UTC" @@ -590,7 +697,9 @@ def _get_sql_insert_from_model(model, populated_columns): with psycopg2.connect(get_database_dsn_string()) as new_psycopg2_conn: with new_psycopg2_conn.cursor() as cursor: cursor.execute("set session time zone 'HST'") - cursor.execute("select test_table.test_timestamp from test_table where id = 3") + cursor.execute( + "select test_table.test_timestamp from test_table where id = 3" + ) dt_from_db = [row[0] for row in cursor.fetchall()][0] # type: datetime assert dt_from_db.tzinfo is not None # Can't use traditional time zone names with tzname() since pyscopg2 uses its own time zone infos. @@ -614,7 +723,9 @@ def _get_sql_insert_from_model(model, populated_columns): @pytest.mark.django_db(transaction=True) -def test_load_table_to_delta_for_detached_award_procurement(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): +def test_load_table_to_delta_for_detached_award_procurement( + spark, s3_unittest_data_bucket, hive_unittest_metastore_db +): baker.make( "transactions.SourceProcurementTransaction", detached_award_procurement_id="4", @@ -632,13 +743,20 @@ def test_load_table_to_delta_for_detached_award_procurement(spark, s3_unittest_d _fill_optional=True, ) - verify_delta_table_loaded_to_delta(spark, "detached_award_procurement", s3_unittest_data_bucket) + verify_delta_table_loaded_to_delta( + spark, "detached_award_procurement", s3_unittest_data_bucket + ) @pytest.mark.django_db(transaction=True) -@pytest.mark.skip(reason="Due to the nature of the views with all the transformations, this will be out of date") +@pytest.mark.skip( + reason="Due to the nature of the views with all the transformations, this will be out of date" +) def test_load_table_to_from_delta_for_recipient_profile_testing( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): tables_to_load = [ "recipient_lookup", @@ -649,13 +767,21 @@ def test_load_table_to_from_delta_for_recipient_profile_testing( ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "recipient_profile_testing", s3_unittest_data_bucket, load_command="load_table_to_delta" + spark, + "recipient_profile_testing", + s3_unittest_data_bucket, + load_command="load_table_to_delta", ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_from_delta_for_transaction_search( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to # Postgres table, and then push the updated table to Delta. @@ -663,17 +789,27 @@ def test_load_table_to_from_delta_for_transaction_search( insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -707,7 +843,10 @@ def test_load_table_to_from_delta_for_transaction_search( ) @pytest.mark.django_db(transaction=True) def test_load_table_to_from_delta_for_transaction_search_testing( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # TODO: Commenting these out while we have `transaction_search_gold` vs `transaction_search` in the TABLE_SPEC # as by design the data in delta will be different from the data in postgres @@ -723,8 +862,12 @@ def test_load_table_to_from_delta_for_transaction_search_testing( def test_load_table_to_delta_for_transaction_normalized_alt_db_and_name( spark, s3_unittest_data_bucket, hive_unittest_metastore_db ): - baker.make("search.TransactionSearch", transaction_id="1", award_id=1, _fill_optional=True) - baker.make("search.TransactionSearch", transaction_id="2", award_id=2, _fill_optional=True) + baker.make( + "search.TransactionSearch", transaction_id="1", award_id=1, _fill_optional=True + ) + baker.make( + "search.TransactionSearch", transaction_id="2", award_id=2, _fill_optional=True + ) verify_delta_table_loaded_to_delta( spark, "transaction_normalized", @@ -735,9 +878,14 @@ def test_load_table_to_delta_for_transaction_normalized_alt_db_and_name( @pytest.mark.django_db(transaction=True) -@pytest.mark.skip(reason="Due to the nature of the views with all the transformations, this will be out of date") +@pytest.mark.skip( + reason="Due to the nature of the views with all the transformations, this will be out of date" +) def test_load_table_to_from_delta_for_transaction_search_alt_db_and_name( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): tables_to_load = [ "awards", @@ -771,9 +919,14 @@ def test_load_table_to_from_delta_for_transaction_search_alt_db_and_name( # ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_from_delta_for_award_search( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to # Postgres table, and then push the updated table to Delta. @@ -781,17 +934,27 @@ def test_load_table_to_from_delta_for_award_search( insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -807,32 +970,54 @@ def test_load_table_to_from_delta_for_award_search( ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "award_search", s3_unittest_data_bucket, load_command="load_query_to_delta" + spark, + "award_search", + s3_unittest_data_bucket, + load_command="load_query_to_delta", ) - verify_delta_table_loaded_from_delta(spark, "award_search", spark_s3_bucket=s3_unittest_data_bucket) - verify_delta_table_loaded_from_delta(spark, "award_search", jdbc_inserts=True) # test alt write strategy + verify_delta_table_loaded_from_delta( + spark, "award_search", spark_s3_bucket=s3_unittest_data_bucket + ) + verify_delta_table_loaded_from_delta( + spark, "award_search", jdbc_inserts=True + ) # test alt write strategy -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_incremental_load_table_to_delta_for_award_search( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Load in data that award_search depends on last_load_datetime = datetime.now(timezone.utc) insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -852,14 +1037,14 @@ def test_incremental_load_table_to_delta_for_award_search( call_command( "create_delta_table", f"--spark-s3-bucket={s3_unittest_data_bucket}", - f"--destination-table=award_search", + "--destination-table=award_search", "--alt-db=int", ) # load in award_search data call_command( "load_query_to_delta", - f"--destination-table=award_search", + "--destination-table=award_search", "--incremental", "--alt-db=int", ) @@ -870,7 +1055,7 @@ def test_incremental_load_table_to_delta_for_award_search( # Reload the data call_command( "load_query_to_delta", - f"--destination-table=award_search", + "--destination-table=award_search", "--incremental", "--alt-db=int", ) @@ -886,33 +1071,55 @@ def test_incremental_load_table_to_delta_for_award_search( expected = pd.DataFrame( { "award_id": [4, 4, 1, 3, 2, 4], - "_change_type": ["delete", "insert", "insert", "insert", "insert", "insert"], + "_change_type": [ + "delete", + "insert", + "insert", + "insert", + "insert", + "insert", + ], "_commit_version": [2, 3, 1, 1, 1, 1], } ) pd.testing.assert_frame_equal(result, expected) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_incremental_load_table_to_delta_for_transaction_search( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): # Load in data that transaction_search depends on last_load_datetime = datetime.now(timezone.utc) insert_datetime = last_load_datetime + timedelta(minutes=-15) assist = deepcopy(_NEW_ASSIST) assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceAssistanceTransaction", **assist) load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) procure = deepcopy(_NEW_PROCURE) procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } ) baker.make("transactions.SourceProcurementTransaction", **procure) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -932,14 +1139,14 @@ def test_incremental_load_table_to_delta_for_transaction_search( call_command( "create_delta_table", f"--spark-s3-bucket={s3_unittest_data_bucket}", - f"--destination-table=transaction_search", + "--destination-table=transaction_search", "--alt-db=int", ) # load in award_search data call_command( "load_query_to_delta", - f"--destination-table=transaction_search", + "--destination-table=transaction_search", "--incremental", "--alt-db=int", ) @@ -950,7 +1157,7 @@ def test_incremental_load_table_to_delta_for_transaction_search( # Reload the data call_command( "load_query_to_delta", - f"--destination-table=transaction_search", + "--destination-table=transaction_search", "--incremental", "--alt-db=int", ) @@ -966,14 +1173,25 @@ def test_incremental_load_table_to_delta_for_transaction_search( expected = pd.DataFrame( { "transaction_id": [4, 4, 1, 2, 434, 3, 4, 5], - "_change_type": ["delete", "insert", "insert", "insert", "insert", "insert", "insert", "insert"], + "_change_type": [ + "delete", + "insert", + "insert", + "insert", + "insert", + "insert", + "insert", + "insert", + ], "_commit_version": [2, 3, 1, 1, 1, 1, 1, 1], } ) pd.testing.assert_frame_equal(result, expected) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_delta_for_sam_recipient( spark, s3_unittest_data_bucket, populate_broker_data, hive_unittest_metastore_db ): @@ -1001,7 +1219,11 @@ def test_load_table_to_delta_for_sam_recipient( } ] verify_delta_table_loaded_to_delta( - spark, "sam_recipient", s3_unittest_data_bucket, load_command="load_query_to_delta", dummy_data=expected_data + spark, + "sam_recipient", + s3_unittest_data_bucket, + load_command="load_query_to_delta", + dummy_data=expected_data, ) @@ -1009,15 +1231,21 @@ def test_load_table_to_delta_for_sam_recipient( settings.BROKER_DB_ALIAS not in settings.DATABASES, reason="'data_broker' database not configured in django settings.DATABASES.", ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_table_to_delta_for_summary_state_view( - spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db + spark, + s3_unittest_data_bucket, + populate_usas_data_and_recipients_from_broker, + hive_unittest_metastore_db, ): - # We need the award_search table to create the summary_state_view in delta # And in order to create the award_search table, we need the following load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -1033,39 +1261,47 @@ def test_load_table_to_delta_for_summary_state_view( ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "award_search", s3_unittest_data_bucket, load_command="load_query_to_delta" + spark, + "award_search", + s3_unittest_data_bucket, + load_command="load_query_to_delta", ) - # We now want to load the award_search table that we created above along with other tables needed to create award_search - # Then create the summay_state_view table and populate it using the load_query_to_delta command - tables_to_load = ["transaction_fabs", "transaction_fpds", "transaction_normalized", "award_search"] + # We now want to load the award_search table that we created above along with other tables needed to create + # award_search. Then create the summary_state_view table and populate it using the load_query_to_delta command. + tables_to_load = [ + "transaction_fabs", + "transaction_fpds", + "transaction_normalized", + "award_search", + ] create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "summary_state_view", s3_unittest_data_bucket, load_command="load_query_to_delta" + spark, + "summary_state_view", + s3_unittest_data_bucket, + load_command="load_query_to_delta", ) # Lastly, check using verify_delta_table_loaded_from_delta function which will run the load_table_from_delta command - verify_delta_table_loaded_from_delta(spark, "summary_state_view", spark_s3_bucket=s3_unittest_data_bucket) + verify_delta_table_loaded_from_delta( + spark, "summary_state_view", spark_s3_bucket=s3_unittest_data_bucket + ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_object_class_program_activity_class( spark, s3_unittest_data_bucket, hive_unittest_metastore_db, populate_usas_data_and_recipients_from_broker, - monkeypatch, ): call_command( "create_delta_table", "--destination-table=object_class_program_activity_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.object_class_program_activity.ObjectClassProgramActivityMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/object_class_program_activity_download" - ), - ) verify_delta_table_loaded_to_delta( spark, @@ -1076,17 +1312,19 @@ def test_load_object_class_program_activity_class( ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_award_financial_download( spark, s3_unittest_data_bucket, populate_usas_data_and_recipients_from_broker, hive_unittest_metastore_db, - monkeypatch, ): - load_delta_table_from_postgres("published_fabs", s3_unittest_data_bucket) - load_delta_table_from_postgres("detached_award_procurement", s3_unittest_data_bucket) + load_delta_table_from_postgres( + "detached_award_procurement", s3_unittest_data_bucket + ) tables_to_load = [ "awards", @@ -1103,7 +1341,10 @@ def test_load_award_financial_download( create_and_load_all_delta_tables(spark, s3_unittest_data_bucket, tables_to_load) verify_delta_table_loaded_to_delta( - spark, "award_search", s3_unittest_data_bucket, load_command="load_query_to_delta" + spark, + "award_search", + s3_unittest_data_bucket, + load_command="load_query_to_delta", ) verify_delta_table_loaded_to_delta( spark, @@ -1124,12 +1365,6 @@ def test_load_award_financial_download( "--destination-table=award_financial_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.award_financial.AwardFinancialMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/award_financial_download" - ), - ) expected_data = [ { @@ -1232,12 +1467,13 @@ def test_load_award_financial_download( ) -@pytest.mark.django_db(databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True) +@pytest.mark.django_db( + databases=[settings.BROKER_DB_ALIAS, settings.DEFAULT_DB_ALIAS], transaction=True +) def test_load_account_balances_download( spark, s3_unittest_data_bucket, hive_unittest_metastore_db, - monkeypatch, populate_usas_data_and_recipients_from_broker, ): call_command( @@ -1245,12 +1481,6 @@ def test_load_account_balances_download( "--destination-table=account_balances_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - monkeypatch.setattr( - f"usaspending_api.download.delta_downloads.account_balances.AccountBalancesMixin.download_table", - spark.read.format("delta").load( - f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/rpt/account_balances_download" - ), - ) verify_delta_table_loaded_to_delta( spark, diff --git a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py new file mode 100644 index 0000000000..2a897a4b07 --- /dev/null +++ b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py @@ -0,0 +1,726 @@ +"""Automated Unit Tests for the loading of transaction and award tables in Delta Lake. + +NOTE: Uses Pytest Fixtures from immediate parent conftest.py: usaspending_api/etl/tests/conftest.py +""" + +from copy import deepcopy +from datetime import datetime, timedelta, timezone + +from django.core.management import call_command +from model_bakery import baker +from pytest import mark + +from usaspending_api.broker.helpers.last_load_date import ( + get_last_load_date, + update_last_load_date, +) +from usaspending_api.config import CONFIG +from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC +from usaspending_api.etl.tests.integration.test_load_to_from_delta import ( + equal_datasets, + load_delta_table_from_postgres, +) +from usaspending_api.etl.tests.integration.test_load_transactions_in_delta_lookups import ( + _BEGINNING_OF_TIME, + _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + _InitialRunWithPostgresLoader, + _TableLoadInfo, +) +from usaspending_api.etl.tests.integration.test_load_transactions_in_delta_lookups import ( + TestInitialRun as InitialRun, # Remove 'test' prefix to avoid pytest running these tests twice +) +from usaspending_api.etl.tests.integration.test_load_transactions_in_delta_lookups import ( + # Remove 'test' prefix to avoid pytest running these tests twice + TestInitialRunNoPostgresLoader as InitialRunNoPostgresLoader, +) + + +class _TransactionFabsFpdsCore: + + new_transaction_fabs_fpds_id = 6 + new_transaction_id = 11 + + def __init__( + self, + spark, + s3_data_bucket, + etl_level, + pk_field, + compare_fields, + usas_source_table_name, + broker_source_table_name, + baker_table, + baker_kwargs, + expected_initial_transaction_fabs, + expected_initial_transaction_fpds, + ): + self.spark = spark + self.s3_data_bucket = s3_data_bucket + self.etl_level = etl_level + self.pk_field = pk_field + self.usas_source_table_name = usas_source_table_name + self.broker_source_table_name = broker_source_table_name + self.baker_table = baker_table + self.compare_fields = compare_fields + self.baker_kwargs = baker_kwargs + self.expected_initial_transaction_fabs = expected_initial_transaction_fabs + self.expected_initial_transaction_fpds = expected_initial_transaction_fpds + + def unexpected_paths_source_tables_only_test_core(self): + # Setup some source tables without data, this test does not require these tables to be populated + raw_db = "raw" + self.spark.sql(f"create database if not exists {raw_db};") + self.spark.sql(f"use {raw_db};") + self.spark.sql( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( + DESTINATION_TABLE="published_fabs", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=self.s3_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + self.spark.sql( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( + DESTINATION_TABLE="detached_award_procurement", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=self.s3_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + + # 1. Call load_transactions_in_delta with etl-level of initial_run first, but without first loading + # raw.transaction_normalized or raw.awards. Then immediately call load_transactions_in_delta with + # etl-level of transaction_f[ab|pd]s. + InitialRun.initial_run(self.s3_data_bucket) + call_command("load_transactions_in_delta", "--etl-level", self.etl_level) + + # Verify the transaction and award id lookup tables and other int transaction tables. They should all be empty. + kwargs = { + "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, + "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, + "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, + } + # Even though nothing will have been loaded to that table, the table whose etl_level has been called will + # have its last load date set to the date of the source tables' load. + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + InitialRun.verify(self.spark, [], [], **kwargs) + + # 2. With raw.transaction_normalized and raw.awards still not created, call load_transactions_in_delta + # with etl-level of transaction_id_lookup, and then again with etl-level of transaction_f[ab|pd]s. + + # Since the call to load_transactions_in_delta with etl-level of transaction_f[ab|pd]s above succeeded, we first + # need to reset the last load date on transaction_fabs + update_last_load_date(self.etl_level, _BEGINNING_OF_TIME) + + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) + call_command("load_transactions_in_delta", "--etl-level", self.etl_level) + + # The expected transaction_id_lookup table should be the same as in _InitialRunWithPostgresLoader, + # but all of the transaction ids should be 1 larger than expected there. + expected_transaction_id_lookup = deepcopy( + _InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup + ) + for item in expected_transaction_id_lookup: + item["transaction_id"] += 1 + # Also, the last load date of the transaction_id_lookup table and of the table whose etl_level is being + # called should be updated to the load time of the source tables + kwargs["expected_last_load_transaction_id_lookup"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + InitialRun.verify( + self.spark, + expected_transaction_id_lookup, + [], + 0, + len(self.expected_initial_transaction_fabs), + len(self.expected_initial_transaction_fpds), + **kwargs, + ) + + # Verify key fields in transaction_f[ab|pd]s table. Note that the transaction_ids should be 1 more than + # in those from _InitialRunWithPostgresLoader + query = f"SELECT {', '.join(self.compare_fields)} FROM int.{self.etl_level} ORDER BY {self.pk_field}" + delta_data = [row.asDict() for row in self.spark.sql(query).collect()] + + if len(self.expected_initial_transaction_fabs) > 0: + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fabs + ) + else: + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fpds + ) + for item in expected_transaction_fabs_fpds: + item["transaction_id"] += 1 + assert equal_datasets(expected_transaction_fabs_fpds, delta_data, "") + + def unexpected_paths_test_core( + self, + load_other_raw_tables, + expected_initial_transaction_id_lookup, + expected_initial_award_id_lookup, + ): + # 1. Call load_transactions_in_delta with etl-level of initial_run first, making sure to load + # raw.transaction_normalized along with the source tables, but don't copy the raw tables to int. + # Then immediately call load_transactions_in_delta with etl-level of transaction_f[ab|pd]s. + InitialRun.initial_run( + self.s3_data_bucket, + load_other_raw_tables=load_other_raw_tables, + initial_copy=False, + ) + call_command("load_transactions_in_delta", "--etl-level", self.etl_level) + + # Even without the call to load_transactions_in_delta with etl-level of transaction_id_lookup, the appropriate + # data will be populated in the transaction_id_lookup table via initial_run to allow the call to + # load_transactions_in_delta with etl-level of transaction_fabs to populate int.transaction_fabs correctly with + # the initial data. + kwargs = { + "expected_last_load_transaction_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + "expected_last_load_award_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, + } + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + InitialRun.verify( + self.spark, + expected_initial_transaction_id_lookup, + expected_initial_award_id_lookup, + 0, + len(self.expected_initial_transaction_fabs), + len(self.expected_initial_transaction_fpds), + **kwargs, + ) + + # Verify key fields in transaction_fabs table. + query = f"SELECT {', '.join(self.compare_fields)} FROM int.{self.etl_level} ORDER BY {self.pk_field}" + delta_data = [row.asDict() for row in self.spark.sql(query).collect()] + if len(self.expected_initial_transaction_fabs) > 0: + assert equal_datasets( + self.expected_initial_transaction_fabs, delta_data, "" + ) + else: + assert equal_datasets( + self.expected_initial_transaction_fpds, delta_data, "" + ) + + # 2. Test inserting, updating, and deleting without calling load_transactions_in_delta with etl-level + # of transaction_id_lookup before calling load_transactions_in_delta with etl-level of transaction_f[ab|pd]s. + + # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to + # Postgres table, and then push the updated table to Delta. + last_load_datetime = datetime.now(timezone.utc) + insert_update_datetime = last_load_datetime + timedelta(minutes=-15) + self.baker_kwargs.update( + { + "action_date": insert_update_datetime.isoformat(), + "created_at": insert_update_datetime, + "updated_at": insert_update_datetime, + } + ) + baker.make(self.baker_table, **self.baker_kwargs) + update_last_load_date(self.broker_source_table_name, last_load_datetime) + load_delta_table_from_postgres(self.usas_source_table_name, self.s3_data_bucket) + + self.spark.sql( + f""" + UPDATE raw.{self.usas_source_table_name} + SET updated_at = '{insert_update_datetime}' + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ + ) + + self.spark.sql( + f""" + DELETE FROM raw.{self.usas_source_table_name} + WHERE {self.pk_field} = 2 OR {self.pk_field} = 3 + """ + ) + + call_command("load_transactions_in_delta", "--etl-level", self.etl_level) + + # Verify the transaction and award id lookup tables. Without a call to load_transactions_in_delta with an + # --etl-level of transaction_id_lookup or award_id_lookup, they should be the same as during the initial run. + InitialRun.verify( + self.spark, + expected_initial_transaction_id_lookup, + expected_initial_award_id_lookup, + 0, + len(self.expected_initial_transaction_fabs), + len(self.expected_initial_transaction_fpds), + **kwargs, + ) + + # Verify key fields in transaction_f[ab|pd]s table + query = f"SELECT {', '.join(self.compare_fields)} FROM int.{self.etl_level} ORDER BY {self.pk_field}" + delta_data = [row.asDict() for row in self.spark.sql(query).collect()] + + # With no call to load_transactions_in_delta with etl-level of transaction_id_lookup, the above call to + # load_transactions_in_delta with etl-level of transaction_f[ab|pd]s *should* pick up the *updates* in the + # published f[ab|pd]s table because those transactions already exist in the transaction_id_lookup table. + # However, this call should *NOT* pick up the inserts or deletes, since those transactions will not + # have changed in the transaction_id_lookup table. + if len(self.expected_initial_transaction_fabs) > 0: + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fabs + ) + else: + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fpds + ) + expected_transaction_fabs_fpds[-2]["updated_at"] = insert_update_datetime + expected_transaction_fabs_fpds[-1]["updated_at"] = insert_update_datetime + assert equal_datasets(expected_transaction_fabs_fpds, delta_data, "") + + def unexpected_paths_no_pg_loader_test_core(self): + self.unexpected_paths_test_core( + [ + _TableLoadInfo( + self.spark, + "transaction_normalized", + InitialRunNoPostgresLoader.initial_transaction_normalized, + ) + ], + InitialRunNoPostgresLoader.expected_initial_transaction_id_lookup, + InitialRunNoPostgresLoader.expected_initial_award_id_lookup, + ) + + def happy_paths_test_core( + self, + load_other_raw_tables, + expected_initial_transaction_id_lookup, + expected_initial_award_id_lookup, + expected_transaction_id_lookup_pops, + expected_transaction_id_lookup_append, + expected_transaction_fabs_fpds_append, + ): + # 1, Test calling load_transactions_in_delta with etl-level of transaction_f[ab|pd]s after calling with + # etl-levels of initial_run and transaction_id_lookup. + InitialRun.initial_run( + self.s3_data_bucket, load_other_raw_tables=load_other_raw_tables + ) + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) + call_command("load_transactions_in_delta", "--etl-level", self.etl_level) + + # Verify the tables. The transaction and award id lookup tables should be the same as during the initial run. + # The transaction_normalized and transaction_f[ab|pd]s tables should have been copied from raw to int. + kwargs = { + "expected_last_load_transaction_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + "expected_last_load_award_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + "expected_last_load_transaction_normalized": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, + } + kwargs[f"expected_last_load_{self.etl_level}"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + InitialRun.verify( + self.spark, + expected_initial_transaction_id_lookup, + expected_initial_award_id_lookup, + len(expected_initial_transaction_id_lookup), + len(self.expected_initial_transaction_fabs), + len(self.expected_initial_transaction_fpds), + **kwargs, + ) + + # Verify key fields in transaction_fabs table + transaction_fabs_fpds_query = f""" + SELECT {', '.join(self.compare_fields)} + FROM int.{self.etl_level} + ORDER BY {self.pk_field} + """ + delta_data = [ + row.asDict() + for row in self.spark.sql(transaction_fabs_fpds_query).collect() + ] + if len(self.expected_initial_transaction_fabs) > 0: + assert equal_datasets( + self.expected_initial_transaction_fabs, delta_data, "" + ) + else: + assert equal_datasets( + self.expected_initial_transaction_fpds, delta_data, "" + ) + + # 2. Test inserting, updating, and deleting records followed by calling load_transactions_in_delta with + # etl-levels of transaction_id_lookup and then transaction_f[ab|pd]s. + + # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to + # Postgres table, and then push the updated table to Delta. + last_load_datetime = datetime.now(timezone.utc) + insert_update_datetime = last_load_datetime + timedelta(minutes=-15) + self.baker_kwargs.update( + { + "action_date": insert_update_datetime.isoformat(), + "created_at": insert_update_datetime, + "updated_at": insert_update_datetime, + } + ) + baker.make(self.baker_table, **self.baker_kwargs) + update_last_load_date(self.broker_source_table_name, last_load_datetime) + load_delta_table_from_postgres(self.usas_source_table_name, self.s3_data_bucket) + + self.spark.sql( + f""" + UPDATE raw.{self.usas_source_table_name} + SET updated_at = '{insert_update_datetime}' + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ + ) + + self.spark.sql( + f""" + DELETE FROM raw.{self.usas_source_table_name} + WHERE {self.pk_field} = 2 OR {self.pk_field} = 3 + """ + ) + + self.spark.sql( + f""" + UPDATE raw.{self.usas_source_table_name} + SET place_of_perform_country_c = 'UNITED STATES' + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ + ) + + self.spark.sql( + f""" + UPDATE raw.{self.usas_source_table_name} + SET legal_entity_country_code = 'UNITED STATES' + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ + ) + + self.spark.sql( + f""" + UPDATE raw.{self.usas_source_table_name} + SET place_of_perform_country_n = 'USA' + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ + ) + + self.spark.sql( + f""" + UPDATE raw.{self.usas_source_table_name} + SET legal_entity_country_name = 'USA' + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ + ) + + # Need to load changes into the transaction_id_lookup table. + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) + call_command("load_transactions_in_delta", "--etl-level", self.etl_level) + + # Verify transaction_id_lookup table + query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" + delta_data = [row.asDict() for row in self.spark.sql(query).collect()] + + expected_transaction_id_lookup = deepcopy( + expected_initial_transaction_id_lookup + ) + for pop_index in expected_transaction_id_lookup_pops: + expected_transaction_id_lookup.pop(pop_index) + expected_transaction_id_lookup_append.update( + { + "transaction_id": self.new_transaction_id, + } + ) + expected_transaction_id_lookup.append(expected_transaction_id_lookup_append) + assert equal_datasets(expected_transaction_id_lookup, delta_data, "") + + # Verify country code scalar transformation + query = f""" + SELECT DISTINCT legal_entity_country_code, place_of_perform_country_c + FROM int.{self.etl_level} + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ + delta_data = [row.asDict() for row in self.spark.sql(query).collect()] + assert len(delta_data) == 1 + assert delta_data[0]["legal_entity_country_code"] == "USA" + assert delta_data[0]["place_of_perform_country_c"] == "USA" + + # Verify country name scalar transformation + query = f""" + SELECT DISTINCT legal_entity_country_name, place_of_perform_country_n + FROM int.{self.etl_level} + WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 + """ + delta_data = [row.asDict() for row in self.spark.sql(query).collect()] + assert len(delta_data) == 1 + assert delta_data[0]["legal_entity_country_name"] == "UNITED STATES" + assert delta_data[0]["place_of_perform_country_n"] == "UNITED STATES" + + # Verify key fields in transaction_f[ab|pd]s table + delta_data = [ + row.asDict() + for row in self.spark.sql(transaction_fabs_fpds_query).collect() + ] + + if len(self.expected_initial_transaction_fabs) > 0: + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fabs + ) + else: + expected_transaction_fabs_fpds = deepcopy( + self.expected_initial_transaction_fpds + ) + expected_transaction_fabs_fpds.pop(1) + expected_transaction_fabs_fpds.pop(1) + expected_transaction_fabs_fpds[-2]["updated_at"] = insert_update_datetime + expected_transaction_fabs_fpds[-1]["updated_at"] = insert_update_datetime + expected_transaction_fabs_fpds_append.update( + { + "transaction_id": self.new_transaction_id, + "action_date": insert_update_datetime.date().isoformat(), + "created_at": insert_update_datetime, + "updated_at": insert_update_datetime, + } + ) + expected_transaction_fabs_fpds.append(expected_transaction_fabs_fpds_append) + assert equal_datasets(expected_transaction_fabs_fpds, delta_data, "") + + # Verify that the last_load_dates of the transaction_id_lookup table and the table whose etl_level has been + # called did NOT change, since only one of the broker source tables' last load date was changed. + assert ( + get_last_load_date("transaction_id_lookup") + == _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + assert get_last_load_date(self.etl_level) == _INITIAL_SOURCE_TABLE_LOAD_DATETIME + + def happy_paths_no_pg_loader_test_core( + self, + initial_transaction_fabs_fpds, + expected_transaction_id_lookup_pops, + expected_transaction_id_lookup_append, + expected_transaction_fabs_fpds_append, + ): + self.happy_paths_test_core( + ( + _TableLoadInfo( + self.spark, + "transaction_normalized", + InitialRunNoPostgresLoader.initial_transaction_normalized, + ), + _TableLoadInfo( + self.spark, + self.etl_level, + initial_transaction_fabs_fpds, + ), + _TableLoadInfo( + self.spark, "awards", InitialRunNoPostgresLoader.initial_awards + ), + ), + InitialRunNoPostgresLoader.expected_initial_transaction_id_lookup, + InitialRunNoPostgresLoader.expected_initial_award_id_lookup, + expected_transaction_id_lookup_pops, + expected_transaction_id_lookup_append, + expected_transaction_fabs_fpds_append, + ) + + +class TestTransactionFabs: + + etl_level = "transaction_fabs" + pk_field = "published_fabs_id" + usas_source_table_name = "published_fabs" + broker_source_table_name = "source_assistance_transaction" + baker_table = "transactions.SourceAssistanceTransaction" + compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fabs[ + 0 + ].keys() + new_afa_generated_unique = "award_assist_0004_trans_0001" + new_unique_award_key = "award_assist_0004" + baker_kwargs = { + "published_fabs_id": _TransactionFabsFpdsCore.new_transaction_fabs_fpds_id, + "afa_generated_unique": new_afa_generated_unique, + "is_active": True, + "unique_award_key": new_unique_award_key, + } + expected_transaction_id_lookup_append = { + "is_fpds": False, + "transaction_unique_id": new_afa_generated_unique.upper(), + } + expected_transaction_fabs_fpds_append = { + "afa_generated_unique": new_afa_generated_unique.upper(), + "is_active": True, + "published_fabs_id": _TransactionFabsFpdsCore.new_transaction_fabs_fpds_id, + "unique_award_key": new_unique_award_key.upper(), + } + + def _generate_transaction_fabs_fpds_core( + self, spark, s3_data_bucket, expected_initial_transaction_fabs + ): + return _TransactionFabsFpdsCore( + spark, + s3_data_bucket, + self.etl_level, + self.pk_field, + self.compare_fields, + self.usas_source_table_name, + self.broker_source_table_name, + self.baker_table, + deepcopy(self.baker_kwargs), + expected_initial_transaction_fabs, + [], + ) + + @mark.django_db(transaction=True) + def test_unexpected_paths_source_tables_only( + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( + spark, + s3_unittest_data_bucket, + _InitialRunWithPostgresLoader.expected_initial_transaction_fabs, + ) + transaction_fabs_fpds_core.unexpected_paths_source_tables_only_test_core() + + @mark.django_db(transaction=True) + def test_unexpected_paths_no_pg_loader( + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fabs, + ) + transaction_fabs_fpds_core.unexpected_paths_no_pg_loader_test_core() + + @mark.django_db(transaction=True) + def test_happy_paths_no_pg_loader( + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fabs, + ) + transaction_fabs_fpds_core.happy_paths_no_pg_loader_test_core( + InitialRunNoPostgresLoader.initial_transaction_fabs, + (2, 3), + self.expected_transaction_id_lookup_append, + self.expected_transaction_fabs_fpds_append, + ) + + +class TestTransactionFpds: + + etl_level = "transaction_fpds" + pk_field = "detached_award_procurement_id" + usas_source_table_name = "detached_award_procurement" + broker_source_table_name = "source_procurement_transaction" + baker_table = "transactions.SourceProcurementTransaction" + compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fpds[ + 0 + ].keys() + new_detached_award_proc_unique = "award_procure_0004_trans_0001" + new_unique_award_key = "award_procure_0004" + baker_kwargs = { + "detached_award_procurement_id": _TransactionFabsFpdsCore.new_transaction_fabs_fpds_id, + "detached_award_proc_unique": new_detached_award_proc_unique, + "unique_award_key": new_unique_award_key, + } + expected_transaction_id_lookup_append = { + "is_fpds": True, + "transaction_unique_id": new_detached_award_proc_unique.upper(), + } + expected_transaction_fabs_fpds_append = { + "detached_award_proc_unique": new_detached_award_proc_unique.upper(), + "detached_award_procurement_id": _TransactionFabsFpdsCore.new_transaction_fabs_fpds_id, + "unique_award_key": new_unique_award_key.upper(), + } + + def _generate_transaction_fabs_fpds_core( + self, spark, s3_data_bucket, expected_initial_transaction_fpds + ): + return _TransactionFabsFpdsCore( + spark, + s3_data_bucket, + self.etl_level, + self.pk_field, + self.compare_fields, + self.usas_source_table_name, + self.broker_source_table_name, + self.baker_table, + deepcopy(self.baker_kwargs), + [], + expected_initial_transaction_fpds, + ) + + @mark.django_db(transaction=True) + def test_unexpected_paths_source_tables_only( + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( + spark, + s3_unittest_data_bucket, + _InitialRunWithPostgresLoader.expected_initial_transaction_fpds, + ) + transaction_fabs_fpds_core.unexpected_paths_source_tables_only_test_core() + + @mark.django_db(transaction=True) + def test_unexpected_paths_no_pg_loader( + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fpds, + ) + transaction_fabs_fpds_core.unexpected_paths_no_pg_loader_test_core() + + @mark.django_db(transaction=True) + def test_happy_paths_no_pg_loader( + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( + spark, + s3_unittest_data_bucket, + InitialRunNoPostgresLoader.initial_transaction_fpds, + ) + transaction_fabs_fpds_core.happy_paths_no_pg_loader_test_core( + InitialRunNoPostgresLoader.initial_transaction_fpds, + (3, 4), + self.expected_transaction_id_lookup_append, + self.expected_transaction_fabs_fpds_append, + ) diff --git a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py new file mode 100644 index 0000000000..b623129860 --- /dev/null +++ b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py @@ -0,0 +1,1882 @@ +"""Automated Unit Tests for the loading of transaction and award tables in Delta Lake. + +NOTE: Uses Pytest Fixtures from immediate parent conftest.py: usaspending_api/etl/tests/conftest.py +""" + +import re +from copy import deepcopy +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from typing import Any, Dict, Optional, Sequence +from unittest.mock import patch + +import dateutil +import pyspark +from django.core.management import call_command +from django.db import connection +from model_bakery import baker +from pyspark.sql import SparkSession +from pytest import mark, raises + +from usaspending_api.broker.helpers.last_load_date import ( + get_last_load_date, + update_last_load_date, +) +from usaspending_api.common.helpers.spark_helpers import load_dict_to_delta_table +from usaspending_api.config import CONFIG +from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC +from usaspending_api.etl.tests.integration.test_load_to_from_delta import ( + equal_datasets, + load_delta_table_from_postgres, +) +from usaspending_api.transactions.delta_models.transaction_fabs import ( + TRANSACTION_FABS_COLUMNS, +) +from usaspending_api.transactions.delta_models.transaction_fpds import ( + TRANSACTION_FPDS_COLUMNS, +) +from usaspending_api.transactions.delta_models.transaction_normalized import ( + TRANSACTION_NORMALIZED_COLUMNS, +) + +_BEGINNING_OF_TIME = datetime(1970, 1, 1, tzinfo=timezone.utc) +_INITIAL_DATETIME = datetime(2022, 10, 31, tzinfo=timezone.utc) +_INITIAL_SOURCE_TABLE_LOAD_DATETIME = _INITIAL_DATETIME + timedelta(hours=12) +_INITIAL_ASSISTS = [ + { + "published_fabs_id": 1, + "afa_generated_unique": "award_assist_0001_trans_0001", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "is_active": True, + "unique_award_key": "award_assist_0001", + }, + { + "published_fabs_id": 2, + "afa_generated_unique": "award_assist_0002_trans_0001", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "is_active": True, + "unique_award_key": "award_assist_0002", + }, + { + "published_fabs_id": 3, + "afa_generated_unique": "award_assist_0002_trans_0002", + # Deliberately formatting this action_date somewhat unusually. + "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "is_active": True, + "unique_award_key": "award_assist_0002", + }, + { + "published_fabs_id": 4, + "afa_generated_unique": "award_assist_0003_trans_0001", + # Deliberately formatting this action_date somewhat unusually. + "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "is_active": True, + "unique_award_key": "award_assist_0003", + }, + { + "published_fabs_id": 5, + "afa_generated_unique": "award_assist_0003_trans_0002", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "is_active": True, + "unique_award_key": "award_assist_0003", + }, +] +_INITIAL_PROCURES = [ + { + "detached_award_procurement_id": 1, + "detached_award_proc_unique": "award_procure_0001_trans_0001", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "unique_award_key": "award_procure_0001", + }, + { + "detached_award_procurement_id": 2, + "detached_award_proc_unique": "award_procure_0002_trans_0001", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "unique_award_key": "award_procure_0002", + }, + { + "detached_award_procurement_id": 3, + "detached_award_proc_unique": "award_procure_0002_trans_0002", + # Deliberately formatting this action_date somewhat unusually. + "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "unique_award_key": "award_procure_0002", + }, + { + "detached_award_procurement_id": 4, + "detached_award_proc_unique": "award_procure_0003_trans_0001", + # Deliberately formatting this action_date somewhat unusually. + "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "unique_award_key": "award_procure_0003", + }, + { + "detached_award_procurement_id": 5, + "detached_award_proc_unique": "award_procure_0003_trans_0002", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "unique_award_key": "award_procure_0003", + }, +] +_NEW_ASSIST = { + "published_fabs_id": 6, + "afa_generated_unique": "award_assist_0004_trans_0001", + "is_active": True, + "unique_award_key": "award_assist_0004", +} +_NEW_PROCURE = { + "detached_award_procurement_id": 6, + "detached_award_proc_unique": "award_procure_0004_trans_0001", + "unique_award_key": "award_procure_0004", +} + + +@dataclass +class _TableLoadInfo: + spark: SparkSession + table_name: str + data: Sequence[Dict[str, Any]] + overwrite: Optional[bool] = False + + +def _load_tables_to_delta( + s3_data_bucket, load_source_tables=True, load_other_raw_tables=None +): + if load_source_tables: + load_delta_table_from_postgres("published_fabs", s3_data_bucket) + load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) + + if load_other_raw_tables: + for item in load_other_raw_tables: + if isinstance(item, _TableLoadInfo): + load_dict_to_delta_table( + item.spark, + s3_data_bucket, + "raw", + item.table_name, + item.data, + item.overwrite, + ) + else: + load_delta_table_from_postgres(item, s3_data_bucket) + + +class TestInitialRun: + @staticmethod + def initial_run( + s3_data_bucket, + load_source_tables=True, + load_other_raw_tables=None, + initial_copy=True, + ): + _load_tables_to_delta(s3_data_bucket, load_source_tables, load_other_raw_tables) + call_params = [ + "load_transactions_in_delta", + "--etl-level", + "initial_run", + "--spark-s3-bucket", + s3_data_bucket, + ] + if not initial_copy: + call_params.append("--no-initial-copy") + call_command(*call_params) + + @staticmethod + def verify_transaction_ids( + spark, expected_transaction_id_lookup, expected_last_load=None + ): + # Verify transaction_id_lookup table + query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" + delta_data = [row.asDict() for row in spark.sql(query).collect()] + assert equal_datasets(expected_transaction_id_lookup, delta_data, "") + + # Verify max transaction id + with connection.cursor() as cursor: + cursor.execute("SELECT nextval('transaction_id_seq')") + # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id + max_transaction_id = cursor.fetchone()[0] + if expected_transaction_id_lookup: + assert max_transaction_id == max( + [ + transaction["transaction_id"] + for transaction in expected_transaction_id_lookup + ] + ) + else: + assert max_transaction_id == 1 + + # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false + # so that the next call to nextval() will return the same value. + with connection.cursor() as cursor: + cursor.execute( + f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)" + ) + + @staticmethod + def verify_award_ids(spark, expected_award_id_lookup, expected_last_load=None): + # Verify award_id_lookup table + query = ( + "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + ) + delta_data = [row.asDict() for row in spark.sql(query).collect()] + assert equal_datasets(expected_award_id_lookup, delta_data, "") + + # Verify max award id + with connection.cursor() as cursor: + cursor.execute("SELECT nextval('award_id_seq')") + # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id + max_award_id = cursor.fetchone()[0] + if expected_award_id_lookup: + assert max_award_id == max( + [award["award_id"] for award in expected_award_id_lookup] + ) + else: + assert max_award_id == 1 + + # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false + # so that the next call to nextval() will return the same value. + with connection.cursor() as cursor: + cursor.execute(f"SELECT setval('award_id_seq', {max_award_id}, false)") + + @staticmethod + def verify_lookup_info( + spark, + expected_transaction_id_lookup, + expected_award_id_lookup, + expected_last_load_transaction_id_lookup=None, + expected_load_load_award_id_lookup=None, + ): + TestInitialRun.verify_transaction_ids( + spark, + expected_transaction_id_lookup, + expected_last_load_transaction_id_lookup, + ) + TestInitialRun.verify_award_ids( + spark, expected_award_id_lookup, expected_load_load_award_id_lookup + ) + + @staticmethod + def verify_raw_vs_int_tables(spark, table_name, col_names): + # Make sure the raw and int versions of the given table match + result = spark.sql( + f""" + SELECT {', '.join(col_names)} FROM int.{table_name} + MINUS + SELECT {', '.join(col_names)} FROM raw.{table_name} + """ + ).collect() + assert len(result) == 0 + + result = spark.sql( + f""" + SELECT {', '.join(col_names)} FROM raw.{table_name} + MINUS + SELECT {', '.join(col_names)} FROM int.{table_name} + """ + ).collect() + assert len(result) == 0 + + @staticmethod + def verify( + spark, + expected_transaction_id_lookup, + expected_award_id_lookup, + expected_normalized_count=0, + expected_fabs_count=0, + expected_fpds_count=0, + expected_last_load_transaction_id_lookup=None, + expected_last_load_award_id_lookup=None, + expected_last_load_transaction_normalized=None, + expected_last_load_transaction_fabs=None, + expected_last_load_transaction_fpds=None, + ): + TestInitialRun.verify_lookup_info( + spark, + expected_transaction_id_lookup, + expected_award_id_lookup, + expected_last_load_transaction_id_lookup, + expected_last_load_award_id_lookup, + ) + + # int.award_ids_delete_modified should exist, but be empty + actual_count = spark.sql( + "SELECT COUNT(*) AS count from int.award_ids_delete_modified" + ).collect()[0]["count"] + assert actual_count == 0 + + # Make sure int.transaction_[normalized,fabs,fpds] tables have been created and have the expected sizes. + for table_name, expected_count, _expected_last_load, col_names in zip( + (f"transaction_{t}" for t in ("normalized", "fabs", "fpds")), + (expected_normalized_count, expected_fabs_count, expected_fpds_count), + ( + expected_last_load_transaction_normalized, + expected_last_load_transaction_fabs, + expected_last_load_transaction_fpds, + ), + ( + list(TRANSACTION_NORMALIZED_COLUMNS), + TRANSACTION_FABS_COLUMNS, + TRANSACTION_FPDS_COLUMNS, + ), + strict=False, + ): + actual_count = spark.sql( + f"SELECT COUNT(*) AS count from int.{table_name}" + ).collect()[0]["count"] + assert actual_count == expected_count + + if expected_count > 0: + # Only verify raw vs int tables if raw table exists + try: + spark.sql(f"SELECT 1 FROM raw.{table_name}") + except pyspark.sql.utils.AnalysisException as e: + if re.match( + rf"^\[TABLE_OR_VIEW_NOT_FOUND\] The table or view `raw`\.`{table_name}` cannot be found\..*$", + str(e), + re.MULTILINE, + ): + pass + else: + raise e + else: + TestInitialRun.verify_raw_vs_int_tables( + spark, table_name, col_names + ) + + @mark.django_db(transaction=True) + def test_edge_cases_using_only_source_tables( + self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db + ): + # Setup some source tables without data, this test does not require these tables to be populated + raw_db = "raw" + spark.sql(f"create database if not exists {raw_db};") + spark.sql(f"use {raw_db};") + spark.sql( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( + DESTINATION_TABLE="published_fabs", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + spark.sql( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( + DESTINATION_TABLE="detached_award_procurement", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + call_command( + "load_transactions_in_delta", + "--etl-level", + "initial_run", + "--spark-s3-bucket", + s3_unittest_data_bucket, + "--no-initial-copy", + ) + kwargs = { + "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, + "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, + "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, + } + TestInitialRun.verify(spark, [], [], **kwargs) + + +# Even though all the tests that use the Postgres loader have been removed, these variables are still +# needed for some tests. +class _InitialRunWithPostgresLoader: + expected_initial_transaction_id_lookup = [ + { + "transaction_id": id, + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[id - 1][ + "afa_generated_unique" + ].upper(), + } + for id in range(1, len(_INITIAL_ASSISTS) + 1) + ] + [ + { + "transaction_id": id, + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[id - 6][ + "detached_award_proc_unique" + ].upper(), + } + for id in range( + len(_INITIAL_ASSISTS) + 1, + len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1, + ) + ] + + expected_initial_award_id_lookup = [ + { + "award_id": int(assist["unique_award_key"].split("_")[-1]), + "is_fpds": False, + "transaction_unique_id": assist["afa_generated_unique"].upper(), + "generated_unique_award_id": assist["unique_award_key"].upper(), + } + for assist in _INITIAL_ASSISTS + ] + [ + { + "award_id": ( + int(procure["unique_award_key"].split("_")[-1]) + + max( + [ + int(assist["unique_award_key"].split("_")[-1]) + for assist in _INITIAL_ASSISTS + ] + ) + ), + "is_fpds": True, + "transaction_unique_id": procure["detached_award_proc_unique"].upper(), + "generated_unique_award_id": procure["unique_award_key"].upper(), + } + for procure in _INITIAL_PROCURES + ] + + expected_initial_transaction_fabs = [ + { + **assist, + "action_date": dateutil.parser.parse(assist["action_date"]) + .date() + .isoformat(), + "afa_generated_unique": assist["afa_generated_unique"].upper(), + "transaction_id": assist["published_fabs_id"], + "unique_award_key": assist["unique_award_key"].upper(), + } + for assist in _INITIAL_ASSISTS + ] + + expected_initial_transaction_fpds = [ + { + **procure, + "action_date": dateutil.parser.parse(procure["action_date"]) + .date() + .isoformat(), + "detached_award_proc_unique": procure["detached_award_proc_unique"].upper(), + "transaction_id": procure["detached_award_procurement_id"] + + len(_INITIAL_ASSISTS), + "unique_award_key": procure["unique_award_key"].upper(), + } + for procure in _INITIAL_PROCURES + ] + + +class TestInitialRunNoPostgresLoader: + expected_initial_transaction_id_lookup = [ + { + "transaction_id": 1, + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), + }, + { + "transaction_id": 2, + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), + }, + { + "transaction_id": 3, + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), + }, + { + "transaction_id": 4, + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), + }, + { + "transaction_id": 5, + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[2][ + "afa_generated_unique" + ].upper(), + }, + { + "transaction_id": 6, + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[2][ + "detached_award_proc_unique" + ].upper(), + }, + { + "transaction_id": 7, + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), + }, + { + "transaction_id": 8, + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[4][ + "afa_generated_unique" + ].upper(), + }, + { + "transaction_id": 9, + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), + }, + { + "transaction_id": 10, + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), + }, + ] + + expected_initial_award_id_lookup = [ + { + "award_id": 1, + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[0][ + "unique_award_key" + ].upper(), + }, + { + "award_id": 2, + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[1][ + "unique_award_key" + ].upper(), + }, + { + "award_id": 2, + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[2][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[2][ + "unique_award_key" + ].upper(), + }, + { + "award_id": 3, + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[0][ + "unique_award_key" + ].upper(), + }, + { + "award_id": 4, + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[1][ + "unique_award_key" + ].upper(), + }, + { + "award_id": 4, + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[2][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[2][ + "unique_award_key" + ].upper(), + }, + { + "award_id": 5, + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[3][ + "unique_award_key" + ].upper(), + }, + { + "award_id": 5, + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[4][ + "afa_generated_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_ASSISTS[4][ + "unique_award_key" + ].upper(), + }, + { + "award_id": 6, + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[3][ + "unique_award_key" + ].upper(), + }, + { + "award_id": 6, + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _INITIAL_PROCURES[4][ + "unique_award_key" + ].upper(), + }, + ] + + initial_award_trans_norm_update_create_date = _INITIAL_DATETIME + timedelta(days=1) + + initial_awards = [ + { + "id": 1, + "update_date": initial_award_trans_norm_update_create_date, + "generated_unique_award_id": _INITIAL_ASSISTS[0][ + "unique_award_key" + ].upper(), + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), + "subaward_count": 0, + }, + { + "id": 2, + "update_date": initial_award_trans_norm_update_create_date, + "generated_unique_award_id": _INITIAL_ASSISTS[1][ + "unique_award_key" + ].upper(), + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), + "subaward_count": 0, + }, + { + "id": 3, + "update_date": initial_award_trans_norm_update_create_date, + "generated_unique_award_id": _INITIAL_PROCURES[0][ + "unique_award_key" + ].upper(), + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), + "subaward_count": 0, + }, + { + "id": 4, + "update_date": initial_award_trans_norm_update_create_date, + "generated_unique_award_id": _INITIAL_PROCURES[1][ + "unique_award_key" + ].upper(), + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), + "subaward_count": 0, + }, + { + "id": 5, + "update_date": initial_award_trans_norm_update_create_date, + "generated_unique_award_id": _INITIAL_ASSISTS[3][ + "unique_award_key" + ].upper(), + "is_fpds": False, + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), + "subaward_count": 0, + }, + { + "id": 6, + "update_date": initial_award_trans_norm_update_create_date, + "generated_unique_award_id": _INITIAL_PROCURES[3][ + "unique_award_key" + ].upper(), + "is_fpds": True, + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), + "subaward_count": 0, + }, + ] + + initial_transaction_normalized = [ + { + "id": 1, + "award_id": 1, + "business_categories": [], + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[0]["action_date"] + ).date(), + "create_date": initial_award_trans_norm_update_create_date, + "transaction_unique_id": _INITIAL_ASSISTS[0][ + "afa_generated_unique" + ].upper(), + "update_date": initial_award_trans_norm_update_create_date, + "is_fpds": False, + "unique_award_key": _INITIAL_ASSISTS[0]["unique_award_key"].upper(), + }, + { + "id": 2, + "award_id": 3, + "business_categories": [], + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[0]["action_date"] + ).date(), + "create_date": initial_award_trans_norm_update_create_date, + "transaction_unique_id": _INITIAL_PROCURES[0][ + "detached_award_proc_unique" + ].upper(), + "update_date": initial_award_trans_norm_update_create_date, + "is_fpds": True, + "unique_award_key": _INITIAL_PROCURES[0]["unique_award_key"].upper(), + }, + { + "id": 3, + "award_id": 2, + "business_categories": [], + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[1]["action_date"] + ).date(), + "create_date": initial_award_trans_norm_update_create_date, + "transaction_unique_id": _INITIAL_ASSISTS[1][ + "afa_generated_unique" + ].upper(), + "update_date": initial_award_trans_norm_update_create_date, + "is_fpds": False, + "unique_award_key": _INITIAL_ASSISTS[1]["unique_award_key"].upper(), + }, + { + "id": 4, + "award_id": 4, + "business_categories": [], + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[1]["action_date"] + ).date(), + "create_date": initial_award_trans_norm_update_create_date, + "transaction_unique_id": _INITIAL_PROCURES[1][ + "detached_award_proc_unique" + ].upper(), + "update_date": initial_award_trans_norm_update_create_date, + "is_fpds": True, + "unique_award_key": _INITIAL_PROCURES[1]["unique_award_key"].upper(), + }, + { + "id": 5, + "award_id": 2, + "business_categories": [], + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[2]["action_date"] + ).date(), + "create_date": initial_award_trans_norm_update_create_date, + "transaction_unique_id": _INITIAL_ASSISTS[2][ + "afa_generated_unique" + ].upper(), + "update_date": initial_award_trans_norm_update_create_date, + "is_fpds": False, + "unique_award_key": _INITIAL_ASSISTS[2]["unique_award_key"].upper(), + }, + { + "id": 6, + "award_id": 4, + "business_categories": [], + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[2]["action_date"] + ).date(), + "create_date": initial_award_trans_norm_update_create_date, + "transaction_unique_id": _INITIAL_PROCURES[2][ + "detached_award_proc_unique" + ].upper(), + "update_date": initial_award_trans_norm_update_create_date, + "is_fpds": True, + "unique_award_key": _INITIAL_PROCURES[2]["unique_award_key"].upper(), + }, + { + "id": 7, + "award_id": 5, + "business_categories": [], + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[3]["action_date"] + ).date(), + "create_date": initial_award_trans_norm_update_create_date, + "transaction_unique_id": _INITIAL_ASSISTS[3][ + "afa_generated_unique" + ].upper(), + "update_date": initial_award_trans_norm_update_create_date, + "is_fpds": False, + "unique_award_key": _INITIAL_ASSISTS[3]["unique_award_key"].upper(), + }, + { + "id": 8, + "award_id": 5, + "business_categories": [], + "action_date": dateutil.parser.parse( + _INITIAL_ASSISTS[4]["action_date"] + ).date(), + "create_date": initial_award_trans_norm_update_create_date, + "transaction_unique_id": _INITIAL_ASSISTS[4][ + "afa_generated_unique" + ].upper(), + "update_date": initial_award_trans_norm_update_create_date, + "is_fpds": False, + "unique_award_key": _INITIAL_ASSISTS[4]["unique_award_key"].upper(), + }, + { + "id": 9, + "award_id": 6, + "business_categories": [], + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[3]["action_date"] + ).date(), + "create_date": initial_award_trans_norm_update_create_date, + "transaction_unique_id": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), + "update_date": initial_award_trans_norm_update_create_date, + "is_fpds": True, + "unique_award_key": _INITIAL_PROCURES[3]["unique_award_key"].upper(), + }, + { + "id": 10, + "award_id": 6, + "business_categories": [], + "action_date": dateutil.parser.parse( + _INITIAL_PROCURES[3]["action_date"] + ).date(), + "create_date": initial_award_trans_norm_update_create_date, + "transaction_unique_id": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), + "update_date": initial_award_trans_norm_update_create_date, + "is_fpds": True, + "unique_award_key": _INITIAL_PROCURES[4]["unique_award_key"].upper(), + }, + ] + + initial_transaction_fabs = [ + { + **assist, + "action_date": dateutil.parser.parse(assist["action_date"]) + .date() + .isoformat(), + "afa_generated_unique": assist["afa_generated_unique"].upper(), + "transaction_id": (assist["published_fabs_id"] - 1) * 2 + 1, + "unique_award_key": assist["unique_award_key"].upper(), + } + for assist in _INITIAL_ASSISTS[:4] + ] + [ + { + **_INITIAL_ASSISTS[4], + "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[4]["action_date"]) + .date() + .isoformat(), + "afa_generated_unique": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), + "transaction_id": 8, + "unique_award_key": _INITIAL_ASSISTS[4]["unique_award_key"].upper(), + } + ] + + initial_transaction_fpds = [ + { + **procure, + "action_date": dateutil.parser.parse(procure["action_date"]) + .date() + .isoformat(), + "detached_award_proc_unique": procure["detached_award_proc_unique"].upper(), + "transaction_id": procure["detached_award_procurement_id"] * 2, + "unique_award_key": procure["unique_award_key"].upper(), + } + for procure in _INITIAL_PROCURES[:3] + ] + [ + { + **_INITIAL_PROCURES[3], + "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]) + .date() + .isoformat(), + "detached_award_proc_unique": _INITIAL_PROCURES[3][ + "detached_award_proc_unique" + ].upper(), + "transaction_id": 9, + "unique_award_key": _INITIAL_PROCURES[3]["unique_award_key"].upper(), + }, + { + **_INITIAL_PROCURES[4], + "action_date": dateutil.parser.parse(_INITIAL_PROCURES[4]["action_date"]) + .date() + .isoformat(), + "detached_award_proc_unique": _INITIAL_PROCURES[4][ + "detached_award_proc_unique" + ].upper(), + "transaction_id": 10, + "unique_award_key": _INITIAL_PROCURES[4]["unique_award_key"].upper(), + }, + ] + + # This test will only load the source tables from postgres, and NOT use the Postgres transaction loader + # to populate any other Delta tables, so can only test for NULLs originating in Delta. + @mark.django_db(transaction=True) + @patch( + "usaspending_api.etl.management.commands.load_transactions_in_delta.Command._insert_orphaned_transactions" + ) + def test_nulls_in_trans_norm_unique_award_key_from_delta( + self, + orphaned_txns_patch, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + raw_db = "raw" + spark.sql(f"create database if not exists {raw_db};") + spark.sql(f"use {raw_db};") + spark.sql( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( + DESTINATION_TABLE="published_fabs", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + spark.sql( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( + DESTINATION_TABLE="detached_award_procurement", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + spark.sql( + TABLE_SPEC["transaction_normalized"].delta_table_create_sql.format( + DESTINATION_TABLE="transaction_normalized", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + spark.sql( + """ + INSERT INTO raw.transaction_normalized + VALUES('2022-10-31' + , NULL + , NULL + , 5 + , NULL + , ARRAY() + , NULL + , '2022-11-01T00:00:00+00:00' + , NULL + , NULL + , NULL + , NULL + , NULL + , NULL + , 5 + , NULL + , TRUE + , NULL + , NULL + , NULL + , NULL + , NULL + , NULL + , 'AWARD_ASSIST_0002_TRANS_0002' + , NULL + , NULL + , NULL + , '2022-11-01T00:00:00+00:00' + , NULL + ) + """ + ) + + with raises( + ValueError, + match="Found 1 NULL in 'unique_award_key' in table raw.transaction_normalized!", + ): + call_command( + "load_transactions_in_delta", + "--etl-level", + "initial_run", + "--spark-s3-bucket", + s3_unittest_data_bucket, + ) + + spark.sql( + """ + INSERT INTO raw.transaction_normalized + VALUES('2022-10-31' + , NULL + , NULL + , 6 + , NULL + , ARRAY() + , NULL + , '2022-11-01T00:00:00+00:00' + , NULL + , NULL + , NULL + , NULL + , NULL + , NULL + , 6 + , NULL + , TRUE + , NULL + , NULL + , NULL + , NULL + , NULL + , NULL + , 'AWARD_PROCURE_0002_TRANS_0002' + , NULL + , NULL + , NULL + , '2022-11-01T00:00:00+00:00' + , NULL + ) + """ + ) + + with raises( + ValueError, + match="Found 2 NULLs in 'unique_award_key' in table raw.transaction_normalized!", + ): + call_command( + "load_transactions_in_delta", + "--etl-level", + "initial_run", + "--spark-s3-bucket", + s3_unittest_data_bucket, + ) + + @mark.django_db(transaction=True) + def test_happy_path_scenarios( + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards + # from expected data when making initial run + load_other_raw_tables = [ + _TableLoadInfo( + spark, "transaction_normalized", self.initial_transaction_normalized + ), + _TableLoadInfo(spark, "awards", self.initial_awards), + ] + # Setup some source tables with data, without loading these Delta Tables from Postgres + # for efficiency reasons. + raw_db = "raw" + spark.sql(f"create database if not exists {raw_db};") + spark.sql(f"use {raw_db};") + spark.sql( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( + DESTINATION_TABLE="published_fabs", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + spark.sql( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( + DESTINATION_TABLE="detached_award_procurement", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + load_dict_to_delta_table( + spark, + s3_unittest_data_bucket, + "raw", + "detached_award_procurement", + _INITIAL_PROCURES, + True, + ) + load_dict_to_delta_table( + spark, + s3_unittest_data_bucket, + "raw", + "published_fabs", + _INITIAL_ASSISTS, + True, + ) + TestInitialRun.initial_run( + s3_unittest_data_bucket, + load_source_tables=False, + load_other_raw_tables=load_other_raw_tables, + initial_copy=False, + ) + kwargs = { + "expected_last_load_transaction_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + "expected_last_load_award_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, + } + TestInitialRun.verify( + spark, + self.expected_initial_transaction_id_lookup, + self.expected_initial_award_id_lookup, + **kwargs, + ) + + # 2. Call initial_run with initial-copy, and have all raw tables populated + + # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards + # from expected data when making initial run + load_other_raw_tables = [ + _TableLoadInfo(spark, "transaction_fabs", self.initial_transaction_fabs), + _TableLoadInfo(spark, "transaction_fpds", self.initial_transaction_fpds), + ] + # Don't call Postgres loader or re-load the source tables, though. + TestInitialRun.initial_run( + s3_unittest_data_bucket, False, load_other_raw_tables + ) + kwargs["expected_last_load_transaction_normalized"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + kwargs["expected_last_load_transaction_fabs"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + kwargs["expected_last_load_transaction_fpds"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + TestInitialRun.verify( + spark, + self.expected_initial_transaction_id_lookup, + self.expected_initial_award_id_lookup, + len(self.initial_transaction_normalized), + len(self.initial_transaction_fabs), + len(self.initial_transaction_fpds), + **kwargs, + ) + + +class TestTransactionIdLookup: + @mark.django_db(transaction=True) + def test_unexpected_paths( + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + # Setup some source tables with data, without loading these Delta Tables from Postgres + # for efficiency reasons. + raw_db = "raw" + spark.sql(f"create database if not exists {raw_db};") + spark.sql(f"use {raw_db};") + spark.sql( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( + DESTINATION_TABLE="published_fabs", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + spark.sql( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( + DESTINATION_TABLE="detached_award_procurement", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + load_dict_to_delta_table( + spark, + s3_unittest_data_bucket, + "raw", + "detached_award_procurement", + _INITIAL_PROCURES, + True, + ) + load_dict_to_delta_table( + spark, + s3_unittest_data_bucket, + "raw", + "published_fabs", + _INITIAL_ASSISTS, + True, + ) + + # 1. Test calling load_transactions_in_delta with the etl-level set to the proper sequencing of + # initial_run, then transaction_id_lookup. However, call initial_run with blank raw.transaction_normalized + # and raw.awards tables. + + # First, create blank raw.transaction_normalized and raw.awards tables + spark.sql( + TABLE_SPEC["transaction_normalized"].delta_table_create_sql.format( + DESTINATION_TABLE="transaction_normalized", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + spark.sql( + TABLE_SPEC["awards"].delta_table_create_sql.format( + DESTINATION_TABLE="awards", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + + # Then, call load_transactions_in_delta with etl-level of initial_run and verify. + # Don't reload the source tables, and don't do initial copy of transaction tables, though. + TestInitialRun.initial_run( + s3_unittest_data_bucket, load_source_tables=False, initial_copy=False + ) + kwargs = { + "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, + "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, + "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, + } + TestInitialRun.verify(spark, [], [], **kwargs) + + # Then, call load_transactions_in_delta with etl-level of transaction_id_lookup. + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) + + # The expected transaction_id_lookup table should be the same as in _InitialRunWithPostgresLoader, + # but all of the transaction ids should be 1 larger than expected there. + expected_transaction_id_lookup = deepcopy( + _InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup + ) + for item in expected_transaction_id_lookup: + item["transaction_id"] += 1 + # Also, the last load date for the transaction_id_lookup table should be updated to the date of the + # initial loads. + kwargs["expected_last_load_transaction_id_lookup"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + TestInitialRun.verify(spark, expected_transaction_id_lookup, [], **kwargs) + + @staticmethod + def _happy_path_test_core( + spark, + s3_data_bucket, + load_other_raw_tables, + expected_initial_transaction_id_lookup, + expected_initial_award_id_lookup, + expected_transaction_id_lookup_pops, + ): + # First, setup some source tables with data, without loading these Delta Tables from Postgres + # for efficiency reasons. + raw_db = "raw" + spark.sql(f"create database if not exists {raw_db};") + spark.sql(f"use {raw_db};") + spark.sql( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( + DESTINATION_TABLE="published_fabs", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + spark.sql( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( + DESTINATION_TABLE="detached_award_procurement", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + load_dict_to_delta_table( + spark, + s3_data_bucket, + "raw", + "detached_award_procurement", + _INITIAL_PROCURES, + True, + ) + load_dict_to_delta_table( + spark, + s3_data_bucket, + "raw", + "published_fabs", + _INITIAL_ASSISTS, + True, + ) + # Trigger initial run of load transactions in delta. This step is required as it creates various data sources. + TestInitialRun.initial_run( + s3_data_bucket, + load_source_tables=False, + load_other_raw_tables=load_other_raw_tables, + initial_copy=False, + ) + + # 1. Test deleting the transaction(s) with the last transaction ID(s) from the appropriate raw table, + # followed by a call to load_transaction_in_delta with etl-level of transaction_id_lookup + # 2. Test for a single inserted transaction, and another call to load_transaction_in_delta with etl-level of + # transaction_id_lookup. + + spark.sql( + """ + DELETE FROM raw.detached_award_procurement + WHERE detached_award_procurement_id = 4 OR detached_award_procurement_id = 5 + """ + ) + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) + + # Verify transaction_id_lookup table + query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" + delta_data = [row.asDict() for row in spark.sql(query).collect()] + + expected_transaction_id_lookup = deepcopy( + expected_initial_transaction_id_lookup + ) + expected_transaction_id_lookup.pop() + expected_transaction_id_lookup.pop() + assert equal_datasets(expected_transaction_id_lookup, delta_data, "") + + # Also, make sure transaction_id_seq hasn't gone backwards + with connection.cursor() as cursor: + cursor.execute("SELECT nextval('transaction_id_seq')") + # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id + max_transaction_id = cursor.fetchone()[0] + assert max_transaction_id == (len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES)) + + # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false + # so that the next call to nextval() will return the same value as previously. + with connection.cursor() as cursor: + cursor.execute( + f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)" + ) + + # 3. Test for a single inserted transaction, and another call to load_transaction_in_delta with etl-level of + # transaction_id_lookup. + + # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to + # Postgres table, and then push the updated table to Delta. + last_assist_load_datetime = datetime.now(timezone.utc) + insert_datetime = last_assist_load_datetime + timedelta(minutes=-15) + assist = deepcopy(_NEW_ASSIST) + assist.update( + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } + ) + baker.make("transactions.SourceAssistanceTransaction", **assist) + update_last_load_date( + "source_assistance_transaction", last_assist_load_datetime + ) + load_delta_table_from_postgres("published_fabs", s3_data_bucket) + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) + + # Verify transaction_id_lookup table + query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" + delta_data = [row.asDict() for row in spark.sql(query).collect()] + + expected_transaction_id_lookup = deepcopy( + expected_initial_transaction_id_lookup + ) + expected_transaction_id_lookup.pop() + expected_transaction_id_lookup.pop() + + expected_transaction_id_lookup.append( + { + "transaction_id": 11, + "is_fpds": False, + "transaction_unique_id": _NEW_ASSIST["afa_generated_unique"].upper(), + } + ) + + # Verify the data has been loaded and changed correctly + # Although the last load date for the source_assistance_transaction was updated above, the code in + # load_transactions_in_delta takes the minimum last load date of that table and of the + # source_procurement_transaction table, which has not been updated since the initial load of both tables. + kwargs = { + "expected_last_load_transaction_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + "expected_last_load_award_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, + } + TestInitialRun.verify( + spark, + expected_transaction_id_lookup, + expected_initial_award_id_lookup, + **kwargs, + ) + + # Also, make sure transaction_id_seq hasn't gone backwards + with connection.cursor() as cursor: + cursor.execute("SELECT nextval('transaction_id_seq')") + # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id + max_transaction_id = cursor.fetchone()[0] + assert max_transaction_id == ( + len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1 + ) # Add one for the insert + + # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false + # so that the next call to nextval() will return the same value as previously. + with connection.cursor() as cursor: + cursor.execute( + f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)" + ) + + # 3. Make inserts to and deletes from the raw tables, call load_transaction_in_delta with etl-level of + # transaction_id_lookup, and test that the results are as expected. + last_procure_load_datetime = datetime.now(timezone.utc) + insert_datetime = last_procure_load_datetime + timedelta(minutes=-15) + procure = deepcopy(_NEW_PROCURE) + procure.update( + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } + ) + baker.make("transactions.SourceProcurementTransaction", **procure) + update_last_load_date( + "source_procurement_transaction", last_procure_load_datetime + ) + load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) + + spark.sql( + """ + DELETE FROM raw.published_fabs + WHERE published_fabs_id = 2 OR published_fabs_id = 3 + """ + ) + spark.sql( + """ + DELETE FROM raw.detached_award_procurement + WHERE detached_award_procurement_id = 1 + """ + ) + + call_command( + "load_transactions_in_delta", "--etl-level", "transaction_id_lookup" + ) + + # Verify transaction_id_lookup table + query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" + delta_data = [row.asDict() for row in spark.sql(query).collect()] + + for pop in expected_transaction_id_lookup_pops: + expected_transaction_id_lookup.pop(pop) + expected_transaction_id_lookup.append( + { + "transaction_id": 12, + "is_fpds": True, + "transaction_unique_id": _NEW_PROCURE[ + "detached_award_proc_unique" + ].upper(), + } + ) + assert equal_datasets(expected_transaction_id_lookup, delta_data, "") + + assert get_last_load_date("transaction_id_lookup") == last_assist_load_datetime + + @mark.django_db(transaction=True) + def test_happy_path_scenarios_no_pg_loader( + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards + # from expected data when making initial run + load_other_raw_tables = [ + _TableLoadInfo( + spark, + "transaction_normalized", + TestInitialRunNoPostgresLoader.initial_transaction_normalized, + ), + _TableLoadInfo( + spark, "awards", TestInitialRunNoPostgresLoader.initial_awards + ), + ] + + self._happy_path_test_core( + spark, + s3_unittest_data_bucket, + load_other_raw_tables, + TestInitialRunNoPostgresLoader.expected_initial_transaction_id_lookup, + TestInitialRunNoPostgresLoader.expected_initial_award_id_lookup, + (1, 1, 2), + ) + + +class TestAwardIdLookup: + @mark.django_db(transaction=True) + def test_unexpected_paths( + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + # First, setup some source tables with data, without loading these Delta Tables from Postgres + # for efficiency reasons. + raw_db = "raw" + spark.sql(f"create database if not exists {raw_db};") + spark.sql(f"use {raw_db};") + spark.sql( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( + DESTINATION_TABLE="published_fabs", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + spark.sql( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( + DESTINATION_TABLE="detached_award_procurement", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + load_dict_to_delta_table( + spark, + s3_unittest_data_bucket, + "raw", + "detached_award_procurement", + _INITIAL_PROCURES, + True, + ) + load_dict_to_delta_table( + spark, + s3_unittest_data_bucket, + "raw", + "published_fabs", + _INITIAL_ASSISTS, + True, + ) + + # 1. Test calling load_transactions_in_delta with the etl-level set to the proper sequencing of + # initial_run, then award_id_lookup. However, call initial_run with blank raw.transaction_normalized + # and raw.awards tables. + + # First, create blank raw.transaction_normalized and raw.awards tables + spark.sql( + TABLE_SPEC["transaction_normalized"].delta_table_create_sql.format( + DESTINATION_TABLE="transaction_normalized", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + spark.sql( + TABLE_SPEC["awards"].delta_table_create_sql.format( + DESTINATION_TABLE="awards", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_unittest_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + + # Then, call load_transactions_in_delta with etl-level of initial_run and verify. + # Don't reload the source tables, and don't do initial copy of transaction tables, though. + TestInitialRun.initial_run( + s3_unittest_data_bucket, load_source_tables=False, initial_copy=False + ) + kwargs = { + "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, + "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, + "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, + } + TestInitialRun.verify(spark, [], [], **kwargs) + + # Then, call load_transactions_in_delta with etl-level of award_id_lookup. + call_command("load_transactions_in_delta", "--etl-level", "award_id_lookup") + + # The expected award_id_lookup table should be the same as in TestInitialRunWithPostgresLoader, + # but all of the award ids should be 1 larger than expected there. + expected_award_id_lookup = deepcopy( + _InitialRunWithPostgresLoader.expected_initial_award_id_lookup + ) + for item in expected_award_id_lookup: + item["award_id"] += 1 + # Also, the last load date for the award_id_lookup table should be updated to the date of the initial loads. + kwargs["expected_last_load_award_id_lookup"] = ( + _INITIAL_SOURCE_TABLE_LOAD_DATETIME + ) + TestInitialRun.verify(spark, [], expected_award_id_lookup, **kwargs) + + @staticmethod + def _happy_path_test_core( + spark, + s3_data_bucket, + load_other_raw_tables, + expected_initial_transaction_id_lookup, + expected_initial_award_id_lookup, + expected_award_id_lookup_pops, + partially_deleted_award_id, + ): + # First, setup some source tables with data, without loading these Delta Tables from Postgres + # for efficiency reasons. + raw_db = "raw" + spark.sql(f"create database if not exists {raw_db};") + spark.sql(f"use {raw_db};") + spark.sql( + TABLE_SPEC["published_fabs"].delta_table_create_sql.format( + DESTINATION_TABLE="published_fabs", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + spark.sql( + TABLE_SPEC["detached_award_procurement"].delta_table_create_sql.format( + DESTINATION_TABLE="detached_award_procurement", + DESTINATION_DATABASE=raw_db, + SPARK_S3_BUCKET=s3_data_bucket, + DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, + ) + ) + load_dict_to_delta_table( + spark, + s3_data_bucket, + "raw", + "detached_award_procurement", + _INITIAL_PROCURES, + True, + ) + load_dict_to_delta_table( + spark, + s3_data_bucket, + "raw", + "published_fabs", + _INITIAL_ASSISTS, + True, + ) + # Trigger initial run of load transactions in delta. This step is required as it creates various data sources. + TestInitialRun.initial_run( + s3_data_bucket, + load_source_tables=False, + load_other_raw_tables=load_other_raw_tables, + initial_copy=False, + ) + + # 1. Test deleting the transactions with the last award ID from the appropriate raw table, + # followed by a call to load_transaction_in_delta with etl-level of award_id_lookup + # 2. Test for a single inserted transaction, and another call to load_transaction_in_delta with etl-level of + # award_id_lookup. + + spark.sql( + """ + DELETE FROM raw.detached_award_procurement + WHERE detached_award_procurement_id = 4 OR detached_award_procurement_id = 5 + """ + ) + + # Can't use spark.sql to just insert rows with only values for desired columns (need to specify values for + # all of them), so using model baker to add new rows to Postgres table, and then pushing new table to Delta. + last_assist_load_datetime = datetime.now(timezone.utc) + insert_datetime = last_assist_load_datetime + timedelta(minutes=-15) + assist = deepcopy(_NEW_ASSIST) + assist.update( + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } + ) + baker.make("transactions.SourceAssistanceTransaction", **assist) + update_last_load_date( + "source_assistance_transaction", last_assist_load_datetime + ) + load_delta_table_from_postgres("published_fabs", s3_data_bucket) + call_command("load_transactions_in_delta", "--etl-level", "award_id_lookup") + + # Verify award_id_lookup table + query = ( + "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + ) + delta_data = [row.asDict() for row in spark.sql(query).collect()] + + expected_award_id_lookup = deepcopy(expected_initial_award_id_lookup) + expected_award_id_lookup.pop() + expected_award_id_lookup.pop() + + expected_award_id_lookup.append( + { + "award_id": 7, + "is_fpds": False, + "transaction_unique_id": _NEW_ASSIST["afa_generated_unique"].upper(), + "generated_unique_award_id": _NEW_ASSIST["unique_award_key"].upper(), + } + ) + + # Verify the data has been loaded and changed correctly + # Although the last load date for the source_assistance_transaction was updated above, the code in + # load_transactions_in_delta takes the minimum last load date of that table and of the + # source_procurement_transaction table, which has not been updated since the initial load of both tables. + kwargs = { + "expected_last_load_transaction_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + "expected_last_load_award_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, + "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, + "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, + } + TestInitialRun.verify( + spark, + expected_initial_transaction_id_lookup, + expected_award_id_lookup, + **kwargs, + ) + + # Make sure award_id_seq hasn't gone backwards + with connection.cursor() as cursor: + cursor.execute("SELECT nextval('award_id_seq')") + # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id + max_award_id = cursor.fetchone()[0] + assert ( + max_award_id + == max( + [award["id"] for award in TestInitialRunNoPostgresLoader.initial_awards] + ) + + 1 + ) # Add one for the insert + + # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false + # so that the next call to nextval() will return the same value as previously. + with connection.cursor() as cursor: + cursor.execute(f"SELECT setval('award_id_seq', {max_award_id}, false)") + + # 3. Make inserts to and deletes from the raw tables, call load_transaction_in_delta with etl-level of + # award_id_lookup, and test that the results are as expected, and that int.award_ids_delete_modified has + # tracked the appropriate delete. + last_procure_load_datetime = datetime.now(timezone.utc) + insert_datetime = last_procure_load_datetime + timedelta(minutes=-15) + procure = deepcopy(_NEW_PROCURE) + procure.update( + { + "action_date": insert_datetime.isoformat(), + "created_at": insert_datetime, + "updated_at": insert_datetime, + } + ) + baker.make("transactions.SourceProcurementTransaction", **procure) + update_last_load_date( + "source_procurement_transaction", last_procure_load_datetime + ) + load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) + + spark.sql( + """ + DELETE FROM raw.published_fabs + WHERE published_fabs_id = 2 + """ + ) + spark.sql( + """ + DELETE FROM raw.detached_award_procurement + WHERE detached_award_procurement_id = 1 + """ + ) + + call_command("load_transactions_in_delta", "--etl-level", "award_id_lookup") + + # Verify award_id_lookup table + query = ( + "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" + ) + delta_data = [row.asDict() for row in spark.sql(query).collect()] + + for pop in expected_award_id_lookup_pops: + expected_award_id_lookup.pop(pop) + expected_award_id_lookup.append( + { + "award_id": 8, + "is_fpds": True, + "transaction_unique_id": _NEW_PROCURE[ + "detached_award_proc_unique" + ].upper(), + "generated_unique_award_id": _NEW_PROCURE["unique_award_key"].upper(), + } + ) + assert equal_datasets(expected_award_id_lookup, delta_data, "") + + assert get_last_load_date("award_id_lookup") == last_assist_load_datetime + + # Verify award_ids_delete_modified table + query = "SELECT * FROM int.award_ids_delete_modified ORDER BY award_id" + delta_data = [row.asDict() for row in spark.sql(query).collect()] + assert equal_datasets( + [{"award_id": partially_deleted_award_id}], delta_data, "" + ) + + @mark.django_db(transaction=True) + def test_happy_path_scenarios_no_pg_loader( + self, + spark, + s3_unittest_data_bucket, + hive_unittest_metastore_db, + _populate_initial_source_tables_pg, + ): + # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards + # from expected data when making initial run + load_other_raw_tables = [ + _TableLoadInfo( + spark, + "transaction_normalized", + TestInitialRunNoPostgresLoader.initial_transaction_normalized, + ), + _TableLoadInfo( + spark, "awards", TestInitialRunNoPostgresLoader.initial_awards + ), + ] + + self._happy_path_test_core( + spark, + s3_unittest_data_bucket, + load_other_raw_tables, + TestInitialRunNoPostgresLoader.expected_initial_transaction_id_lookup, + TestInitialRunNoPostgresLoader.expected_initial_award_id_lookup, + (3, 1), + 2, + ) diff --git a/usaspending_api/etl/tests/integration/test_spark_app.py b/usaspending_api/etl/tests/integration/test_spark_app.py index 09387d99f1..6e137f9475 100644 --- a/usaspending_api/etl/tests/integration/test_spark_app.py +++ b/usaspending_api/etl/tests/integration/test_spark_app.py @@ -8,22 +8,27 @@ import random import sys import uuid -from datetime import date +from datetime import datetime from unittest.mock import MagicMock, call import boto3 from django.conf import settings from model_bakery import baker from pyspark.context import SparkContext -from pyspark.sql import SparkSession, Row +from pyspark.sql import Row, SparkSession from pytest import fixture, mark + from usaspending_api.awards.models import TransactionFABS, TransactionFPDS +from usaspending_api.common.etl.spark import ( + _BROKER_REF_TABLES, + _USAS_RDS_REF_TABLES, + create_ref_temp_views, +) from usaspending_api.common.helpers.spark_helpers import ( - get_jdbc_url_from_pg_uri, - get_jdbc_connection_properties, get_broker_jdbc_url, + get_jdbc_connection_properties, + get_jdbc_url_from_pg_uri, ) -from usaspending_api.common.etl.spark import _USAS_RDS_REF_TABLES, _BROKER_REF_TABLES, create_ref_temp_views from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string from usaspending_api.config import CONFIG @@ -37,10 +42,17 @@ def test_jvm_sparksession(spark: SparkSession): sc = SparkContext._active_spark_context assert sc._jvm assert sc._jvm.SparkSession - assert not sc._jvm.SparkSession.getDefaultSession().get().sparkContext().isStopped() + assert ( + not sc._jvm.SparkSession.getDefaultSession() + .get() + .sparkContext() + .isStopped() + ) -def test_hive_metastore_db(spark: SparkSession, s3_unittest_data_bucket, hive_unittest_metastore_db): +def test_hive_metastore_db( + spark: SparkSession, s3_unittest_data_bucket, hive_unittest_metastore_db +): """Ensure that schemas and tables created are tracked in the hive metastore_db""" test_schema = "my_delta_test_schema" test_table = "my_delta_test_table" @@ -65,7 +77,9 @@ def test_hive_metastore_db(spark: SparkSession, s3_unittest_data_bucket, hive_un assert tables_in_test_schema[0]["tableName"] == test_table -def test_tmp_hive_metastore_db_empty_on_test_start(spark: SparkSession, hive_unittest_metastore_db): +def test_tmp_hive_metastore_db_empty_on_test_start( + spark: SparkSession, hive_unittest_metastore_db +): """Test that when using the spark test fixture, the metastore_db is configured to live in a tmp directory, so that schemas and tables created while under-test only live or are known for the duration of a SINGLE test, not a test SESSION. And test that the metastore used for unit tests is empty on each test run (except for the @@ -102,18 +116,55 @@ def test_spark_app_run_local_master(spark: SparkSession): def test_spark_write_csv_app_run(spark: SparkSession, s3_unittest_data_bucket): """More involved integration test that requires MinIO to be up as an s3 alternative.""" data = [ - {"first_col": "row 1", "id": str(uuid.uuid4()), "color": "blue", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 2", "id": str(uuid.uuid4()), "color": "green", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 3", "id": str(uuid.uuid4()), "color": "pink", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 4", "id": str(uuid.uuid4()), "color": "yellow", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 5", "id": str(uuid.uuid4()), "color": "red", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 6", "id": str(uuid.uuid4()), "color": "orange", "numeric_val": random.randint(-100, 100)}, - {"first_col": "row 7", "id": str(uuid.uuid4()), "color": "magenta", "numeric_val": random.randint(-100, 100)}, + { + "first_col": "row 1", + "id": str(uuid.uuid4()), + "color": "blue", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 2", + "id": str(uuid.uuid4()), + "color": "green", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 3", + "id": str(uuid.uuid4()), + "color": "pink", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 4", + "id": str(uuid.uuid4()), + "color": "yellow", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 5", + "id": str(uuid.uuid4()), + "color": "red", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 6", + "id": str(uuid.uuid4()), + "color": "orange", + "numeric_val": random.randint(-100, 100), + }, + { + "first_col": "row 7", + "id": str(uuid.uuid4()), + "color": "magenta", + "numeric_val": random.randint(-100, 100), + }, ] df = spark.createDataFrame([Row(**data_row) for data_row in data]) # NOTE! NOTE! NOTE! MinIO locally does not support a TRAILING SLASH after object (folder) name - df.write.option("header", True).csv(f"s3a://{s3_unittest_data_bucket}" f"/{CONFIG.DELTA_LAKE_S3_PATH}/write_to_s3") + df.write.option("header", True).csv( + f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/write_to_s3" + ) # Verify there are *.csv part files in the chosen bucket s3_client = boto3.client( @@ -138,7 +189,7 @@ def _transaction_and_award_test_data(db): award=awd1, modification_number="1", awarding_agency_id=agency1.id, - last_modified_date=date(2012, 3, 1), + last_modified_date=datetime(2012, 3, 1), business_funds_indicator="a", record_type=1, total_funding_amount=1000.00, @@ -153,7 +204,7 @@ def _transaction_and_award_test_data(db): award=awd2, modification_number="1", awarding_agency_id=agency1.id, - last_modified_date=date(2012, 4, 1), + last_modified_date=datetime(2012, 4, 1), is_fpds=True, piid="abc", base_and_all_options_value=1000, @@ -161,7 +212,9 @@ def _transaction_and_award_test_data(db): assert TransactionFPDS.objects.all().count() == 1 -@mark.django_db(transaction=True) # must commit Django data for Spark to be able to read it +@mark.django_db( + transaction=True +) # must commit Django data for Spark to be able to read it def test_spark_write_to_s3_delta_from_db( _transaction_and_award_test_data, spark: SparkSession, @@ -174,18 +227,24 @@ def test_spark_write_to_s3_delta_from_db( pg_uri = get_database_dsn_string() jdbc_url = get_jdbc_url_from_pg_uri(pg_uri) if not jdbc_url.startswith("jdbc:postgresql://"): - raise ValueError("JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://...") + raise ValueError( + "JDBC URL given is not in postgres JDBC URL format (e.g. jdbc:postgresql://..." + ) schema_name = delta_lake_unittest_schema # ==== transaction_normalized ==== table_name = "vw_transaction_normalized" logger.info(f"Reading db records for {table_name} from connection: {jdbc_url}") - df = spark.read.jdbc(url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties()) + df = spark.read.jdbc( + url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties() + ) # NOTE! NOTE! NOTE! MinIO locally does not support a TRAILING SLASH after object (folder) name path = f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/{table_name}" - logger.info(f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}") + logger.info( + f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}" + ) # Create table in the metastore using DataFrame's schema and write data to the table df.write.saveAsTable( @@ -198,11 +257,15 @@ def test_spark_write_to_s3_delta_from_db( # ==== transaction_fabs ==== table_name = "vw_transaction_fabs" logger.info(f"Reading db records for {table_name} from connection: {jdbc_url}") - df = spark.read.jdbc(url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties()) + df = spark.read.jdbc( + url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties() + ) # NOTE! NOTE! NOTE! MinIO locally does not support a TRAILING SLASH after object (folder) name path = f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/{table_name}" - logger.info(f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}") + logger.info( + f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}" + ) # Create table in the metastore using DataFrame's schema and write data to the table df.write.saveAsTable( @@ -215,11 +278,15 @@ def test_spark_write_to_s3_delta_from_db( # ==== transaction_fpds ==== table_name = "vw_transaction_fpds" logger.info(f"Reading db records for {table_name} from connection: {jdbc_url}") - df = spark.read.jdbc(url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties()) + df = spark.read.jdbc( + url=jdbc_url, table=table_name, properties=get_jdbc_connection_properties() + ) # NOTE! NOTE! NOTE! MinIO locally does not support a TRAILING SLASH after object (folder) name path = f"s3a://{s3_unittest_data_bucket}/{CONFIG.DELTA_LAKE_S3_PATH}/{table_name}" - logger.info(f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}") + logger.info( + f"Loading {df.count()} rows from DB to Delta table named {schema_name}.{table_name} at path {path}" + ) # Create table in the metastore using DataFrame's schema and write data to the table df.write.saveAsTable( @@ -238,7 +305,7 @@ def test_spark_write_to_s3_delta_from_db( # Now assert that we're still by-default using the unittest schema, by way of using that pytest fixture. # i.e. don't tell it what schema to look at - tables = spark.sql(f"show tables").collect() + tables = spark.sql("show tables").collect() assert len(tables) == 3 table_names = [t.tableName for t in tables] assert "vw_transaction_normalized" in table_names @@ -246,7 +313,9 @@ def test_spark_write_to_s3_delta_from_db( assert "vw_transaction_fpds" in table_names # Assert rows are present - assert spark.sql("select count(*) from vw_transaction_normalized").collect()[0][0] == 2 + assert ( + spark.sql("select count(*) from vw_transaction_normalized").collect()[0][0] == 2 + ) assert spark.sql("select count(*) from vw_transaction_fabs").collect()[0][0] == 1 assert spark.sql("select count(*) from vw_transaction_fpds").collect()[0][0] == 1 @@ -268,7 +337,9 @@ def test_create_ref_temp_views(spark: SparkSession): # verify the data in the temp view matches the dummy data for rds_ref_table in _USAS_RDS_REF_TABLES: - spark_count = spark.sql(f"select count(*) from global_temp.{rds_ref_table._meta.db_table}").collect()[0][0] + spark_count = spark.sql( + f"select count(*) from global_temp.{rds_ref_table._meta.db_table}" + ).collect()[0][0] assert rds_ref_table.objects.count() == spark_count # Setup for testing the Broker table(s) diff --git a/usaspending_api/etl/tests/unit/test_spark.py b/usaspending_api/etl/tests/unit/test_spark.py deleted file mode 100644 index 3d1c6cf0a3..0000000000 --- a/usaspending_api/etl/tests/unit/test_spark.py +++ /dev/null @@ -1,17 +0,0 @@ -from usaspending_api.etl.management.commands.load_query_to_delta import TABLE_SPEC as LOAD_QUERY_TABLE_SPEC -from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC as LOAD_TABLE_TABLE_SPEC - - -def test_table_spec_consistency(): - table_spec_config_groups = { - "LOAD_QUERY_TABLE_SPEC": LOAD_QUERY_TABLE_SPEC, - "LOAD_TABLE_TABLE_SPEC": LOAD_TABLE_TABLE_SPEC, - } - for table_spec_group_name, table_spec_config_group in table_spec_config_groups.items(): - unioned_table_spec_keys = set() - for table_name, config in table_spec_config_group.items(): - unioned_table_spec_keys = unioned_table_spec_keys.union(set(list(config.keys()))) - for table_name, config in table_spec_config_group.items(): - diff = unioned_table_spec_keys - set(list(config.keys())) - if diff: - raise Exception(f"{table_name} is missing the following {table_spec_group_name} values: {diff}") diff --git a/usaspending_api/etl/transaction_delta_loaders/loaders.py b/usaspending_api/etl/transaction_delta_loaders/loaders.py index dc04dd0795..d19978876d 100644 --- a/usaspending_api/etl/transaction_delta_loaders/loaders.py +++ b/usaspending_api/etl/transaction_delta_loaders/loaders.py @@ -1,27 +1,34 @@ -import copy import logging from abc import ABC from datetime import datetime, timezone +from time import perf_counter from typing import Callable, Literal from delta import DeltaTable -from pyspark.sql import functions as sf, SparkSession, Window +from pyspark.sql import Column, DataFrame, functions as sf, SparkSession, Window +from pyspark.sql.types import ArrayType, StringType from usaspending_api.broker.helpers.build_business_categories_boolean_dict import fpds_boolean_columns +from usaspending_api.broker.helpers.get_business_categories import ( + get_business_categories_fabs, + get_business_categories_fpds, +) from usaspending_api.broker.helpers.last_load_date import ( get_earliest_load_date, + get_last_load_date, update_last_load_date, ) from usaspending_api.common.data_classes import TransactionColumn from usaspending_api.common.etl.spark import create_ref_temp_views - +from usaspending_api.etl.transaction_delta_loaders.utils import parse_date_column from usaspending_api.transactions.delta_models.transaction_fabs import ( FABS_TO_NORMALIZED_COLUMN_INFO, TRANSACTION_FABS_COLUMN_INFO, ) + from usaspending_api.transactions.delta_models.transaction_fpds import ( DAP_TO_NORMALIZED_COLUMN_INFO, TRANSACTION_FPDS_COLUMN_INFO, @@ -37,130 +44,134 @@ class AbstractDeltaTransactionLoader(ABC): id_col: str source_table: str col_info = list[TransactionColumn] - - def __init__(self, spark, etl_level: Literal["fabs", "fpds", "normalized"], spark_s3_bucket: str) -> None: + last_etl_load_date: datetime + + def __init__( + self, + spark, + etl_level: Literal["fabs", "fpds", "normalized"], + alt_last_load_date: str | None, + spark_s3_bucket: str, + ) -> None: self.etl_level = etl_level + if alt_last_load_date is not None: + self.last_etl_load_date = datetime.strptime(alt_last_load_date, "%Y-%m-%d %H:%M:%S") + else: + self.last_etl_load_date = get_last_load_date(f"transaction_{self.etl_level}") self.spark_s3_bucket: spark_s3_bucket self.spark = spark def load_transactions(self) -> None: + logger.info(f"LOADING TRANSACTIONS -- level: {self.etl_level}, last load date: {self.last_etl_load_date}") if not self.spark._jsparkSession.catalog().tableExists(f"int.transaction_{self.etl_level}"): raise Exception(f"Table: int.transaction_{self.etl_level} does not exist.") logger.info(f"Running UPSERT SQL for transaction_{self.etl_level} ETL") - self.spark.sql(self.transaction_merge_into_sql()) + self.transaction_merge() next_last_load = get_earliest_load_date( ("source_procurement_transaction", "source_assistance_transaction"), datetime.utcfromtimestamp(0) ) update_last_load_date(f"transaction_{self.etl_level}", next_last_load) - def build_date_format_sql(self, col: TransactionColumn, is_casted_to_date: bool = True) -> str: - # Each of these regexps allows for an optional timestamp portion, separated from the date by some character, - # and the timestamp allows for an optional UTC offset. In any case, the timestamp is ignored, though. - regexp_mmddYYYY = r"(\\d{2})(?[-/])(\\d{2})(\\k)(\\d{4})(.\\d{2}:\\d{2}:\\d{2}([+-]\\d{2}:\\d{2})?)?" - regexp_YYYYmmdd = r"(\\d{4})(?[-/]?)(\\d{2})(\\k)(\\d{2})(.\\d{2}:\\d{2}:\\d{2}([+-]\\d{2}:\\d{2})?)?" - - mmddYYYY_fmt = f""" - (regexp_extract({self.source_table}.{col.source}, '{regexp_mmddYYYY}', 5) - || '-' || - regexp_extract({self.source_table}.{col.source}, '{regexp_mmddYYYY}', 1) - || '-' || - regexp_extract({self.source_table}.{col.source}, '{regexp_mmddYYYY}', 3)) - """ - YYYYmmdd_fmt = f""" - (regexp_extract({self.source_table}.{col.source}, '{regexp_YYYYmmdd}', 1) - || '-' || - regexp_extract({self.source_table}.{col.source}, '{regexp_YYYYmmdd}', 3) - || '-' || - regexp_extract({self.source_table}.{col.source}, '{regexp_YYYYmmdd}', 5)) - """ - - if is_casted_to_date: - mmddYYYY_fmt = f"""CAST({mmddYYYY_fmt} - AS DATE) - """ - YYYYmmdd_fmt = f"""CAST({YYYYmmdd_fmt} - AS DATE) - """ - - sql_snippet = f""" - CASE WHEN regexp({self.source_table}.{col.source}, '{regexp_mmddYYYY}') - THEN {mmddYYYY_fmt} - ELSE {YYYYmmdd_fmt} - END - """ - - return sql_snippet - - def handle_column(self, col: TransactionColumn, is_result_aliased=True) -> str: + def handle_column(self, col: TransactionColumn, is_result_aliased=True) -> Column: if col.handling == "cast": - retval = f"CAST({self.source_table}.{col.source} AS {col.delta_type})" + retval = sf.col(f"{self.source_table}.{col.source}").cast(col.delta_type) elif col.handling == "literal": # Use col.source directly as the value - retval = f"{col.source}" + retval = sf.lit(col.source).cast(col.delta_type) elif col.handling == "parse_string_datetime_to_date": # These are string fields that actually hold DATES/TIMESTAMPS and need to be cast as dates. # However, they may not be properly parsed when calling CAST(... AS DATE). - retval = self.build_date_format_sql(col, is_casted_to_date=True) + retval = parse_date_column(col.source, table=self.source_table, is_casted_to_date=True) elif col.handling == "string_datetime_remove_timestamp": # These are string fields that actually hold DATES/TIMESTAMPS, but need the non-DATE part discarded, # even though they remain as strings - retval = self.build_date_format_sql(col, is_casted_to_date=False) + retval = parse_date_column(col.source, table=self.source_table, is_casted_to_date=False) elif col.delta_type.upper() == "STRING": # Capitalize and remove leading & trailing whitespace from all string values - retval = f"ucase(trim({self.source_table}.{col.source}))" + retval = sf.upper(sf.trim(sf.col(f"{self.source_table}.{col.source}"))) elif col.delta_type.upper() == "BOOLEAN" and not col.handling == "leave_null": # Unless specified, convert any nulls to false for boolean columns - retval = f"COALESCE({self.source_table}.{col.source}, FALSE)" + retval = sf.coalesce(sf.col(f"{self.source_table}.{col.source}"), sf.lit(False)) else: - retval = f"{self.source_table}.{col.source}" + retval = sf.col(f"{self.source_table}.{col.source}") # Handle scalar transformations if the column requires it if col.scalar_transformation is not None: - retval = col.scalar_transformation.format(input=retval) + retval = col.scalar_transformation(retval) - retval = f"{retval}{' AS ' + col.dest_name if is_result_aliased else ''}" + retval = retval.alias(col.dest_name) if is_result_aliased else retval return retval @property - def select_columns(self) -> list[str]: - return ["CAST(NULL AS LONG) AS transaction_id"] + [ + def select_columns(self) -> list[Column]: + return [sf.lit(None).cast("LONG").alias("transaction_id")] + [ self.handle_column(col) for col in self.col_info if col.dest_name != "transaction_id" ] - def source_subquery_sql(self) -> str: - select_columns_str = ",\n ".join(self.select_columns) - sql = f""" - SELECT - {select_columns_str} - FROM {self.source_table} - """ - return sql - - def transaction_merge_into_sql(self) -> str: - silver_table_cols = ", ".join([col.dest_name for col in self.col_info if col.dest_name != "transaction_id"]) - sql = f""" - MERGE INTO int.transaction_{self.etl_level} AS silver_table - USING ( - {self.source_subquery_sql()} - ) AS source_subquery - ON - silver_table.{self.id_col} = source_subquery.{self.id_col} - AND silver_table.hash = source_subquery.hash - WHEN NOT MATCHED - THEN INSERT - ({silver_table_cols}) - VALUES ({silver_table_cols}) - WHEN NOT MATCHED BY SOURCE - THEN DELETE - """ - - return sql + def to_insert_df(self) -> DataFrame: + df = ( + self.spark.read.format("delta") + .option("readChangeFeed", "true") + .option("startingVersion", 0) + .table(self.source_table) + .filter( + sf.col("_change_type").isin(["insert", "update_postimage"]) + & (sf.col("_commit_timestamp") > self.last_etl_load_date) + ) + .select(self.select_columns) + ) + return df + + def to_delete_df(self, id_col) -> DataFrame: + version_window = Window.partitionBy(id_col, "hash", "_commit_version") + df = ( + self.spark.read.format("delta") + .option("readChangeFeed", "true") + .option("startingVersion", 0) + .table(self.source_table) + .withColumn("has_insert", sf.max(sf.col("_change_type") == "insert").over(version_window)) + .filter( + (sf.col("_change_type") == sf.lit("delete")) + & (sf.col("_commit_timestamp") > self.last_etl_load_date) + & ~sf.col("has_insert") + ) + .select(id_col, "hash", "action_year", "action_month") + ) + return df + + def transaction_merge(self) -> None: + source = self.to_insert_df().alias("s") + target = DeltaTable.forName(self.spark, f"int.transaction_{self.etl_level}").alias("t") + id_condition = f"t.{self.id_col} == s.{self.id_col}" + hash_condition = "t.hash == s.hash" + partition_pruning_conditions = "t.action_year == s.action_year AND t.action_month == s.action_month" + ( + target.merge(source, " AND ".join([id_condition, hash_condition, partition_pruning_conditions])) + .whenNotMatchedInsert( + values={ + col.dest_name: sf.col(f"s.{col.dest_name}") + for col in self.col_info + if col.dest_name != "transaction_id" + }, + ) + .execute() + ) + ( + target.merge( + self.to_delete_df(self.id_col).alias("s"), + " AND ".join([id_condition, hash_condition, partition_pruning_conditions]), + ) + .whenMatchedDelete() + .execute() + ) class FPDSDeltaTransactionLoader(AbstractDeltaTransactionLoader): - def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: - super().__init__(spark=spark, etl_level="fpds", spark_s3_bucket=spark_s3_bucket) + def __init__(self, spark: SparkSession, alt_last_load_date: str | None, spark_s3_bucket: str) -> None: + super().__init__( + spark=spark, etl_level="fpds", alt_last_load_date=alt_last_load_date, spark_s3_bucket=spark_s3_bucket + ) self.id_col = "detached_award_proc_unique" self.source_table = "raw.detached_award_procurement" self.col_info = TRANSACTION_FPDS_COLUMN_INFO @@ -168,8 +179,10 @@ def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: class FABSDeltaTransactionLoader(AbstractDeltaTransactionLoader): - def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: - super().__init__(spark=spark, etl_level="fabs", spark_s3_bucket=spark_s3_bucket) + def __init__(self, spark: SparkSession, alt_last_load_date: str | None, spark_s3_bucket: str) -> None: + super().__init__( + spark=spark, etl_level="fabs", alt_last_load_date=alt_last_load_date, spark_s3_bucket=spark_s3_bucket + ) self.id_col = "afa_generated_unique" self.source_table = "raw.published_fabs" self.col_info = TRANSACTION_FABS_COLUMN_INFO @@ -179,41 +192,61 @@ class NormalizedMixin: spark: SparkSession handle_column: Callable + to_delete_df: Callable source_table: str + id_col: str + source_id_col: str etl_level: str + last_etl_load_date: datetime select_columns: list[str] to_normalized_col_info: list[TransactionColumn] normalization_type: Literal["fabs", "fpds"] - prepare_spark: Callable - def source_subquery_sql(self) -> str: - additional_joins = f""" - LEFT OUTER JOIN global_temp.subtier_agency AS funding_subtier_agency ON ( - funding_subtier_agency.subtier_code = {self.source_table}.funding_sub_tier_agency_co + def to_insert_df(self) -> DataFrame: + funding_subtier_agency = self.spark.table("global_temp.subtier_agency").alias("funding_subtier_agency") + funding_agency = self.spark.table("global_temp.agency").alias("funding_agency") + awarding_subtier_agency = ( + self.spark.table("global_temp.subtier_agency") + .withColumn("awarding_subtier_agency_id", sf.col("subtier_agency_id")) + .alias("awarding_subtier_agency") + ) + awarding_agency = self.spark.table("global_temp.agency").alias("awarding_agency") + df = ( + self.spark.read.format("delta") + .option("readChangeFeed", "true") + .option("startingVersion", 0) + .table(self.source_table) + .filter( + sf.col("_change_type").isin(["insert", "update_postimage"]) + & (sf.col("_commit_timestamp") > self.last_etl_load_date) + ) + ) + result = ( + df.join( + funding_subtier_agency, + funding_subtier_agency.subtier_code == df.funding_sub_tier_agency_co, + how="leftouter", ) - LEFT OUTER JOIN global_temp.agency AS funding_agency ON ( - funding_agency.subtier_agency_id = funding_subtier_agency.subtier_agency_id + .join( + funding_agency, + funding_agency.subtier_agency_id == funding_subtier_agency.subtier_agency_id, + how="leftouter", ) - LEFT OUTER JOIN global_temp.subtier_agency AS awarding_subtier_agency ON ( - awarding_subtier_agency.subtier_code = {self.source_table}.awarding_sub_tier_agency_c + .join( + awarding_subtier_agency, + awarding_subtier_agency.subtier_code == df.awarding_sub_tier_agency_c, + how="leftouter", ) - LEFT OUTER JOIN global_temp.agency AS awarding_agency ON ( - awarding_agency.subtier_agency_id = awarding_subtier_agency.subtier_agency_id + .join( + awarding_agency, + awarding_agency.subtier_agency_id == awarding_subtier_agency.awarding_subtier_agency_id, + how="leftouter", ) - """ - - # Since the select columns may have complicated logic, put them on separate lines for debugging. - # However, strings inside {} expressions in f-strings can't contain backslashes, so will join them first - # before inserting into overall sql statement. - select_columns_str = ",\n ".join(self.select_columns) - return f""" - SELECT - {select_columns_str} - FROM {self.source_table} - {additional_joins} - """ - - def transaction_merge_into_sql(self) -> str: + .select(self.select_columns) + ) + return result + + def transaction_merge(self) -> None: create_ref_temp_views(self.spark) load_datetime = datetime.now(timezone.utc) special_columns = ["create_date", "update_date"] @@ -227,33 +260,35 @@ def transaction_merge_into_sql(self) -> str: set_cols.append(f"""int.transaction_normalized.update_date = '{load_datetime.isoformat(" ")}'""") # Move create_date and update_date to the end of the list of column names for ease of handling # during record insert - insert_col_name_list = [ - col_name for col_name in TRANSACTION_NORMALIZED_COLUMNS if col_name not in special_columns - ] - insert_col_name_list.extend(special_columns) - insert_col_names = ", ".join([col_name for col_name in insert_col_name_list]) + insert_col_names = [col_name for col_name in TRANSACTION_NORMALIZED_COLUMNS if col_name not in special_columns] + insert_col_names.extend(special_columns) # On insert, all values except for create_date and update_date will come from the subquery - insert_value_list = insert_col_name_list[:-2] - insert_value_list.extend([f"""'{load_datetime.isoformat(" ")}'"""] * 2) - insert_values = ", ".join([value for value in insert_value_list]) - - sql = f""" - MERGE INTO int.transaction_normalized - USING ( - {self.source_subquery_sql()} - ) AS source_subquery - ON transaction_normalized.transaction_unique_id = source_subquery.transaction_unique_id - AND transaction_normalized.hash = source_subquery.hash - WHEN NOT MATCHED - THEN INSERT - ({insert_col_names}) - VALUES ({insert_values}) - WHEN NOT MATCHED BY SOURCE AND {'NOT' if self.normalization_type== 'fabs' else ''} transaction_normalized.is_fpds - THEN DELETE - """ - - return sql + insert_values = [sf.col(col) for col in insert_col_names[:-2]] + insert_values.extend([sf.lit(f"{load_datetime.isoformat(sep=' ')}")] * 2) + + target = DeltaTable.forName(self.spark, "int.transaction_normalized").alias("t") + id_condition = "t.transaction_unique_id = s.transaction_unique_id" + hash_condition = "t.hash == s.hash" + type_partition_condition = f"{'NOT' if self.normalization_type == 'fabs' else ''} t.is_fpds" + partition_pruning_conditions = "t.action_year == s.action_year AND t.action_month == s.action_month" + ( + target.merge( + self.to_insert_df().alias("s"), + " AND ".join([id_condition, hash_condition, type_partition_condition, partition_pruning_conditions]), + ) + .whenNotMatchedInsert(values=dict(zip(insert_col_names, insert_values))) + .execute() + ) + delete_id_condition = f"t.transaction_unique_id = s.{self.source_id_col}" + ( + target.merge( + self.to_delete_df(self.source_id_col).alias("s"), + " AND ".join([delete_id_condition, hash_condition, partition_pruning_conditions]), + ) + .whenMatchedDelete() + .execute() + ) def populate_transaction_normalized_ids(self) -> None: target = DeltaTable.forName(self.spark, "int.transaction_normalized").alias("t") @@ -310,23 +345,40 @@ def populate_award_ids(self) -> None: w = Window.orderBy(needs_ids.unique_award_key) with_ids = needs_ids.withColumn("award_id", (max_id + sf.row_number().over(w)).cast("LONG")).alias("s") ( - target.merge(with_ids, f"t.unique_award_key = s.unique_award_key") + target.merge(with_ids, "t.unique_award_key = s.unique_award_key") .whenMatchedUpdate(set={"t.award_id": "s.award_id"}) .execute() ) def load_transactions(self) -> None: + start = perf_counter() + logger.info("Loading transactions...") super().load_transactions() + s1 = perf_counter() + logger.info(f"Loading transactions took {s1 - start:.2f} seconds.") + logger.info(f"populating award ids...") self.populate_award_ids() + s2 = perf_counter() + logger.info(f"Populating awards took {s2 - s1:.2f} seconds.") + logger.info("populating transaction normalized ids...") self.populate_transaction_normalized_ids() + s3 = perf_counter() + logger.info(f"Populating normalized ids took {s3 - s2:.2f} seconds.") + logger.info("linking transactions to normalized...") self.link_transactions_to_normalized() + s4 = perf_counter() + logger.info(f"Linking took {s4 - s3:.2f} seconds.") + logger.info(f"total time {s2 - start:.2f} seconds.") class FABSNormalizedDeltaTransactionLoader(NormalizedMixin, AbstractDeltaTransactionLoader): - def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: - super().__init__(spark=spark, etl_level="normalized", spark_s3_bucket=spark_s3_bucket) + def __init__(self, spark: SparkSession, alt_last_load_date: str | None, spark_s3_bucket: str) -> None: + super().__init__( + spark=spark, etl_level="normalized", alt_last_load_date=alt_last_load_date, spark_s3_bucket=spark_s3_bucket + ) self.id_col = "transaction_unique_id" + self.source_id_col = "afa_generated_unique" self.source_table = "raw.published_fabs" self.to_normalized_col_info = FABS_TO_NORMALIZED_COLUMN_INFO self.normalization_type = "fabs" @@ -336,32 +388,26 @@ def select_columns(self) -> list[str]: action_date_col = next( filter(lambda c: c.dest_name == "action_date" and c.source == "action_date", FABS_TO_NORMALIZED_COLUMN_INFO) ) - parse_action_date_sql_snippet = self.handle_column(action_date_col, is_result_aliased=False) + parse_action_date_snippet = self.handle_column(action_date_col, is_result_aliased=False) select_cols = [ - "CAST(NULL AS LONG) AS id", - "CAST(NULL AS LONG) AS award_id", - "awarding_agency.id AS awarding_agency_id", - f"""CASE WHEN month({parse_action_date_sql_snippet}) > 9 - THEN year({parse_action_date_sql_snippet}) + 1 - ELSE year({parse_action_date_sql_snippet}) - END AS fiscal_year""", - "funding_agency.id AS funding_agency_id", + sf.lit(None).cast("LONG").alias("id"), + sf.lit(None).cast("LONG").alias("award_id"), + sf.col("awarding_agency.id").alias("awarding_agency_id"), + sf.when(sf.month(parse_action_date_snippet) > sf.lit(9), sf.year(parse_action_date_snippet) + sf.lit(1)) + .otherwise(sf.year(parse_action_date_snippet)) + .alias("fiscal_year"), + sf.col("funding_agency.id").alias("funding_agency_id"), ] - select_cols.extend( - [ - # business_categories - f"get_business_categories_fabs({self.source_table}.business_types) AS business_categories", - # funding_amount - # In theory, this should be equal to - # CAST(COALESCE({bronze_table_name}.federal_action_obligation, 0) - # + COALESCE({bronze_table_name}.non_federal_funding_amount, 0) - # AS NUMERIC(23, 2)) - # However, for some historical records, this isn't true. - f""" - CAST({self.source_table}.total_funding_amount AS NUMERIC(23, 2)) AS funding_amount - """, - ] + get_business_categories_fabs_udf = sf.udf( + lambda x: get_business_categories_fabs(x), + ArrayType(StringType()), ) + select_cols = select_cols + [ + get_business_categories_fabs_udf(sf.col(f"{self.source_table}.business_types")).alias( + "business_categories" + ), + sf.expr(f"CAST({self.source_table}.total_funding_amount AS NUMERIC(23, 2)) AS funding_amount"), + ] for col in FABS_TO_NORMALIZED_COLUMN_INFO: select_cols.append(self.handle_column(col)) @@ -370,9 +416,12 @@ def select_columns(self) -> list[str]: class FPDSNormalizedDeltaTransactionLoader(NormalizedMixin, AbstractDeltaTransactionLoader): - def __init__(self, spark, spark_s3_bucket: str) -> None: - super().__init__(spark=spark, etl_level="normalized", spark_s3_bucket=spark_s3_bucket) + def __init__(self, spark, alt_last_load_date: str | None, spark_s3_bucket: str) -> None: + super().__init__( + spark=spark, etl_level="normalized", alt_last_load_date=alt_last_load_date, spark_s3_bucket=spark_s3_bucket + ) self.id_col = "transaction_unique_id" + self.source_id_col = "detached_award_proc_unique" self.source_table = "raw.detached_award_procurement" self.to_normalized_col_info = DAP_TO_NORMALIZED_COLUMN_INFO self.normalization_type = "fpds" @@ -382,27 +431,28 @@ def select_columns(self) -> list[str]: action_date_col = next( filter(lambda c: c.dest_name == "action_date" and c.source == "action_date", DAP_TO_NORMALIZED_COLUMN_INFO) ) - parse_action_date_sql_snippet = self.handle_column(action_date_col, is_result_aliased=False) + parse_action_date_snippet = self.handle_column(action_date_col, is_result_aliased=False) select_cols = [ - "CAST(NULL AS LONG) AS id", - "CAST(NULL AS LONG) AS award_id", - "awarding_agency.id AS awarding_agency_id", - f"""CASE WHEN month({parse_action_date_sql_snippet}) > 9 - THEN year({parse_action_date_sql_snippet}) + 1 - ELSE year({parse_action_date_sql_snippet}) - END AS fiscal_year""", - "funding_agency.id AS funding_agency_id", + sf.lit(None).cast("LONG").alias("id"), + sf.lit(None).cast("LONG").alias("award_id"), + sf.col("awarding_agency.id").alias("awarding_agency_id"), + sf.when(sf.month(parse_action_date_snippet) > sf.lit(9), sf.year(parse_action_date_snippet) + sf.lit(1)) + .otherwise(sf.year(parse_action_date_snippet)) + .alias("fiscal_year"), + sf.col("funding_agency.id").alias("funding_agency_id"), + ] + fpds_business_category_columns = [ + sf.col(col) for col in fpds_boolean_columns + ["contracting_officers_deter", "domestic_or_foreign_entity"] ] - fpds_business_category_columns = copy.copy(fpds_boolean_columns) - # Add a couple of non-boolean columns that are needed in the business category logic - fpds_business_category_columns.extend(["contracting_officers_deter", "domestic_or_foreign_entity"]) - named_struct_text = ", ".join([f"'{col}', {self.source_table}.{col}" for col in fpds_business_category_columns]) + get_business_categories_fpds_udf = sf.udf(lambda x: get_business_categories_fpds(x), ArrayType(StringType())) select_cols.extend( [ - # business_categories - f"get_business_categories_fpds(named_struct({named_struct_text})) AS business_categories", + get_business_categories_fpds_udf(sf.struct(*fpds_business_category_columns)).alias( + "business_categories" + ), # type - f""" + sf.expr( + f""" CASE WHEN {self.source_table}.pulled_from <> 'IDV' THEN {self.source_table}.contract_award_type WHEN {self.source_table}.idv_type = 'B' AND {self.source_table}.type_of_idc IS NOT NULL THEN 'IDV_B_' || {self.source_table}.type_of_idc @@ -419,9 +469,11 @@ def select_columns(self) -> list[str]: THEN 'IDV_B_C' ELSE 'IDV_' || {self.source_table}.idv_type END AS type - """, + """ + ), # type_description - f""" + sf.expr( + f""" CASE WHEN {self.source_table}.pulled_from <> 'IDV' THEN {self.source_table}.contract_award_type_desc WHEN {self.source_table}.idv_type = 'B' @@ -432,7 +484,8 @@ def select_columns(self) -> list[str]: THEN 'INDEFINITE DELIVERY CONTRACT' ELSE {self.source_table}.idv_type_description END AS type_description - """, + """ + ), ] ) for col in DAP_TO_NORMALIZED_COLUMN_INFO: diff --git a/usaspending_api/etl/transaction_delta_loaders/utils.py b/usaspending_api/etl/transaction_delta_loaders/utils.py new file mode 100644 index 0000000000..fbc12ab88c --- /dev/null +++ b/usaspending_api/etl/transaction_delta_loaders/utils.py @@ -0,0 +1,27 @@ +from pyspark.sql import Column, functions as sf + + +def parse_date_column(column: str, table: str | None = None, is_casted_to_date: bool = True) -> Column: + column_ref = sf.col((f"{table}." if table else "") + column) + regexp_mmddYYYY = r"(\d{2})(?[-/])(\d{2})(\k)(\d{4})(.\d{2}:\d{2}:\d{2}([+-]\d{2}:\d{2})?)?" + regexp_YYYYmmdd = r"(\d{4})(?[-/]?)(\d{2})(\k)(\d{2})(.\d{2}:\d{2}:\d{2}([+-]\d{2}:\d{2})?)?" + mmddYYYY_fmt = sf.concat( + sf.regexp_extract(column_ref, regexp_mmddYYYY, 5), + sf.lit("-"), + sf.regexp_extract(column_ref, regexp_mmddYYYY, 1), + sf.lit("-"), + sf.regexp_extract(column_ref, regexp_mmddYYYY, 3), + ) + YYYYmmdd_fmt = sf.concat( + sf.regexp_extract(column_ref, regexp_YYYYmmdd, 1), + sf.lit("-"), + sf.regexp_extract(column_ref, regexp_YYYYmmdd, 3), + sf.lit("-"), + sf.regexp_extract(column_ref, regexp_YYYYmmdd, 5), + ) + if is_casted_to_date: + mmddYYYY_fmt = mmddYYYY_fmt.cast("date") + YYYYmmdd_fmt = YYYYmmdd_fmt.cast("date") + return sf.when(sf.regexp_extract(column_ref, regexp_mmddYYYY, 0) != sf.lit(""), mmddYYYY_fmt).otherwise( + YYYYmmdd_fmt + ) diff --git a/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py b/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py index d97ca5b2d6..a014d44515 100644 --- a/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py +++ b/usaspending_api/idvs/tests/integration/test_awards_idv_v2.py @@ -1,24 +1,45 @@ import json -import pytest +import pytest from model_bakery import baker from rest_framework import status -from usaspending_api.references.models import ToptierAgency, SubtierAgency +from usaspending_api.references.models import SubtierAgency, ToptierAgency @pytest.fixture def awards_and_transactions(db): - subag = {"pk": 1, "name": "agency name", "abbreviation": "some other stuff"} - baker.make("references.SubtierAgency", subtier_code="def", **subag, _fill_optional=True) - baker.make("references.ToptierAgency", toptier_code="abc", **subag, _fill_optional=True) + baker.make( + "references.SubtierAgency", subtier_code="def", **subag, _fill_optional=True + ) + baker.make( + "references.ToptierAgency", toptier_code="abc", **subag, _fill_optional=True + ) - duns = {"awardee_or_recipient_uniqu": "123", "uei": "ABC", "legal_business_name": "Sams Club"} - parent_recipient_lookup = {"duns": "123", "uei": "ABC", "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e"} - recipient_lookup = {"duns": "456", "uei": "DEF", "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab"} - parent_recipient_profile = {"recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", "recipient_level": "P"} - recipient_profile = {"recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", "recipient_level": "C"} + duns = { + "awardee_or_recipient_uniqu": "123", + "uei": "ABC", + "legal_business_name": "Sams Club", + } + parent_recipient_lookup = { + "duns": "123", + "uei": "ABC", + "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", + } + recipient_lookup = { + "duns": "456", + "uei": "DEF", + "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", + } + parent_recipient_profile = { + "recipient_hash": "cfd3f3f5-2162-7679-9f6b-429cecaa3e1e", + "recipient_level": "P", + } + recipient_profile = { + "recipient_hash": "66545a8d-bf37-3eda-cce5-29c6170c9aab", + "recipient_level": "C", + } baker.make("references.Cfda", program_number=1234) baker.make("recipient.DUNS", **duns) baker.make("recipient.RecipientLookup", **parent_recipient_lookup) @@ -26,14 +47,32 @@ def awards_and_transactions(db): baker.make("recipient.RecipientProfile", **parent_recipient_profile) baker.make("recipient.RecipientProfile", **recipient_profile) - ag = {"pk": 1, "toptier_agency": ToptierAgency.objects.get(pk=1), "subtier_agency": SubtierAgency.objects.get(pk=1)} + ag = { + "pk": 1, + "toptier_agency": ToptierAgency.objects.get(pk=1), + "subtier_agency": SubtierAgency.objects.get(pk=1), + } baker.make("references.Agency", **ag, _fill_optional=True) - baker.make("references.PSC", code="4730", description="HOSE, PIPE, TUBE, LUBRICATION, AND RAILING FITTINGS") - baker.make("references.PSC", code="47", description="PIPE, TUBING, HOSE, AND FITTINGS") + baker.make( + "references.PSC", + code="4730", + description="HOSE, PIPE, TUBE, LUBRICATION, AND RAILING FITTINGS", + ) + baker.make( + "references.PSC", code="47", description="PIPE, TUBING, HOSE, AND FITTINGS" + ) - baker.make("references.NAICS", code="333911", description="PUMP AND PUMPING EQUIPMENT MANUFACTURING") - baker.make("references.NAICS", code="3339", description="Other General Purpose Machinery Manufacturing") + baker.make( + "references.NAICS", + code="333911", + description="PUMP AND PUMPING EQUIPMENT MANUFACTURING", + ) + baker.make( + "references.NAICS", + code="3339", + description="Other General Purpose Machinery Manufacturing", + ) baker.make("references.NAICS", code="33", description="Manufacturing") award_1_model = { @@ -102,7 +141,13 @@ def awards_and_transactions(db): baker.make("search.AwardSearch", **award_2_model) baker.make("search.AwardSearch", **award_3_model) - asst_data = {"is_fpds": False, "transaction_id": 1, "award_id": 1, "cfda_number": 1234, "cfda_title": "farms"} + asst_data = { + "is_fpds": False, + "transaction_id": 1, + "award_id": 1, + "cfda_number": 1234, + "cfda_title": "farms", + } baker.make("search.TransactionSearch", **asst_data) latest_transaction_contract_data = { @@ -150,7 +195,7 @@ def awards_and_transactions(db): "is_fpds": True, "labor_standards": None, "labor_standards_descrip": "NO", - "last_modified_date": "2018-08-24", + "last_modified_date": "2018-08-24 00:00:00+00", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -270,7 +315,7 @@ def awards_and_transactions(db): "is_fpds": True, "labor_standards": None, "labor_standards_descrip": "NO", - "last_modified_date": "2018-08-24", + "last_modified_date": "2018-08-24 00:00:00+00", "legal_entity_address_line1": "123 main st", "legal_entity_address_line2": None, "legal_entity_address_line3": None, @@ -340,7 +385,10 @@ def awards_and_transactions(db): "funding_office_name": "funding_office", } baker.make("search.TransactionSearch", **latest_transaction_contract_data) - baker.make("search.TransactionSearch", **latest_transaction_contract_data_without_recipient_name_or_id) + baker.make( + "search.TransactionSearch", + **latest_transaction_contract_data_without_recipient_name_or_id, + ) @pytest.mark.django_db @@ -353,7 +401,10 @@ def test_no_data_idv_award_endpoint(client): @pytest.mark.django_db def test_award_endpoint_different_ids(client, awards_and_transactions): - resp = client.get("/api/v2/awards/CONT_AWD_03VD_9700_SPM30012D3486_9700/", content_type="application/json") + resp = client.get( + "/api/v2/awards/CONT_AWD_03VD_9700_SPM30012D3486_9700/", + content_type="application/json", + ) assert resp.status_code == status.HTTP_200_OK assert json.loads(resp.content.decode("utf-8")) == expected_response_idv @@ -366,7 +417,10 @@ def test_award_endpoint_different_ids(client, awards_and_transactions): def test_award_endpoint_for_null_recipient_information(client, awards_and_transactions): resp = client.get("/api/v2/awards/3/", content_type="application/json") assert resp.status_code == status.HTTP_200_OK - assert json.loads(resp.content.decode("utf-8")).get("recipient") == recipient_without_id_and_name + assert ( + json.loads(resp.content.decode("utf-8")).get("recipient") + == recipient_without_id_and_name + ) expected_response_idv = { @@ -387,15 +441,33 @@ def test_award_endpoint_for_null_recipient_information(client, awards_and_transa "awarding_agency": { "id": 1, "has_agency_page": False, - "toptier_agency": {"name": "agency name", "abbreviation": "some other stuff", "code": "abc", "slug": None}, - "subtier_agency": {"name": "agency name", "abbreviation": "some other stuff", "code": "def"}, + "toptier_agency": { + "name": "agency name", + "abbreviation": "some other stuff", + "code": "abc", + "slug": None, + }, + "subtier_agency": { + "name": "agency name", + "abbreviation": "some other stuff", + "code": "def", + }, "office_agency_name": "awarding_office", }, "funding_agency": { "id": 1, "has_agency_page": False, - "toptier_agency": {"name": "agency name", "abbreviation": "some other stuff", "code": "abc", "slug": None}, - "subtier_agency": {"name": "agency name", "abbreviation": "some other stuff", "code": "def"}, + "toptier_agency": { + "name": "agency name", + "abbreviation": "some other stuff", + "code": "abc", + "slug": None, + }, + "subtier_agency": { + "name": "agency name", + "abbreviation": "some other stuff", + "code": "def", + }, "office_agency_name": "funding_office", }, "recipient": { @@ -531,14 +603,26 @@ def test_award_endpoint_for_null_recipient_information(client, awards_and_transa "date_signed": "2004-03-02", "naics_hierarchy": { "toptier_code": {"description": "Manufacturing", "code": "33"}, - "midtier_code": {"description": "Other General Purpose Machinery Manufacturing", "code": "3339"}, - "base_code": {"description": "PUMP AND PUMPING EQUIPMENT MANUFACTURING", "code": "333911"}, + "midtier_code": { + "description": "Other General Purpose Machinery Manufacturing", + "code": "3339", + }, + "base_code": { + "description": "PUMP AND PUMPING EQUIPMENT MANUFACTURING", + "code": "333911", + }, }, "psc_hierarchy": { "toptier_code": {}, - "midtier_code": {"description": "PIPE, TUBING, HOSE, AND FITTINGS", "code": "47"}, + "midtier_code": { + "description": "PIPE, TUBING, HOSE, AND FITTINGS", + "code": "47", + }, "subtier_code": {}, - "base_code": {"description": "HOSE, PIPE, TUBE, LUBRICATION, AND RAILING FITTINGS", "code": "4730"}, + "base_code": { + "description": "HOSE, PIPE, TUBE, LUBRICATION, AND RAILING FITTINGS", + "code": "4730", + }, }, "account_obligations_by_defc": [], "account_outlays_by_defc": [], diff --git a/usaspending_api/search/delta_models/award_search.py b/usaspending_api/search/delta_models/award_search.py index 756bfe839e..8c417ffbed 100644 --- a/usaspending_api/search/delta_models/award_search.py +++ b/usaspending_api/search/delta_models/award_search.py @@ -1,14 +1,30 @@ from usaspending_api.awards.v2.lookups.lookups import award_type_mapping AWARD_SEARCH_COLUMNS = { - "treasury_account_identifiers": {"delta": "ARRAY", "postgres": "INTEGER[]", "gold": False}, - "award_id": {"delta": "LONG NOT NULL", "postgres": "BIGINT NOT NULL", "gold": False}, + "treasury_account_identifiers": { + "delta": "ARRAY", + "postgres": "INTEGER[]", + "gold": False, + }, + "award_id": { + "delta": "LONG NOT NULL", + "postgres": "BIGINT NOT NULL", + "gold": False, + }, "data_source": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "transaction_unique_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "latest_transaction_id": {"delta": "LONG", "postgres": "BIGINT", "gold": True}, "earliest_transaction_id": {"delta": "LONG", "postgres": "BIGINT", "gold": True}, - "latest_transaction_search_id": {"delta": "LONG", "postgres": "BIGINT", "gold": True}, - "earliest_transaction_search_id": {"delta": "LONG", "postgres": "BIGINT", "gold": True}, + "latest_transaction_search_id": { + "delta": "LONG", + "postgres": "BIGINT", + "gold": True, + }, + "earliest_transaction_search_id": { + "delta": "LONG", + "postgres": "BIGINT", + "gold": True, + }, "category": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type_description_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, @@ -16,7 +32,11 @@ "type_description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "is_fpds": {"delta": "boolean", "postgres": "boolean", "gold": True}, "generated_unique_award_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "generated_unique_award_id_legacy": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "generated_unique_award_id_legacy": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "display_award_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "update_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, "certified_date": {"delta": "DATE", "postgres": "DATE", "gold": True}, @@ -25,81 +45,297 @@ "fain": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "uri": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "parent_award_piid": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "award_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "total_obligation": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, + "award_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "total_obligation": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, "description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "total_obl_bin": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "total_subsidy_cost": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "total_loan_value": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "total_funding_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23,2)", "gold": True}, - "total_indirect_federal_sharing": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, - "base_and_all_options_value": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, - "base_exercised_options_val": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, - "non_federal_funding_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "total_subsidy_cost": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "total_loan_value": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "total_funding_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, + "total_indirect_federal_sharing": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, + "base_and_all_options_value": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, + "base_exercised_options_val": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, + "non_federal_funding_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "recipient_hash": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "recipient_levels": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, "recipient_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "raw_recipient_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "recipient_unique_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "parent_recipient_unique_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "parent_recipient_unique_id": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "recipient_uei": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "parent_uei": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "parent_recipient_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "business_categories": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, - "total_subaward_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "business_categories": { + "delta": "ARRAY", + "postgres": "TEXT[]", + "gold": False, + }, + "total_subaward_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "subaward_count": {"delta": "INTEGER", "postgres": "INTEGER", "gold": True}, "action_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "last_modified_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "period_of_performance_start_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "period_of_performance_current_end_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, + "last_modified_date": { + "delta": "TIMESTAMP", + "postgres": "TIMESTAMP", + "gold": False, + }, + "period_of_performance_start_date": { + "delta": "DATE", + "postgres": "DATE", + "gold": False, + }, + "period_of_performance_current_end_date": { + "delta": "DATE", + "postgres": "DATE", + "gold": False, + }, "date_signed": {"delta": "DATE", "postgres": "DATE", "gold": False}, "ordering_period_end_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "original_loan_subsidy_cost": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "face_value_loan_guarantee": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, + "original_loan_subsidy_cost": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "face_value_loan_guarantee": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, "awarding_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "funding_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "awarding_toptier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_name": {"delta": " STRING", "postgres": " STRING", "gold": False}, - "funding_subtier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_toptier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_name_raw": {"delta": " STRING", "postgres": " STRING", "gold": False}, - "funding_subtier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_toptier_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_subtier_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_toptier_agency_code_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_code_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_code_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_subtier_agency_code_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "funding_subtier_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, + "awarding_toptier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_name": { + "delta": " STRING", + "postgres": " STRING", + "gold": False, + }, + "funding_subtier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_toptier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_name_raw": { + "delta": " STRING", + "postgres": " STRING", + "gold": False, + }, + "funding_subtier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_toptier_agency_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_subtier_agency_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_toptier_agency_code_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_code_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_code_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_subtier_agency_code_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_id": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "funding_subtier_agency_id": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, "fpds_agency_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "fpds_parent_agency_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "recipient_location_country_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_country_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_county_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_county_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_congressional_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_congressional_code_current": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "recipient_location_country_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_country_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_county_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_county_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_congressional_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_congressional_code_current": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "recipient_location_zip5": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_city_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_county_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_congressional_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_county_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_address_line1": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_address_line2": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_address_line3": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "recipient_location_city_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_fips": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_county_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_congressional_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_county_fips": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_address_line1": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_address_line2": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_address_line3": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "recipient_location_zip4": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_foreign_postal_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_foreign_province": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "recipient_location_foreign_postal_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_foreign_province": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "pop_country_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_country_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_state_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, @@ -108,13 +344,21 @@ "pop_city_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_zip5": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_congressional_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "pop_congressional_code_current": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "pop_congressional_code_current": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "pop_city_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_state_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_state_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_state_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "pop_county_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "pop_congressional_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, + "pop_congressional_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, "pop_county_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_zip4": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "cfda_program_title": {"delta": "STRING", "postgres": "TEXT", "gold": False}, @@ -125,34 +369,82 @@ "extent_competed": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type_set_aside": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "product_or_service_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "product_or_service_description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "product_or_service_description": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "naics_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "naics_description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "tas_paths": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, "tas_components": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, "federal_accounts": {"delta": "STRING", "postgres": "JSONB", "gold": False}, - "disaster_emergency_fund_codes": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, + "disaster_emergency_fund_codes": { + "delta": "ARRAY", + "postgres": "TEXT[]", + "gold": False, + }, "spending_by_defc": {"delta": "STRING", "postgres": "JSONB", "gold": False}, - "total_covid_outlay": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "total_covid_obligation": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, + "total_covid_outlay": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "total_covid_obligation": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, "officer_1_amount": { "delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True, }, "officer_1_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_2_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "officer_2_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "officer_2_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_3_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "officer_3_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "officer_3_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_4_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "officer_4_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "officer_4_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_5_amount": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, + "officer_5_amount": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, "officer_5_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "total_iija_outlay": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, - "total_iija_obligation": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": True}, - "total_outlays": {"delta": "NUMERIC(23, 2)", "postgres": "NUMERIC(23, 2)", "gold": False}, - "generated_pragmatic_obligation": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, + "total_iija_outlay": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, + "total_iija_obligation": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": True, + }, + "total_outlays": { + "delta": "NUMERIC(23, 2)", + "postgres": "NUMERIC(23, 2)", + "gold": False, + }, + "generated_pragmatic_obligation": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, "program_activities": {"delta": "STRING", "postgres": "JSONB", "gold": False}, "transaction_count": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, } @@ -163,8 +455,12 @@ **{k: v["delta"] for k, v in AWARD_SEARCH_COLUMNS.items()}, **DELTA_ONLY_COLUMNS, } -AWARD_SEARCH_POSTGRES_COLUMNS = {k: v["postgres"] for k, v in AWARD_SEARCH_COLUMNS.items() if not v["gold"]} -AWARD_SEARCH_POSTGRES_GOLD_COLUMNS = {k: v["gold"] for k, v in AWARD_SEARCH_COLUMNS.items()} +AWARD_SEARCH_POSTGRES_COLUMNS = { + k: v["postgres"] for k, v in AWARD_SEARCH_COLUMNS.items() if not v["gold"] +} +AWARD_SEARCH_POSTGRES_GOLD_COLUMNS = { + k: v["gold"] for k, v in AWARD_SEARCH_COLUMNS.items() +} ALL_AWARD_TYPES = list(award_type_mapping.keys()) diff --git a/usaspending_api/search/delta_models/dataframes/transaction_search.py b/usaspending_api/search/delta_models/dataframes/transaction_search.py index c290183f47..b59e60a305 100644 --- a/usaspending_api/search/delta_models/dataframes/transaction_search.py +++ b/usaspending_api/search/delta_models/dataframes/transaction_search.py @@ -1,5 +1,6 @@ from delta.tables import DeltaTable -from pyspark.sql import DataFrame, SparkSession, functions as sf, Column +from pyspark.sql import Column, DataFrame, SparkSession +from pyspark.sql import functions as sf from pyspark.sql.types import ( DecimalType, StringType, @@ -10,20 +11,23 @@ from usaspending_api.recipient.v2.lookups import SPECIAL_CASES from usaspending_api.search.delta_models.dataframes.abstract_search import ( AbstractSearch, - hash_col, extract_numbers_as_string, + hash_col, ) ALL_AWARD_TYPES = list(award_type_mapping.keys()) class TransactionSearch(AbstractSearch): - @property def recipient_hash_and_levels(self) -> DataFrame: return ( self.recipient_profile.groupBy("recipient_hash", "uei") - .agg(sf.sort_array(sf.collect_set("recipient_level")).alias("recipient_levels")) + .agg( + sf.sort_array(sf.collect_set("recipient_level")).alias( + "recipient_levels" + ) + ) .select( sf.col("recipient_hash").alias("recipient_level_hash"), sf.col("recipient_levels"), @@ -35,23 +39,31 @@ def fed_and_tres_acct(self) -> DataFrame: return ( self.faba.join( self.treasury_appropriation_account, - self.treasury_appropriation_account.treasury_account_identifier == self.faba.treasury_account_id, + self.treasury_appropriation_account.treasury_account_identifier + == self.faba.treasury_account_id, "inner", ) .join( self.federal_account, - self.federal_account.id == self.treasury_appropriation_account.federal_account_id, + self.federal_account.id + == self.treasury_appropriation_account.federal_account_id, "inner", ) .join( self.awarding_toptier_agency, - self.federal_account.parent_toptier_agency_id == self.awarding_toptier_agency.toptier_agency_id, + self.federal_account.parent_toptier_agency_id + == self.awarding_toptier_agency.toptier_agency_id, "inner", ) - .join(self.ref_program_activity, self.faba.program_activity_id == self.ref_program_activity.id, "left") + .join( + self.ref_program_activity, + self.faba.program_activity_id == self.ref_program_activity.id, + "left", + ) .join( self.program_activity_park, - self.faba.program_activity_reporting_key == self.program_activity_park.code, + self.faba.program_activity_reporting_key + == self.program_activity_park.code, "left", ) .filter(self.faba["award_id"].isNotNull()) @@ -74,60 +86,73 @@ def key_cols(self) -> list[Column]: def date_cols(self) -> list[Column]: return [ sf.to_date(self.transaction_normalized.action_date).alias("action_date"), - sf.add_months(sf.to_date(self.transaction_normalized.action_date), 3).alias("fiscal_action_date"), - sf.to_date(self.transaction_normalized.last_modified_date).alias("last_modified_date"), + sf.add_months(sf.to_date(self.transaction_normalized.action_date), 3).alias( + "fiscal_action_date" + ), + self.transaction_normalized.last_modified_date, self.transaction_normalized.fiscal_year, self.awards.certified_date.alias("award_certified_date"), - sf.year(sf.add_months(sf.to_date(self.awards.certified_date), 3)).alias("award_fiscal_year"), + sf.year(sf.add_months(sf.to_date(self.awards.certified_date), 3)).alias( + "award_fiscal_year" + ), self.transaction_normalized.create_date.cast(TimestampType()), self.transaction_normalized.update_date.cast(TimestampType()), self.awards.update_date.cast(TimestampType()).alias("award_update_date"), sf.to_date(self.awards.date_signed).alias("award_date_signed"), - sf.greatest(sf.to_timestamp(self.transaction_normalized.update_date), self.awards.update_date).alias( - "etl_update_date" - ), - sf.to_date(self.transaction_normalized.period_of_performance_start_date).alias( - "period_of_performance_start_date" - ), - sf.to_date(self.transaction_normalized.period_of_performance_current_end_date).alias( - "period_of_performance_current_end_date" - ), + sf.greatest( + sf.to_timestamp(self.transaction_normalized.update_date), + self.awards.update_date, + ).alias("etl_update_date"), + sf.to_date( + self.transaction_normalized.period_of_performance_start_date + ).alias("period_of_performance_start_date"), + sf.to_date( + self.transaction_normalized.period_of_performance_current_end_date + ).alias("period_of_performance_current_end_date"), sf.coalesce( - sf.to_date(self.transaction_fabs.created_at), - sf.to_date(self.transaction_fpds.initial_report_date), + self.transaction_fabs.created_at, + sf.to_timestamp(self.transaction_fpds.initial_report_date), ).alias("initial_report_date"), ] @property def agency_cols(self) -> list[Column]: return [ - sf.coalesce(self.transaction_fabs.awarding_agency_code, self.transaction_fpds.awarding_agency_code).alias( - "awarding_agency_code" - ), + sf.coalesce( + self.transaction_fabs.awarding_agency_code, + self.transaction_fpds.awarding_agency_code, + ).alias("awarding_agency_code"), self.awarding_toptier_agency.awarding_toptier_agency_name, - sf.coalesce(self.transaction_fabs.awarding_agency_name, self.transaction_fpds.awarding_agency_name).alias( - "awarding_toptier_agency_name_raw" - ), - sf.coalesce(self.transaction_fabs.funding_agency_code, self.transaction_fpds.funding_agency_code).alias( - "funding_agency_code" - ), + sf.coalesce( + self.transaction_fabs.awarding_agency_name, + self.transaction_fpds.awarding_agency_name, + ).alias("awarding_toptier_agency_name_raw"), + sf.coalesce( + self.transaction_fabs.funding_agency_code, + self.transaction_fpds.funding_agency_code, + ).alias("funding_agency_code"), self.funding_toptier_agency.funding_toptier_agency_name, - sf.coalesce(self.transaction_fabs.funding_agency_name, self.transaction_fpds.funding_agency_name).alias( - "funding_toptier_agency_name_raw" - ), sf.coalesce( - self.transaction_fabs.awarding_sub_tier_agency_c, self.transaction_fpds.awarding_sub_tier_agency_c + self.transaction_fabs.funding_agency_name, + self.transaction_fpds.funding_agency_name, + ).alias("funding_toptier_agency_name_raw"), + sf.coalesce( + self.transaction_fabs.awarding_sub_tier_agency_c, + self.transaction_fpds.awarding_sub_tier_agency_c, ).alias("awarding_sub_tier_agency_c"), self.awarding_subtier_agency.awarding_subtier_agency_name, sf.coalesce( - self.transaction_fabs.awarding_sub_tier_agency_n, self.transaction_fpds.awarding_sub_tier_agency_n + self.transaction_fabs.awarding_sub_tier_agency_n, + self.transaction_fpds.awarding_sub_tier_agency_n, ).alias("awarding_subtier_agency_name_raw"), sf.coalesce( - self.transaction_fabs.funding_sub_tier_agency_co, self.transaction_fpds.funding_sub_tier_agency_co + self.transaction_fabs.funding_sub_tier_agency_co, + self.transaction_fpds.funding_sub_tier_agency_co, ).alias("funding_sub_tier_agency_co"), self.funding_subtier_agency.funding_subtier_agency_name, sf.coalesce( - self.transaction_fabs.funding_sub_tier_agency_na, self.transaction_fpds.funding_sub_tier_agency_na + self.transaction_fabs.funding_sub_tier_agency_na, + self.transaction_fpds.funding_sub_tier_agency_na, ).alias("funding_subtier_agency_name_raw"), self.awarding_agency_id.awarding_toptier_agency_id, self.funding_agency_id.funding_toptier_agency_id, @@ -137,17 +162,19 @@ def agency_cols(self) -> list[Column]: self.funding_toptier_agency.funding_toptier_agency_abbreviation, self.awarding_subtier_agency.awarding_subtier_agency_abbreviation, self.funding_subtier_agency.funding_subtier_agency_abbreviation, - sf.coalesce(self.transaction_fabs.awarding_office_code, self.transaction_fpds.awarding_office_code).alias( - "awarding_office_code" - ), + sf.coalesce( + self.transaction_fabs.awarding_office_code, + self.transaction_fpds.awarding_office_code, + ).alias("awarding_office_code"), sf.coalesce( self.awarding_office.awarding_office_name, self.transaction_fabs.awarding_office_name, self.transaction_fpds.awarding_office_name, ).alias("awarding_office_name"), - sf.coalesce(self.transaction_fabs.funding_office_code, self.transaction_fpds.funding_office_code).alias( - "funding_office_code" - ), + sf.coalesce( + self.transaction_fabs.funding_office_code, + self.transaction_fpds.funding_office_code, + ).alias("funding_office_code"), sf.coalesce( self.funding_office.funding_office_name, self.transaction_fabs.funding_office_name, @@ -187,7 +214,8 @@ def amounts_cols(self) -> list[Column]: return [ sf.coalesce( sf.when( - self.transaction_normalized["type"].isin(["07", "08"]), self.awards.total_subsidy_cost + self.transaction_normalized["type"].isin(["07", "08"]), + self.awards.total_subsidy_cost, ).otherwise(self.awards.total_obligation), sf.lit(0), ) @@ -202,16 +230,24 @@ def amounts_cols(self) -> list[Column]: ) .cast(DecimalType(23, 2)) .alias("generated_pragmatic_obligation"), - sf.coalesce(self.transaction_normalized.federal_action_obligation, sf.lit(0)) + sf.coalesce( + self.transaction_normalized.federal_action_obligation, sf.lit(0) + ) .cast(DecimalType(23, 2)) .alias("federal_action_obligation"), - sf.coalesce(self.transaction_normalized.original_loan_subsidy_cost, sf.lit(0)) + sf.coalesce( + self.transaction_normalized.original_loan_subsidy_cost, sf.lit(0) + ) .cast(DecimalType(23, 2)) .alias("original_loan_subsidy_cost"), - sf.coalesce(self.transaction_normalized.face_value_loan_guarantee, sf.lit(0)) + sf.coalesce( + self.transaction_normalized.face_value_loan_guarantee, sf.lit(0) + ) .cast(DecimalType(23, 2)) .alias("face_value_loan_guarantee"), - self.transaction_normalized.indirect_federal_sharing.cast(DecimalType(23, 2)), + self.transaction_normalized.indirect_federal_sharing.cast( + DecimalType(23, 2) + ), self.transaction_normalized.funding_amount, sf.coalesce(self.transaction_fabs.total_funding_amount, sf.lit("0")) .cast(DecimalType(23, 2)) @@ -224,11 +260,15 @@ def generated_parent_recipient_hash(self) -> Column: return hash_col( sf.when( sf.coalesce( - self.transaction_fpds.ultimate_parent_uei, self.transaction_fabs.ultimate_parent_uei + self.transaction_fpds.ultimate_parent_uei, + self.transaction_fabs.ultimate_parent_uei, ).isNotNull(), sf.concat( sf.lit("uei-"), - sf.coalesce(self.transaction_fpds.ultimate_parent_uei, self.transaction_fabs.ultimate_parent_uei), + sf.coalesce( + self.transaction_fpds.ultimate_parent_uei, + self.transaction_fabs.ultimate_parent_uei, + ), ), ) .when( @@ -264,11 +304,15 @@ def recipient_cols(self) -> list[Column]: hash_col( sf.when( sf.coalesce( - self.transaction_fpds.awardee_or_recipient_uei, self.transaction_fabs.uei + self.transaction_fpds.awardee_or_recipient_uei, + self.transaction_fabs.uei, ).isNotNull(), sf.concat( sf.lit("uei-"), - sf.coalesce(self.transaction_fpds.awardee_or_recipient_uei, self.transaction_fabs.uei), + sf.coalesce( + self.transaction_fpds.awardee_or_recipient_uei, + self.transaction_fabs.uei, + ), ), ) .when( @@ -297,11 +341,13 @@ def recipient_cols(self) -> list[Column]: ), ).alias("recipient_hash"), sf.col("recipient_levels"), - sf.coalesce(self.transaction_fpds.awardee_or_recipient_uei, self.transaction_fabs.uei).alias( - "recipient_uei" - ), sf.coalesce( - self.transaction_fpds.awardee_or_recipient_legal, self.transaction_fabs.awardee_or_recipient_legal + self.transaction_fpds.awardee_or_recipient_uei, + self.transaction_fabs.uei, + ).alias("recipient_uei"), + sf.coalesce( + self.transaction_fpds.awardee_or_recipient_legal, + self.transaction_fabs.awardee_or_recipient_legal, ).alias("recipient_name_raw"), sf.upper( sf.coalesce( @@ -311,18 +357,24 @@ def recipient_cols(self) -> list[Column]: ) ).alias("recipient_name"), sf.coalesce( - self.transaction_fpds.awardee_or_recipient_uniqu, self.transaction_fabs.awardee_or_recipient_uniqu + self.transaction_fpds.awardee_or_recipient_uniqu, + self.transaction_fabs.awardee_or_recipient_uniqu, ).alias("recipient_unique_id"), self.parent_recipient.parent_recipient_hash, - sf.coalesce(self.transaction_fpds.ultimate_parent_uei, self.transaction_fabs.ultimate_parent_uei).alias( - "parent_uei" - ), sf.coalesce( - self.transaction_fpds.ultimate_parent_legal_enti, self.transaction_fabs.ultimate_parent_legal_enti + self.transaction_fpds.ultimate_parent_uei, + self.transaction_fabs.ultimate_parent_uei, + ).alias("parent_uei"), + sf.coalesce( + self.transaction_fpds.ultimate_parent_legal_enti, + self.transaction_fabs.ultimate_parent_legal_enti, ).alias("parent_recipient_name_raw"), - sf.upper(self.parent_recipient.parent_recipient_name).alias("parent_recipient_name"), + sf.upper(self.parent_recipient.parent_recipient_name).alias( + "parent_recipient_name" + ), sf.coalesce( - self.transaction_fpds.ultimate_parent_unique_ide, self.transaction_fabs.ultimate_parent_unique_ide + self.transaction_fpds.ultimate_parent_unique_ide, + self.transaction_fabs.ultimate_parent_unique_ide, ).alias("parent_recipient_unique_id"), ] @@ -330,57 +382,72 @@ def recipient_cols(self) -> list[Column]: def recipient_location_cols(self) -> list[Column]: return [ sf.coalesce( - self.transaction_fpds.legal_entity_country_code, self.transaction_fabs.legal_entity_country_code + self.transaction_fpds.legal_entity_country_code, + self.transaction_fabs.legal_entity_country_code, ).alias("recipient_location_country_code"), sf.coalesce( - self.transaction_fpds.legal_entity_country_name, self.transaction_fabs.legal_entity_country_name + self.transaction_fpds.legal_entity_country_name, + self.transaction_fabs.legal_entity_country_name, ).alias("recipient_location_country_name"), sf.coalesce( - self.transaction_fpds.legal_entity_state_code, self.transaction_fabs.legal_entity_state_code + self.transaction_fpds.legal_entity_state_code, + self.transaction_fabs.legal_entity_state_code, ).alias("recipient_location_state_code"), sf.coalesce( - self.transaction_fpds.legal_entity_state_descrip, self.transaction_fabs.legal_entity_state_name + self.transaction_fpds.legal_entity_state_descrip, + self.transaction_fabs.legal_entity_state_name, ).alias("recipient_location_state_name"), sf.col("recipient_location_state_fips"), self.rl_state_population.recipient_location_state_population, extract_numbers_as_string( sf.coalesce( - self.transaction_fpds.legal_entity_county_code, self.transaction_fabs.legal_entity_county_code + self.transaction_fpds.legal_entity_county_code, + self.transaction_fabs.legal_entity_county_code, ), 3, ).alias("recipient_location_county_code"), sf.coalesce( - self.transaction_fpds.legal_entity_county_name, self.transaction_fabs.legal_entity_county_name + self.transaction_fpds.legal_entity_county_name, + self.transaction_fabs.legal_entity_county_name, ).alias("recipient_location_county_name"), self.rl_county_population.recipient_location_county_population, extract_numbers_as_string( sf.coalesce( - self.transaction_fpds.legal_entity_congressional, self.transaction_fabs.legal_entity_congressional + self.transaction_fpds.legal_entity_congressional, + self.transaction_fabs.legal_entity_congressional, ) ).alias("recipient_location_congressional_code"), self.rl_district_population.recipient_location_congressional_population, self.current_cd.recipient_location_congressional_code_current.alias( "recipient_location_congressional_code_current" ), - sf.coalesce(self.transaction_fpds.legal_entity_zip5, self.transaction_fabs.legal_entity_zip5).alias( - "recipient_location_zip5" - ), + sf.coalesce( + self.transaction_fpds.legal_entity_zip5, + self.transaction_fabs.legal_entity_zip5, + ).alias("recipient_location_zip5"), self.transaction_fpds.legal_entity_zip4, sf.coalesce( - self.transaction_fpds.legal_entity_zip_last4, self.transaction_fabs.legal_entity_zip_last4 + self.transaction_fpds.legal_entity_zip_last4, + self.transaction_fabs.legal_entity_zip_last4, ).alias("legal_entity_zip_last4"), self.transaction_fabs.legal_entity_city_code, sf.rtrim( - sf.coalesce(self.transaction_fpds.legal_entity_city_name, self.transaction_fabs.legal_entity_city_name) + sf.coalesce( + self.transaction_fpds.legal_entity_city_name, + self.transaction_fabs.legal_entity_city_name, + ) ).alias("recipient_location_city_name"), sf.coalesce( - self.transaction_fpds.legal_entity_address_line1, self.transaction_fabs.legal_entity_address_line1 + self.transaction_fpds.legal_entity_address_line1, + self.transaction_fabs.legal_entity_address_line1, ).alias("legal_entity_address_line1"), sf.coalesce( - self.transaction_fpds.legal_entity_address_line2, self.transaction_fabs.legal_entity_address_line2 + self.transaction_fpds.legal_entity_address_line2, + self.transaction_fabs.legal_entity_address_line2, ).alias("legal_entity_address_line2"), sf.coalesce( - self.transaction_fpds.legal_entity_address_line3, self.transaction_fabs.legal_entity_address_line3 + self.transaction_fpds.legal_entity_address_line3, + self.transaction_fabs.legal_entity_address_line3, ).alias("legal_entity_address_line3"), self.transaction_fabs.legal_entity_foreign_city, self.transaction_fabs.legal_entity_foreign_descr, @@ -389,7 +456,8 @@ def recipient_location_cols(self) -> list[Column]: sf.concat( sf.col("recipient_location_state_fips"), sf.coalesce( - self.transaction_fpds.legal_entity_county_code, self.transaction_fabs.legal_entity_county_code + self.transaction_fpds.legal_entity_county_code, + self.transaction_fabs.legal_entity_county_code, ), ).alias("recipient_location_county_fips"), ] @@ -400,55 +468,67 @@ def place_of_performance_cols(self) -> list[Column]: self.transaction_fabs.place_of_performance_code, self.transaction_fabs.place_of_performance_scope, sf.coalesce( - self.transaction_fpds.place_of_perform_country_c, self.transaction_fabs.place_of_perform_country_c + self.transaction_fpds.place_of_perform_country_c, + self.transaction_fabs.place_of_perform_country_c, ).alias("pop_country_code"), sf.coalesce( - self.transaction_fpds.place_of_perf_country_desc, self.transaction_fabs.place_of_perform_country_n + self.transaction_fpds.place_of_perf_country_desc, + self.transaction_fabs.place_of_perform_country_n, ).alias("pop_country_name"), sf.coalesce( - self.transaction_fpds.place_of_performance_state, self.transaction_fabs.place_of_perfor_state_code + self.transaction_fpds.place_of_performance_state, + self.transaction_fabs.place_of_perfor_state_code, ).alias("pop_state_code"), sf.coalesce( - self.transaction_fpds.place_of_perfor_state_desc, self.transaction_fabs.place_of_perform_state_nam + self.transaction_fpds.place_of_perfor_state_desc, + self.transaction_fabs.place_of_perform_state_nam, ).alias("pop_state_name"), sf.col("pop_state_fips"), self.pop_state_population.pop_state_population, extract_numbers_as_string( sf.coalesce( - self.transaction_fpds.place_of_perform_county_co, self.transaction_fabs.place_of_perform_county_co + self.transaction_fpds.place_of_perform_county_co, + self.transaction_fabs.place_of_perform_county_co, ), 3, ).alias("pop_county_code"), sf.coalesce( - self.transaction_fpds.place_of_perform_county_na, self.transaction_fabs.place_of_perform_county_na + self.transaction_fpds.place_of_perform_county_na, + self.transaction_fabs.place_of_perform_county_na, ).alias("pop_county_name"), self.pop_county_population.pop_county_population, extract_numbers_as_string( sf.coalesce( - self.transaction_fpds.place_of_performance_congr, self.transaction_fabs.place_of_performance_congr + self.transaction_fpds.place_of_performance_congr, + self.transaction_fabs.place_of_performance_congr, ) ).alias("pop_congressional_code"), self.pop_district_population.pop_congressional_population, self.current_cd.pop_congressional_code_current, sf.coalesce( - self.transaction_fpds.place_of_performance_zip5, self.transaction_fabs.place_of_performance_zip5 + self.transaction_fpds.place_of_performance_zip5, + self.transaction_fabs.place_of_performance_zip5, ).alias("pop_zip5"), sf.coalesce( - self.transaction_fpds.place_of_performance_zip4a, self.transaction_fabs.place_of_performance_zip4a + self.transaction_fpds.place_of_performance_zip4a, + self.transaction_fabs.place_of_performance_zip4a, ).alias("place_of_performance_zip4a"), sf.coalesce( - self.transaction_fpds.place_of_perform_zip_last4, self.transaction_fabs.place_of_perform_zip_last4 + self.transaction_fpds.place_of_perform_zip_last4, + self.transaction_fabs.place_of_perform_zip_last4, ).alias("place_of_perform_zip_last4"), sf.rtrim( sf.coalesce( - self.transaction_fpds.place_of_perform_city_name, self.transaction_fabs.place_of_performance_city + self.transaction_fpds.place_of_perform_city_name, + self.transaction_fabs.place_of_performance_city, ) ).alias("pop_city_name"), self.transaction_fabs.place_of_performance_forei, sf.concat( sf.col("pop_state_fips"), sf.coalesce( - self.transaction_fpds.place_of_perform_county_co, self.transaction_fabs.place_of_perform_county_co + self.transaction_fpds.place_of_perform_county_co, + self.transaction_fabs.place_of_perform_county_co, ), ).alias("pop_county_fips"), ] @@ -466,36 +546,46 @@ def accounts_cols(self) -> list[Column]: @property def officer_amounts_cols(self) -> list[Column]: return [ - sf.coalesce(self.transaction_fabs.officer_1_name, self.transaction_fpds.officer_1_name).alias( - "officer_1_name" - ), - sf.coalesce(self.transaction_fabs.officer_1_amount, self.transaction_fpds.officer_1_amount).alias( - "officer_1_amount" - ), - sf.coalesce(self.transaction_fabs.officer_2_name, self.transaction_fpds.officer_2_name).alias( - "officer_2_name" - ), - sf.coalesce(self.transaction_fabs.officer_2_amount, self.transaction_fpds.officer_2_amount).alias( - "officer_2_amount" - ), - sf.coalesce(self.transaction_fabs.officer_3_name, self.transaction_fpds.officer_3_name).alias( - "officer_3_name" - ), - sf.coalesce(self.transaction_fabs.officer_3_amount, self.transaction_fpds.officer_3_amount).alias( - "officer_3_amount" - ), - sf.coalesce(self.transaction_fabs.officer_4_name, self.transaction_fpds.officer_4_name).alias( - "officer_4_name" - ), - sf.coalesce(self.transaction_fabs.officer_4_amount, self.transaction_fpds.officer_4_amount).alias( - "officer_4_amount" - ), - sf.coalesce(self.transaction_fabs.officer_5_name, self.transaction_fpds.officer_5_name).alias( - "officer_5_name" - ), - sf.coalesce(self.transaction_fabs.officer_5_amount, self.transaction_fpds.officer_5_amount).alias( - "officer_5_amount" - ), + sf.coalesce( + self.transaction_fabs.officer_1_name, + self.transaction_fpds.officer_1_name, + ).alias("officer_1_name"), + sf.coalesce( + self.transaction_fabs.officer_1_amount, + self.transaction_fpds.officer_1_amount, + ).alias("officer_1_amount"), + sf.coalesce( + self.transaction_fabs.officer_2_name, + self.transaction_fpds.officer_2_name, + ).alias("officer_2_name"), + sf.coalesce( + self.transaction_fabs.officer_2_amount, + self.transaction_fpds.officer_2_amount, + ).alias("officer_2_amount"), + sf.coalesce( + self.transaction_fabs.officer_3_name, + self.transaction_fpds.officer_3_name, + ).alias("officer_3_name"), + sf.coalesce( + self.transaction_fabs.officer_3_amount, + self.transaction_fpds.officer_3_amount, + ).alias("officer_3_amount"), + sf.coalesce( + self.transaction_fabs.officer_4_name, + self.transaction_fpds.officer_4_name, + ).alias("officer_4_name"), + sf.coalesce( + self.transaction_fabs.officer_4_amount, + self.transaction_fpds.officer_4_amount, + ).alias("officer_4_amount"), + sf.coalesce( + self.transaction_fabs.officer_5_name, + self.transaction_fpds.officer_5_name, + ).alias("officer_5_name"), + sf.coalesce( + self.transaction_fabs.officer_5_amount, + self.transaction_fpds.officer_5_amount, + ).alias("officer_5_amount"), ] @property @@ -676,7 +766,9 @@ def fpds_cols(self) -> list[Column]: self.transaction_fpds.price_evaluation_adjustmen, self.transaction_fpds.private_university_or_coll, self.transaction_fpds.product_or_service_code, - self.transaction_fpds.product_or_service_co_desc.alias("product_or_service_description"), + self.transaction_fpds.product_or_service_co_desc.alias( + "product_or_service_description" + ), self.transaction_fpds.program_acronym, self.transaction_fpds.program_system_or_equ_desc, self.transaction_fpds.program_system_or_equipmen, @@ -765,7 +857,8 @@ def dataframe(self) -> DataFrame: ) .join( self.references_cfda, - self.transaction_fabs.cfda_number == self.references_cfda.program_number, + self.transaction_fabs.cfda_number + == self.references_cfda.program_number, "leftouter", ) .join( @@ -773,26 +866,36 @@ def dataframe(self) -> DataFrame: self.recipient_lookup.recipient_hash == self.generated_recipient_hash, "leftouter", ) - .join(self.awards, self.transaction_normalized.award_id == self.awards.id, "leftouter") + .join( + self.awards, + self.transaction_normalized.award_id == self.awards.id, + "leftouter", + ) .join( self.awarding_agency, - self.transaction_normalized.awarding_agency_id == self.awarding_agency.id, + self.transaction_normalized.awarding_agency_id + == self.awarding_agency.id, "leftouter", ) .join( self.awarding_toptier_agency, - self.awarding_agency.toptier_agency_id == self.awarding_toptier_agency.toptier_agency_id, + self.awarding_agency.toptier_agency_id + == self.awarding_toptier_agency.toptier_agency_id, "leftouter", ) .join( self.awarding_subtier_agency, - self.awarding_agency.subtier_agency_id == self.awarding_subtier_agency.subtier_agency_id, + self.awarding_agency.subtier_agency_id + == self.awarding_subtier_agency.subtier_agency_id, "leftouter", ) .join( self.awarding_agency_id, ( - (self.awarding_agency_id.toptier_agency_id == self.awarding_toptier_agency.toptier_agency_id) + ( + self.awarding_agency_id.toptier_agency_id + == self.awarding_toptier_agency.toptier_agency_id + ) & self.awarding_agency_id.toptier_flag ), "leftouter", @@ -804,23 +907,29 @@ def dataframe(self) -> DataFrame: ) .join( self.funding_toptier_agency, - self.funding_agency.funding_toptier_agency_id == self.funding_toptier_agency.toptier_agency_id, + self.funding_agency.funding_toptier_agency_id + == self.funding_toptier_agency.toptier_agency_id, "leftouter", ) .join( self.funding_subtier_agency, - self.funding_agency.funding_subtier_agency_id == self.funding_subtier_agency.subtier_agency_id, + self.funding_agency.funding_subtier_agency_id + == self.funding_subtier_agency.subtier_agency_id, "leftouter", ) .join( self.funding_agency_id, - (self.funding_agency_id.toptier_agency_id == self.funding_toptier_agency.funding_toptier_agency_id) + ( + self.funding_agency_id.toptier_agency_id + == self.funding_toptier_agency.funding_toptier_agency_id + ) & (self.funding_agency_id.row_num == 1), "leftouter", ) .join( self.parent_recipient, - self.parent_recipient.parent_recipient_hash == self.generated_parent_recipient_hash, + self.parent_recipient.parent_recipient_hash + == self.generated_parent_recipient_hash, "leftouter", ) .join( @@ -833,18 +942,26 @@ def dataframe(self) -> DataFrame: df_with_location = self.join_location_data(df) return ( df_with_location.join( - self.current_cd, self.transaction_normalized.id == self.current_cd.transaction_id, "leftouter" + self.current_cd, + self.transaction_normalized.id == self.current_cd.transaction_id, + "leftouter", ) .join( self.awarding_office, self.awarding_office.office_code - == sf.coalesce(self.transaction_fabs.awarding_office_code, self.transaction_fpds.awarding_office_code), + == sf.coalesce( + self.transaction_fabs.awarding_office_code, + self.transaction_fpds.awarding_office_code, + ), "leftouter", ) .join( self.funding_office, self.funding_office.office_code - == sf.coalesce(self.transaction_fabs.funding_office_code, self.transaction_fpds.funding_office_code), + == sf.coalesce( + self.transaction_fabs.funding_office_code, + self.transaction_fpds.funding_office_code, + ), "leftouter", ) .join( @@ -870,7 +987,9 @@ def dataframe(self) -> DataFrame: ) -def load_transaction_search(spark: SparkSession, destination_database: str, destination_table_name: str) -> None: +def load_transaction_search( + spark: SparkSession, destination_database: str, destination_table_name: str +) -> None: df = TransactionSearch(spark).dataframe df.write.saveAsTable( f"{destination_database}.{destination_table_name}", @@ -882,10 +1001,15 @@ def load_transaction_search(spark: SparkSession, destination_database: str, dest def load_transaction_search_incremental( spark: SparkSession, destination_database: str, destination_table_name: str ) -> None: - target = DeltaTable.forName(spark, f"{destination_database}.{destination_table_name}").alias("t") + target = DeltaTable.forName( + spark, f"{destination_database}.{destination_table_name}" + ).alias("t") source = TransactionSearch(spark).dataframe.alias("s") ( - target.merge(source, "s.transaction_id = t.transaction_id and s.merge_hash_key = t.merge_hash_key") + target.merge( + source, + "s.transaction_id = t.transaction_id and s.merge_hash_key = t.merge_hash_key", + ) .whenNotMatchedInsertAll() .whenNotMatchedBySourceDelete() .execute() diff --git a/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py new file mode 100644 index 0000000000..c023c9fca9 --- /dev/null +++ b/usaspending_api/search/migrations/0059_alter_transactionsearch_initial_report_date_and_more.py @@ -0,0 +1,26 @@ +# Generated by Django 4.2.23 on 2026-01-02 16:42 + +from django.db import migrations + + +class Migration(migrations.Migration): + atomic = False + dependencies = [ + ("search", "0058_add_transaction_count_field"), + ] + + operations = [ + # Without dropping these tables, it caused the error cannot alter type of a column used by a view or rule + migrations.RunSQL( + sql=""" + DROP VIEW IF EXISTS + vw_awards, + vw_transaction_fabs, + vw_transaction_normalized, + vw_transaction_fpds, + transaction_delta_view + CASCADE; + """, + reverse_sql=migrations.RunSQL.noop, + ), + ] diff --git a/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py b/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py new file mode 100644 index 0000000000..4badf0d9ff --- /dev/null +++ b/usaspending_api/search/migrations/0060_alter_initial_report_date_andmore.py @@ -0,0 +1,57 @@ +# Generated by Django 4.2.23 on 2026-01-02 16:42 + +from django.db import migrations, models + +from usaspending_api.awards.models.award import vw_awards_sql +from usaspending_api.awards.models.transaction_fabs import vw_transaction_fabs_sql +from usaspending_api.awards.models.transaction_fpds import vw_transaction_fpds_sql +from usaspending_api.awards.models.transaction_normalized import ( + vw_transaction_normalized_sql, +) + +transaction_delta_view_file = ( + "usaspending_api/database_scripts/etl/transaction_delta_view.sql" +) +with open(transaction_delta_view_file, "r") as f: + transaction_delta_view = f.read() + + +class Migration(migrations.Migration): + atomic = False + dependencies = [ + ("search", "0059_alter_transactionsearch_initial_report_date_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="transactionsearch", + name="initial_report_date", + field=models.DateTimeField(null=True), + ), + migrations.AlterField( + model_name="transactionsearch", + name="last_modified_date", + field=models.DateTimeField(null=True), + ), + migrations.AlterField( + model_name="awardsearch", + name="last_modified_date", + field=models.DateTimeField(blank=True, null=True), + ), + migrations.RunSQL( + sql=f""" + {vw_awards_sql} + {vw_transaction_normalized_sql} + {vw_transaction_fpds_sql} + {vw_transaction_fabs_sql} + {transaction_delta_view} + """, + reverse_sql="""DROP VIEW IF EXISTS + vw_awards, + vw_transaction_fabs, + vw_transaction_normalized, + vw_transaction_fpds, + transaction_delta_view + """, + ), + ] diff --git a/usaspending_api/search/models/award_search.py b/usaspending_api/search/models/award_search.py index c915ee234c..5a6d8ddbc8 100644 --- a/usaspending_api/search/models/award_search.py +++ b/usaspending_api/search/models/award_search.py @@ -9,8 +9,12 @@ class AwardSearch(models.Model): - treasury_account_identifiers = ArrayField(models.IntegerField(), default=list, null=True) - award = models.OneToOneField(Award, on_delete=models.DO_NOTHING, primary_key=True, related_name="%(class)s") + treasury_account_identifiers = ArrayField( + models.IntegerField(), default=list, null=True + ) + award = models.OneToOneField( + Award, on_delete=models.DO_NOTHING, primary_key=True, related_name="%(class)s" + ) category = models.TextField(null=True, db_index=True) type_raw = models.TextField(null=True, db_index=True) type_description_raw = models.TextField(null=True) @@ -18,19 +22,31 @@ class AwardSearch(models.Model): type_description = models.TextField(null=True) generated_unique_award_id = models.TextField(null=False, unique=True) generated_unique_award_id_legacy = models.TextField( - null=True, unique=True, help_text="Legacy generated unique award ID built using subtier awarding agency code" + null=True, + unique=True, + help_text="Legacy generated unique award ID built using subtier awarding agency code", ) display_award_id = models.TextField(null=True) update_date = models.DateTimeField(auto_now=True, null=True) piid = models.TextField(null=True, db_index=True) fain = models.TextField(null=True, db_index=True) uri = models.TextField(null=True, db_index=True) - award_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True, db_index=True) - total_outlays = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True, db_index=True) + award_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True, db_index=True + ) + total_outlays = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True, db_index=True + ) description = models.TextField(null=True) - total_subsidy_cost = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_loan_value = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + total_subsidy_cost = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_loan_value = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) total_obl_bin = models.TextField(null=True) recipient_hash = models.UUIDField(null=True) @@ -45,15 +61,19 @@ class AwardSearch(models.Model): action_date = models.DateField(null=True) fiscal_year = models.IntegerField(null=True) - last_modified_date = models.DateField(blank=True, null=True) + last_modified_date = models.DateTimeField(blank=True, null=True) period_of_performance_start_date = models.DateField(null=True, db_index=True) period_of_performance_current_end_date = models.DateField(null=True, db_index=True) date_signed = models.DateField(null=True) ordering_period_end_date = models.DateField(null=True) - original_loan_subsidy_cost = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - face_value_loan_guarantee = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + original_loan_subsidy_cost = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + face_value_loan_guarantee = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) awarding_agency_id = models.IntegerField(null=True, db_index=True) funding_agency_id = models.IntegerField(null=True, db_index=True) @@ -137,34 +157,64 @@ class AwardSearch(models.Model): tas_paths = ArrayField(models.TextField(), default=list, null=True) tas_components = ArrayField(models.TextField(), default=list, null=True) - disaster_emergency_fund_codes = ArrayField(models.TextField(), default=list, null=True) + disaster_emergency_fund_codes = ArrayField( + models.TextField(), default=list, null=True + ) spending_by_defc = models.JSONField(null=True) - total_covid_outlay = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_covid_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_iija_outlay = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_iija_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - officer_1_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + total_covid_outlay = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_covid_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_iija_outlay = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_iija_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + officer_1_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_1_name = models.TextField(null=True) - officer_2_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_2_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_2_name = models.TextField(null=True) - officer_3_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_3_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_3_name = models.TextField(null=True) - officer_4_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_4_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_4_name = models.TextField(null=True) - officer_5_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_5_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_5_name = models.TextField(null=True) is_fpds = models.BooleanField(default=False) fpds_agency_id = models.TextField(null=True) fpds_parent_agency_id = models.TextField(null=True) - base_and_all_options_value = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - non_federal_funding_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_subaward_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + base_and_all_options_value = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + non_federal_funding_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_subaward_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) subaward_count = models.IntegerField(null=True) - base_exercised_options_val = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + base_exercised_options_val = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) parent_award_piid = models.TextField(null=True, db_index=True) certified_date = models.DateField(blank=True, null=True) create_date = models.DateTimeField(null=True, auto_now_add=True) - total_funding_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + total_funding_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) latest_transaction = models.ForeignKey( "awards.TransactionNormalized", on_delete=models.DO_NOTHING, @@ -199,11 +249,15 @@ class AwardSearch(models.Model): "award", db_constraint=False, ) - total_indirect_federal_sharing = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + total_indirect_federal_sharing = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) transaction_unique_id = models.TextField(null=True) raw_recipient_name = models.TextField(null=True) data_source = models.TextField(null=True) - generated_pragmatic_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + generated_pragmatic_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) program_activities = models.JSONField(null=True) transaction_count = models.IntegerField(null=True) @@ -219,12 +273,15 @@ class Meta: ] indexes = [ models.Index( - fields=["recipient_hash"], name="as_idx_recipient_hash", condition=Q(action_date__gte="2007-10-01") + fields=["recipient_hash"], + name="as_idx_recipient_hash", + condition=Q(action_date__gte="2007-10-01"), ), models.Index( fields=["recipient_unique_id"], name="as_idx_recipient_unique_id", - condition=Q(recipient_unique_id__isnull=False) & Q(action_date__gte="2007-10-01"), + condition=Q(recipient_unique_id__isnull=False) + & Q(action_date__gte="2007-10-01"), ), models.Index( F("action_date").desc(nulls_last=True), @@ -258,8 +315,12 @@ class Meta: condition=Q(action_date__lt="2007-10-01"), ), models.Index(Upper("piid"), name="as_idx_piid_upper"), - models.Index(Upper("parent_award_piid"), name="as_idx_parent_award_piid_upper"), + models.Index( + Upper("parent_award_piid"), name="as_idx_parent_award_piid_upper" + ), models.Index(Upper("fain"), name="as_idx_fain_upper"), models.Index(Upper("uri"), name="as_idx_uri_upper"), - models.Index(F("update_date").desc(nulls_last=True), name="as_idx_update_date_desc"), + models.Index( + F("update_date").desc(nulls_last=True), name="as_idx_update_date_desc" + ), ] diff --git a/usaspending_api/search/models/transaction_search.py b/usaspending_api/search/models/transaction_search.py index 7b97920e76..7b9c230e15 100644 --- a/usaspending_api/search/models/transaction_search.py +++ b/usaspending_api/search/models/transaction_search.py @@ -18,8 +18,12 @@ class TransactionSearch(models.Model): # Also, this table has been physically partitioned by partition key: is_fpds. We can no longer have a UNIQUE key # or UNIQUE INDEX on transaction_id (the primary_key) anymore, it must include the partition key. So setting # primary_key=False and adding a UniqueConstraint (is_fpds, transaction) - transaction = models.OneToOneField("awards.TransactionNormalized", on_delete=models.DO_NOTHING, primary_key=True) - award = models.ForeignKey("search.AwardSearch", on_delete=models.DO_NOTHING, null=True) + transaction = models.OneToOneField( + "awards.TransactionNormalized", on_delete=models.DO_NOTHING, primary_key=True + ) + award = models.ForeignKey( + "search.AwardSearch", on_delete=models.DO_NOTHING, null=True + ) transaction_unique_id = models.TextField(blank=False, null=False, default="NONE") usaspending_unique_transaction_id = models.TextField(null=True) modification_number = models.TextField(null=True) @@ -28,7 +32,7 @@ class TransactionSearch(models.Model): # Dates action_date = models.DateField(null=True) fiscal_action_date = models.DateField(null=True) - last_modified_date = models.DateField(null=True) + last_modified_date = models.DateTimeField(null=True) fiscal_year = models.IntegerField(null=True) award_certified_date = models.DateField(null=True) award_fiscal_year = models.IntegerField(null=True) @@ -39,7 +43,7 @@ class TransactionSearch(models.Model): etl_update_date = models.DateTimeField(null=True) period_of_performance_start_date = models.DateField(null=True) period_of_performance_current_end_date = models.DateField(null=True) - initial_report_date = models.DateField(null=True) + initial_report_date = models.DateTimeField(null=True) # Agencies awarding_agency_code = models.TextField(null=True) @@ -80,15 +84,31 @@ class TransactionSearch(models.Model): business_categories = ArrayField(models.TextField(), null=True) # Amounts - award_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - generated_pragmatic_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - federal_action_obligation = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - original_loan_subsidy_cost = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - face_value_loan_guarantee = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + award_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + generated_pragmatic_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + federal_action_obligation = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + original_loan_subsidy_cost = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + face_value_loan_guarantee = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) indirect_federal_sharing = NumericField(blank=True, null=True) - funding_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - total_funding_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) - non_federal_funding_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + funding_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + total_funding_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) + non_federal_funding_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) # Recipient recipient_hash = models.UUIDField(null=True) @@ -161,15 +181,25 @@ class TransactionSearch(models.Model): # Officer Amounts officer_1_name = models.TextField(null=True) - officer_1_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_1_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_2_name = models.TextField(null=True) - officer_2_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_2_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_3_name = models.TextField(null=True) - officer_3_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_3_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_4_name = models.TextField(null=True) - officer_4_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_4_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) officer_5_name = models.TextField(null=True) - officer_5_amount = models.DecimalField(max_digits=23, decimal_places=2, blank=True, null=True) + officer_5_amount = models.DecimalField( + max_digits=23, decimal_places=2, blank=True, null=True + ) # Exclusively FABS published_fabs_id = models.IntegerField(blank=True, null=True) @@ -417,7 +447,11 @@ class TransactionSearch(models.Model): class Meta: db_table = "transaction_search" - constraints = [models.UniqueConstraint(fields=["is_fpds", "transaction"], name="ts_idx_is_fpds_transaction_id")] + constraints = [ + models.UniqueConstraint( + fields=["is_fpds", "transaction"], name="ts_idx_is_fpds_transaction_id" + ) + ] indexes = [ models.Index(fields=["transaction"], name="ts_idx_transaction_id"), models.Index(fields=["generated_unique_award_id"], name="ts_idx_award_key"), @@ -431,26 +465,50 @@ class Meta: name="ts_idx_fpds_key_pre2008", condition=Q(action_date__lt="2007-10-01"), ), - models.Index(fields=["piid"], name="ts_idx_piid_pre2008", condition=Q(action_date__lt="2007-10-01")), + models.Index( + fields=["piid"], + name="ts_idx_piid_pre2008", + condition=Q(action_date__lt="2007-10-01"), + ), models.Index( fields=["parent_award_id"], name="ts_idx_parent_award_id_pre2008", condition=Q(action_date__lt="2007-10-01"), ), - models.Index(fields=["fain"], name="ts_idx_fain_pre2008", condition=Q(action_date__lt="2007-10-01")), - models.Index(fields=["uri"], name="ts_idx_uri_pre2008", condition=Q(action_date__lt="2007-10-01")), + models.Index( + fields=["fain"], + name="ts_idx_fain_pre2008", + condition=Q(action_date__lt="2007-10-01"), + ), + models.Index( + fields=["uri"], + name="ts_idx_uri_pre2008", + condition=Q(action_date__lt="2007-10-01"), + ), models.Index(fields=["is_fpds"], name="ts_idx_is_fpds"), models.Index( - fields=["-action_date"], name="ts_idx_action_date", condition=Q(action_date__gte="2007-10-01") + fields=["-action_date"], + name="ts_idx_action_date", + condition=Q(action_date__gte="2007-10-01"), ), - models.Index(fields=["-last_modified_date"], name="ts_idx_last_modified_date"), models.Index( - fields=["-fiscal_year"], name="ts_idx_fiscal_year", condition=Q(action_date__gte="2007-10-01") + fields=["-last_modified_date"], name="ts_idx_last_modified_date" ), models.Index( - fields=["type"], name="ts_idx_type", condition=Q(type__isnull=False) & Q(action_date__gte="2007-10-01") + fields=["-fiscal_year"], + name="ts_idx_fiscal_year", + condition=Q(action_date__gte="2007-10-01"), + ), + models.Index( + fields=["type"], + name="ts_idx_type", + condition=Q(type__isnull=False) & Q(action_date__gte="2007-10-01"), + ), + models.Index( + fields=["award"], + name="ts_idx_award_id", + condition=Q(action_date__gte="2007-10-01"), ), - models.Index(fields=["award"], name="ts_idx_award_id", condition=Q(action_date__gte="2007-10-01")), models.Index( fields=["pop_zip5"], name="ts_idx_pop_zip5", @@ -459,12 +517,14 @@ class Meta: models.Index( fields=["recipient_unique_id"], name="ts_idx_recipient_unique_id", - condition=Q(recipient_unique_id__isnull=False) & Q(action_date__gte="2007-10-01"), + condition=Q(recipient_unique_id__isnull=False) + & Q(action_date__gte="2007-10-01"), ), models.Index( fields=["parent_recipient_unique_id"], name="ts_idx_parent_recipient_unique", - condition=Q(parent_recipient_unique_id__isnull=False) & Q(action_date__gte="2007-10-01"), + condition=Q(parent_recipient_unique_id__isnull=False) + & Q(action_date__gte="2007-10-01"), ), models.Index( fields=["pop_state_code", "action_date"], @@ -474,10 +534,14 @@ class Meta: & Q(action_date__gte="2007-10-01"), ), models.Index( - fields=["recipient_hash"], name="ts_idx_recipient_hash", condition=Q(action_date__gte="2007-10-01") + fields=["recipient_hash"], + name="ts_idx_recipient_hash", + condition=Q(action_date__gte="2007-10-01"), ), models.Index( - fields=["action_date"], name="ts_idx_action_date_pre2008", condition=Q(action_date__lt="2007-10-01") + fields=["action_date"], + name="ts_idx_action_date_pre2008", + condition=Q(action_date__lt="2007-10-01"), ), models.Index(fields=["etl_update_date"], name="ts_idx_etl_update_date"), models.Index( @@ -485,12 +549,20 @@ class Meta: name="ts_idx_tocp_pre2008", condition=Q(action_date__lt="2007-10-01"), ), - models.Index(fields=["naics_code"], name="ts_idx_naics_pre2008", condition=Q(action_date__lt="2007-10-01")), models.Index( - fields=["extent_competed"], name="ts_idx_ext_com_pre2008", condition=Q(action_date__lt="2007-10-01") + fields=["naics_code"], + name="ts_idx_naics_pre2008", + condition=Q(action_date__lt="2007-10-01"), + ), + models.Index( + fields=["extent_competed"], + name="ts_idx_ext_com_pre2008", + condition=Q(action_date__lt="2007-10-01"), ), models.Index( - fields=["product_or_service_code"], name="ts_idx_psc_pre2008", condition=Q(action_date__lt="2007-10-01") + fields=["product_or_service_code"], + name="ts_idx_psc_pre2008", + condition=Q(action_date__lt="2007-10-01"), ), models.Index( fields=["type_set_aside"], @@ -498,8 +570,12 @@ class Meta: condition=Q(action_date__lt="2007-10-01"), ), models.Index( - fields=["cfda_number"], name="ts_idx_cfda_aside_pre2008", condition=Q(action_date__lt="2007-10-01") + fields=["cfda_number"], + name="ts_idx_cfda_aside_pre2008", + condition=Q(action_date__lt="2007-10-01"), + ), + models.Index( + fields=["awarding_agency_id"], name="ts_idx_awarding_agency_id" ), - models.Index(fields=["awarding_agency_id"], name="ts_idx_awarding_agency_id"), models.Index(fields=["funding_agency_id"], name="ts_idx_funding_agency_id"), ] diff --git a/usaspending_api/search/tests/data/spending_by_award_test_data.py b/usaspending_api/search/tests/data/spending_by_award_test_data.py index f651a2fced..5f49bf0453 100644 --- a/usaspending_api/search/tests/data/spending_by_award_test_data.py +++ b/usaspending_api/search/tests/data/spending_by_award_test_data.py @@ -280,7 +280,9 @@ def spending_by_award_test_data(): pop_zip4="9040", cfda_number="64.114", cfda_program_title="VETERANS HOUSING GUARANTEED AND INSURED LOANS", - cfdas=['{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}'], + cfdas=[ + '{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}' + ], ) baker.make( @@ -320,7 +322,9 @@ def spending_by_award_test_data(): pop_zip4="9040", cfda_number="64.114", cfda_program_title="VETERANS HOUSING GUARANTEED AND INSURED LOANS", - cfdas=['{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}'], + cfdas=[ + '{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}' + ], ) award_6 = baker.make( @@ -360,7 +364,9 @@ def spending_by_award_test_data(): pop_zip4="9040", cfda_number="64.114", cfda_program_title="VETERANS HOUSING GUARANTEED AND INSURED LOANS", - cfdas=['{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}'], + cfdas=[ + '{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}' + ], ) baker.make( @@ -669,7 +675,9 @@ def spending_by_award_test_data(): pop_zip4="9040", cfda_number="64.114", cfda_program_title="VETERANS HOUSING GUARANTEED AND INSURED LOANS", - cfdas=['{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}'], + cfdas=[ + '{"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"}' + ], ) baker.make( @@ -723,14 +731,34 @@ def spending_by_award_test_data(): product_or_service_description="PSC description 1", ) + baker.make( + "search.AwardSearch", + award_id=2026, + type="F003", + category="loan", + date_signed="2019-01-01", + action_date="2019-01-01", + fain="fain2026", + display_award_id="award2026", + generated_unique_award_id="ASST_NEW_TYPES_2026", + ) + # Toptier Agency ta1 = baker.make( - "references.ToptierAgency", abbreviation="TA1", name="TOPTIER AGENCY 1", toptier_code="ABC", _fill_optional=True + "references.ToptierAgency", + abbreviation="TA1", + name="TOPTIER AGENCY 1", + toptier_code="ABC", + _fill_optional=True, ) # Federal Account baker.make( - "accounts.FederalAccount", id=1, parent_toptier_agency=ta1, agency_identifier="1", main_account_code="0001" + "accounts.FederalAccount", + id=1, + parent_toptier_agency=ta1, + agency_identifier="1", + main_account_code="0001", ) # TAS @@ -750,7 +778,10 @@ def spending_by_award_test_data(): earliest_public_law_enactment_date="2020-03-06", ) defc_q = baker.make( - "references.DisasterEmergencyFundCode", code="Q", group_name=None, earliest_public_law_enactment_date=None + "references.DisasterEmergencyFundCode", + code="Q", + group_name=None, + earliest_public_law_enactment_date=None, ) # Submissions @@ -832,16 +863,38 @@ def spending_by_award_test_data(): ) # Subtier Agency - subtier_agency_1 = {"pk": 1, "abbreviation": "SA1", "name": "SUBTIER AGENCY 1", "subtier_code": "DEF"} - subtier_agency_2 = {"pk": 2, "abbreviation": "SA2", "name": "SUBTIER AGENCY 2", "subtier_code": "1000"} + subtier_agency_1 = { + "pk": 1, + "abbreviation": "SA1", + "name": "SUBTIER AGENCY 1", + "subtier_code": "DEF", + } + subtier_agency_2 = { + "pk": 2, + "abbreviation": "SA2", + "name": "SUBTIER AGENCY 2", + "subtier_code": "1000", + } baker.make("references.SubtierAgency", **subtier_agency_1, _fill_optional=True) baker.make("references.SubtierAgency", **subtier_agency_2, _fill_optional=True) # Agency - baker.make("references.Agency", pk=1, toptier_agency=ta1, subtier_agency_id=1, _fill_optional=True) + baker.make( + "references.Agency", + pk=1, + toptier_agency=ta1, + subtier_agency_id=1, + _fill_optional=True, + ) - baker.make("search.TransactionSearch", transaction_id=1, award=award_1, action_date="2020-04-01", is_fpds=True) + baker.make( + "search.TransactionSearch", + transaction_id=1, + award=award_1, + action_date="2020-04-01", + is_fpds=True, + ) baker.make( "search.TransactionSearch", transaction_id=2, @@ -879,8 +932,20 @@ def spending_by_award_test_data(): recipient_location_county_code="012", naics_code="112244", ) - baker.make("search.TransactionSearch", transaction_id=4, award=award_3, action_date="2017-01-01", is_fpds=True) - baker.make("search.TransactionSearch", transaction_id=5, award=award_3, action_date="2018-01-01", is_fpds=True) + baker.make( + "search.TransactionSearch", + transaction_id=4, + award=award_3, + action_date="2017-01-01", + is_fpds=True, + ) + baker.make( + "search.TransactionSearch", + transaction_id=5, + award=award_3, + action_date="2018-01-01", + is_fpds=True, + ) baker.make( "search.TransactionSearch", transaction_id=6, @@ -898,7 +963,13 @@ def spending_by_award_test_data(): cfda_number="10.331", recipient_unique_id="duns_1001", ) - baker.make("search.TransactionSearch", transaction_id=8, award=award_5, action_date="2019-10-1", is_fpds=True) + baker.make( + "search.TransactionSearch", + transaction_id=8, + award=award_5, + action_date="2019-10-1", + is_fpds=True, + ) baker.make( "search.SubawardSearch", diff --git a/usaspending_api/search/tests/integration/test_spending_by_award.py b/usaspending_api/search/tests/integration/test_spending_by_award.py index 1eafd1580f..0663d12589 100644 --- a/usaspending_api/search/tests/integration/test_spending_by_award.py +++ b/usaspending_api/search/tests/integration/test_spending_by_award.py @@ -7,20 +7,39 @@ from usaspending_api.awards.v2.lookups.lookups import all_award_types_mappings from usaspending_api.common.helpers.generic_helper import get_generic_filters_message -from usaspending_api.search.tests.data.search_filters_test_data import legacy_filters, non_legacy_filters +from usaspending_api.search.tests.data.search_filters_test_data import ( + legacy_filters, + non_legacy_filters, +) from usaspending_api.search.tests.data.utilities import setup_elasticsearch_test @pytest.fixture def award_data_fixture(db): - baker.make("search.TransactionSearch", transaction_id=210210210, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=321032103, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=432104321, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=543210543, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=654321065, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=765432107, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=876543210, action_date="2013-09-17") - baker.make("search.TransactionSearch", transaction_id=987654321, action_date="2013-09-17") + baker.make( + "search.TransactionSearch", transaction_id=210210210, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=321032103, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=432104321, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=543210543, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=654321065, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=765432107, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=876543210, action_date="2013-09-17" + ) + baker.make( + "search.TransactionSearch", transaction_id=987654321, action_date="2013-09-17" + ) ref_program_activity1 = baker.make( "references.RefProgramActivity", @@ -35,9 +54,19 @@ def award_data_fixture(db): program_activity_name="PROGRAM_ACTIVITY_2", ) - baker.make("references.DisasterEmergencyFundCode", code="L", group_name="covid_19", public_law="LAW", title="title") baker.make( - "references.DisasterEmergencyFundCode", code="Z", group_name="infrastructure", public_law="LAW", title="title" + "references.DisasterEmergencyFundCode", + code="L", + group_name="covid_19", + public_law="LAW", + title="title", + ) + baker.make( + "references.DisasterEmergencyFundCode", + code="Z", + group_name="infrastructure", + public_law="LAW", + title="title", ) award1 = baker.make( @@ -286,7 +315,10 @@ def test_spending_by_award_subaward_success( assert resp.status_code == status.HTTP_200_OK # Testing contents of what is returned - spending_level_filter_list = [{"spending_level": "subawards"}, {"spending_level": "subawards"}] + spending_level_filter_list = [ + {"spending_level": "subawards"}, + {"spending_level": "subawards"}, + ] for spending_level_filter in spending_level_filter_list: resp = client.post( @@ -353,14 +385,21 @@ def test_spending_by_award_subaward_success( @pytest.mark.django_db -def test_spending_by_award_legacy_filters(client, monkeypatch, elasticsearch_award_index): +def test_spending_by_award_legacy_filters( + client, monkeypatch, elasticsearch_award_index +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = client.post( "/api/v2/search/spending_by_award", content_type="application/json", data=json.dumps( - {"spending_level": "awards", "fields": ["Award ID"], "sort": "Award ID", "filters": legacy_filters()} + { + "spending_level": "awards", + "fields": ["Award ID"], + "sort": "Award ID", + "filters": legacy_filters(), + } ), ) assert resp.status_code == status.HTTP_200_OK @@ -368,9 +407,20 @@ def test_spending_by_award_legacy_filters(client, monkeypatch, elasticsearch_awa @pytest.mark.django_db def test_no_intersection(client, monkeypatch, elasticsearch_award_index): - - baker.make("search.AwardSearch", award_id=1, type="A", latest_transaction_id=1, action_date="2020-10-10") - baker.make("search.TransactionSearch", transaction_id=1, action_date="2010-10-01", award_id=1, is_fpds=True) + baker.make( + "search.AwardSearch", + award_id=1, + type="A", + latest_transaction_id=1, + action_date="2020-10-10", + ) + baker.make( + "search.TransactionSearch", + transaction_id=1, + action_date="2010-10-01", + award_id=1, + is_fpds=True, + ) setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) @@ -381,19 +431,34 @@ def test_no_intersection(client, monkeypatch, elasticsearch_award_index): "filters": {"award_type_codes": ["A", "B", "C", "D"]}, } - resp = client.post("/api/v2/search/spending_by_award", content_type="application/json", data=json.dumps(request)) + resp = client.post( + "/api/v2/search/spending_by_award", + content_type="application/json", + data=json.dumps(request), + ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 1 request["filters"]["award_type_codes"].append("no intersection") - resp = client.post("/api/v2/search/spending_by_award", content_type="application/json", data=json.dumps(request)) + resp = client.post( + "/api/v2/search/spending_by_award", + content_type="application/json", + data=json.dumps(request), + ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 0, "Results returned, there should be 0" @pytest.fixture def awards_over_different_date_ranges(): - award_category_list = ["contracts", "direct_payments", "grants", "idvs", "loans", "other_financial_assistance"] + award_category_list = [ + "contracts", + "direct_payments", + "grants", + "idvs", + "loans", + "other_financial_assistance", + ] # The date ranges for the different awards are setup to cover possible intersection points by the # different date ranges being searched. The comments on each line specify where the date ranges are @@ -402,23 +467,68 @@ def awards_over_different_date_ranges(): # - {"start_date": "2017-02-01", "end_date": "2017-11-30"} date_range_list = [ # Intersect only one of the date ranges searched for - {"date_signed": datetime(2014, 1, 1), "action_date": datetime(2014, 5, 1)}, # Before both - {"date_signed": datetime(2014, 3, 1), "action_date": datetime(2015, 4, 15)}, # Beginning of first - {"date_signed": datetime(2015, 2, 1), "action_date": datetime(2015, 7, 1)}, # Middle of first + { + "date_signed": datetime(2014, 1, 1), + "action_date": datetime(2014, 5, 1), + }, # Before both + { + "date_signed": datetime(2014, 3, 1), + "action_date": datetime(2015, 4, 15), + }, # Beginning of first + { + "date_signed": datetime(2015, 2, 1), + "action_date": datetime(2015, 7, 1), + }, # Middle of first {"date_signed": datetime(2015, 2, 1), "action_date": datetime(2015, 4, 17)}, - {"date_signed": datetime(2014, 12, 1), "action_date": datetime(2016, 1, 1)}, # All of first - {"date_signed": datetime(2015, 11, 1), "action_date": datetime(2016, 3, 1)}, # End of first - {"date_signed": datetime(2016, 2, 23), "action_date": datetime(2016, 7, 19)}, # Between both - {"date_signed": datetime(2016, 11, 26), "action_date": datetime(2017, 3, 1)}, # Beginning of second - {"date_signed": datetime(2017, 5, 1), "action_date": datetime(2017, 7, 1)}, # Middle of second - {"date_signed": datetime(2017, 1, 1), "action_date": datetime(2017, 12, 1)}, # All of second - {"date_signed": datetime(2017, 9, 1), "action_date": datetime(2017, 12, 17)}, # End of second - {"date_signed": datetime(2018, 2, 1), "action_date": datetime(2018, 7, 1)}, # After both + { + "date_signed": datetime(2014, 12, 1), + "action_date": datetime(2016, 1, 1), + }, # All of first + { + "date_signed": datetime(2015, 11, 1), + "action_date": datetime(2016, 3, 1), + }, # End of first + { + "date_signed": datetime(2016, 2, 23), + "action_date": datetime(2016, 7, 19), + }, # Between both + { + "date_signed": datetime(2016, 11, 26), + "action_date": datetime(2017, 3, 1), + }, # Beginning of second + { + "date_signed": datetime(2017, 5, 1), + "action_date": datetime(2017, 7, 1), + }, # Middle of second + { + "date_signed": datetime(2017, 1, 1), + "action_date": datetime(2017, 12, 1), + }, # All of second + { + "date_signed": datetime(2017, 9, 1), + "action_date": datetime(2017, 12, 17), + }, # End of second + { + "date_signed": datetime(2018, 2, 1), + "action_date": datetime(2018, 7, 1), + }, # After both # Intersect both date ranges searched for - {"date_signed": datetime(2014, 12, 1), "action_date": datetime(2017, 12, 5)}, # Completely both - {"date_signed": datetime(2015, 7, 1), "action_date": datetime(2017, 5, 1)}, # Partially both - {"date_signed": datetime(2014, 10, 3), "action_date": datetime(2017, 4, 8)}, # All first; partial second - {"date_signed": datetime(2015, 8, 1), "action_date": datetime(2018, 1, 2)}, # Partial first; all second + { + "date_signed": datetime(2014, 12, 1), + "action_date": datetime(2017, 12, 5), + }, # Completely both + { + "date_signed": datetime(2015, 7, 1), + "action_date": datetime(2017, 5, 1), + }, # Partially both + { + "date_signed": datetime(2014, 10, 3), + "action_date": datetime(2017, 4, 8), + }, # All first; partial second + { + "date_signed": datetime(2015, 8, 1), + "action_date": datetime(2018, 1, 2), + }, # Partial first; all second ] award_id = 0 @@ -478,7 +588,9 @@ def test_date_range_search_with_one_range( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_with_contracts) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_with_contracts), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 9 @@ -497,7 +609,9 @@ def test_date_range_search_with_one_range( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_with_grants) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_with_grants), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 8 @@ -516,11 +630,15 @@ def test_date_range_search_with_one_range( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_one_award) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_one_award), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 1 - assert resp.data["results"] == [{"Award ID": "abcdefg1", "internal_id": 1, "generated_internal_id": "AWARD_1"}] + assert resp.data["results"] == [ + {"Award ID": "abcdefg1", "internal_id": 1, "generated_internal_id": "AWARD_1"} + ] # Test with no award showing request_for_no_awards = { @@ -536,7 +654,9 @@ def test_date_range_search_with_one_range( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_no_awards) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_no_awards), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 0 @@ -568,7 +688,9 @@ def test_date_range_search_with_two_ranges( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_with_contracts) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_with_contracts), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 13 @@ -590,7 +712,9 @@ def test_date_range_search_with_two_ranges( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_with_grants) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_with_grants), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 13 @@ -612,7 +736,9 @@ def test_date_range_search_with_two_ranges( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_two_awards) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_two_awards), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 2 @@ -638,13 +764,17 @@ def test_date_range_search_with_two_ranges( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_no_awards) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_no_awards), ) assert resp.status_code == status.HTTP_200_OK @pytest.mark.django_db -def test_date_range_with_date_signed(client, monkeypatch, elasticsearch_award_index, awards_over_different_date_ranges): +def test_date_range_with_date_signed( + client, monkeypatch, elasticsearch_award_index, awards_over_different_date_ranges +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) contract_type_list = all_award_types_mappings["contracts"] @@ -657,14 +787,20 @@ def test_date_range_with_date_signed(client, monkeypatch, elasticsearch_award_in "page": 1, "filters": { "time_period": [ - {"start_date": "2015-01-01", "end_date": "2015-12-31", "date_type": "date_signed"}, + { + "start_date": "2015-01-01", + "end_date": "2015-12-31", + "date_type": "date_signed", + }, ], "award_type_codes": contract_type_list, }, } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_2015) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_2015), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 5 @@ -677,21 +813,29 @@ def test_date_range_with_date_signed(client, monkeypatch, elasticsearch_award_in "page": 1, "filters": { "time_period": [ - {"start_date": "2016-01-01", "end_date": "2016-12-31", "date_type": "date_signed"}, + { + "start_date": "2016-01-01", + "end_date": "2016-12-31", + "date_type": "date_signed", + }, ], "award_type_codes": contract_type_list, }, } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_2016) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_2016), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 2 @pytest.mark.django_db -def test_messages_not_nested(client, monkeypatch, elasticsearch_award_index, awards_over_different_date_ranges): +def test_messages_not_nested( + client, monkeypatch, elasticsearch_award_index, awards_over_different_date_ranges +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) contract_type_list = all_award_types_mappings["contracts"] @@ -704,7 +848,11 @@ def test_messages_not_nested(client, monkeypatch, elasticsearch_award_index, awa "page": 1, "filters": { "time_period": [ - {"start_date": "2015-01-01", "end_date": "2015-12-31", "date_type": "date_signed"}, + { + "start_date": "2015-01-01", + "end_date": "2015-12-31", + "date_type": "date_signed", + }, ], "award_type_codes": contract_type_list, "not_a_real_filter": "abc", @@ -712,7 +860,9 @@ def test_messages_not_nested(client, monkeypatch, elasticsearch_award_index, awa } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_2015) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_2015), ) resp_json = resp.json() @@ -749,7 +899,9 @@ def test_success_with_all_filters(client, monkeypatch, elasticsearch_award_index @pytest.mark.django_db -def test_inclusive_naics_code(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_inclusive_naics_code( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): """ Verify use of built query_string boolean logic for NAICS code inclusions/exclusions executes as expected on ES """ @@ -763,7 +915,9 @@ def test_inclusive_naics_code(client, monkeypatch, spending_by_award_test_data, "filters": { "award_type_codes": ["A", "B", "C", "D"], "naics_codes": {"require": ["1122"]}, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -779,7 +933,9 @@ def test_inclusive_naics_code(client, monkeypatch, spending_by_award_test_data, @pytest.mark.django_db -def test_exclusive_naics_code(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_exclusive_naics_code( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): """ Verify use of built query_string boolean logic for NAICS code inclusions/exclusions executes as expected on ES """ @@ -793,7 +949,9 @@ def test_exclusive_naics_code(client, monkeypatch, spending_by_award_test_data, "filters": { "award_type_codes": ["A", "B", "C", "D"], "naics_codes": {"require": ["999990"]}, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -809,7 +967,9 @@ def test_exclusive_naics_code(client, monkeypatch, spending_by_award_test_data, @pytest.mark.django_db -def test_mixed_naics_codes(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_mixed_naics_codes( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): """ Verify use of built query_string boolean logic for NAICS code inclusions/exclusions executes as expected on ES """ @@ -848,8 +1008,13 @@ def test_mixed_naics_codes(client, monkeypatch, spending_by_award_test_data, ela { "filters": { "award_type_codes": ["A", "B", "C", "D"], - "naics_codes": {"require": ["112233", "222233"], "exclude": ["112233"]}, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "naics_codes": { + "require": ["112233", "222233"], + "exclude": ["112233"], + }, + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -860,15 +1025,27 @@ def test_mixed_naics_codes(client, monkeypatch, spending_by_award_test_data, ela } ), ) - expected_result = [{"internal_id": 5, "Award ID": None, "generated_internal_id": "ASST_NON_TESTING_5"}] + expected_result = [ + { + "internal_id": 5, + "Award ID": None, + "generated_internal_id": "ASST_NON_TESTING_5", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Keyword filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Keyword filter does not match expected result" + ) @pytest.mark.django_db def test_correct_response_for_each_filter( - client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index, elasticsearch_subaward_index + client, + monkeypatch, + spending_by_award_test_data, + elasticsearch_award_index, + elasticsearch_subaward_index, ): """ Verify the content of the response when using different filters. This function creates the ES Index @@ -881,6 +1058,7 @@ def test_correct_response_for_each_filter( _test_correct_response_for_keywords, _test_correct_response_for_time_period, _test_correct_response_for_award_type_codes, + _test_correct_response_for_award_type_codes_loans, _test_correct_response_for_agencies, _test_correct_response_for_tas_components, _test_correct_response_for_pop_location, @@ -924,12 +1102,22 @@ def _test_correct_response_for_keywords(client): ), ) expected_result = [ - {"internal_id": 2, "Award ID": "abc222", "generated_internal_id": "CONT_AWD_TESTING_2"}, - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, + { + "internal_id": 2, + "Award ID": "abc222", + "generated_internal_id": "CONT_AWD_TESTING_2", + }, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "Keyword filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Keyword filter does not match expected result" + ) def _test_correct_response_for_time_period(client): @@ -940,7 +1128,9 @@ def _test_correct_response_for_time_period(client): { "filters": { "award_type_codes": ["A"], - "time_period": [{"start_date": "2014-01-01", "end_date": "2008-12-31"}], + "time_period": [ + {"start_date": "2014-01-01", "end_date": "2008-12-31"} + ], }, "fields": ["Award ID"], "page": 1, @@ -951,10 +1141,18 @@ def _test_correct_response_for_time_period(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Time Period filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Time Period filter does not match expected result" + ) def _test_correct_response_for_award_type_codes(client): @@ -965,7 +1163,9 @@ def _test_correct_response_for_award_type_codes(client): { "filters": { "award_type_codes": ["A", "B", "C", "D"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -977,17 +1177,82 @@ def _test_correct_response_for_award_type_codes(client): ), ) expected_result = [ - {"internal_id": 999, "Award ID": "award999", "generated_internal_id": "ASST_NON_TESTING_999"}, - {"internal_id": 998, "Award ID": "award998", "generated_internal_id": "ASST_NON_TESTING_998"}, - {"internal_id": 997, "Award ID": "award997", "generated_internal_id": "ASST_NON_TESTING_997"}, - {"internal_id": 5, "Award ID": "abcdef123", "generated_internal_id": "CONT_AWD_TESTING_5"}, - {"internal_id": 3, "Award ID": "abc333", "generated_internal_id": "CONT_AWD_TESTING_3"}, - {"internal_id": 2, "Award ID": "abc222", "generated_internal_id": "CONT_AWD_TESTING_2"}, - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, + { + "internal_id": 999, + "Award ID": "award999", + "generated_internal_id": "ASST_NON_TESTING_999", + }, + { + "internal_id": 998, + "Award ID": "award998", + "generated_internal_id": "ASST_NON_TESTING_998", + }, + { + "internal_id": 997, + "Award ID": "award997", + "generated_internal_id": "ASST_NON_TESTING_997", + }, + { + "internal_id": 5, + "Award ID": "abcdef123", + "generated_internal_id": "CONT_AWD_TESTING_5", + }, + { + "internal_id": 3, + "Award ID": "abc333", + "generated_internal_id": "CONT_AWD_TESTING_3", + }, + { + "internal_id": 2, + "Award ID": "abc222", + "generated_internal_id": "CONT_AWD_TESTING_2", + }, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 7 - assert resp.json().get("results") == expected_result, "Award Type Codes filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Award Type Codes filter does not match expected result" + ) + + +def _test_correct_response_for_award_type_codes_loans(client): + resp = client.post( + "/api/v2/search/spending_by_award", + content_type="application/json", + data=json.dumps( + { + "filters": { + "award_type_codes": ["F003"], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], + }, + "fields": ["Award ID"], + "page": 1, + "limit": 60, + "sort": "Award ID", + "order": "desc", + "spending_level": "awards", + } + ), + ) + expected_result = [ + { + "internal_id": 2026, + "Award ID": "award2026", + "generated_internal_id": "ASST_NEW_TYPES_2026", + } + ] + assert resp.status_code == status.HTTP_200_OK + assert len(resp.json().get("results")) == 1 + assert resp.json().get("results") == expected_result, ( + "Award Type Codes filter does not match expected result" + ) def _test_correct_response_for_agencies(client): @@ -999,10 +1264,20 @@ def _test_correct_response_for_agencies(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "agencies": [ - {"type": "awarding", "tier": "toptier", "name": "TOPTIER AGENCY 1"}, - {"type": "awarding", "tier": "subtier", "name": "SUBTIER AGENCY 1"}, + { + "type": "awarding", + "tier": "toptier", + "name": "TOPTIER AGENCY 1", + }, + { + "type": "awarding", + "tier": "subtier", + "name": "SUBTIER AGENCY 1", + }, + ], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} ], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], }, "fields": ["Award ID"], "page": 1, @@ -1013,10 +1288,18 @@ def _test_correct_response_for_agencies(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Agency filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Agency filter does not match expected result" + ) def _test_correct_response_for_tas_components(client): @@ -1028,7 +1311,9 @@ def _test_correct_response_for_tas_components(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "tas_codes": [{"aid": "097", "main": "4930"}], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1040,12 +1325,22 @@ def _test_correct_response_for_tas_components(client): ), ) expected_result = [ - {"internal_id": 5, "Award ID": "abcdef123", "generated_internal_id": "CONT_AWD_TESTING_5"}, - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, + { + "internal_id": 5, + "Award ID": "abcdef123", + "generated_internal_id": "CONT_AWD_TESTING_5", + }, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "TAS Codes filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "TAS Codes filter does not match expected result" + ) def _test_correct_response_for_pop_location(client): @@ -1056,8 +1351,12 @@ def _test_correct_response_for_pop_location(client): { "filters": { "award_type_codes": ["A", "B", "C", "D"], - "place_of_performance_locations": [{"country": "USA", "state": "VA", "county": "014"}], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "place_of_performance_locations": [ + {"country": "USA", "state": "VA", "county": "014"} + ], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1068,10 +1367,18 @@ def _test_correct_response_for_pop_location(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Place of Performance filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Place of Performance filter does not match expected result" + ) def _test_correct_response_for_recipient_location(client): @@ -1086,7 +1393,9 @@ def _test_correct_response_for_recipient_location(client): {"country": "USA", "state": "VA", "county": "012"}, {"country": "USA", "state": "VA", "city": "Arlington"}, ], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1098,12 +1407,22 @@ def _test_correct_response_for_recipient_location(client): ), ) expected_result = [ - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, - {"internal_id": 2, "Award ID": "abc222", "generated_internal_id": "CONT_AWD_TESTING_2"}, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, + { + "internal_id": 2, + "Award ID": "abc222", + "generated_internal_id": "CONT_AWD_TESTING_2", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "Recipient Location filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Recipient Location filter does not match expected result" + ) def _test_correct_response_for_recipient_search_text(client): @@ -1115,7 +1434,9 @@ def _test_correct_response_for_recipient_search_text(client): "filters": { "award_type_codes": ["02", "03", "04", "05"], "recipient_search_text": ["recipient_name_for_award_1001"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1126,10 +1447,18 @@ def _test_correct_response_for_recipient_search_text(client): } ), ) - expected_result = [{"internal_id": 4, "Award ID": "abc444", "generated_internal_id": "ASST_NON_TESTING_4"}] + expected_result = [ + { + "internal_id": 4, + "Award ID": "abc444", + "generated_internal_id": "ASST_NON_TESTING_4", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Recipient Search Text filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Recipient Search Text filter does not match expected result" + ) # Test the results when searching for a recipient name that ends with a period # A search for `ACME INC` should include ACME INC, ACME INC. and ACME INC.XYZ @@ -1141,7 +1470,9 @@ def _test_correct_response_for_recipient_search_text(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "recipient_search_text": ["ACME INC"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID", "Recipient Name"], "page": 1, @@ -1175,7 +1506,9 @@ def _test_correct_response_for_recipient_search_text(client): assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == len(expected_result) - assert resp.json().get("results") == expected_result, "Recipient Search Text filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Recipient Search Text filter does not match expected result" + ) # A search for `ACME INC.` should include ACME INC. and ACME INC.XYZ but not ACME INC resp = client.post( @@ -1186,7 +1519,9 @@ def _test_correct_response_for_recipient_search_text(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "recipient_search_text": ["ACME INC."], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID", "Recipient Name"], "page": 1, @@ -1214,7 +1549,9 @@ def _test_correct_response_for_recipient_search_text(client): assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == len(expected_result) - assert resp.json().get("results") == expected_result, "Recipient Search Text filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Recipient Search Text filter does not match expected result" + ) def _test_correct_response_for_recipient_type_names(client): @@ -1225,8 +1562,13 @@ def _test_correct_response_for_recipient_type_names(client): { "filters": { "award_type_codes": ["A", "B", "C", "D"], - "recipient_type_names": ["business_category_1_3", "business_category_2_8"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "recipient_type_names": [ + "business_category_1_3", + "business_category_2_8", + ], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1238,12 +1580,22 @@ def _test_correct_response_for_recipient_type_names(client): ), ) expected_result = [ - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, - {"internal_id": 3, "Award ID": "abc333", "generated_internal_id": "CONT_AWD_TESTING_3"}, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, + { + "internal_id": 3, + "Award ID": "abc333", + "generated_internal_id": "CONT_AWD_TESTING_3", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "Recipient Type Names filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Recipient Type Names filter does not match expected result" + ) def _test_correct_response_for_award_amounts(client): @@ -1254,8 +1606,13 @@ def _test_correct_response_for_award_amounts(client): { "filters": { "award_type_codes": ["A", "B", "C", "D"], - "award_amounts": [{"upper_bound": 1000000}, {"lower_bound": 9013, "upper_bound": 9017}], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "award_amounts": [ + {"upper_bound": 1000000}, + {"lower_bound": 9013, "upper_bound": 9017}, + ], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1267,13 +1624,27 @@ def _test_correct_response_for_award_amounts(client): ), ) expected_result = [ - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, - {"internal_id": 2, "Award ID": "abc222", "generated_internal_id": "CONT_AWD_TESTING_2"}, - {"internal_id": 5, "Award ID": "abcdef123", "generated_internal_id": "CONT_AWD_TESTING_5"}, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, + { + "internal_id": 2, + "Award ID": "abc222", + "generated_internal_id": "CONT_AWD_TESTING_2", + }, + { + "internal_id": 5, + "Award ID": "abcdef123", + "generated_internal_id": "CONT_AWD_TESTING_5", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 3 - assert resp.json().get("results") == expected_result, "Award Amounts filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Award Amounts filter does not match expected result" + ) def _test_correct_response_for_cfda_program(client): @@ -1285,7 +1656,9 @@ def _test_correct_response_for_cfda_program(client): "filters": { "award_type_codes": ["02", "03", "04", "05"], "program_numbers": ["10.331"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1296,10 +1669,18 @@ def _test_correct_response_for_cfda_program(client): } ), ) - expected_result = [{"internal_id": 4, "Award ID": "abc444", "generated_internal_id": "ASST_NON_TESTING_4"}] + expected_result = [ + { + "internal_id": 4, + "Award ID": "abc444", + "generated_internal_id": "ASST_NON_TESTING_4", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "CFDA Program filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "CFDA Program filter does not match expected result" + ) def _test_correct_response_for_cfda_program_subawards(client): @@ -1311,7 +1692,9 @@ def _test_correct_response_for_cfda_program_subawards(client): "filters": { "award_type_codes": ["02", "03", "04", "05"], "program_numbers": ["10.331"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1332,7 +1715,9 @@ def _test_correct_response_for_cfda_program_subawards(client): ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "CFDA Program filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "CFDA Program filter does not match expected result" + ) def _test_correct_response_for_naics_codes(client): @@ -1344,7 +1729,9 @@ def _test_correct_response_for_naics_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "naics_codes": {"require": ["1122"], "exclude": ["112244"]}, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1355,10 +1742,18 @@ def _test_correct_response_for_naics_codes(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "NAICS Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "NAICS Code filter does not match expected result" + ) def _test_correct_response_for_naics_codes_subawards(client): @@ -1370,7 +1765,9 @@ def _test_correct_response_for_naics_codes_subawards(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "naics_codes": {"require": ["112233", "112244"]}, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1397,7 +1794,9 @@ def _test_correct_response_for_naics_codes_subawards(client): ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "NAICS Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "NAICS Code filter does not match expected result" + ) def _test_correct_response_for_psc_code_list(client): @@ -1409,7 +1808,9 @@ def _test_correct_response_for_psc_code_list(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "psc_codes": ["PSC1"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1420,10 +1821,18 @@ def _test_correct_response_for_psc_code_list(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "PSC Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "PSC Code filter does not match expected result" + ) def _test_correct_response_for_psc_code_object(client): @@ -1438,7 +1847,9 @@ def _test_correct_response_for_psc_code_object(client): "require": [["Service", "P", "PSC", "PSC1"]], "exclude": [["Service", "P", "PSC", "PSC0"]], }, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1449,10 +1860,18 @@ def _test_correct_response_for_psc_code_object(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "PSC Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "PSC Code filter does not match expected result" + ) def _test_correct_response_for_psc_code_list_subawards(client): @@ -1465,7 +1884,9 @@ def _test_correct_response_for_psc_code_list_subawards(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "psc_codes": ["PSC2"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1486,7 +1907,9 @@ def _test_correct_response_for_psc_code_list_subawards(client): ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "PSC Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "PSC Code filter does not match expected result" + ) def _test_correct_response_for_psc_code_object_subawards(client): @@ -1502,7 +1925,9 @@ def _test_correct_response_for_psc_code_object_subawards(client): "require": [["Service", "P", "PSC", "PSC2"]], "exclude": [["Service", "P", "PSC", "PSC0"]], }, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1523,7 +1948,9 @@ def _test_correct_response_for_psc_code_object_subawards(client): ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "PSC Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "PSC Code filter does not match expected result" + ) def _test_more_sophisticated_eclipsed_psc_code_1(client): @@ -1538,7 +1965,9 @@ def _test_more_sophisticated_eclipsed_psc_code_1(client): "require": [["Service"], ["Service", "P", "PSC"]], "exclude": [["Service", "P"], ["Service", "P", "PSC", "PSC1"]], }, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1565,7 +1994,9 @@ def _test_more_sophisticated_eclipsed_psc_code_2(client): "require": [["Service", "P"], ["Service", "P", "PSC", "PSC1"]], "exclude": [["Service"], ["Service", "P", "PSC"]], }, - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1589,7 +2020,9 @@ def _test_correct_response_for_contract_pricing_type_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "contract_pricing_type_codes": ["contract_pricing_test"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1600,12 +2033,18 @@ def _test_correct_response_for_contract_pricing_type_codes(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert ( - resp.json().get("results") == expected_result - ), "Contract Pricing Type Codes filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Contract Pricing Type Codes filter does not match expected result" + ) def _test_correct_response_for_set_aside_type_codes(client): @@ -1617,7 +2056,9 @@ def _test_correct_response_for_set_aside_type_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "set_aside_type_codes": ["type_set_aside_test"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1628,10 +2069,18 @@ def _test_correct_response_for_set_aside_type_codes(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "Set Aside Type Codes filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Set Aside Type Codes filter does not match expected result" + ) def _test_correct_response_for_set_extent_competed_type_codes(client): @@ -1643,7 +2092,9 @@ def _test_correct_response_for_set_extent_competed_type_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "extent_competed_type_codes": ["extent_competed_test"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1654,12 +2105,18 @@ def _test_correct_response_for_set_extent_competed_type_codes(client): } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert ( - resp.json().get("results") == expected_result - ), "Extent Competed Type Codes filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Extent Competed Type Codes filter does not match expected result" + ) def _test_correct_response_for_recipient_id(client): @@ -1671,7 +2128,9 @@ def _test_correct_response_for_recipient_id(client): "filters": { "award_type_codes": ["02", "03", "04", "05"], "recipient_id": "51c7c0ad-a793-de3f-72ba-be5c2895a9ca", - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1682,10 +2141,16 @@ def _test_correct_response_for_recipient_id(client): } ), ) - expected_result = {"internal_id": 4, "Award ID": "abc444", "generated_internal_id": "ASST_NON_TESTING_4"} + expected_result = { + "internal_id": 4, + "Award ID": "abc444", + "generated_internal_id": "ASST_NON_TESTING_4", + } assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 7 - assert resp.json().get("results")[-1] == expected_result, "Recipient ID filter does not match expected result" + assert resp.json().get("results")[-1] == expected_result, ( + "Recipient ID filter does not match expected result" + ) def _test_correct_response_for_def_codes(client): @@ -1697,7 +2162,9 @@ def _test_correct_response_for_def_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "def_codes": ["L", "Q"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1709,12 +2176,22 @@ def _test_correct_response_for_def_codes(client): ), ) expected_result = [ - {"internal_id": 5, "Award ID": "abcdef123", "generated_internal_id": "CONT_AWD_TESTING_5"}, - {"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}, + { + "internal_id": 5, + "Award ID": "abcdef123", + "generated_internal_id": "CONT_AWD_TESTING_5", + }, + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 2 - assert resp.json().get("results") == expected_result, "DEFC filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "DEFC filter does not match expected result" + ) resp = client.post( "/api/v2/search/spending_by_award", @@ -1724,7 +2201,9 @@ def _test_correct_response_for_def_codes(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "def_codes": ["J"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1738,7 +2217,9 @@ def _test_correct_response_for_def_codes(client): expected_result = [] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 0 - assert resp.json().get("results") == expected_result, "DEFC filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "DEFC filter does not match expected result" + ) def _test_correct_response_for_def_codes_subaward(client): @@ -1750,7 +2231,9 @@ def _test_correct_response_for_def_codes_subaward(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "def_codes": ["L"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1783,7 +2266,9 @@ def _test_correct_response_for_def_codes_subaward(client): ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 3 - assert resp.json().get("results") == expected_result, "DEFC subaward filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "DEFC subaward filter does not match expected result" + ) resp = client.post( "/api/v2/search/spending_by_award", @@ -1793,7 +2278,9 @@ def _test_correct_response_for_def_codes_subaward(client): "filters": { "award_type_codes": ["A", "B", "C", "D"], "def_codes": ["J"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Sub-Award ID"], "page": 1, @@ -1807,7 +2294,9 @@ def _test_correct_response_for_def_codes_subaward(client): expected_result = [] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 0 - assert resp.json().get("results") == expected_result, "DEFC subaward filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "DEFC subaward filter does not match expected result" + ) @pytest.mark.django_db @@ -1815,7 +2304,11 @@ def test_failure_with_invalid_filters(client, monkeypatch, elasticsearch_award_i setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) # Fails with no request data - resp = client.post("/api/v2/search/spending_by_award", content_type="application/json", data=json.dumps({})) + resp = client.post( + "/api/v2/search/spending_by_award", + content_type="application/json", + data=json.dumps({}), + ) assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY assert resp.json().get("detail") == "Missing value: 'fields' is a required field" @@ -1823,10 +2316,21 @@ def test_failure_with_invalid_filters(client, monkeypatch, elasticsearch_award_i resp = client.post( "/api/v2/search/spending_by_award", content_type="application/json", - data=json.dumps({"fields": [], "filters": {}, "page": 1, "limit": 60, "spending_level": "awards"}), + data=json.dumps( + { + "fields": [], + "filters": {}, + "page": 1, + "limit": 60, + "spending_level": "awards", + } + ), ) assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY - assert resp.json().get("detail") == "Missing value: 'filters|award_type_codes' is a required field" + assert ( + resp.json().get("detail") + == "Missing value: 'filters|award_type_codes' is a required field" + ) # fails with empty field resp = client.post( @@ -1836,7 +2340,9 @@ def test_failure_with_invalid_filters(client, monkeypatch, elasticsearch_award_i { "fields": [], "filters": { - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], "award_type_codes": ["A", "B", "C", "D"], }, "page": 1, @@ -1846,11 +2352,15 @@ def test_failure_with_invalid_filters(client, monkeypatch, elasticsearch_award_i ), ) assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY - assert resp.json().get("detail") == "Field 'fields' value '[]' is below min '1' items" + assert ( + resp.json().get("detail") == "Field 'fields' value '[]' is below min '1' items" + ) @pytest.mark.django_db -def test_search_after(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_search_after( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = client.post( @@ -1871,20 +2381,48 @@ def test_search_after(client, monkeypatch, spending_by_award_test_data, elastics ), ) expected_result = [ - {"internal_id": 2, "Award ID": "abc222", "generated_internal_id": "CONT_AWD_TESTING_2"}, - {"internal_id": 3, "Award ID": "abc333", "generated_internal_id": "CONT_AWD_TESTING_3"}, - {"internal_id": 5, "Award ID": "abcdef123", "generated_internal_id": "CONT_AWD_TESTING_5"}, - {"internal_id": 997, "Award ID": "award997", "generated_internal_id": "ASST_NON_TESTING_997"}, - {"internal_id": 998, "Award ID": "award998", "generated_internal_id": "ASST_NON_TESTING_998"}, - {"internal_id": 999, "Award ID": "award999", "generated_internal_id": "ASST_NON_TESTING_999"}, + { + "internal_id": 2, + "Award ID": "abc222", + "generated_internal_id": "CONT_AWD_TESTING_2", + }, + { + "internal_id": 3, + "Award ID": "abc333", + "generated_internal_id": "CONT_AWD_TESTING_3", + }, + { + "internal_id": 5, + "Award ID": "abcdef123", + "generated_internal_id": "CONT_AWD_TESTING_5", + }, + { + "internal_id": 997, + "Award ID": "award997", + "generated_internal_id": "ASST_NON_TESTING_997", + }, + { + "internal_id": 998, + "Award ID": "award998", + "generated_internal_id": "ASST_NON_TESTING_998", + }, + { + "internal_id": 999, + "Award ID": "award999", + "generated_internal_id": "ASST_NON_TESTING_999", + }, ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == len(expected_result) - assert resp.json().get("results") == expected_result, "Award Type Code filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "Award Type Code filter does not match expected result" + ) @pytest.mark.django_db -def test_no_0_covid_amounts(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_no_0_covid_amounts( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = client.post( @@ -1895,7 +2433,9 @@ def test_no_0_covid_amounts(client, monkeypatch, spending_by_award_test_data, el "filters": { "award_type_codes": ["A", "B", "C", "D"], "def_codes": ["L"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1906,14 +2446,24 @@ def test_no_0_covid_amounts(client, monkeypatch, spending_by_award_test_data, el } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "DEFC filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "DEFC filter does not match expected result" + ) @pytest.mark.django_db -def test_uei_keyword_filter(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_uei_keyword_filter( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = client.post( @@ -1924,7 +2474,9 @@ def test_uei_keyword_filter(client, monkeypatch, spending_by_award_test_data, el "filters": { "award_type_codes": ["A", "B", "C", "D"], "keywords": ["testuei"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1935,14 +2487,24 @@ def test_uei_keyword_filter(client, monkeypatch, spending_by_award_test_data, el } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "UEI filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "UEI filter does not match expected result" + ) @pytest.mark.django_db -def test_parent_uei_keyword_filter(client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index): +def test_parent_uei_keyword_filter( + client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) resp = client.post( @@ -1953,7 +2515,9 @@ def test_parent_uei_keyword_filter(client, monkeypatch, spending_by_award_test_d "filters": { "award_type_codes": ["A", "B", "C", "D"], "keywords": ["test_parent_uei"], - "time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2020-09-30"} + ], }, "fields": ["Award ID"], "page": 1, @@ -1964,15 +2528,27 @@ def test_parent_uei_keyword_filter(client, monkeypatch, spending_by_award_test_d } ), ) - expected_result = [{"internal_id": 1, "Award ID": "abc111", "generated_internal_id": "CONT_AWD_TESTING_1"}] + expected_result = [ + { + "internal_id": 1, + "Award ID": "abc111", + "generated_internal_id": "CONT_AWD_TESTING_1", + } + ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "UEI filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "UEI filter does not match expected result" + ) @pytest.mark.django_db def test_uei_recipient_filter_subaward( - client, monkeypatch, spending_by_award_test_data, elasticsearch_award_index, elasticsearch_subaward_index + client, + monkeypatch, + spending_by_award_test_data, + elasticsearch_award_index, + elasticsearch_subaward_index, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -1983,7 +2559,9 @@ def test_uei_recipient_filter_subaward( data=json.dumps( { "filters": { - "time_period": [{"start_date": "2007-10-01", "end_date": "2022-09-30"}], + "time_period": [ + {"start_date": "2007-10-01", "end_date": "2022-09-30"} + ], "award_type_codes": [ "A", "B", @@ -2019,12 +2597,18 @@ def test_uei_recipient_filter_subaward( ] assert resp.status_code == status.HTTP_200_OK assert len(resp.json().get("results")) == 1 - assert resp.json().get("results") == expected_result, "UEI Recipient subaward filter does not match expected result" + assert resp.json().get("results") == expected_result, ( + "UEI Recipient subaward filter does not match expected result" + ) @pytest.mark.django_db def test_date_range_with_new_awards_only( - client, monkeypatch, elasticsearch_award_index, awards_over_different_date_ranges, elasticsearch_subaward_index + client, + monkeypatch, + elasticsearch_award_index, + awards_over_different_date_ranges, + elasticsearch_subaward_index, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2039,14 +2623,20 @@ def test_date_range_with_new_awards_only( "page": 1, "filters": { "time_period": [ - {"start_date": "2015-01-01", "end_date": "2015-12-31", "date_type": "new_awards_only"}, + { + "start_date": "2015-01-01", + "end_date": "2015-12-31", + "date_type": "new_awards_only", + }, ], "award_type_codes": contract_type_list, }, } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_2015) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_2015), ) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) == 5 @@ -2059,25 +2649,35 @@ def test_date_range_with_new_awards_only( "page": 1, "filters": { "time_period": [ - {"start_date": "2015-01-01", "end_date": "2015-12-31", "date_type": "new_awards_only"}, + { + "start_date": "2015-01-01", + "end_date": "2015-12-31", + "date_type": "new_awards_only", + }, ], "award_type_codes": contract_type_list, }, } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_for_2015) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_for_2015), ) assert resp.status_code == status.HTTP_400_BAD_REQUEST assert ( resp.json().get("detail") - == "Field 'filters|time_period' is outside valid values ['action_date', 'last_modified_date', 'date_signed', 'sub_action_date']" + == "Field 'filters|time_period' is outside valid values ['action_date', 'last_modified_date', 'date_signed', 'sub_action_date']" # noqa: E501 ) @pytest.mark.django_db def test_spending_by_award_program_activity_subawards( - client, monkeypatch, elasticsearch_award_index, spending_by_award_test_data, elasticsearch_subaward_index + client, + monkeypatch, + elasticsearch_award_index, + spending_by_award_test_data, + elasticsearch_subaward_index, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2102,11 +2702,15 @@ def test_spending_by_award_program_activity_subawards( } ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) test_payload = { "spending_level": "subawards", @@ -2127,11 +2731,15 @@ def test_spending_by_award_program_activity_subawards( } ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) test_payload = { "spending_level": "subawards", @@ -2145,15 +2753,21 @@ def test_spending_by_award_program_activity_subawards( } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) @pytest.mark.django_db -def test_spending_by_award_program_activity(client, monkeypatch, elasticsearch_award_index, award_data_fixture): +def test_spending_by_award_program_activity( + client, monkeypatch, elasticsearch_award_index, award_data_fixture +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) # Program Activites filter test @@ -2175,11 +2789,15 @@ def test_spending_by_award_program_activity(client, monkeypatch, elasticsearch_a } ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) test_payload = { "spending_level": "awards", @@ -2193,11 +2811,15 @@ def test_spending_by_award_program_activity(client, monkeypatch, elasticsearch_a } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) test_payload = { "spending_level": "awards", @@ -2217,11 +2839,15 @@ def test_spending_by_award_program_activity(client, monkeypatch, elasticsearch_a } ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) test_payload = { "spending_level": "awards", @@ -2241,11 +2867,15 @@ def test_spending_by_award_program_activity(client, monkeypatch, elasticsearch_a } ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) @pytest.mark.django_db @@ -2271,10 +2901,16 @@ def test_spending_by_award_subawards_award_id_filter( "prime_award_generated_internal_id": "ASST_NON_DECF0000058_8900", } ] - resp = client.post("/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(payload)) + resp = client.post( + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(payload), + ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # Test finding a Subaward by it's `award_piid_fain` payload = { @@ -2293,15 +2929,25 @@ def test_spending_by_award_subawards_award_id_filter( "prime_award_generated_internal_id": "ASST_NON_DECF0000058_8900", } ] - resp = client.post("/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(payload)) + resp = client.post( + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(payload), + ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) @pytest.mark.django_db def test_spending_by_award_unique_id_award( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2323,11 +2969,15 @@ def test_spending_by_award_unique_id_award( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # Test with an undefined award_unique_id test_payload = { @@ -2340,16 +2990,24 @@ def test_spending_by_award_unique_id_award( } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) @pytest.mark.django_db def test_spending_by_award_unique_id_subaward( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2378,11 +3036,15 @@ def test_spending_by_award_unique_id_subaward( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # Test with a single subaward test_payload = { @@ -2402,11 +3064,15 @@ def test_spending_by_award_unique_id_subaward( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # Test with no subawards test_payload = { @@ -2419,15 +3085,23 @@ def test_spending_by_award_unique_id_subaward( } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_description_specificity( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2436,7 +3110,10 @@ def test_spending_by_award_description_specificity( test_payload = { "spending_level": "awards", "fields": ["Award ID"], - "filters": {"award_type_codes": ["A", "B", "C", "D"], "description": "the test"}, + "filters": { + "award_type_codes": ["A", "B", "C", "D"], + "description": "the test", + }, } expected_response = [ { @@ -2446,17 +3123,24 @@ def test_spending_by_award_description_specificity( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # get subaward with description "the test test test" and not "the description for test" test_payload = { "spending_level": "subawards", "fields": ["Sub-Award ID"], - "filters": {"award_type_codes": ["A", "B", "C", "D"], "description": "the test"}, + "filters": { + "award_type_codes": ["A", "B", "C", "D"], + "description": "the test", + }, } expected_response = [ { @@ -2467,29 +3151,44 @@ def test_spending_by_award_description_specificity( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # ensure only queries for text in the correct order test_payload = { "spending_level": "subawards", "fields": ["Sub-Award ID"], - "filters": {"award_type_codes": ["A", "B", "C", "D"], "description": "test the"}, + "filters": { + "award_type_codes": ["A", "B", "C", "D"], + "description": "test the", + }, } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_keyword_specificity( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2508,11 +3207,15 @@ def test_spending_by_award_keyword_specificity( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # get subaward with product_or_service_description "the test test test" and not # "the description for test" @@ -2530,11 +3233,15 @@ def test_spending_by_award_keyword_specificity( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) # ensure only queries for text in the correct order test_payload = { @@ -2544,15 +3251,23 @@ def test_spending_by_award_keyword_specificity( } expected_response = [] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_new_subcontract_fields( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2614,15 +3329,23 @@ def test_spending_by_award_new_subcontract_fields( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_new_subgrant_fields( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -2677,27 +3400,45 @@ def test_spending_by_award_new_subgrant_fields( "zip5": "55455", }, "Prime Award Recipient UEI": "uei 1", - "Assistance Listing": {"cfda_number": "1.234", "cfda_program_title": "test cfda"}, + "Assistance Listing": { + "cfda_number": "1.234", + "cfda_program_title": "test cfda", + }, "sub_award_recipient_id": "EXAM-PLE-ID-P", }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_new_contract_fields( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) # get award with naics_description "the test test test" and not "the description for test" test_payload = { "spending_level": "awards", - "fields": ["Award ID", "Recipient UEI", "Recipient Location", "Primary Place of Performance", "NAICS", "PSC"], + "fields": [ + "Award ID", + "Recipient UEI", + "Recipient Location", + "Primary Place of Performance", + "NAICS", + "PSC", + ], "filters": {"award_type_codes": ["A", "B", "C", "D"], "keyword": "the test"}, } expected_response = [ @@ -2740,15 +3481,23 @@ def test_spending_by_award_new_contract_fields( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_new_assistance_fields( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) @@ -2801,7 +3550,10 @@ def test_spending_by_award_new_assistance_fields( "zip5": "55455", }, "Assistance Listings": [ - {"cfda_number": "64.114", "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS"} + { + "cfda_number": "64.114", + "cfda_program_title": "VETERANS HOUSING GUARANTEED AND INSURED LOANS", + } ], "primary_assistance_listing": { "cfda_number": "64.114", @@ -2810,17 +3562,24 @@ def test_spending_by_award_new_assistance_fields( }, ] resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.json().get("results"), "Unexpected or missing content!" + assert expected_response == resp.json().get("results"), ( + "Unexpected or missing content!" + ) def test_spending_by_award_sort_recipient_location( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -2961,7 +3720,9 @@ def test_spending_by_award_sort_recipient_location( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -2987,7 +3748,9 @@ def test_spending_by_award_sort_recipient_location( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3003,9 +3766,12 @@ def test_spending_by_award_sort_recipient_location( def test_spending_by_primary_place_of_performance( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3098,7 +3864,9 @@ def test_spending_by_primary_place_of_performance( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3113,9 +3881,12 @@ def test_spending_by_primary_place_of_performance( def test_spending_by_award_sort_naics( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3130,7 +3901,9 @@ def test_spending_by_award_sort_naics( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) naics_1 = {"code": "123456", "description": "1"} @@ -3158,7 +3931,9 @@ def test_spending_by_award_sort_naics( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3170,9 +3945,12 @@ def test_spending_by_award_sort_naics( def test_spending_by_award_sort_psc( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3187,7 +3965,9 @@ def test_spending_by_award_sort_psc( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) psc1 = {"code": "PSC1", "description": "PSC description 1"} @@ -3215,7 +3995,9 @@ def test_spending_by_award_sort_psc( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3227,9 +4009,12 @@ def test_spending_by_award_sort_psc( def test_spending_by_award_assistance_listings( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3244,7 +4029,9 @@ def test_spending_by_award_assistance_listings( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assisance_listing1 = [{"cfda_number": "12", "cfda_program_title": "program1"}] @@ -3272,7 +4059,9 @@ def test_spending_by_award_assistance_listings( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3284,7 +4073,11 @@ def test_spending_by_award_assistance_listings( def test_spending_by_award_sort_sub_recipient_locations( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3300,7 +4093,9 @@ def test_spending_by_award_sort_sub_recipient_locations( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3309,7 +4104,10 @@ def test_spending_by_award_sort_sub_recipient_locations( assert results[0]["Sub-Recipient Location"]["city_name"] == "ARLINGTON" assert results[0]["Sub-Recipient Location"]["address_line1"] == "1 Memorial Drive" assert results[1]["Sub-Recipient Location"]["city_name"] == "ARLINGTON" - assert results[1]["Sub-Recipient Location"]["address_line1"] == "600 CALIFORNIA STREET FL 18" + assert ( + results[1]["Sub-Recipient Location"]["address_line1"] + == "600 CALIFORNIA STREET FL 18" + ) assert results[2]["Sub-Recipient Location"]["city_name"] == "SAN FRANCISCO" assert results[3]["Sub-Recipient Location"]["state_code"] == "CA" assert results[3]["Sub-Recipient Location"]["city_name"] is None @@ -3332,7 +4130,9 @@ def test_spending_by_award_sort_sub_recipient_locations( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3340,7 +4140,10 @@ def test_spending_by_award_sort_sub_recipient_locations( assert len(results) == 7 assert results[0]["Sub-Recipient Location"]["city_name"] == "SAN FRANCISCO" assert results[1]["Sub-Recipient Location"]["city_name"] == "ARLINGTON" - assert results[1]["Sub-Recipient Location"]["address_line1"] == "600 CALIFORNIA STREET FL 18" + assert ( + results[1]["Sub-Recipient Location"]["address_line1"] + == "600 CALIFORNIA STREET FL 18" + ) assert results[2]["Sub-Recipient Location"]["city_name"] == "ARLINGTON" assert results[2]["Sub-Recipient Location"]["address_line1"] == "1 Memorial Drive" assert results[3]["Sub-Recipient Location"]["state_code"] == "NE" @@ -3350,7 +4153,11 @@ def test_spending_by_award_sort_sub_recipient_locations( def test_spending_by_award_sort_sub_pop_location( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3367,23 +4174,37 @@ def test_spending_by_award_sort_sub_pop_location( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK results = resp.json().get("results") assert len(results) == 7 - assert results[0]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" - assert results[1]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" - assert results[2]["Sub-Award Primary Place of Performance"]["city_name"] == "LOS ANGELES" + assert ( + results[0]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" + ) + assert ( + results[1]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" + ) + assert ( + results[2]["Sub-Award Primary Place of Performance"]["city_name"] + == "LOS ANGELES" + ) assert results[3]["Sub-Award Primary Place of Performance"]["city_name"] is None assert results[3]["Sub-Award Primary Place of Performance"]["state_code"] == "IL" assert results[4]["Sub-Award Primary Place of Performance"]["city_name"] is None assert results[4]["Sub-Award Primary Place of Performance"]["state_code"] == "VA" assert results[5]["Sub-Award Primary Place of Performance"]["state_code"] is None - assert results[5]["Sub-Award Primary Place of Performance"]["country_name"] == "LAOS" + assert ( + results[5]["Sub-Award Primary Place of Performance"]["country_name"] == "LAOS" + ) assert results[6]["Sub-Award Primary Place of Performance"]["state_code"] is None - assert results[6]["Sub-Award Primary Place of Performance"]["country_name"] == "UNITED STATES" + assert ( + results[6]["Sub-Award Primary Place of Performance"]["country_name"] + == "UNITED STATES" + ) test_payload = { "spending_level": "subawards", @@ -3397,23 +4218,41 @@ def test_spending_by_award_sort_sub_pop_location( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK results = resp.json().get("results") assert len(results) == 7 - assert results[0]["Sub-Award Primary Place of Performance"]["city_name"] == "LOS ANGELES" - assert results[1]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" - assert results[2]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" + assert ( + results[0]["Sub-Award Primary Place of Performance"]["city_name"] + == "LOS ANGELES" + ) + assert ( + results[1]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" + ) + assert ( + results[2]["Sub-Award Primary Place of Performance"]["city_name"] == "ARLINGTON" + ) assert results[3]["Sub-Award Primary Place of Performance"]["state_code"] == "VA" assert results[4]["Sub-Award Primary Place of Performance"]["state_code"] == "IL" - assert results[5]["Sub-Award Primary Place of Performance"]["country_name"] == "UNITED STATES" - assert results[6]["Sub-Award Primary Place of Performance"]["country_name"] == "LAOS" + assert ( + results[5]["Sub-Award Primary Place of Performance"]["country_name"] + == "UNITED STATES" + ) + assert ( + results[6]["Sub-Award Primary Place of Performance"]["country_name"] == "LAOS" + ) def test_spending_by_award_sort_sub_assistance_listing( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3430,7 +4269,9 @@ def test_spending_by_award_sort_sub_assistance_listing( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3457,7 +4298,9 @@ def test_spending_by_award_sort_sub_assistance_listing( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3472,7 +4315,11 @@ def test_spending_by_award_sort_sub_assistance_listing( def test_spending_by_award_sort_sub_naics( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3489,7 +4336,9 @@ def test_spending_by_award_sort_sub_naics( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3514,7 +4363,9 @@ def test_spending_by_award_sort_sub_naics( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3529,7 +4380,11 @@ def test_spending_by_award_sort_sub_naics( def test_spending_by_award_sort_sub_psc( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3546,7 +4401,9 @@ def test_spending_by_award_sort_sub_psc( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3571,7 +4428,9 @@ def test_spending_by_award_sort_sub_psc( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3586,9 +4445,12 @@ def test_spending_by_award_sort_sub_psc( def test_spending_by_subaward_new_sort_fields( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): - setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) @@ -3604,7 +4466,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3625,7 +4489,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3646,7 +4512,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3667,7 +4535,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3688,7 +4558,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3709,7 +4581,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3730,7 +4604,9 @@ def test_spending_by_subaward_new_sort_fields( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3741,7 +4617,9 @@ def test_spending_by_subaward_new_sort_fields( @pytest.mark.django_db -def test_covid_and_iija_values(client, monkeypatch, elasticsearch_award_index, award_data_fixture): +def test_covid_and_iija_values( + client, monkeypatch, elasticsearch_award_index, award_data_fixture +): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) request_body = { "spending_level": "awards", @@ -3763,7 +4641,9 @@ def test_covid_and_iija_values(client, monkeypatch, elasticsearch_award_index, a } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_body) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_body), ) expected_result = [ { @@ -3799,7 +4679,9 @@ def test_covid_and_iija_values(client, monkeypatch, elasticsearch_award_index, a } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(request_body) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(request_body), ) expected_result = [ { @@ -3833,7 +4715,9 @@ def test_spending_by_subaward_place_of_perf_zip_filter( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3850,13 +4734,18 @@ def test_spending_by_subaward_recipient_location_zip_filter( test_payload = { "spending_level": "subawards", "fields": ["Sub-Award ID"], - "filters": {"award_type_codes": ["07", "08"], "recipient_locations": [{"country": "USA", "zip": "12345"}]}, + "filters": { + "award_type_codes": ["07", "08"], + "recipient_locations": [{"country": "USA", "zip": "12345"}], + }, "sort": "Sub-Award ID", "order": "desc", } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK @@ -3866,7 +4755,11 @@ def test_spending_by_subaward_recipient_location_zip_filter( def test_spending_by_award_sort_contract_award_type( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3881,13 +4774,19 @@ def test_spending_by_award_sort_contract_award_type( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK def test_spending_by_award_sort_recipient_uei( - client, monkeypatch, elasticsearch_award_index, elasticsearch_subaward_index, spending_by_award_test_data + client, + monkeypatch, + elasticsearch_award_index, + elasticsearch_subaward_index, + spending_by_award_test_data, ): setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) test_payload = { @@ -3902,6 +4801,8 @@ def test_spending_by_award_sort_recipient_uei( } resp = client.post( - "/api/v2/search/spending_by_award/", content_type="application/json", data=json.dumps(test_payload) + "/api/v2/search/spending_by_award/", + content_type="application/json", + data=json.dumps(test_payload), ) assert resp.status_code == status.HTTP_200_OK diff --git a/usaspending_api/settings.py b/usaspending_api/settings.py index 52a231f915..59a4733d6c 100644 --- a/usaspending_api/settings.py +++ b/usaspending_api/settings.py @@ -91,6 +91,15 @@ BROKER_AGENCY_BUCKET_NAME = "" UNLINKED_AWARDS_DOWNLOAD_REDIRECT_DIR = "unlinked_awards_downloads" +# AWS parameter store key names +EMR_DOWNLOAD_APP_PARAM_NAME = "" +if not EMR_DOWNLOAD_APP_PARAM_NAME: + EMR_DOWNLOAD_APP_PARAM_NAME = os.environ.get("EMR_DOWNLOAD_APP_PARAM_NAME") + +EMR_DOWNLOAD_ROLE_PARAM_NAME = "" +if not EMR_DOWNLOAD_ROLE_PARAM_NAME: + EMR_DOWNLOAD_ROLE_PARAM_NAME = os.environ.get("EMR_DOWNLOAD_ROLE_PARAM_NAME") + # This list contains any abnormal characters in agency names # This list is important to track which characters we need to replace in # the agency name before the name can be used in a file name diff --git a/usaspending_api/tests/conftest_spark.py b/usaspending_api/tests/conftest_spark.py index 587c877b63..60e2a87a9e 100644 --- a/usaspending_api/tests/conftest_spark.py +++ b/usaspending_api/tests/conftest_spark.py @@ -11,6 +11,7 @@ from django.db import connections from model_bakery import baker from psycopg2.extensions import AsIs + from usaspending_api import settings from usaspending_api.common.etl.spark import create_ref_temp_views from usaspending_api.common.helpers.spark_helpers import ( @@ -21,7 +22,10 @@ from usaspending_api.common.spark.configs import LOCAL_BASIC_EXTRA_CONF from usaspending_api.config import CONFIG from usaspending_api.etl.award_helpers import update_awards -from usaspending_api.etl.management.commands.create_delta_table import LOAD_QUERY_TABLE_SPEC, LOAD_TABLE_TABLE_SPEC +from usaspending_api.etl.management.commands.create_delta_table import ( + LOAD_QUERY_TABLE_SPEC, + LOAD_TABLE_TABLE_SPEC, +) if TYPE_CHECKING: from pyspark.sql import SparkSession @@ -59,7 +63,7 @@ def s3_unittest_data_bucket_setup_and_teardown(worker_id: str) -> str: unittest_data_bucket = "unittest-data-{}".format(worker_prefix + str(uuid.uuid4())) logging.warning( - f"Attempting to create unit test data bucket {unittest_data_bucket } " + f"Attempting to create unit test data bucket {unittest_data_bucket} " f"at: http://{CONFIG.AWS_S3_ENDPOINT} using CONFIG.AWS_ACCESS_KEY and CONFIG.AWS_SECRET_KEY" ) s3_client = boto3.client( @@ -140,11 +144,13 @@ def spark(tmp_path_factory) -> Generator["SparkSession", None, None]: # So as not to have interfering schemas and tables in the metastore_db from individual test run to run, # another test-scoped fixture should be created, pulling this in, and blowing away all schemas and tables as part # of each run - spark_sql_warehouse_dir = str(tmp_path_factory.mktemp(basename="spark-warehouse", numbered=False)) + spark_sql_warehouse_dir = str( + tmp_path_factory.mktemp(basename="spark-warehouse", numbered=False) + ) extra_conf = { **LOCAL_BASIC_EXTRA_CONF, "spark.sql.warehouse.dir": spark_sql_warehouse_dir, - "spark.hadoop.javax.jdo.option.ConnectionURL": f"jdbc:derby:;databaseName={spark_sql_warehouse_dir}/metastore_db;create=true", + "spark.hadoop.javax.jdo.option.ConnectionURL": f"jdbc:derby:;databaseName={spark_sql_warehouse_dir}/metastore_db;create=true", # noqa: E501 } spark = configure_spark_session( app_name="Unit Test Session", @@ -224,16 +230,36 @@ def populate_broker_data(broker_server_dblink_setup): USAspending test DB and broker test DB """ broker_data = { - "sam_recipient": json.loads(Path("usaspending_api/recipient/tests/data/broker_sam_recipient.json").read_text()), - "subaward": json.loads(Path("usaspending_api/awards/tests/data/subaward.json").read_text()), + "sam_recipient": json.loads( + Path( + "usaspending_api/recipient/tests/data/broker_sam_recipient.json" + ).read_text() + ), + "subaward": json.loads( + Path("usaspending_api/awards/tests/data/subaward.json").read_text() + ), "cd_state_grouped": json.loads( - Path("usaspending_api/transactions/tests/data/cd_state_grouped.json").read_text() + Path( + "usaspending_api/transactions/tests/data/cd_state_grouped.json" + ).read_text() + ), + "zips": json.loads( + Path("usaspending_api/transactions/tests/data/zips.json").read_text() + ), + "cd_zips_grouped": json.loads( + Path( + "usaspending_api/transactions/tests/data/cd_zips_grouped.json" + ).read_text() + ), + "cd_city_grouped": json.loads( + Path( + "usaspending_api/transactions/tests/data/cd_city_grouped.json" + ).read_text() ), - "zips": json.loads(Path("usaspending_api/transactions/tests/data/zips.json").read_text()), - "cd_zips_grouped": json.loads(Path("usaspending_api/transactions/tests/data/cd_zips_grouped.json").read_text()), - "cd_city_grouped": json.loads(Path("usaspending_api/transactions/tests/data/cd_city_grouped.json").read_text()), "cd_county_grouped": json.loads( - Path("usaspending_api/transactions/tests/data/cd_county_grouped.json").read_text() + Path( + "usaspending_api/transactions/tests/data/cd_county_grouped.json" + ).read_text() ), } insert_statement = "INSERT INTO %(table_name)s (%(columns)s) VALUES %(values)s" @@ -244,7 +270,11 @@ def populate_broker_data(broker_server_dblink_setup): values = [str(tuple(r.values())).replace("None", "null") for r in rows] sql_string = cursor.mogrify( insert_statement, - {"table_name": AsIs(table_name), "columns": AsIs(",".join(columns)), "values": AsIs(",".join(values))}, + { + "table_name": AsIs(table_name), + "columns": AsIs(",".join(columns)), + "values": AsIs(",".join(values)), + }, ) cursor.execute(sql_string) yield @@ -330,10 +360,16 @@ def _build_usas_data_for_spark(): # Create agency data funding_toptier_agency = baker.make( - "references.ToptierAgency", name="TEST AGENCY 1", abbreviation="TA1", _fill_optional=True + "references.ToptierAgency", + name="TEST AGENCY 1", + abbreviation="TA1", + _fill_optional=True, ) funding_subtier_agency = baker.make( - "references.SubtierAgency", name="TEST SUBTIER 1", abbreviation="SA1", _fill_optional=True + "references.SubtierAgency", + name="TEST SUBTIER 1", + abbreviation="SA1", + _fill_optional=True, ) funding_agency = baker.make( "references.Agency", @@ -343,8 +379,18 @@ def _build_usas_data_for_spark(): _fill_optional=True, ) - toptier = baker.make("references.ToptierAgency", name="toptier", abbreviation="tt", _fill_optional=True) - subtier = baker.make("references.SubtierAgency", name="subtier", abbreviation="st", _fill_optional=True) + toptier = baker.make( + "references.ToptierAgency", + name="toptier", + abbreviation="tt", + _fill_optional=True, + ) + subtier = baker.make( + "references.SubtierAgency", + name="subtier", + abbreviation="st", + _fill_optional=True, + ) agency = baker.make( "references.Agency", toptier_agency=toptier, @@ -355,10 +401,17 @@ def _build_usas_data_for_spark(): ) awarding_toptier_agency = baker.make( - "references.ToptierAgency", name="TEST AGENCY 2", abbreviation="TA2", _fill_optional=True + "references.ToptierAgency", + name="TEST AGENCY 2", + abbreviation="TA2", + _fill_optional=True, ) awarding_subtier_agency = baker.make( - "references.SubtierAgency", name="TEST SUBTIER 2", abbreviation="SA2", subtier_code="789", _fill_optional=True + "references.SubtierAgency", + name="TEST SUBTIER 2", + abbreviation="SA2", + subtier_code="789", + _fill_optional=True, ) awarding_agency = baker.make( "references.Agency", @@ -379,14 +432,57 @@ def _build_usas_data_for_spark(): county_name="County Name", _fill_optional=True, ) - baker.make("references.RefCountryCode", country_code="USA", country_name="UNITED STATES", _fill_optional=True) - baker.make("recipient.StateData", code="VA", name="Virginia", fips="51", _fill_optional=True) - baker.make("references.PopCounty", state_code="51", county_number="000", latest_population=1, _fill_optional=True) - baker.make("references.PopCounty", state_code="51", county_number="001", latest_population=1, _fill_optional=True) - baker.make("references.PopCongressionalDistrict", state_code="51", latest_population=1, congressional_district="01") - defc_l = baker.make("references.DisasterEmergencyFundCode", code="L", group_name="covid_19", _fill_optional=True) - defc_m = baker.make("references.DisasterEmergencyFundCode", code="M", group_name="covid_19", _fill_optional=True) - defc_q = baker.make("references.DisasterEmergencyFundCode", code="Q", group_name=None, _fill_optional=True) + baker.make( + "references.RefCountryCode", + country_code="USA", + country_name="UNITED STATES", + _fill_optional=True, + ) + baker.make( + "recipient.StateData", + code="VA", + name="Virginia", + fips="51", + _fill_optional=True, + ) + baker.make( + "references.PopCounty", + state_code="51", + county_number="000", + latest_population=1, + _fill_optional=True, + ) + baker.make( + "references.PopCounty", + state_code="51", + county_number="001", + latest_population=1, + _fill_optional=True, + ) + baker.make( + "references.PopCongressionalDistrict", + state_code="51", + latest_population=1, + congressional_district="01", + ) + defc_l = baker.make( + "references.DisasterEmergencyFundCode", + code="L", + group_name="covid_19", + _fill_optional=True, + ) + defc_m = baker.make( + "references.DisasterEmergencyFundCode", + code="M", + group_name="covid_19", + _fill_optional=True, + ) + defc_q = baker.make( + "references.DisasterEmergencyFundCode", + code="Q", + group_name=None, + _fill_optional=True, + ) rpa_1 = baker.make( "references.RefProgramActivity", id=1, @@ -408,7 +504,9 @@ def _build_usas_data_for_spark(): # Create account data federal_account = baker.make( - "accounts.FederalAccount", parent_toptier_agency=funding_toptier_agency, _fill_optional=True + "accounts.FederalAccount", + parent_toptier_agency=funding_toptier_agency, + _fill_optional=True, ) tas = baker.make( "accounts.TreasuryAppropriationAccount", @@ -502,10 +600,16 @@ def _build_usas_data_for_spark(): recipient_location_congressional_population=1, pop_congressional_population=1, tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], disaster_emergency_fund_codes=["L", "M"], total_covid_outlay=2.0, @@ -702,10 +806,16 @@ def _build_usas_data_for_spark(): recipient_location_state_population=1, pop_state_population=1, tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], disaster_emergency_fund_codes=["Q"], spending_by_defc=[{"defc": "Q", "outlay": 1.00, "obligation": 1.00}], @@ -719,7 +829,9 @@ def _build_usas_data_for_spark(): recipient_location_county_fips=None, pop_county_fips=None, generated_pragmatic_obligation=0.00, - program_activities=[{"name": "TRAINING AND RECRUITING", "code": "0003", "type": "PAC/PAN"}], + program_activities=[ + {"name": "TRAINING AND RECRUITING", "code": "0003", "type": "PAC/PAN"} + ], federal_accounts=[ { "id": federal_account.id, @@ -747,7 +859,7 @@ def _build_usas_data_for_spark(): total_obligation=0.00, total_subsidy_cost=0.00, total_obl_bin="<1M", - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", update_date="2020-01-01", awarding_agency_id=32, funding_agency_id=32, @@ -840,7 +952,7 @@ def _build_usas_data_for_spark(): funding_subtier_agency_name_raw="TEST SUBTIER 1", awarding_toptier_agency_id=awarding_agency.id, funding_toptier_agency_id=funding_agency.id, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", federal_action_obligation=0, cfda_number="12.456", cfda_id=cfda.id, @@ -889,10 +1001,16 @@ def _build_usas_data_for_spark(): non_federal_funding_amount=0.00, treasury_account_identifiers=[tas.treasury_account_identifier], tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], federal_accounts=[ { @@ -948,7 +1066,7 @@ def _build_usas_data_for_spark(): funding_subtier_agency_abbreviation=funding_subtier_agency.abbreviation, awarding_toptier_agency_id=awarding_agency.id, funding_toptier_agency_id=funding_agency.id, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", federal_action_obligation=0, published_fabs_id=2, cfda_number="12.456", @@ -998,10 +1116,16 @@ def _build_usas_data_for_spark(): non_federal_funding_amount=0.00, treasury_account_identifiers=[tas.treasury_account_identifier], tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], federal_accounts=[ { @@ -1057,7 +1181,7 @@ def _build_usas_data_for_spark(): funding_subtier_agency_name_raw="TEST SUBTIER 1", awarding_toptier_agency_id=awarding_agency.id, funding_toptier_agency_id=funding_agency.id, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", federal_action_obligation=0, cfda_number="12.456", cfda_id=cfda.id, @@ -1151,7 +1275,7 @@ def _build_usas_data_for_spark(): funding_toptier_agency_abbreviation=funding_toptier_agency.abbreviation, awarding_subtier_agency_abbreviation=awarding_subtier_agency.abbreviation, funding_subtier_agency_abbreviation=funding_subtier_agency.abbreviation, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", federal_action_obligation=0, naics_code="123456", product_or_service_code="12", @@ -1191,10 +1315,16 @@ def _build_usas_data_for_spark(): total_funding_amount=0.00, treasury_account_identifiers=[tas.treasury_account_identifier], tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], federal_accounts=[ { @@ -1206,7 +1336,9 @@ def _build_usas_data_for_spark(): disaster_emergency_fund_codes=["Q"], recipient_location_county_fips=None, pop_county_fips=None, - program_activities=[{"code": "0003", "name": "TRAINING AND RECRUITING", "type": "PAC/PAN"}], + program_activities=[ + {"code": "0003", "name": "TRAINING AND RECRUITING", "type": "PAC/PAN"} + ], ) pap1 = baker.make("references.ProgramActivityPark", code="1000", name="PAP name") @@ -1249,7 +1381,7 @@ def _build_usas_data_for_spark(): funding_toptier_agency_abbreviation=funding_toptier_agency.abbreviation, awarding_subtier_agency_abbreviation=awarding_subtier_agency.abbreviation, funding_subtier_agency_abbreviation=funding_subtier_agency.abbreviation, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", federal_action_obligation=0, naics_code="123456", product_or_service_code="12", @@ -1289,10 +1421,16 @@ def _build_usas_data_for_spark(): total_funding_amount=0.00, treasury_account_identifiers=[tas.treasury_account_identifier], tas_paths=[ - f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"agency={funding_toptier_agency.toptier_code}faaid={federal_account.agency_identifier}" + f"famain={federal_account.main_account_code}aid={tas.agency_id}main={tas.main_account_code}" + f"ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}" + f"bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}" + f"a={tas.availability_type_code}" ], tas_components=[ - f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" + f"aid={tas.agency_id}main={tas.main_account_code}ata={tas.allocation_transfer_agency_id or ''}" + f"sub={tas.sub_account_code}bpoa={tas.beginning_period_of_availability or ''}" + f"epoa={tas.ending_period_of_availability or ''}a={tas.availability_type_code}" ], federal_accounts=[ { @@ -1304,7 +1442,9 @@ def _build_usas_data_for_spark(): disaster_emergency_fund_codes=["Q"], recipient_location_county_fips=None, pop_county_fips=None, - program_activities=[{"code": "0003", "name": "TRAINING AND RECRUITING", "type": "PAC/PAN"}], + program_activities=[ + {"code": "0003", "name": "TRAINING AND RECRUITING", "type": "PAC/PAN"} + ], ) baker.make( "search.TransactionSearch", @@ -1339,7 +1479,7 @@ def _build_usas_data_for_spark(): funding_subtier_agency_abbreviation=subtier.abbreviation, awarding_toptier_agency_id=agency.id, funding_toptier_agency_id=agency.id, - last_modified_date="2020-01-01", + last_modified_date="2020-01-01 00:00:00", award_update_date=cont_award2.update_date, generated_pragmatic_obligation=0.00, original_loan_subsidy_cost=0.00, @@ -1415,7 +1555,9 @@ def _build_usas_data_for_spark(): _fill_optional=True, ) - dabs = baker.make("submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-05-01") + dabs = baker.make( + "submissions.DABSSubmissionWindowSchedule", submission_reveal_date="2020-05-01" + ) sa = baker.make( "submissions.SubmissionAttributes", reporting_period_start="2020-04-02", @@ -1487,21 +1629,28 @@ def populate_usas_data(db): @pytest.fixture -def populate_usas_data_and_recipients_from_broker(db, populate_usas_data, populate_broker_data): +def populate_usas_data_and_recipients_from_broker( + db, populate_usas_data, populate_broker_data +): with connections[settings.DEFAULT_DB_ALIAS].cursor() as cursor: - restock_duns_sql = open("usaspending_api/broker/management/sql/restock_duns.sql", "r").read() + restock_duns_sql = open( + "usaspending_api/broker/management/sql/restock_duns.sql", "r" + ).read() restock_duns_sql = restock_duns_sql.replace("VACUUM ANALYZE int.duns;", "") cursor.execute(restock_duns_sql) call_command("update_recipient_lookup") with connections[settings.DEFAULT_DB_ALIAS].cursor() as cursor: restock_recipient_profile_sql = open( - "usaspending_api/recipient/management/sql/restock_recipient_profile.sql", "r" + "usaspending_api/recipient/management/sql/restock_recipient_profile.sql", + "r", ).read() cursor.execute(restock_recipient_profile_sql) yield -def create_all_delta_tables(spark: "SparkSession", s3_bucket: str, tables_to_load: list): +def create_all_delta_tables( + spark: "SparkSession", s3_bucket: str, tables_to_load: list +): load_query_tables = [val for val in tables_to_load if val in LOAD_QUERY_TABLE_SPEC] load_table_tables = [val for val in tables_to_load if val in LOAD_TABLE_TABLE_SPEC] for dest_table in load_table_tables + load_query_tables: @@ -1519,10 +1668,16 @@ def create_all_delta_tables(spark: "SparkSession", s3_bucket: str, tables_to_loa f"--spark-s3-bucket={s3_bucket}", ) else: - call_command("create_delta_table", f"--destination-table={dest_table}", f"--spark-s3-bucket={s3_bucket}") + call_command( + "create_delta_table", + f"--destination-table={dest_table}", + f"--spark-s3-bucket={s3_bucket}", + ) -def create_and_load_all_delta_tables(spark: "SparkSession", s3_bucket: str, tables_to_load: list): +def create_and_load_all_delta_tables( + spark: "SparkSession", s3_bucket: str, tables_to_load: list +): create_all_delta_tables(spark, s3_bucket, tables_to_load) load_query_tables = [val for val in tables_to_load if val in LOAD_QUERY_TABLE_SPEC] diff --git a/usaspending_api/transactions/delta_models/detached_award_procurement.py b/usaspending_api/transactions/delta_models/detached_award_procurement.py index b2bc834687..facbd2682a 100644 --- a/usaspending_api/transactions/delta_models/detached_award_procurement.py +++ b/usaspending_api/transactions/delta_models/detached_award_procurement.py @@ -93,7 +93,10 @@ "fair_opportunity_limited_s": {"delta": "STRING", "postgres": "TEXT"}, "fed_biz_opps": {"delta": "STRING", "postgres": "TEXT"}, "fed_biz_opps_description": {"delta": "STRING", "postgres": "TEXT"}, - "federal_action_obligation": {"delta": "NUMERIC(38, 18)", "postgres": "NUMERIC(38,18"}, + "federal_action_obligation": { + "delta": "NUMERIC(38, 18)", + "postgres": "NUMERIC(38,18", + }, "federal_agency": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, "federally_funded_research": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, "for_profit_organization": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, @@ -311,6 +314,8 @@ } DELTA_ONLY_COLUMNS = { "hash": "LONG", + "action_year": "INTEGER", + "action_month": "INTEGER", } DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS = { **{k: v["delta"] for k, v in DETACHED_AWARD_PROCUREMENT_COLUMNS.items()}, @@ -323,5 +328,7 @@ {", ".join([f"{key} {val}" for key, val in DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS.items()])} ) USING DELTA + PARTITIONED BY (action_year, action_month) LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ diff --git a/usaspending_api/transactions/delta_models/published_fabs.py b/usaspending_api/transactions/delta_models/published_fabs.py index 21ac24bd4d..2b06b2e88d 100644 --- a/usaspending_api/transactions/delta_models/published_fabs.py +++ b/usaspending_api/transactions/delta_models/published_fabs.py @@ -26,9 +26,15 @@ "correction_delete_ind_desc": {"delta": "STRING", "postgres": "TEXT"}, "correction_delete_indicatr": {"delta": "STRING", "postgres": "TEXT"}, "created_at": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP"}, - "face_value_loan_guarantee": {"delta": "NUMERIC(38,18)", "postgres": "NUMERIC(38,18"}, + "face_value_loan_guarantee": { + "delta": "NUMERIC(38,18)", + "postgres": "NUMERIC(38,18", + }, "fain": {"delta": "STRING", "postgres": "TEXT"}, - "federal_action_obligation": {"delta": "NUMERIC(38,18)", "postgres": "NUMERIC(38,18"}, + "federal_action_obligation": { + "delta": "NUMERIC(38,18)", + "postgres": "NUMERIC(38,18", + }, "fiscal_year_and_quarter_co": {"delta": "STRING", "postgres": "TEXT"}, "funding_agency_code": {"delta": "STRING", "postgres": "TEXT"}, "funding_agency_name": {"delta": "STRING", "postgres": "TEXT"}, @@ -48,7 +54,10 @@ "high_comp_officer4_full_na": {"delta": "STRING", "postgres": "TEXT"}, "high_comp_officer5_amount": {"delta": "STRING", "postgres": "TEXT"}, "high_comp_officer5_full_na": {"delta": "STRING", "postgres": "TEXT"}, - "indirect_federal_sharing": {"delta": "NUMERIC(38,18)", "postgres": "NUMERIC(38,18)"}, + "indirect_federal_sharing": { + "delta": "NUMERIC(38,18)", + "postgres": "NUMERIC(38,18)", + }, "is_active": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, "is_historical": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, "legal_entity_address_line1": {"delta": "STRING", "postgres": "TEXT"}, @@ -70,8 +79,14 @@ "legal_entity_zip5": {"delta": "STRING", "postgres": "TEXT"}, "legal_entity_zip_last4": {"delta": "STRING", "postgres": "TEXT"}, "modified_at": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP"}, - "non_federal_funding_amount": {"delta": "NUMERIC(38,18)", "postgres": "NUMERIC(38,18)"}, - "original_loan_subsidy_cost": {"delta": "NUMERIC(38,18)", "postgres": "NUMERIC(38,18)"}, + "non_federal_funding_amount": { + "delta": "NUMERIC(38,18)", + "postgres": "NUMERIC(38,18)", + }, + "original_loan_subsidy_cost": { + "delta": "NUMERIC(38,18)", + "postgres": "NUMERIC(38,18)", + }, "period_of_performance_curr": {"delta": "STRING", "postgres": "TEXT"}, "period_of_performance_star": {"delta": "STRING", "postgres": "TEXT"}, "place_of_perfor_state_code": {"delta": "STRING", "postgres": "TEXT"}, @@ -103,6 +118,8 @@ } DELTA_ONLY_COLUMNS = { "hash": "LONG", + "action_year": "INTEGER", + "action_month": "INTEGER", } PUBLISHED_FABS_DELTA_COLUMNS = { **{k: v["delta"] for k, v in PUBLISHED_FABS_COLUMNS.items()}, @@ -115,5 +132,7 @@ {", ".join([f'{key} {val}' for key, val in PUBLISHED_FABS_DELTA_COLUMNS.items()])} ) USING DELTA + PARTITIONED BY (action_year, action_month) LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ diff --git a/usaspending_api/transactions/delta_models/transaction_fabs.py b/usaspending_api/transactions/delta_models/transaction_fabs.py index d6cc32481c..c674d628ae 100644 --- a/usaspending_api/transactions/delta_models/transaction_fabs.py +++ b/usaspending_api/transactions/delta_models/transaction_fabs.py @@ -1,3 +1,5 @@ +from pyspark.sql import functions as sf + from usaspending_api.common.data_classes import TransactionColumn TRANSACTION_FABS_COLUMN_INFO = [ @@ -51,20 +53,21 @@ "legal_entity_country_code", "legal_entity_country_code", "STRING", - scalar_transformation="CASE {input} \ - WHEN 'UNITED STATES' THEN 'USA' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: sf.when(col == sf.lit("UNITED STATES"), sf.lit("USA")).otherwise(col), ), TransactionColumn( "legal_entity_country_name", "legal_entity_country_name", "STRING", - scalar_transformation="CASE \ - WHEN {input} = 'USA' THEN 'UNITED STATES' \ - WHEN COALESCE({input}, '') = '' AND legal_entity_country_code = 'UNITED STATES' THEN 'UNITED STATES' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: ( + sf.when(col == sf.lit("USA"), sf.lit("UNITED STATES")) + .when( + (sf.coalesce(col, sf.lit("")) == sf.lit("")) + & (sf.col("legal_entity_country_code") == sf.lit("UNITED STATES")), + sf.lit("UNITED STATES"), + ) + .otherwise(col) + ), ), TransactionColumn("legal_entity_county_code", "legal_entity_county_code", "STRING"), TransactionColumn("legal_entity_county_name", "legal_entity_county_name", "STRING"), @@ -96,20 +99,21 @@ "place_of_perform_country_c", "place_of_perform_country_c", "STRING", - scalar_transformation="CASE {input} \ - WHEN 'UNITED STATES' THEN 'USA' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: sf.when(col == sf.lit("UNITED STATES"), sf.lit("USA")).otherwise(col), ), TransactionColumn( "place_of_perform_country_n", "place_of_perform_country_n", "STRING", - scalar_transformation="CASE \ - WHEN {input} = 'USA' THEN 'UNITED STATES' \ - WHEN COALESCE({input}, '') = '' AND place_of_perform_country_c = 'UNITED STATES' THEN 'UNITED STATES' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: ( + sf.when(col == sf.lit("USA"), sf.lit("UNITED STATES")) + .when( + (sf.coalesce(col, sf.lit("")) == sf.lit("")) + & (sf.col("place_of_perform_country_c") == sf.lit("UNITED STATES")), + sf.lit("UNITED STATES"), + ) + .otherwise(col) + ), ), TransactionColumn("place_of_perform_county_co", "place_of_perform_county_co", "STRING"), TransactionColumn("place_of_perform_county_na", "place_of_perform_county_na", "STRING"), @@ -137,6 +141,8 @@ TransactionColumn("updated_at", "updated_at", "TIMESTAMP"), TransactionColumn("uri", "uri", "STRING"), TransactionColumn("hash", "hash", "LONG"), + TransactionColumn("action_year", "action_year", "INTEGER"), + TransactionColumn("action_month", "action_month", "INTEGER"), ] TRANSACTION_FABS_COLUMNS = [col.dest_name for col in TRANSACTION_FABS_COLUMN_INFO] @@ -159,7 +165,9 @@ {", ".join([f'{col.dest_name} {col.delta_type}' for col in TRANSACTION_FABS_COLUMN_INFO])} ) USING DELTA + PARTITIONED BY (action_year, action_month) LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ # Mapping from raw.published_fabs to int.transaction_normalized columns, where a simple mapping exists @@ -168,12 +176,12 @@ TransactionColumn("action_date", "action_date", "DATE", "parse_string_datetime_to_date"), TransactionColumn("action_type", "action_type", "STRING"), TransactionColumn("action_type_description", "action_type_description", "STRING"), - TransactionColumn("certified_date", "NULL", "DATE", "literal"), + TransactionColumn("certified_date", None, "DATE", "literal"), TransactionColumn("description", "award_description", "STRING"), TransactionColumn("face_value_loan_guarantee", "face_value_loan_guarantee", "NUMERIC(23,2)"), TransactionColumn("federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)"), TransactionColumn("indirect_federal_sharing", "indirect_federal_sharing", "NUMERIC(23, 2)", "cast"), - TransactionColumn("is_fpds", "FALSE", "BOOLEAN", "literal"), + TransactionColumn("is_fpds", False, "BOOLEAN", "literal"), TransactionColumn("last_modified_date", "modified_at", "DATE", "cast"), TransactionColumn("modification_number", "award_modification_amendme", "STRING"), TransactionColumn("non_federal_funding_amount", "non_federal_funding_amount", "NUMERIC(23,2)"), @@ -189,6 +197,8 @@ TransactionColumn("type", "assistance_type", "STRING"), TransactionColumn("type_description", "assistance_type_desc", "STRING"), TransactionColumn("unique_award_key", "unique_award_key", "STRING"), - TransactionColumn("usaspending_unique_transaction_id", "NULL", "STRING", "literal"), + TransactionColumn("usaspending_unique_transaction_id", None, "STRING", "literal"), TransactionColumn("hash", "hash", "LONG"), + TransactionColumn("action_year", "action_year", "INTEGER"), + TransactionColumn("action_month", "action_month", "INTEGER"), ] diff --git a/usaspending_api/transactions/delta_models/transaction_fpds.py b/usaspending_api/transactions/delta_models/transaction_fpds.py index bba0ab70ba..e6063f9792 100644 --- a/usaspending_api/transactions/delta_models/transaction_fpds.py +++ b/usaspending_api/transactions/delta_models/transaction_fpds.py @@ -1,3 +1,5 @@ +from pyspark.sql import functions as sf + from usaspending_api.common.data_classes import TransactionColumn TRANSACTION_FPDS_COLUMN_INFO = [ @@ -124,7 +126,12 @@ TransactionColumn("information_technology_com", "information_technology_com", "STRING"), TransactionColumn("inherently_government_desc", "inherently_government_desc", "STRING"), TransactionColumn("inherently_government_func", "inherently_government_func", "STRING"), - TransactionColumn("initial_report_date", "initial_report_date", "STRING", "string_datetime_remove_timestamp"), + TransactionColumn( + "initial_report_date", + "initial_report_date", + "STRING", + "string_datetime_remove_timestamp", + ), TransactionColumn("inter_municipal_local_gove", "inter_municipal_local_gove", "BOOLEAN"), TransactionColumn("interagency_contract_desc", "interagency_contract_desc", "STRING"), TransactionColumn("interagency_contracting_au", "interagency_contracting_au", "STRING"), @@ -145,20 +152,21 @@ "legal_entity_country_code", "legal_entity_country_code", "STRING", - scalar_transformation="CASE {input} \ - WHEN 'UNITED STATES' THEN 'USA' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: sf.when(col == "UNITED STATES", "USA").otherwise(col), ), TransactionColumn( "legal_entity_country_name", "legal_entity_country_name", "STRING", - scalar_transformation="CASE \ - WHEN {input} = 'USA' THEN 'UNITED STATES' \ - WHEN COALESCE({input}, '') = '' AND legal_entity_country_code = 'UNITED STATES' THEN 'UNITED STATES' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: ( + sf.when(col == "USA", sf.lit("UNITED STATES")) + .when( + (sf.coalesce(col, sf.lit("")) == sf.lit("")) + & (sf.col("legal_entity_country_code") == sf.lit("UNITED STATES")), + sf.lit("UNITED STATES"), + ) + .otherwise(col) + ), ), TransactionColumn("legal_entity_county_code", "legal_entity_county_code", "STRING"), TransactionColumn("legal_entity_county_name", "legal_entity_county_name", "STRING"), @@ -204,7 +212,10 @@ TransactionColumn("officer_5_amount", "high_comp_officer5_amount", "NUMERIC(23,2)", "cast"), TransactionColumn("officer_5_name", "high_comp_officer5_full_na", "STRING"), TransactionColumn( - "ordering_period_end_date", "ordering_period_end_date", "STRING", "string_datetime_remove_timestamp" + "ordering_period_end_date", + "ordering_period_end_date", + "STRING", + "string_datetime_remove_timestamp", ), TransactionColumn("organizational_type", "organizational_type", "STRING"), TransactionColumn("other_minority_owned_busin", "other_minority_owned_busin", "BOOLEAN"), @@ -229,20 +240,21 @@ "place_of_perform_country_c", "place_of_perform_country_c", "STRING", - scalar_transformation="CASE {input} \ - WHEN 'UNITED STATES' THEN 'USA' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: sf.when(col == sf.lit("UNITED STATES"), sf.lit("USA")).otherwise(col), ), TransactionColumn( "place_of_perform_country_n", "place_of_perform_country_n", "STRING", - scalar_transformation="CASE \ - WHEN {input} = 'USA' THEN 'UNITED STATES' \ - WHEN COALESCE({input}, '') = '' AND place_of_perform_country_c = 'UNITED STATES' THEN 'UNITED STATES' \ - ELSE {input} \ - END", + scalar_transformation=lambda col: ( + sf.when(col == sf.lit("USA"), sf.lit("UNITED STATES")) + .when( + (sf.coalesce(col, sf.lit("")) == sf.lit("")) + & (sf.col("place_of_perform_country_c") == sf.lit("UNITED STATES")), + sf.lit("UNITED STATES"), + ) + .otherwise(col) + ), ), TransactionColumn("place_of_perform_county_co", "place_of_perform_county_co", "STRING"), TransactionColumn("place_of_perform_county_na", "place_of_perform_county_na", "STRING"), @@ -279,7 +291,7 @@ TransactionColumn("referenced_mult_or_single", "referenced_mult_or_single", "STRING"), # The referenced_multi_or_single field does not appear in the django model and may have been created inadvertently # in the Delta model previously. Since it is always NULL, it is a candidate for elimination. - TransactionColumn("referenced_multi_or_single", "NULL", "STRING", "literal"), + TransactionColumn("referenced_multi_or_single", None, "STRING", "literal"), TransactionColumn("research", "research", "STRING"), TransactionColumn("research_description", "research_description", "STRING"), TransactionColumn("sam_exception", "sam_exception", "STRING"), @@ -345,6 +357,8 @@ TransactionColumn("woman_owned_business", "woman_owned_business", "BOOLEAN"), TransactionColumn("women_owned_small_business", "women_owned_small_business", "BOOLEAN"), TransactionColumn("hash", "hash", "LONG"), + TransactionColumn("action_year", "action_year", "INTEGER"), + TransactionColumn("action_month", "action_month", "INTEGER"), ] TRANSACTION_FPDS_COLUMNS = [col.dest_name for col in TRANSACTION_FPDS_COLUMN_INFO] @@ -378,10 +392,12 @@ transaction_fpds_sql_string = rf""" CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( - {", ".join([f'{col.dest_name} {col.delta_type}' for col in TRANSACTION_FPDS_COLUMN_INFO])} + {", ".join([f"{col.dest_name} {col.delta_type}" for col in TRANSACTION_FPDS_COLUMN_INFO])} ) USING DELTA + PARTITIONED BY (action_year, action_month) LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ # Mapping from raw.detached_award_procurement to int.transaction_normalized columns, where a simple mapping exists @@ -389,27 +405,35 @@ TransactionColumn("action_date", "action_date", "DATE", "parse_string_datetime_to_date"), TransactionColumn("action_type", "action_type", "STRING"), TransactionColumn("action_type_description", "action_type_description", "STRING"), - TransactionColumn("certified_date", "NULL", "DATE", "literal"), + TransactionColumn("certified_date", None, "DATE", "literal"), TransactionColumn("description", "award_description", "STRING"), - TransactionColumn("face_value_loan_guarantee", "NULL", "NUMERIC(23, 2)", "literal"), + TransactionColumn("face_value_loan_guarantee", None, "NUMERIC(23, 2)", "literal"), TransactionColumn("federal_action_obligation", "federal_action_obligation", "NUMERIC(23,2)"), TransactionColumn("funding_amount", "NULL", "NUMERIC(23, 2)", "literal"), - TransactionColumn("indirect_federal_sharing", "NULL", "NUMERIC(23, 2)", "literal"), - TransactionColumn("is_fpds", "TRUE", "BOOLEAN", "literal"), - TransactionColumn("last_modified_date", "last_modified", "DATE", "cast"), + TransactionColumn("indirect_federal_sharing", None, "NUMERIC(23, 2)", "literal"), + TransactionColumn("is_fpds", True, "BOOLEAN", "literal"), + TransactionColumn("last_modified_date", "last_modified", "TIMESTAMP", "cast"), TransactionColumn("modification_number", "award_modification_amendme", "STRING"), - TransactionColumn("non_federal_funding_amount", "NULL", "NUMERIC(23, 2)", "literal"), - TransactionColumn("original_loan_subsidy_cost", "NULL", "NUMERIC(23, 2)", "literal"), + TransactionColumn("non_federal_funding_amount", None, "NUMERIC(23, 2)", "literal"), + TransactionColumn("original_loan_subsidy_cost", None, "NUMERIC(23, 2)", "literal"), # All period_of_performance_* fields seen as: YYYY-MM-DD 00:00:00, so cast works # BUT it's still just a string and could morph, so defensively smart-date-parsing the string TransactionColumn( - "period_of_performance_current_end_date", "period_of_performance_curr", "DATE", "parse_string_datetime_to_date" + "period_of_performance_current_end_date", + "period_of_performance_curr", + "DATE", + "parse_string_datetime_to_date", ), TransactionColumn( - "period_of_performance_start_date", "period_of_performance_star", "DATE", "parse_string_datetime_to_date" + "period_of_performance_start_date", + "period_of_performance_star", + "DATE", + "parse_string_datetime_to_date", ), TransactionColumn("transaction_unique_id", "detached_award_proc_unique", "STRING"), TransactionColumn("unique_award_key", "unique_award_key", "STRING"), - TransactionColumn("usaspending_unique_transaction_id", "NULL", "STRING", "literal"), + TransactionColumn("usaspending_unique_transaction_id", None, "STRING", "literal"), TransactionColumn("hash", "hash", "LONG"), + TransactionColumn("action_year", "action_year", "INTEGER"), + TransactionColumn("action_month", "action_month", "INTEGER"), ] diff --git a/usaspending_api/transactions/delta_models/transaction_normalized.py b/usaspending_api/transactions/delta_models/transaction_normalized.py index c9c073002c..c685f467f2 100644 --- a/usaspending_api/transactions/delta_models/transaction_normalized.py +++ b/usaspending_api/transactions/delta_models/transaction_normalized.py @@ -16,7 +16,7 @@ "id": "LONG", "indirect_federal_sharing": "NUMERIC(23, 2)", "is_fpds": "BOOLEAN NOT NULL", - "last_modified_date": "DATE", + "last_modified_date": "TIMESTAMP", "modification_number": "STRING", "non_federal_funding_amount": "NUMERIC(23, 2)", "original_loan_subsidy_cost": "NUMERIC(23, 2)", @@ -29,12 +29,16 @@ "update_date": "TIMESTAMP", "usaspending_unique_transaction_id": "STRING", "hash": "LONG", + "action_year": "INTEGER", + "action_month": "INTEGER", } transaction_normalized_sql_string = rf""" CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( - {", ".join([f'{key} {val}' for key, val in TRANSACTION_NORMALIZED_COLUMNS.items()])} + {", ".join([f"{key} {val}" for key, val in TRANSACTION_NORMALIZED_COLUMNS.items()])} ) USING DELTA + PARTITIONED BY (is_fpds, action_year, action_month) LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + TBLPROPERTIES (delta.enableChangeDataFeed = true) """ diff --git a/usaspending_api/transactions/delta_models/transaction_search.py b/usaspending_api/transactions/delta_models/transaction_search.py index b4960aa142..9050257695 100644 --- a/usaspending_api/transactions/delta_models/transaction_search.py +++ b/usaspending_api/transactions/delta_models/transaction_search.py @@ -2,17 +2,37 @@ TRANSACTION_SEARCH_COLUMNS = { # Keys - "transaction_id": {"delta": "LONG NOT NULL", "postgres": "BIGINT NOT NULL", "gold": False}, - "award_id": {"delta": "LONG NOT NULL", "postgres": "BIGINT NOT NULL", "gold": False}, + "transaction_id": { + "delta": "LONG NOT NULL", + "postgres": "BIGINT NOT NULL", + "gold": False, + }, + "award_id": { + "delta": "LONG NOT NULL", + "postgres": "BIGINT NOT NULL", + "gold": False, + }, # while transaction_unique_id is gold, it can't be NULL - "transaction_unique_id": {"delta": "STRING NOT NULL", "postgres": "TEXT NOT NULL", "gold": False}, - "usaspending_unique_transaction_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "transaction_unique_id": { + "delta": "STRING NOT NULL", + "postgres": "TEXT NOT NULL", + "gold": False, + }, + "usaspending_unique_transaction_id": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "modification_number": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "generated_unique_award_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, # Dates "action_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "fiscal_action_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "last_modified_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, + "last_modified_date": { + "delta": "TIMESTAMP", + "postgres": "TIMESTAMP", + "gold": False, + }, "fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "award_certified_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, "award_fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, @@ -21,37 +41,117 @@ "award_update_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, "award_date_signed": {"delta": "DATE", "postgres": "DATE", "gold": False}, "etl_update_date": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP", "gold": False}, - "period_of_performance_start_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "period_of_performance_current_end_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, - "initial_report_date": {"delta": "DATE", "postgres": "DATE", "gold": False}, + "period_of_performance_start_date": { + "delta": "DATE", + "postgres": "DATE", + "gold": False, + }, + "period_of_performance_current_end_date": { + "delta": "DATE", + "postgres": "DATE", + "gold": False, + }, + "initial_report_date": { + "delta": "TIMESTAMP", + "postgres": "TIMESTAMP", + "gold": False, + }, # Agencies "awarding_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_toptier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_toptier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "awarding_toptier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_toptier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "funding_agency_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "awarding_sub_tier_agency_c": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "funding_sub_tier_agency_co": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_subtier_agency_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_subtier_agency_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "awarding_toptier_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "funding_toptier_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, + "funding_toptier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, + "awarding_sub_tier_agency_c": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, + "funding_sub_tier_agency_co": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_subtier_agency_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_subtier_agency_name_raw": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, + "awarding_toptier_agency_id": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "funding_toptier_agency_id": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, "awarding_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "funding_agency_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "awarding_toptier_agency_abbreviation": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_toptier_agency_abbreviation": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "awarding_subtier_agency_abbreviation": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "funding_subtier_agency_abbreviation": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "awarding_toptier_agency_abbreviation": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_toptier_agency_abbreviation": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "awarding_subtier_agency_abbreviation": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "funding_subtier_agency_abbreviation": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "awarding_office_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "awarding_office_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "funding_office_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "funding_office_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, # Typing # while is_fpds is gold, it also can't be NULL - "is_fpds": {"delta": "BOOLEAN NOT NULL", "postgres": "BOOLEAN NOT NULL", "gold": False}, + "is_fpds": { + "delta": "BOOLEAN NOT NULL", + "postgres": "BOOLEAN NOT NULL", + "gold": False, + }, "type_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type_description_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type": {"delta": "STRING", "postgres": "TEXT", "gold": False}, @@ -60,17 +160,57 @@ "action_type_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "award_category": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "transaction_description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "business_categories": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, + "business_categories": { + "delta": "ARRAY", + "postgres": "TEXT[]", + "gold": False, + }, # Amounts - "award_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, - "generated_pragmatic_obligation": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, - "federal_action_obligation": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, - "original_loan_subsidy_cost": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, - "face_value_loan_guarantee": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": False}, - "indirect_federal_sharing": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, - "funding_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, - "total_funding_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, - "non_federal_funding_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "award_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, + "generated_pragmatic_obligation": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, + "federal_action_obligation": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, + "original_loan_subsidy_cost": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, + "face_value_loan_guarantee": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": False, + }, + "indirect_federal_sharing": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, + "funding_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, + "total_funding_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, + "non_federal_funding_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, # Recipient "recipient_hash": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "recipient_levels": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, @@ -82,25 +222,81 @@ "parent_uei": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "parent_recipient_name_raw": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "parent_recipient_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "parent_recipient_unique_id": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "parent_recipient_unique_id": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, # Recipient Location - "recipient_location_country_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_country_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_state_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_county_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_county_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_county_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_congressional_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "recipient_location_congressional_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "recipient_location_congressional_code_current": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "recipient_location_country_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_country_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_fips": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_state_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_county_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_county_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_county_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_congressional_code": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, + "recipient_location_congressional_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "recipient_location_congressional_code_current": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "recipient_location_zip5": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "legal_entity_zip4": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_zip_last4": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_city_code": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "recipient_location_city_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "recipient_location_city_name": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "legal_entity_address_line1": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_address_line2": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_address_line3": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -108,7 +304,11 @@ "legal_entity_foreign_descr": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_foreign_posta": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "legal_entity_foreign_provi": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "recipient_location_county_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "recipient_location_county_fips": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, # Place of Performance "place_of_performance_code": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "place_of_performance_scope": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -122,8 +322,16 @@ "pop_county_name": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "pop_county_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "pop_congressional_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "pop_congressional_population": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "pop_congressional_code_current": {"delta": "STRING", "postgres": "TEXT", "gold": True}, + "pop_congressional_population": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "pop_congressional_code_current": { + "delta": "STRING", + "postgres": "TEXT", + "gold": True, + }, "pop_zip5": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "place_of_performance_zip4a": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "place_of_perform_zip_last4": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -131,22 +339,50 @@ "place_of_performance_forei": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "pop_county_fips": {"delta": "STRING", "postgres": "TEXT", "gold": False}, # Accounts - "treasury_account_identifiers": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, + "treasury_account_identifiers": { + "delta": "ARRAY", + "postgres": "TEXT[]", + "gold": False, + }, "tas_paths": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, "tas_components": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, "federal_accounts": {"delta": "STRING", "postgres": "JSONB", "gold": False}, - "disaster_emergency_fund_codes": {"delta": "ARRAY", "postgres": "TEXT[]", "gold": False}, + "disaster_emergency_fund_codes": { + "delta": "ARRAY", + "postgres": "TEXT[]", + "gold": False, + }, # Officer Amounts "officer_1_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_1_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "officer_1_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, "officer_2_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_2_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "officer_2_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, "officer_3_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_3_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "officer_3_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, "officer_4_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_4_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "officer_4_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, "officer_5_name": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "officer_5_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", "gold": True}, + "officer_5_amount": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + "gold": True, + }, # Exclusively FABS "published_fabs_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, "afa_generated_unique": {"delta": "STRING", "postgres": "TEXT", "gold": False}, @@ -167,23 +403,67 @@ "sai_number": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "uri": {"delta": "STRING", "postgres": "TEXT", "gold": False}, # Exclusively FPDS - "detached_award_procurement_id": {"delta": "INTEGER", "postgres": "INTEGER", "gold": False}, - "detached_award_proc_unique": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "detached_award_procurement_id": { + "delta": "INTEGER", + "postgres": "INTEGER", + "gold": False, + }, + "detached_award_proc_unique": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "a_76_fair_act_action": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "a_76_fair_act_action_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "agency_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "airport_authority": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "alaskan_native_owned_corpo": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "alaskan_native_servicing_i": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "american_indian_owned_busi": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "asian_pacific_american_own": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "alaskan_native_owned_corpo": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "alaskan_native_servicing_i": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "american_indian_owned_busi": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "asian_pacific_american_own": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "base_and_all_options_value": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "base_exercised_options_val": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "black_american_owned_busin": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "c1862_land_grant_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "c1890_land_grant_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "c1994_land_grant_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "c8a_program_participant": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "black_american_owned_busin": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "c1862_land_grant_college": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "c1890_land_grant_college": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "c1994_land_grant_college": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "c8a_program_participant": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "cage_code": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "city_local_government": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "clinger_cohen_act_planning": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -192,8 +472,16 @@ "commercial_item_acquisitio": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "commercial_item_test_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "commercial_item_test_progr": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "community_developed_corpor": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "community_development_corp": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "community_developed_corpor": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "community_development_corp": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "consolidated_contract": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "consolidated_contract_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "construction_wage_rat_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -209,8 +497,16 @@ "contracting_officers_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "contracting_officers_deter": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "contracts": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "corporate_entity_not_tax_e": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "corporate_entity_tax_exemp": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "corporate_entity_not_tax_e": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "corporate_entity_tax_exemp": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "cost_accounting_stand_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "cost_accounting_standards": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "cost_or_pricing_data": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -218,17 +514,37 @@ "council_of_governments": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "country_of_product_or_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "country_of_product_or_serv": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "county_local_government": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "county_local_government": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "current_total_value_award": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "dod_claimant_prog_cod_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "dod_claimant_program_code": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "domestic_or_foreign_e_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "domestic_or_foreign_entity": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "domestic_shelter": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "dot_certified_disadvantage": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "economically_disadvantaged": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "educational_institution": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "emerging_small_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "dot_certified_disadvantage": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "economically_disadvantaged": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "educational_institution": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "emerging_small_business": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "epa_designated_produc_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "epa_designated_product": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "evaluated_preference": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -240,40 +556,100 @@ "fed_biz_opps": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "fed_biz_opps_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "federal_agency": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "federally_funded_research": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "for_profit_organization": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "federally_funded_research": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "for_profit_organization": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "foreign_funding": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "foreign_funding_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "foreign_government": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "foreign_owned_and_located": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "foreign_owned_and_located": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "foundation": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "government_furnished_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "government_furnished_prope": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "grants": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "hispanic_american_owned_bu": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "hispanic_servicing_institu": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "historically_black_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "historically_underutilized": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "hispanic_american_owned_bu": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "hispanic_servicing_institu": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "historically_black_college": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "historically_underutilized": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "hospital_flag": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "housing_authorities_public": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "housing_authorities_public": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "idv_type": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "idv_type_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "indian_tribe_federally_rec": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "indian_tribe_federally_rec": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "information_technolog_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "information_technology_com": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "inherently_government_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "inherently_government_func": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "inter_municipal_local_gove": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "inter_municipal_local_gove": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "interagency_contract_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "interagency_contracting_au": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "international_organization": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "international_organization": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "interstate_entity": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "joint_venture_economically": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "joint_venture_women_owned": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "joint_venture_economically": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "joint_venture_women_owned": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "labor_standards": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "labor_standards_descrip": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "labor_surplus_area_firm": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "limited_liability_corporat": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "labor_surplus_area_firm": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "limited_liability_corporat": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "local_area_set_aside": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "local_area_set_aside_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "local_government_owned": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, @@ -282,31 +658,63 @@ "materials_supplies_article": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "materials_supplies_descrip": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "minority_institution": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "minority_owned_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "minority_owned_business": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "multi_year_contract": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "multi_year_contract_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "multiple_or_single_aw_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "multiple_or_single_award_i": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "municipality_local_governm": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "municipality_local_governm": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "naics_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "naics_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "national_interest_action": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "national_interest_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "native_american_owned_busi": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "native_hawaiian_owned_busi": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "native_hawaiian_servicing": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "native_american_owned_busi": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "native_hawaiian_owned_busi": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "native_hawaiian_servicing": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "nonprofit_organization": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "number_of_actions": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "number_of_offers_received": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "ordering_period_end_date": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "organizational_type": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "other_minority_owned_busin": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "other_not_for_profit_organ": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "other_minority_owned_busin": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "other_not_for_profit_organ": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "other_statutory_authority": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "other_than_full_and_o_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "other_than_full_and_open_c": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "parent_award_id": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "partnership_or_limited_lia": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "partnership_or_limited_lia": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "performance_based_se_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "performance_based_service": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "period_of_perf_potential_e": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -317,16 +725,28 @@ "port_authority": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "potential_total_value_awar": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "price_evaluation_adjustmen": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "private_university_or_coll": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "private_university_or_coll": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "product_or_service_code": {"delta": "STRING", "postgres": "TEXT", "gold": False}, - "product_or_service_description": {"delta": "STRING", "postgres": "TEXT", "gold": False}, + "product_or_service_description": { + "delta": "STRING", + "postgres": "TEXT", + "gold": False, + }, "program_acronym": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "program_system_or_equ_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "program_system_or_equipmen": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "pulled_from": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "purchase_card_as_paym_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "purchase_card_as_payment_m": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "receives_contracts_and_gra": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "receives_contracts_and_gra": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "recovered_materials_s_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "recovered_materials_sustai": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "referenced_idv_agency_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -340,33 +760,85 @@ "research_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "sam_exception": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "sam_exception_description": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "sba_certified_8_a_joint_ve": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "school_district_local_gove": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "sba_certified_8_a_joint_ve": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "school_district_local_gove": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "school_of_forestry": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "sea_transportation": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "sea_transportation_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "self_certified_small_disad": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "service_disabled_veteran_o": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "small_agricultural_coopera": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "small_business_competitive": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "small_disadvantaged_busine": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "self_certified_small_disad": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "service_disabled_veteran_o": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "small_agricultural_coopera": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "small_business_competitive": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "small_disadvantaged_busine": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "sole_proprietorship": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "solicitation_date": {"delta": "DATE", "postgres": "DATE", "gold": True}, "solicitation_identifier": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "solicitation_procedur_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "solicitation_procedures": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "state_controlled_instituti": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "subchapter_s_corporation": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "subcontinent_asian_asian_i": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "state_controlled_instituti": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "subchapter_s_corporation": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, + "subcontinent_asian_asian_i": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "subcontracting_plan": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "subcontracting_plan_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "the_ability_one_program": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "the_ability_one_program": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "total_obligated_amount": {"delta": "STRING", "postgres": "TEXT", "gold": True}, - "township_local_government": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "township_local_government": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "transaction_number": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "transit_authority": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "tribal_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "tribally_owned_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "tribally_owned_business": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "type_of_contract_pricing": {"delta": "STRING", "postgres": "TEXT", "gold": False}, "type_of_contract_pric_desc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, "type_of_idc": {"delta": "STRING", "postgres": "TEXT", "gold": True}, @@ -387,7 +859,11 @@ "veterinary_college": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "veterinary_hospital": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, "woman_owned_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, - "women_owned_small_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN", "gold": True}, + "women_owned_small_business": { + "delta": "BOOLEAN", + "postgres": "BOOLEAN", + "gold": True, + }, "program_activities": {"delta": "STRING", "postgres": "JSONB", "gold": False}, } DELTA_ONLY_COLUMNS = { @@ -397,14 +873,18 @@ **{k: v["delta"] for k, v in TRANSACTION_SEARCH_COLUMNS.items()}, **DELTA_ONLY_COLUMNS, } -TRANSACTION_SEARCH_POSTGRES_COLUMNS = {k: v["postgres"] for k, v in TRANSACTION_SEARCH_COLUMNS.items() if not v["gold"]} -TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS = {k: v["postgres"] for k, v in TRANSACTION_SEARCH_COLUMNS.items()} +TRANSACTION_SEARCH_POSTGRES_COLUMNS = { + k: v["postgres"] for k, v in TRANSACTION_SEARCH_COLUMNS.items() if not v["gold"] +} +TRANSACTION_SEARCH_POSTGRES_GOLD_COLUMNS = { + k: v["postgres"] for k, v in TRANSACTION_SEARCH_COLUMNS.items() +} ALL_AWARD_TYPES = list(award_type_mapping.keys()) transaction_search_create_sql_string = rf""" CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( - {", ".join([f'{key} {val}' for key, val in TRANSACTION_SEARCH_DELTA_COLUMNS.items()])} + {", ".join([f"{key} {val}" for key, val in TRANSACTION_SEARCH_DELTA_COLUMNS.items()])} ) USING DELTA LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' diff --git a/usaspending_api/views.py b/usaspending_api/views.py index f8b97629d1..f640df5b10 100644 --- a/usaspending_api/views.py +++ b/usaspending_api/views.py @@ -1,9 +1,9 @@ -from django.http import HttpResponse -from django.views import View import json +from django.http import HttpRequest, HttpResponse +from django.views import View + class StatusView(View): - def get(self, request, format=None): - response_object = {"status": "running"} - return HttpResponse(json.dumps(response_object)) + def get(self, request: HttpRequest) -> HttpResponse: + return HttpResponse(json.dumps({"status": "running"})) diff --git a/uv.lock b/uv.lock index a8ad9c7a03..e1b655e1f6 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.12'", @@ -100,40 +100,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/09/71/0f5e89fcafc2aae704a421c899df4d56622a364731751ba93a1794f1879e/awscli-1.34.33-py3-none-any.whl", hash = "sha256:4ef6e2b0b72e7d33c0c5ce3ae499f26eb1e814e35deb036b708cdc46cb39ef27", size = 4520041, upload-time = "2024-10-03T19:18:14.963Z" }, ] -[[package]] -name = "black" -version = "24.10.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "mypy-extensions" }, - { name = "packaging" }, - { name = "pathspec" }, - { name = "platformdirs" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d8/0d/cc2fb42b8c50d80143221515dd7e4766995bd07c56c9a3ed30baf080b6dc/black-24.10.0.tar.gz", hash = "sha256:846ea64c97afe3bc677b761787993be4991810ecc7a4a937816dd6bddedc4875", size = 645813, upload-time = "2024-10-07T19:20:50.361Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a3/f3/465c0eb5cddf7dbbfe1fecd9b875d1dcf51b88923cd2c1d7e9ab95c6336b/black-24.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6668650ea4b685440857138e5fe40cde4d652633b1bdffc62933d0db4ed9812", size = 1623211, upload-time = "2024-10-07T19:26:12.43Z" }, - { url = "https://files.pythonhosted.org/packages/df/57/b6d2da7d200773fdfcc224ffb87052cf283cec4d7102fab450b4a05996d8/black-24.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1c536fcf674217e87b8cc3657b81809d3c085d7bf3ef262ead700da345bfa6ea", size = 1457139, upload-time = "2024-10-07T19:25:06.453Z" }, - { url = "https://files.pythonhosted.org/packages/6e/c5/9023b7673904a5188f9be81f5e129fff69f51f5515655fbd1d5a4e80a47b/black-24.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:649fff99a20bd06c6f727d2a27f401331dc0cc861fb69cde910fe95b01b5928f", size = 1753774, upload-time = "2024-10-07T19:23:58.47Z" }, - { url = "https://files.pythonhosted.org/packages/e1/32/df7f18bd0e724e0d9748829765455d6643ec847b3f87e77456fc99d0edab/black-24.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe4d6476887de70546212c99ac9bd803d90b42fc4767f058a0baa895013fbb3e", size = 1414209, upload-time = "2024-10-07T19:24:42.54Z" }, - { url = "https://files.pythonhosted.org/packages/c2/cc/7496bb63a9b06a954d3d0ac9fe7a73f3bf1cd92d7a58877c27f4ad1e9d41/black-24.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5a2221696a8224e335c28816a9d331a6c2ae15a2ee34ec857dcf3e45dbfa99ad", size = 1607468, upload-time = "2024-10-07T19:26:14.966Z" }, - { url = "https://files.pythonhosted.org/packages/2b/e3/69a738fb5ba18b5422f50b4f143544c664d7da40f09c13969b2fd52900e0/black-24.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f9da3333530dbcecc1be13e69c250ed8dfa67f43c4005fb537bb426e19200d50", size = 1437270, upload-time = "2024-10-07T19:25:24.291Z" }, - { url = "https://files.pythonhosted.org/packages/c9/9b/2db8045b45844665c720dcfe292fdaf2e49825810c0103e1191515fc101a/black-24.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4007b1393d902b48b36958a216c20c4482f601569d19ed1df294a496eb366392", size = 1737061, upload-time = "2024-10-07T19:23:52.18Z" }, - { url = "https://files.pythonhosted.org/packages/a3/95/17d4a09a5be5f8c65aa4a361444d95edc45def0de887810f508d3f65db7a/black-24.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:394d4ddc64782e51153eadcaaca95144ac4c35e27ef9b0a42e121ae7e57a9175", size = 1423293, upload-time = "2024-10-07T19:24:41.7Z" }, - { url = "https://files.pythonhosted.org/packages/90/04/bf74c71f592bcd761610bbf67e23e6a3cff824780761f536512437f1e655/black-24.10.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b5e39e0fae001df40f95bd8cc36b9165c5e2ea88900167bddf258bacef9bbdc3", size = 1644256, upload-time = "2024-10-07T19:27:53.355Z" }, - { url = "https://files.pythonhosted.org/packages/4c/ea/a77bab4cf1887f4b2e0bce5516ea0b3ff7d04ba96af21d65024629afedb6/black-24.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d37d422772111794b26757c5b55a3eade028aa3fde43121ab7b673d050949d65", size = 1448534, upload-time = "2024-10-07T19:26:44.953Z" }, - { url = "https://files.pythonhosted.org/packages/4e/3e/443ef8bc1fbda78e61f79157f303893f3fddf19ca3c8989b163eb3469a12/black-24.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14b3502784f09ce2443830e3133dacf2c0110d45191ed470ecb04d0f5f6fcb0f", size = 1761892, upload-time = "2024-10-07T19:24:10.264Z" }, - { url = "https://files.pythonhosted.org/packages/52/93/eac95ff229049a6901bc84fec6908a5124b8a0b7c26ea766b3b8a5debd22/black-24.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:30d2c30dc5139211dda799758559d1b049f7f14c580c409d6ad925b74a4208a8", size = 1434796, upload-time = "2024-10-07T19:25:06.239Z" }, - { url = "https://files.pythonhosted.org/packages/d0/a0/a993f58d4ecfba035e61fca4e9f64a2ecae838fc9f33ab798c62173ed75c/black-24.10.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1cbacacb19e922a1d75ef2b6ccaefcd6e93a2c05ede32f06a21386a04cedb981", size = 1643986, upload-time = "2024-10-07T19:28:50.684Z" }, - { url = "https://files.pythonhosted.org/packages/37/d5/602d0ef5dfcace3fb4f79c436762f130abd9ee8d950fa2abdbf8bbc555e0/black-24.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1f93102e0c5bb3907451063e08b9876dbeac810e7da5a8bfb7aeb5a9ef89066b", size = 1448085, upload-time = "2024-10-07T19:28:12.093Z" }, - { url = "https://files.pythonhosted.org/packages/47/6d/a3a239e938960df1a662b93d6230d4f3e9b4a22982d060fc38c42f45a56b/black-24.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ddacb691cdcdf77b96f549cf9591701d8db36b2f19519373d60d31746068dbf2", size = 1760928, upload-time = "2024-10-07T19:24:15.233Z" }, - { url = "https://files.pythonhosted.org/packages/dd/cf/af018e13b0eddfb434df4d9cd1b2b7892bab119f7a20123e93f6910982e8/black-24.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:680359d932801c76d2e9c9068d05c6b107f2584b2a5b88831c83962eb9984c1b", size = 1436875, upload-time = "2024-10-07T19:24:42.762Z" }, - { url = "https://files.pythonhosted.org/packages/8d/a7/4b27c50537ebca8bec139b872861f9d2bf501c5ec51fcf897cb924d9e264/black-24.10.0-py3-none-any.whl", hash = "sha256:3bb2b7a1f7b685f85b11fed1ef10f8a9148bceb49853e47a294a3dd963c1dd7d", size = 206898, upload-time = "2024-10-07T19:20:48.317Z" }, -] - [[package]] name = "boto3" version = "1.35.33" @@ -785,20 +751,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/ea/6e5568ef338ba918be8c8fccc0a717d824c13187fe5cb9e8ad8530d113d1/fiscalyear-0.4.0-py3-none-any.whl", hash = "sha256:8adb8022a76cc52974d059d176ec3f33b2d7a6c1f72ac356702bc70e1e5e4d92", size = 8417, upload-time = "2022-02-17T03:18:26.523Z" }, ] -[[package]] -name = "flake8" -version = "7.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mccabe" }, - { name = "pycodestyle" }, - { name = "pyflakes" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/4e/34/64f8a43736d9862ced7dd0ea5c3ed99815b8ff4b826a4f3bfd3a1b0639b1/flake8-7.1.0.tar.gz", hash = "sha256:48a07b626b55236e0fb4784ee69a465fbf59d79eec1f5b4785c3d3bc57d17aa5", size = 48240, upload-time = "2024-06-15T21:37:07.633Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/43/d5147aadaa52558e94e024811f2f9543b4bd7203b3a9659eeb5dff9c61b3/flake8-7.1.0-py2.py3-none-any.whl", hash = "sha256:2e416edcc62471a64cea09353f4e7bdba32aeb079b6e360554c659a122b1bc6a", size = 57569, upload-time = "2024-06-15T21:37:05.342Z" }, -] - [[package]] name = "google-auth" version = "2.40.3" @@ -1024,15 +976,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/04/37055b7013dfaaf66e3a9a51e46857cc9be151476a891b995fa70da7e139/marshmallow-3.21.1-py3-none-any.whl", hash = "sha256:f085493f79efb0644f270a9bf2892843142d80d7174bbbd2f3713f2a589dc633", size = 49362, upload-time = "2024-03-04T20:21:15.753Z" }, ] -[[package]] -name = "mccabe" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e7/ff/0ffefdcac38932a54d2b5eed4e0ba8a408f215002cd178ad1df0f2806ff8/mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325", size = 9658, upload-time = "2022-01-24T01:14:51.113Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/27/1a/1f68f9ba0c207934b35b86a8ca3aad8395a3d6dd7921c0686e23853ff5a9/mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e", size = 7350, upload-time = "2022-01-24T01:14:49.62Z" }, -] - [[package]] name = "mdurl" version = "0.1.2" @@ -1640,15 +1583,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436, upload-time = "2024-09-20T13:09:48.112Z" }, ] -[[package]] -name = "pathspec" -version = "0.12.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, -] - [[package]] name = "pip" version = "23.2.1" @@ -1775,15 +1709,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, ] -[[package]] -name = "pycodestyle" -version = "2.12.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/aa/210b2c9aedd8c1cbeea31a50e42050ad56187754b34eb214c46709445801/pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521", size = 39232, upload-time = "2024-08-04T20:26:54.576Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3a/d8/a211b3f85e99a0daa2ddec96c949cac6824bd305b040571b82a03dd62636/pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3", size = 31284, upload-time = "2024-08-04T20:26:53.173Z" }, -] - [[package]] name = "pycparser" version = "2.22" @@ -1817,15 +1742,6 @@ dotenv = [ { name = "python-dotenv" }, ] -[[package]] -name = "pyflakes" -version = "3.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/57/f9/669d8c9c86613c9d568757c7f5824bd3197d7b1c6c27553bc5618a27cce2/pyflakes-3.2.0.tar.gz", hash = "sha256:1c61603ff154621fb2a9172037d84dca3500def8c8b630657d1701f026f8af3f", size = 63788, upload-time = "2024-01-05T00:28:47.703Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d4/d7/f1b7db88d8e4417c5d47adad627a93547f44bdc9028372dbd2313f34a855/pyflakes-3.2.0-py2.py3-none-any.whl", hash = "sha256:84b5be138a2dfbb40689ca07e2152deb896a65c3a3e24c251c5c62489568074a", size = 62725, upload-time = "2024-01-05T00:28:45.903Z" }, -] - [[package]] name = "pygments" version = "2.19.1" @@ -2088,6 +2004,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/93/0c0f002031f18b53af7a6166103c02b9c0667be528944137cc954ec921b3/rsa-4.7.2-py3-none-any.whl", hash = "sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2", size = 34505, upload-time = "2021-02-24T10:55:03.55Z" }, ] +[[package]] +name = "ruff" +version = "0.14.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2e/06/f71e3a86b2df0dfa2d2f72195941cd09b44f87711cb7fa5193732cb9a5fc/ruff-0.14.14.tar.gz", hash = "sha256:2d0f819c9a90205f3a867dbbd0be083bee9912e170fd7d9704cc8ae45824896b", size = 4515732, upload-time = "2026-01-22T22:30:17.527Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/89/20a12e97bc6b9f9f68343952da08a8099c57237aef953a56b82711d55edd/ruff-0.14.14-py3-none-linux_armv6l.whl", hash = "sha256:7cfe36b56e8489dee8fbc777c61959f60ec0f1f11817e8f2415f429552846aed", size = 10467650, upload-time = "2026-01-22T22:30:08.578Z" }, + { url = "https://files.pythonhosted.org/packages/a3/b1/c5de3fd2d5a831fcae21beda5e3589c0ba67eec8202e992388e4b17a6040/ruff-0.14.14-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6006a0082336e7920b9573ef8a7f52eec837add1265cc74e04ea8a4368cd704c", size = 10883245, upload-time = "2026-01-22T22:30:04.155Z" }, + { url = "https://files.pythonhosted.org/packages/b8/7c/3c1db59a10e7490f8f6f8559d1db8636cbb13dccebf18686f4e3c9d7c772/ruff-0.14.14-py3-none-macosx_11_0_arm64.whl", hash = "sha256:026c1d25996818f0bf498636686199d9bd0d9d6341c9c2c3b62e2a0198b758de", size = 10231273, upload-time = "2026-01-22T22:30:34.642Z" }, + { url = "https://files.pythonhosted.org/packages/a1/6e/5e0e0d9674be0f8581d1f5e0f0a04761203affce3232c1a1189d0e3b4dad/ruff-0.14.14-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f666445819d31210b71e0a6d1c01e24447a20b85458eea25a25fe8142210ae0e", size = 10585753, upload-time = "2026-01-22T22:30:31.781Z" }, + { url = "https://files.pythonhosted.org/packages/23/09/754ab09f46ff1884d422dc26d59ba18b4e5d355be147721bb2518aa2a014/ruff-0.14.14-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c0f18b922c6d2ff9a5e6c3ee16259adc513ca775bcf82c67ebab7cbd9da5bc8", size = 10286052, upload-time = "2026-01-22T22:30:24.827Z" }, + { url = "https://files.pythonhosted.org/packages/c8/cc/e71f88dd2a12afb5f50733851729d6b571a7c3a35bfdb16c3035132675a0/ruff-0.14.14-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1629e67489c2dea43e8658c3dba659edbfd87361624b4040d1df04c9740ae906", size = 11043637, upload-time = "2026-01-22T22:30:13.239Z" }, + { url = "https://files.pythonhosted.org/packages/67/b2/397245026352494497dac935d7f00f1468c03a23a0c5db6ad8fc49ca3fb2/ruff-0.14.14-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:27493a2131ea0f899057d49d303e4292b2cae2bb57253c1ed1f256fbcd1da480", size = 12194761, upload-time = "2026-01-22T22:30:22.542Z" }, + { url = "https://files.pythonhosted.org/packages/5b/06/06ef271459f778323112c51b7587ce85230785cd64e91772034ddb88f200/ruff-0.14.14-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:01ff589aab3f5b539e35db38425da31a57521efd1e4ad1ae08fc34dbe30bd7df", size = 12005701, upload-time = "2026-01-22T22:30:20.499Z" }, + { url = "https://files.pythonhosted.org/packages/41/d6/99364514541cf811ccc5ac44362f88df66373e9fec1b9d1c4cc830593fe7/ruff-0.14.14-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1cc12d74eef0f29f51775f5b755913eb523546b88e2d733e1d701fe65144e89b", size = 11282455, upload-time = "2026-01-22T22:29:59.679Z" }, + { url = "https://files.pythonhosted.org/packages/ca/71/37daa46f89475f8582b7762ecd2722492df26421714a33e72ccc9a84d7a5/ruff-0.14.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb8481604b7a9e75eff53772496201690ce2687067e038b3cc31aaf16aa0b974", size = 11215882, upload-time = "2026-01-22T22:29:57.032Z" }, + { url = "https://files.pythonhosted.org/packages/2c/10/a31f86169ec91c0705e618443ee74ede0bdd94da0a57b28e72db68b2dbac/ruff-0.14.14-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:14649acb1cf7b5d2d283ebd2f58d56b75836ed8c6f329664fa91cdea19e76e66", size = 11180549, upload-time = "2026-01-22T22:30:27.175Z" }, + { url = "https://files.pythonhosted.org/packages/fd/1e/c723f20536b5163adf79bdd10c5f093414293cdf567eed9bdb7b83940f3f/ruff-0.14.14-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e8058d2145566510790eab4e2fad186002e288dec5e0d343a92fe7b0bc1b3e13", size = 10543416, upload-time = "2026-01-22T22:30:01.964Z" }, + { url = "https://files.pythonhosted.org/packages/3e/34/8a84cea7e42c2d94ba5bde1d7a4fae164d6318f13f933d92da6d7c2041ff/ruff-0.14.14-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e651e977a79e4c758eb807f0481d673a67ffe53cfa92209781dfa3a996cf8412", size = 10285491, upload-time = "2026-01-22T22:30:29.51Z" }, + { url = "https://files.pythonhosted.org/packages/55/ef/b7c5ea0be82518906c978e365e56a77f8de7678c8bb6651ccfbdc178c29f/ruff-0.14.14-py3-none-musllinux_1_2_i686.whl", hash = "sha256:cc8b22da8d9d6fdd844a68ae937e2a0adf9b16514e9a97cc60355e2d4b219fc3", size = 10733525, upload-time = "2026-01-22T22:30:06.499Z" }, + { url = "https://files.pythonhosted.org/packages/6a/5b/aaf1dfbcc53a2811f6cc0a1759de24e4b03e02ba8762daabd9b6bd8c59e3/ruff-0.14.14-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:16bc890fb4cc9781bb05beb5ab4cd51be9e7cb376bf1dd3580512b24eb3fda2b", size = 11315626, upload-time = "2026-01-22T22:30:36.848Z" }, + { url = "https://files.pythonhosted.org/packages/2c/aa/9f89c719c467dfaf8ad799b9bae0df494513fb21d31a6059cb5870e57e74/ruff-0.14.14-py3-none-win32.whl", hash = "sha256:b530c191970b143375b6a68e6f743800b2b786bbcf03a7965b06c4bf04568167", size = 10502442, upload-time = "2026-01-22T22:30:38.93Z" }, + { url = "https://files.pythonhosted.org/packages/87/44/90fa543014c45560cae1fffc63ea059fb3575ee6e1cb654562197e5d16fb/ruff-0.14.14-py3-none-win_amd64.whl", hash = "sha256:3dde1435e6b6fe5b66506c1dff67a421d0b7f6488d466f651c07f4cab3bf20fd", size = 11630486, upload-time = "2026-01-22T22:30:10.852Z" }, + { url = "https://files.pythonhosted.org/packages/9e/6a/40fee331a52339926a92e17ae748827270b288a35ef4a15c9c8f2ec54715/ruff-0.14.14-py3-none-win_arm64.whl", hash = "sha256:56e6981a98b13a32236a72a8da421d7839221fa308b223b9283312312e5ac76c", size = 10920448, upload-time = "2026-01-22T22:30:15.417Z" }, +] + [[package]] name = "s3transfer" version = "0.10.4" @@ -2300,11 +2242,9 @@ awscli = [ { name = "awscli" }, ] dev = [ - { name = "black" }, { name = "click" }, { name = "docker" }, { name = "dredd-hooks" }, - { name = "flake8" }, { name = "importlib-metadata" }, { name = "mock" }, { name = "model-bakery" }, @@ -2315,6 +2255,7 @@ dev = [ { name = "pytest-django" }, { name = "pytest-pretty" }, { name = "pytest-xdist" }, + { name = "ruff" }, ] server = [ { name = "django-redis" }, @@ -2335,7 +2276,6 @@ requires-dist = [ { name = "asyncpg", specifier = "==0.29.*" }, { name = "attrs", specifier = "==23.2.*" }, { name = "awscli", marker = "extra == 'awscli'", specifier = "==1.34.*" }, - { name = "black", marker = "extra == 'dev'", specifier = "==24.10.0" }, { name = "boto3", specifier = ">=1.34,<1.36" }, { name = "certifi", specifier = "==2024.7.4" }, { name = "click", marker = "extra == 'dev'", specifier = "==8.1.7" }, @@ -2362,7 +2302,6 @@ requires-dist = [ { name = "et-xmlfile", specifier = "==1.1.0" }, { name = "filelock", specifier = "==3.13.1" }, { name = "fiscalyear", specifier = "==0.4.0" }, - { name = "flake8", marker = "extra == 'dev'", specifier = "==7.1.0" }, { name = "importlib-metadata", marker = "extra == 'dev'", specifier = "==8.5.0" }, { name = "markdown", specifier = "==3.5.*" }, { name = "marshmallow", specifier = "==3.21.1" }, @@ -2420,6 +2359,7 @@ requires-dist = [ { name = "python-json-logger", specifier = "==2.0.7" }, { name = "requests", specifier = "==2.31.*" }, { name = "retrying", specifier = "==1.3.4" }, + { name = "ruff", marker = "extra == 'dev'", specifier = "==0.14.14" }, { name = "setuptools", marker = "extra == 'server'", specifier = ">=68.1.2" }, { name = "sqlparse", specifier = "==0.5.*" }, { name = "supervisor", marker = "extra == 'server'", specifier = "==4.1.0" },