From a8adefe0df213c72f2c37e4d428aa348d2904fdf Mon Sep 17 00:00:00 2001 From: Alex Chen Date: Wed, 11 Mar 2026 21:43:03 -0700 Subject: [PATCH] Handle cross-year dedupes with deletes in student academic records --- .../stg_ef3__student_academic_records.sql | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/models/staging/edfi_3/stage/stg_ef3__student_academic_records.sql b/models/staging/edfi_3/stage/stg_ef3__student_academic_records.sql index c800c912..ef030f65 100644 --- a/models/staging/edfi_3/stage/stg_ef3__student_academic_records.sql +++ b/models/staging/edfi_3/stage/stg_ef3__student_academic_records.sql @@ -29,17 +29,33 @@ keyed as ( {{ extract_extension(model_name=this.name, flatten=True) }} from base_academic_records ), -deduped as ( +-- For x-year resources (those that do not include year in unique key), there's an edge case +-- where a record we need for historic reporting could have been deleted in a later year. To avoid removing these, +-- we need to first dedupe within year using last_modified_timestamp, then dedupe across years to get to a single record +deduped_within_year as ( {{ dbt_utils.deduplicate( relation='keyed', + partition_by='k_student_academic_record, api_year', + order_by='last_modified_timestamp desc, pull_timestamp desc' + ) + }} +), +-- .. then remove deletes as they shouldn't be used in x-year dedupe +deduped_within_year_no_deletes as ( + select * from deduped_within_year + {% if not is_incremental() %} + where not is_deleted + {% endif %} +), +-- .. and then dedupe across years to enforce the correct grain, keeping latest year that wasn't deleted +deduped_across_years as ( + {{ + dbt_utils.deduplicate( + relation='deduped_within_year_no_deletes', partition_by='k_student_academic_record', - order_by='api_year desc, last_modified_timestamp desc, pull_timestamp desc' + order_by='api_year desc' ) }} ) -select * from deduped -{% if not is_incremental() %} -where not is_deleted -{% endif %} - +select * from deduped_across_years