diff --git a/models/staging/edfi_3/stage/stg_ef3__student_academic_records.sql b/models/staging/edfi_3/stage/stg_ef3__student_academic_records.sql index c800c912..ef030f65 100644 --- a/models/staging/edfi_3/stage/stg_ef3__student_academic_records.sql +++ b/models/staging/edfi_3/stage/stg_ef3__student_academic_records.sql @@ -29,17 +29,33 @@ keyed as ( {{ extract_extension(model_name=this.name, flatten=True) }} from base_academic_records ), -deduped as ( +-- For x-year resources (those that do not include year in unique key), there's an edge case +-- where a record we need for historic reporting could have been deleted in a later year. To avoid removing these, +-- we need to first dedupe within year using last_modified_timestamp, then dedupe across years to get to a single record +deduped_within_year as ( {{ dbt_utils.deduplicate( relation='keyed', + partition_by='k_student_academic_record, api_year', + order_by='last_modified_timestamp desc, pull_timestamp desc' + ) + }} +), +-- .. then remove deletes as they shouldn't be used in x-year dedupe +deduped_within_year_no_deletes as ( + select * from deduped_within_year + {% if not is_incremental() %} + where not is_deleted + {% endif %} +), +-- .. and then dedupe across years to enforce the correct grain, keeping latest year that wasn't deleted +deduped_across_years as ( + {{ + dbt_utils.deduplicate( + relation='deduped_within_year_no_deletes', partition_by='k_student_academic_record', - order_by='api_year desc, last_modified_timestamp desc, pull_timestamp desc' + order_by='api_year desc' ) }} ) -select * from deduped -{% if not is_incremental() %} -where not is_deleted -{% endif %} - +select * from deduped_across_years