diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index e1e07b1b7..1e1fcd483 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -197,21 +197,11 @@ def get_library_variables_metadata(self) -> DatasetInterface: and self.dataset_metadata.rdomain ): domain = "SUPPQUAL" - elif ( - not self.dataset_metadata.domain - and not self.dataset_metadata.rdomain - and "rel" in self.dataset_metadata.name.lower() - ): - if self.dataset_metadata.name.lower().startswith( - "ap" - ) and self.dataset_metadata.name.lower()[2:].startswith("rel"): - domain = self.dataset_metadata.name[2:] - else: - domain = self.dataset_metadata.name else: domain = self.dataset_metadata.domain variables: List[dict] = sdtm_utilities.get_variables_metadata_from_standard( - domain=domain, library_metadata=self.library_metadata + domain=self.dataset_metadata.unsplit_name, + library_metadata=self.library_metadata, ) variables_metadata: dict = self.library_metadata.variables_metadata.get( domain, {} diff --git a/cdisc_rules_engine/operations/base_operation.py b/cdisc_rules_engine/operations/base_operation.py index ed66cdb96..53dc15bcb 100644 --- a/cdisc_rules_engine/operations/base_operation.py +++ b/cdisc_rules_engine/operations/base_operation.py @@ -224,26 +224,9 @@ def _expand_operation_results_in_grouping(self, grouping_list): def _get_variables_metadata_from_standard(self) -> List[dict]: # TODO: Update to handle other standard types: adam, cdash, etc. - target_metadata = None - for ds in self.params.datasets: - if ds.unsplit_name == self.params.domain: - target_metadata = ds - break - if ( - target_metadata - and hasattr(target_metadata, "is_supp") - and target_metadata.is_supp - ): - domain_for_library = "SUPPQUAL" - elif target_metadata and "rel" in target_metadata.name.lower(): - if target_metadata.name.lower().startswith( - "ap" - ) and target_metadata.name.lower()[2:].startswith("rel"): - domain_for_library = target_metadata.name[2:] - else: - domain_for_library = target_metadata.name - else: - domain_for_library = self.params.domain + + # self.params.domain is unsplit_name + domain_for_library = self.params.domain return sdtm_utilities.get_variables_metadata_from_standard( domain_for_library, self.library_metadata, diff --git a/cdisc_rules_engine/operations/library_model_column_order.py b/cdisc_rules_engine/operations/library_model_column_order.py index 6620ae882..d60fd58ed 100644 --- a/cdisc_rules_engine/operations/library_model_column_order.py +++ b/cdisc_rules_engine/operations/library_model_column_order.py @@ -5,6 +5,7 @@ class LibraryModelColumnOrder(BaseOperation): def _execute_operation(self): """ Fetches column order for a given domain from the CDISC library. + Self.params.domain is SDTMDatasetMetadata.unsplit_name Returns it as a Series of lists like: 0 ["STUDYID", "DOMAIN", ...] 1 ["STUDYID", "DOMAIN", ...] diff --git a/cdisc_rules_engine/utilities/sdtm_utilities.py b/cdisc_rules_engine/utilities/sdtm_utilities.py index 97168e72f..20c8d9718 100644 --- a/cdisc_rules_engine/utilities/sdtm_utilities.py +++ b/cdisc_rules_engine/utilities/sdtm_utilities.py @@ -54,6 +54,23 @@ def get_tabulation_model_type_and_version(model_link: dict) -> Tuple: def get_variables_metadata_from_standard(domain, library_metadata): # noqa + add_AP = False + original_domain = domain + if ( + domain + and (domain.upper().startswith("SUPP") or domain.upper().startswith("SQ")) + and len(domain) > 2 + ): + if domain.upper().startswith("SQ"): + parent_domain = domain[2:] + if parent_domain.upper().startswith("AP"): + add_AP = True + domain = "SUPPQUAL" + elif domain and domain.upper().startswith("AP"): + domain = domain[2:] + original_domain = domain + add_AP = True + standard_details = library_metadata.standard_metadata model_details = library_metadata.model_metadata is_custom = domain not in standard_details.get("domains", {}) @@ -70,13 +87,22 @@ def get_variables_metadata_from_standard(domain, library_metadata): # noqa class_variables_metadata, timing_metadata, ) = get_allowed_class_variables(model_details, model_class_details) + if add_AP: + ap_class_details = get_class_metadata(model_details, "ASSOCIATED PERSONS") + ap_identifiers = ap_class_details.get("classVariables", []) + identifiers_metadata = [ + v + for v in identifiers_metadata + ap_identifiers + if v.get("name") != "USUBJID" + ] + identifiers_metadata.sort(key=lambda item: int(item["ordinal"])) model_variables = [] for var_list in [ identifiers_metadata, class_variables_metadata, timing_metadata, ]: - replace_variable_wildcards(var_list, domain, model_variables) + replace_variable_wildcards(var_list, original_domain, model_variables) # Custom domains only pull from model hierarchy if is_custom: variables_metadata = model_variables @@ -90,13 +116,21 @@ def get_variables_metadata_from_standard(domain, library_metadata): # noqa var["name"]: i for i, var in enumerate(variables_metadata) } for ig_var in ig_variables: - ig_var_name = ig_var["name"] + if "--" in ig_var["name"]: + ig_var_copy = copy.deepcopy(ig_var) + ig_var_copy["name"] = ig_var_copy["name"].replace( + "--", original_domain + ) + ig_var_to_use = ig_var_copy + else: + ig_var_to_use = ig_var + ig_var_name = ig_var_to_use["name"] if ig_var_name in model_vars_by_name: - variables_metadata[model_vars_by_name[ig_var_name]] = ig_var + variables_metadata[model_vars_by_name[ig_var_name]] = ig_var_to_use else: # if a variable exists in the IG but not in the model, # insert it at the end of the its section - ig_var_role = ig_var.get("role") + ig_var_role = ig_var_to_use.get("role") if ig_var_role == "Identifier": identifiers_length = len(identifiers_metadata) insertion_point = identifiers_length @@ -107,12 +141,28 @@ def get_variables_metadata_from_standard(domain, library_metadata): # noqa insertion_point = ( len(variables_metadata) - timing_metadata_length ) - variables_metadata.insert(insertion_point, ig_var) + variables_metadata.insert(insertion_point, ig_var_to_use) model_vars_by_name = { var["name"]: i for i, var in enumerate(variables_metadata) } else: - variables_metadata = ig_variables + if add_AP: + ap_class_details = get_class_metadata( + model_details, "ASSOCIATED PERSONS" + ) + ap_identifiers = ap_class_details.get("classVariables", []) + ig_variables = [ + v + for v in ig_variables + ap_identifiers + if v.get("name") != "USUBJID" + ] + ig_variables.sort(key=lambda item: int(item["ordinal"])) + variables_metadata = [] + replace_variable_wildcards( + ig_variables, original_domain, variables_metadata + ) + else: + variables_metadata = ig_variables return variables_metadata @@ -238,12 +288,22 @@ def get_variables_metadata_from_standard_model( # noqa classes outside of general observation, we check the model for their definition if they are not there, differ to the standard definition of the domain """ + add_AP = False + original_domain = domain if ( domain and (domain.upper().startswith("SUPP") or domain.upper().startswith("SQ")) and len(domain) > 2 ): + if domain.upper().startswith("SQ"): + parent_domain = domain[2:] + if parent_domain.upper().startswith("AP"): + add_AP = True domain = "SUPPQUAL" + elif domain and domain.upper().startswith("AP"): + domain = domain[2:] + original_domain = domain + add_AP = True standard_details = library_metadata.standard_metadata model_details = library_metadata.model_metadata @@ -258,38 +318,85 @@ def get_variables_metadata_from_standard_model( # noqa class_variables_metadata, timing_metadata, ) = get_allowed_class_variables(model_details, model_class_details) + if add_AP: + ap_class_details = get_class_metadata(model_details, "ASSOCIATED PERSONS") + ap_identifiers = ap_class_details.get("classVariables", []) + identifiers_metadata = identifiers_metadata + ap_identifiers + # Remove USUBJID from identifiers and re-sort + identifiers_metadata = [ + v for v in identifiers_metadata if v.get("name") != "USUBJID" + ] + identifiers_metadata.sort(key=lambda item: int(item["ordinal"])) variables_metadata = [] - if identifiers_metadata: - variables_metadata = identifiers_metadata - variables_metadata = variables_metadata + class_variables_metadata - if timing_metadata: - variables_metadata = variables_metadata + timing_metadata + for var_list in [ + identifiers_metadata, + class_variables_metadata, + timing_metadata, + ]: + replace_variable_wildcards(var_list, original_domain, variables_metadata) return variables_metadata else: - # First, try to get class metadata and check for classVariables i.e. AP class + # First, try to get class metadata and check for classVariables class_details = get_class_metadata(model_details, class_name) class_variables = class_details.get("classVariables", []) if class_variables: + if add_AP: + ap_class_details = get_class_metadata( + model_details, "ASSOCIATED PERSONS" + ) + ap_identifiers = ap_class_details.get("classVariables", []) + # Filter out USUBJID from AP identifiers only, then add to class_variables + filtered_ap_identifiers = [ + v for v in ap_identifiers if v.get("name") != "USUBJID" + ] + class_variables = class_variables + filtered_ap_identifiers class_variables.sort(key=lambda item: int(item["ordinal"])) - return class_variables + variables_metadata = [] + replace_variable_wildcards( + class_variables, original_domain, variables_metadata + ) + return variables_metadata else: # Second, check if domain exists in model datasets domain_details = get_model_domain_metadata(model_details, domain) if domain_details: dataset_variables = domain_details.get("datasetVariables", []) - if dataset_variables: - dataset_variables.sort(key=lambda item: int(item["ordinal"])) - return dataset_variables + dataset_variables.sort(key=lambda item: int(item["ordinal"])) + if add_AP: + ap_class_details = get_class_metadata( + model_details, "ASSOCIATED PERSONS" + ) + ap_identifiers = ap_class_details.get("classVariables", []) + dataset_variables = [ + v + for v in dataset_variables + ap_identifiers + if v.get("name") != "USUBJID" + ] + variables_metadata = [] + replace_variable_wildcards( + dataset_variables, original_domain, variables_metadata + ) + variables_metadata.sort(key=lambda item: int(item["ordinal"])) + return variables_metadata # Third, fall back to standard datasets - for cls in standard_details.get("classes", []): - for dataset in cls.get("datasets", []): - if dataset.get("name") == domain: - dataset_variables = dataset.get("datasetVariables", []) - if dataset_variables: - dataset_variables.sort( - key=lambda item: int(item["ordinal"]) - ) - return dataset_variables + if IG_domain_details: + dataset_variables = IG_domain_details.get("datasetVariables", []) + dataset_variables.sort(key=lambda item: int(item["ordinal"])) + if add_AP: + ap_class_details = get_class_metadata( + model_details, "ASSOCIATED PERSONS" + ) + ap_identifiers = ap_class_details.get("classVariables", []) + dataset_variables = [ + v + for v in dataset_variables + ap_identifiers + if v.get("name") != "USUBJID" + ] + variables_metadata = [] + replace_variable_wildcards( + dataset_variables, original_domain, variables_metadata + ) + return variables_metadata return None diff --git a/env.example b/env.example index b9849a804..486491bb9 100644 --- a/env.example +++ b/env.example @@ -1,4 +1,4 @@ CDISC_LIBRARY_API_KEY=your_api_key_here DATASET_SIZE_THRESHOLD=10485760 # max dataset size in bytes to force dask implementation -MAX_REPORT_ROWS = 10 # integer for maximum number of issues per excel sheet (plus headers) in result report +MAX_REPORT_ROWS = 10 # integer for maximum number of issues per excel sheet (plus headers) in result report. Defaults to 10000. MAX_ERRORS_PER_RULE = (10, True) # Tuple for maximum number of errors to report per rule during a validation run. Also has a per dataset flag described as second bool value in readme. example value diff --git a/tests/unit/test_utilities/test_sdtm_utils.py b/tests/unit/test_utilities/test_sdtm_utils.py new file mode 100644 index 000000000..eab41855c --- /dev/null +++ b/tests/unit/test_utilities/test_sdtm_utils.py @@ -0,0 +1,156 @@ +import pytest +from unittest.mock import Mock +from scripts.script_utils import ( + get_library_metadata_from_cache, +) + +from cdisc_rules_engine.utilities.sdtm_utilities import ( + get_variables_metadata_from_standard, + get_variables_metadata_from_standard_model, +) + + +@pytest.fixture +def library_metadata(): + args = Mock() + args.cache = "resources/cache" + args.standard = "sdtmig" + args.version = "3-4" + args.substandard = None + args.custom_standard = False + args.define_xml_path = None + args.controlled_terminology_package = [] + return get_library_metadata_from_cache(args) + + +@pytest.fixture +def mock_data_service(): + """Mock data service for tests that require it.""" + return Mock() + + +@pytest.fixture +def mock_datasets(): + """Mock datasets metadata for tests.""" + return [] + + +def test_standard_domain_ae(library_metadata): + variables = get_variables_metadata_from_standard("AE", library_metadata) + assert any(var["name"] == "STUDYID" for var in variables) + assert any(var["name"] == "AETERM" for var in variables) + assert any(var["name"] == "AESTDTC" for var in variables) + + +def test_standard_domain_dm(library_metadata): + variables = get_variables_metadata_from_standard("DM", library_metadata) + assert any(var["name"] == "USUBJID" for var in variables) + assert any(var["name"] == "AGE" for var in variables) + assert any(var["name"] == "SEX" for var in variables) + + +def test_findings_domain_lb(library_metadata): + variables = get_variables_metadata_from_standard("LB", library_metadata) + assert any(var["name"] == "STUDYID" for var in variables) + assert any(var["name"] == "USUBJID" for var in variables) + assert any(var["name"] == "LBTEST" for var in variables) + assert any(var["name"] == "LBORRES" for var in variables) + + +def test_supp_domain(library_metadata): + variables = get_variables_metadata_from_standard("SUPPAE", library_metadata) + assert any(var["name"] == "STUDYID" for var in variables) + assert any(var["name"] == "QNAM" for var in variables) + assert any(var["name"] == "QLABEL" for var in variables) + + +def test_sq_domain(library_metadata): + variables = get_variables_metadata_from_standard("SQAE", library_metadata) + assert any(var["name"] == "STUDYID" for var in variables) + assert any(var["name"] == "QNAM" for var in variables) + assert any(var["name"] == "QLABEL" for var in variables) + + +def test_ap_domain(library_metadata): + variables = get_variables_metadata_from_standard("APDM", library_metadata) + assert any(var["name"] == "APID" for var in variables) + assert not any(var["name"] == "USUBJID" for var in variables) + assert any(var["name"] == "RSUBJID" for var in variables) + assert any(var["name"] == "RACE" for var in variables) + assert any(var["name"] == "DMDY" for var in variables) + + +def test_sqap_domain(library_metadata): + variables = get_variables_metadata_from_standard("SQAP", library_metadata) + assert any(var["name"] == "APID" for var in variables) + assert not any(var["name"] == "USUBJID" for var in variables) + assert any(var["name"] == "RDOMAIN" for var in variables) + + +def test_findings_about_domain_fa(library_metadata): + """Test Findings About domain includes FINDINGS class variables.""" + variables = get_variables_metadata_from_standard("FA", library_metadata) + assert any(var["name"] == "FATEST" for var in variables) + assert any(var["name"] == "FAOBJ" for var in variables) + + +# Tests for get_variables_metadata_from_standard_model +def test_findings_domain_from_model(library_metadata, mock_data_service, mock_datasets): + mock_dataframe = Mock() + variables = get_variables_metadata_from_standard_model( + domain="LB", + dataframe=mock_dataframe, + datasets=mock_datasets, + dataset_path="/path/to/lb.xpt", + data_service=mock_data_service, + library_metadata=library_metadata, + ) + assert any(var["name"] == "STUDYID" for var in variables) + assert any(var["name"] == "LBTEST" for var in variables) + + +def test_supp_domain_from_model(library_metadata, mock_data_service, mock_datasets): + """Test retrieving variables for SUPP domain from model.""" + mock_dataframe = Mock() + variables = get_variables_metadata_from_standard_model( + domain="SUPPAE", + dataframe=mock_dataframe, + datasets=mock_datasets, + dataset_path="/path/to/suppae.xpt", + data_service=mock_data_service, + library_metadata=library_metadata, + ) + assert any(var["name"] == "RDOMAIN" for var in variables) + assert any(var["name"] == "IDVAR" for var in variables) + + +def test_sqap_domain_from_model(library_metadata, mock_data_service, mock_datasets): + """Test retrieving variables for SUPP domain from model.""" + mock_dataframe = Mock() + variables = get_variables_metadata_from_standard_model( + domain="SQAP", + dataframe=mock_dataframe, + datasets=mock_datasets, + dataset_path="/path/to/suppae.xpt", + data_service=mock_data_service, + library_metadata=library_metadata, + ) + assert any(var["name"] == "RDOMAIN" for var in variables) + assert any(var["name"] == "APID" for var in variables) + + +def test_ap_domain_from_model(library_metadata, mock_data_service, mock_datasets): + """Test AP domain excludes USUBJID and includes APID.""" + mock_dataframe = Mock() + variables = get_variables_metadata_from_standard_model( + domain="APDM", + dataframe=mock_dataframe, + datasets=mock_datasets, + dataset_path="/path/to/apdm.xpt", + data_service=mock_data_service, + library_metadata=library_metadata, + ) + assert not any(var["name"] == "USUBJID" for var in variables) + assert any(var["name"] == "APID" for var in variables) + assert any(var["name"] == "AGE" for var in variables) + assert any(var["name"] == "DMDY" for var in variables)