From 16641f4e11de73b416dfb9f4741579bf63238722 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Mon, 8 Jun 2026 17:37:45 -0400 Subject: [PATCH 1/6] usdm running now --- .../services/csv_metadata_reader.py | 139 +++++++++--------- 1 file changed, 69 insertions(+), 70 deletions(-) diff --git a/cdisc_rules_engine/services/csv_metadata_reader.py b/cdisc_rules_engine/services/csv_metadata_reader.py index 16427e76c..778a2131d 100644 --- a/cdisc_rules_engine/services/csv_metadata_reader.py +++ b/cdisc_rules_engine/services/csv_metadata_reader.py @@ -19,6 +19,7 @@ def __init__( ): self.file_path = file_path self.file_name = file_name + self.dataset = Path(file_name).stem self.encoding = encoding self.variables_csv_path = ( Path(variables_csv_path) @@ -32,43 +33,76 @@ def __init__( ) def read(self) -> dict: - dataset_name = Path(self.file_name).stem.lower() + metadata = {} + metadata.update(self.__dataset_metadata()) + metadata.update( + { + "dataset_modification_date": datetime.fromtimestamp( + Path(self.file_path).stat().st_mtime + ).isoformat(), + "adam_info": { + "categorization_scheme": {}, + "w_indexes": {}, + "period": {}, + "selection_algorithm": {}, + }, + } + ) + metadata.update(self.__variable_metadata()) + metadata.update(self.__data_metadata()) + return metadata - if not self.variables_csv_path.exists(): - logger = logging.getLogger("validator") - logger.info("No variables file found for %s", dataset_name) - variables_meta = {} - else: - variables_meta = self.__get_variable_metadata( - dataset_name, self.variables_csv_path + def __dataset_metadata(self) -> dict: + logger = logging.getLogger("validator") + + if not self.datasets_csv_path.exists(): + logger.info("No datasets file found for %s", self.dataset) + return {} + + try: + datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding) + except (UnicodeDecodeError, UnicodeError) as e: + logger.error( + f"\n Error reading CSV from: {self.file_path}" + f"\n Failed to decode with {self.encoding} encoding: {e}" + f"\n Please specify the correct encoding using the -e flag." ) + return {} + except Exception as e: + logger.error("Error reading CSV file %s. %s", self.file_path, e) + return {} - metadata = { - "dataset_name": dataset_name.upper(), - "dataset_modification_date": datetime.fromtimestamp( - Path(self.file_path).stat().st_mtime - ).isoformat(), - "adam_info": { - "categorization_scheme": {}, - "w_indexes": {}, - "period": {}, - "selection_algorithm": {}, - }, + if "Filename" not in datasets_df.columns: + return {} + + match = datasets_df[datasets_df["Filename"] == self.dataset] + + if match.empty or len(match) > 1: + return {} + + single_match = match.iloc[0] + + return { + "dataset_name": ( + single_match["Dataset Name"] + if "Dataset Name" in datasets_df.columns + else str(single_match["Filename"]).upper() + ), + "dataset_label": str(single_match["Label"]), } - metadata.update(variables_meta) - metadata.update(self.__data_meta()) - metadata.update(self.__dataset_label()) - return metadata - def __get_variable_metadata( - self, dataset_name: str, variables_file_path: Path + def __variable_metadata( + self, ) -> dict: logger = logging.getLogger("validator") + if not self.variables_csv_path.exists(): + logger.info("No variables file found for %s", self.dataset) + return {} try: - meta_df = pd.read_csv(variables_file_path, encoding=self.encoding) + meta_df = pd.read_csv(self.variables_csv_path, encoding=self.encoding) except (UnicodeDecodeError, UnicodeError) as e: logger.error( - f"Could not decode CSV file {variables_file_path} with {self.encoding} encoding: {e}. " + f"Could not decode CSV file {self.variables_csv_path} with {self.encoding} encoding: {e}. " f"Please specify the correct encoding using the -e flag." ) return {} @@ -76,15 +110,10 @@ def __get_variable_metadata( logger.error("Error reading CSV file %s. %s", self.file_path, e) return {} - meta_df["dataset"] = meta_df["dataset"].apply( - lambda x: Path(str(x)).stem.lower() - ) - - dataset_meta_df = meta_df[meta_df["dataset"] == dataset_name] + dataset_meta_df = meta_df[meta_df["dataset"] == self.dataset] if dataset_meta_df.empty: - logger = logging.getLogger("validator") - logger.info("No dataset metadata found for %s", dataset_name) + logger.info("No dataset metadata found for %s", self.dataset) return {} variable_names = dataset_meta_df["variable"].tolist() @@ -95,7 +124,11 @@ def __get_variable_metadata( zip(variable_names, dataset_meta_df["type"]) ) variable_name_to_size_map = { - var: (int(length) if pd.notna(length) else None) + var: ( + int(length) + if pd.notna(length) and (isinstance(length, int) or length.isdigit()) + else None + ) for var, length in zip(variable_names, dataset_meta_df["length"]) } return { @@ -108,41 +141,7 @@ def __get_variable_metadata( "number_of_variables": len(variable_names), } - def __dataset_label(self) -> dict: - logger = logging.getLogger("validator") - - if not self.datasets_csv_path.exists(): - return {} - - try: - datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding) - except (UnicodeDecodeError, UnicodeError) as e: - logger.error( - f"\n Error reading CSV from: {self.file_path}" - f"\n Failed to decode with {self.encoding} encoding: {e}" - f"\n Please specify the correct encoding using the -e flag." - ) - return {} - except Exception as e: - logger.error("Error reading CSV file %s. %s", self.file_path, e) - return {} - - if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns: - return {} - - datasets_df["dataset"] = datasets_df["Filename"].apply( - lambda x: Path(str(x)).stem.lower() - ) - - current_dataset = Path(self.file_name).stem.lower() - match = datasets_df[datasets_df["dataset"] == current_dataset] - - if match.empty: - return {} - - return {"dataset_label": str(match.iloc[0]["Label"])} - - def __data_meta(self): + def __data_metadata(self): logger = logging.getLogger("validator") result = { "dataset_length": 0, From 7803794e051e3ddb7f2fd16fe61b1b099c0626f5 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Tue, 9 Jun 2026 14:44:25 -0400 Subject: [PATCH 2/6] add json path and fix unit tests --- cdisc_rules_engine/services/csv_metadata_reader.py | 2 +- .../services/data_services/local_data_service.py | 2 ++ tests/unit/test_csv_reader.py | 12 ++++++------ 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cdisc_rules_engine/services/csv_metadata_reader.py b/cdisc_rules_engine/services/csv_metadata_reader.py index 778a2131d..0a4723e35 100644 --- a/cdisc_rules_engine/services/csv_metadata_reader.py +++ b/cdisc_rules_engine/services/csv_metadata_reader.py @@ -57,7 +57,7 @@ def __dataset_metadata(self) -> dict: if not self.datasets_csv_path.exists(): logger.info("No datasets file found for %s", self.dataset) - return {} + return {"dataset_name": self.dataset} try: datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding) diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 1893a0520..4ab367ff3 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -144,6 +144,8 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: basename(full_path).split(".")[1].upper() ) df = reader.from_file(full_path) + # Build a simulated json pointer for the case where we are simulating json data. + df["_path"] = [f"/{dataset_name}/{i}" for i in range(len(df))] return df @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) diff --git a/tests/unit/test_csv_reader.py b/tests/unit/test_csv_reader.py index 5cb7f6cdc..ee626e3ce 100644 --- a/tests/unit/test_csv_reader.py +++ b/tests/unit/test_csv_reader.py @@ -179,9 +179,9 @@ def test_duplicate_paths_removed(self, tmp_path): VARIABLES_CSV = textwrap.dedent( """\ dataset,variable,label,type,length - patients.csv,id,Patient ID,integer,10 - patients.csv,name,Patient Name,string,50 - patients.csv,age,Patient Age,integer,3 + patients,id,Patient ID,integer,10 + patients,name,Patient Name,string,50 + patients,age,Patient Age,integer,3 """ ) @@ -197,7 +197,7 @@ def test_duplicate_paths_removed(self, tmp_path): DATASETS_CSV = textwrap.dedent( """\ Filename,Label - patients.csv,Patient Dataset + patients,Patient Dataset """ ) @@ -306,8 +306,8 @@ def test_variable_name_to_size_map_with_nan_length(self): variables_with_nan = textwrap.dedent( """\ dataset,variable,label,type,length - patients.csv,id,Patient ID,integer, - """ + patients,id,Patient ID,integer, + """ ) _write(self._variables_path(), variables_with_nan) reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") From e26cd7c2b0ef0c54c51b24115b6518b379f8a27f Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Wed, 10 Jun 2026 14:12:01 -0400 Subject: [PATCH 3/6] only add path for usdm --- .../services/data_services/local_data_service.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 4ab367ff3..c9878b0e7 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -145,7 +145,8 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: ) df = reader.from_file(full_path) # Build a simulated json pointer for the case where we are simulating json data. - df["_path"] = [f"/{dataset_name}/{i}" for i in range(len(df))] + if self.standard == "usdm": + df["_path"] = [f"/{dataset_name}/{i}" for i in range(len(df))] return df @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) From d5c1efc9a0b31902b77b162f85f50de305c76212 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Fri, 12 Jun 2026 13:38:39 -0400 Subject: [PATCH 4/6] handle emprt ints as floats --- cdisc_rules_engine/services/csv_metadata_reader.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cdisc_rules_engine/services/csv_metadata_reader.py b/cdisc_rules_engine/services/csv_metadata_reader.py index 0a4723e35..88078c17d 100644 --- a/cdisc_rules_engine/services/csv_metadata_reader.py +++ b/cdisc_rules_engine/services/csv_metadata_reader.py @@ -126,7 +126,12 @@ def __variable_metadata( variable_name_to_size_map = { var: ( int(length) - if pd.notna(length) and (isinstance(length, int) or length.isdigit()) + if pd.notna(length) + and ( + # Because NaN is a float, pandas forces an array of integers with any missing values to become floating point + isinstance(length, int | float) + or (isinstance(length, str) and length.isdigit()) + ) else None ) for var, length in zip(variable_names, dataset_meta_df["length"]) From 7afc45bb3cc319368cff992e360071e40783904c Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Tue, 16 Jun 2026 14:33:11 -0400 Subject: [PATCH 5/6] fix extra csv extension in tests --- tests/unit/test_csv_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_csv_reader.py b/tests/unit/test_csv_reader.py index 7f8b021da..b30d98539 100644 --- a/tests/unit/test_csv_reader.py +++ b/tests/unit/test_csv_reader.py @@ -308,7 +308,7 @@ def test_variable_name_to_size_map_with_nan_length(self): def test_dataset_name_lookup_is_case_insensitive(self): """File name with mixed case should still match _variables.csv entry.""" - variables_upper = VARIABLES_CSV.replace("patients.csv", "PATIENTS.CSV") + variables_upper = VARIABLES_CSV.replace("patients", "PATIENTS") _write(self._variables_path(), variables_upper) reader = DatasetCSVMetadataReader(str(self.data_path), "PATIENTS.CSV") result = reader.read() From 9b9371ea13b7e38cbe5e2b907395aca23f96186c Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Wed, 17 Jun 2026 21:48:04 -0400 Subject: [PATCH 6/6] Fix datasetbuilder caching --- .../dataset_builders/base_dataset_builder.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 9dfe32ff1..53edaa726 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -47,12 +47,8 @@ def __init__( self.library_metadata = library_metadata self.dataset_implementation = self.data_service.dataset_implementation if isinstance(dataset_metadata, SDTMDatasetMetadata): - self.domain = ( - f"SUPP{dataset_metadata.rdomain}" - if dataset_metadata.rdomain - else dataset_metadata.domain - ) - self.dataset_name = dataset_metadata.name + # This is created to support the get_dataset cached decorator + self.domain = dataset_metadata.unsplit_name self.name = self.__class__.__name__ @abstractmethod