diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 9dfe32ff1..53edaa726 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -47,12 +47,8 @@ def __init__( self.library_metadata = library_metadata self.dataset_implementation = self.data_service.dataset_implementation if isinstance(dataset_metadata, SDTMDatasetMetadata): - self.domain = ( - f"SUPP{dataset_metadata.rdomain}" - if dataset_metadata.rdomain - else dataset_metadata.domain - ) - self.dataset_name = dataset_metadata.name + # This is created to support the get_dataset cached decorator + self.domain = dataset_metadata.unsplit_name self.name = self.__class__.__name__ @abstractmethod diff --git a/cdisc_rules_engine/services/csv_metadata_reader.py b/cdisc_rules_engine/services/csv_metadata_reader.py index 16427e76c..88078c17d 100644 --- a/cdisc_rules_engine/services/csv_metadata_reader.py +++ b/cdisc_rules_engine/services/csv_metadata_reader.py @@ -19,6 +19,7 @@ def __init__( ): self.file_path = file_path self.file_name = file_name + self.dataset = Path(file_name).stem self.encoding = encoding self.variables_csv_path = ( Path(variables_csv_path) @@ -32,43 +33,76 @@ def __init__( ) def read(self) -> dict: - dataset_name = Path(self.file_name).stem.lower() + metadata = {} + metadata.update(self.__dataset_metadata()) + metadata.update( + { + "dataset_modification_date": datetime.fromtimestamp( + Path(self.file_path).stat().st_mtime + ).isoformat(), + "adam_info": { + "categorization_scheme": {}, + "w_indexes": {}, + "period": {}, + "selection_algorithm": {}, + }, + } + ) + metadata.update(self.__variable_metadata()) + metadata.update(self.__data_metadata()) + return metadata - if not self.variables_csv_path.exists(): - logger = logging.getLogger("validator") - logger.info("No variables file found for %s", dataset_name) - variables_meta = {} - else: - variables_meta = self.__get_variable_metadata( - dataset_name, self.variables_csv_path + def __dataset_metadata(self) -> dict: + logger = logging.getLogger("validator") + + if not self.datasets_csv_path.exists(): + logger.info("No datasets file found for %s", self.dataset) + return {"dataset_name": self.dataset} + + try: + datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding) + except (UnicodeDecodeError, UnicodeError) as e: + logger.error( + f"\n Error reading CSV from: {self.file_path}" + f"\n Failed to decode with {self.encoding} encoding: {e}" + f"\n Please specify the correct encoding using the -e flag." ) + return {} + except Exception as e: + logger.error("Error reading CSV file %s. %s", self.file_path, e) + return {} + + if "Filename" not in datasets_df.columns: + return {} + + match = datasets_df[datasets_df["Filename"] == self.dataset] + + if match.empty or len(match) > 1: + return {} + + single_match = match.iloc[0] - metadata = { - "dataset_name": dataset_name.upper(), - "dataset_modification_date": datetime.fromtimestamp( - Path(self.file_path).stat().st_mtime - ).isoformat(), - "adam_info": { - "categorization_scheme": {}, - "w_indexes": {}, - "period": {}, - "selection_algorithm": {}, - }, + return { + "dataset_name": ( + single_match["Dataset Name"] + if "Dataset Name" in datasets_df.columns + else str(single_match["Filename"]).upper() + ), + "dataset_label": str(single_match["Label"]), } - metadata.update(variables_meta) - metadata.update(self.__data_meta()) - metadata.update(self.__dataset_label()) - return metadata - def __get_variable_metadata( - self, dataset_name: str, variables_file_path: Path + def __variable_metadata( + self, ) -> dict: logger = logging.getLogger("validator") + if not self.variables_csv_path.exists(): + logger.info("No variables file found for %s", self.dataset) + return {} try: - meta_df = pd.read_csv(variables_file_path, encoding=self.encoding) + meta_df = pd.read_csv(self.variables_csv_path, encoding=self.encoding) except (UnicodeDecodeError, UnicodeError) as e: logger.error( - f"Could not decode CSV file {variables_file_path} with {self.encoding} encoding: {e}. " + f"Could not decode CSV file {self.variables_csv_path} with {self.encoding} encoding: {e}. " f"Please specify the correct encoding using the -e flag." ) return {} @@ -76,15 +110,10 @@ def __get_variable_metadata( logger.error("Error reading CSV file %s. %s", self.file_path, e) return {} - meta_df["dataset"] = meta_df["dataset"].apply( - lambda x: Path(str(x)).stem.lower() - ) - - dataset_meta_df = meta_df[meta_df["dataset"] == dataset_name] + dataset_meta_df = meta_df[meta_df["dataset"] == self.dataset] if dataset_meta_df.empty: - logger = logging.getLogger("validator") - logger.info("No dataset metadata found for %s", dataset_name) + logger.info("No dataset metadata found for %s", self.dataset) return {} variable_names = dataset_meta_df["variable"].tolist() @@ -95,7 +124,16 @@ def __get_variable_metadata( zip(variable_names, dataset_meta_df["type"]) ) variable_name_to_size_map = { - var: (int(length) if pd.notna(length) else None) + var: ( + int(length) + if pd.notna(length) + and ( + # Because NaN is a float, pandas forces an array of integers with any missing values to become floating point + isinstance(length, int | float) + or (isinstance(length, str) and length.isdigit()) + ) + else None + ) for var, length in zip(variable_names, dataset_meta_df["length"]) } return { @@ -108,41 +146,7 @@ def __get_variable_metadata( "number_of_variables": len(variable_names), } - def __dataset_label(self) -> dict: - logger = logging.getLogger("validator") - - if not self.datasets_csv_path.exists(): - return {} - - try: - datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding) - except (UnicodeDecodeError, UnicodeError) as e: - logger.error( - f"\n Error reading CSV from: {self.file_path}" - f"\n Failed to decode with {self.encoding} encoding: {e}" - f"\n Please specify the correct encoding using the -e flag." - ) - return {} - except Exception as e: - logger.error("Error reading CSV file %s. %s", self.file_path, e) - return {} - - if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns: - return {} - - datasets_df["dataset"] = datasets_df["Filename"].apply( - lambda x: Path(str(x)).stem.lower() - ) - - current_dataset = Path(self.file_name).stem.lower() - match = datasets_df[datasets_df["dataset"] == current_dataset] - - if match.empty: - return {} - - return {"dataset_label": str(match.iloc[0]["Label"])} - - def __data_meta(self): + def __data_metadata(self): logger = logging.getLogger("validator") result = { "dataset_length": 0, diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 1893a0520..c9878b0e7 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -144,6 +144,9 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: basename(full_path).split(".")[1].upper() ) df = reader.from_file(full_path) + # Build a simulated json pointer for the case where we are simulating json data. + if self.standard == "usdm": + df["_path"] = [f"/{dataset_name}/{i}" for i in range(len(df))] return df @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) diff --git a/tests/unit/test_csv_reader.py b/tests/unit/test_csv_reader.py index 8fe68111a..b30d98539 100644 --- a/tests/unit/test_csv_reader.py +++ b/tests/unit/test_csv_reader.py @@ -178,9 +178,9 @@ def test_duplicate_paths_removed(self, tmp_path): VARIABLES_CSV = textwrap.dedent("""\ dataset,variable,label,type,length - patients.csv,id,Patient ID,integer,10 - patients.csv,name,Patient Name,string,50 - patients.csv,age,Patient Age,integer,3 + patients,id,Patient ID,integer,10 + patients,name,Patient Name,string,50 + patients,age,Patient Age,integer,3 """) DATA_CSV = textwrap.dedent("""\ @@ -192,7 +192,7 @@ def test_duplicate_paths_removed(self, tmp_path): DATASETS_CSV = textwrap.dedent("""\ Filename,Label - patients.csv,Patient Dataset + patients,Patient Dataset """) @@ -299,7 +299,7 @@ def test_variable_name_to_size_map_with_values(self): def test_variable_name_to_size_map_with_nan_length(self): variables_with_nan = textwrap.dedent("""\ dataset,variable,label,type,length - patients.csv,id,Patient ID,integer, + patients,id,Patient ID,integer, """) _write(self._variables_path(), variables_with_nan) reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") @@ -308,7 +308,7 @@ def test_variable_name_to_size_map_with_nan_length(self): def test_dataset_name_lookup_is_case_insensitive(self): """File name with mixed case should still match _variables.csv entry.""" - variables_upper = VARIABLES_CSV.replace("patients.csv", "PATIENTS.CSV") + variables_upper = VARIABLES_CSV.replace("patients", "PATIENTS") _write(self._variables_path(), variables_upper) reader = DatasetCSVMetadataReader(str(self.data_path), "PATIENTS.CSV") result = reader.read()