Skip to content
Merged
8 changes: 2 additions & 6 deletions cdisc_rules_engine/dataset_builders/base_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,8 @@ def __init__(
self.library_metadata = library_metadata
self.dataset_implementation = self.data_service.dataset_implementation
if isinstance(dataset_metadata, SDTMDatasetMetadata):
self.domain = (
f"SUPP{dataset_metadata.rdomain}"
if dataset_metadata.rdomain
else dataset_metadata.domain
)
self.dataset_name = dataset_metadata.name
# This is created to support the get_dataset cached decorator
self.domain = dataset_metadata.unsplit_name
self.name = self.__class__.__name__

@abstractmethod
Expand Down
144 changes: 74 additions & 70 deletions cdisc_rules_engine/services/csv_metadata_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def __init__(
):
self.file_path = file_path
self.file_name = file_name
self.dataset = Path(file_name).stem
self.encoding = encoding
self.variables_csv_path = (
Path(variables_csv_path)
Expand All @@ -32,59 +33,87 @@ def __init__(
)

def read(self) -> dict:
dataset_name = Path(self.file_name).stem.lower()
Comment thread
gerrycampion marked this conversation as resolved.
metadata = {}
metadata.update(self.__dataset_metadata())
metadata.update(
{
"dataset_modification_date": datetime.fromtimestamp(
Path(self.file_path).stat().st_mtime
).isoformat(),
"adam_info": {
"categorization_scheme": {},
"w_indexes": {},
"period": {},
"selection_algorithm": {},
},
}
)
metadata.update(self.__variable_metadata())
metadata.update(self.__data_metadata())
return metadata

if not self.variables_csv_path.exists():
logger = logging.getLogger("validator")
logger.info("No variables file found for %s", dataset_name)
variables_meta = {}
else:
variables_meta = self.__get_variable_metadata(
dataset_name, self.variables_csv_path
def __dataset_metadata(self) -> dict:
logger = logging.getLogger("validator")

if not self.datasets_csv_path.exists():
logger.info("No datasets file found for %s", self.dataset)
return {"dataset_name": self.dataset}

try:
datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding)
except (UnicodeDecodeError, UnicodeError) as e:
logger.error(
f"\n Error reading CSV from: {self.file_path}"
f"\n Failed to decode with {self.encoding} encoding: {e}"
f"\n Please specify the correct encoding using the -e flag."
)
return {}
except Exception as e:
logger.error("Error reading CSV file %s. %s", self.file_path, e)
return {}

if "Filename" not in datasets_df.columns:
return {}

match = datasets_df[datasets_df["Filename"] == self.dataset]

if match.empty or len(match) > 1:
return {}

single_match = match.iloc[0]

metadata = {
"dataset_name": dataset_name.upper(),
"dataset_modification_date": datetime.fromtimestamp(
Path(self.file_path).stat().st_mtime
).isoformat(),
"adam_info": {
"categorization_scheme": {},
"w_indexes": {},
"period": {},
"selection_algorithm": {},
},
return {
"dataset_name": (
single_match["Dataset Name"]
if "Dataset Name" in datasets_df.columns
else str(single_match["Filename"]).upper()
Comment thread
gerrycampion marked this conversation as resolved.
),
"dataset_label": str(single_match["Label"]),
}
metadata.update(variables_meta)
metadata.update(self.__data_meta())
metadata.update(self.__dataset_label())
return metadata

def __get_variable_metadata(
self, dataset_name: str, variables_file_path: Path
def __variable_metadata(
self,
) -> dict:
logger = logging.getLogger("validator")
if not self.variables_csv_path.exists():
logger.info("No variables file found for %s", self.dataset)
return {}
try:
meta_df = pd.read_csv(variables_file_path, encoding=self.encoding)
meta_df = pd.read_csv(self.variables_csv_path, encoding=self.encoding)
except (UnicodeDecodeError, UnicodeError) as e:
logger.error(
f"Could not decode CSV file {variables_file_path} with {self.encoding} encoding: {e}. "
f"Could not decode CSV file {self.variables_csv_path} with {self.encoding} encoding: {e}. "
f"Please specify the correct encoding using the -e flag."
)
return {}
except Exception as e:
logger.error("Error reading CSV file %s. %s", self.file_path, e)
return {}

meta_df["dataset"] = meta_df["dataset"].apply(
lambda x: Path(str(x)).stem.lower()
)

dataset_meta_df = meta_df[meta_df["dataset"] == dataset_name]
dataset_meta_df = meta_df[meta_df["dataset"] == self.dataset]

if dataset_meta_df.empty:
logger = logging.getLogger("validator")
logger.info("No dataset metadata found for %s", dataset_name)
logger.info("No dataset metadata found for %s", self.dataset)
return {}

variable_names = dataset_meta_df["variable"].tolist()
Expand All @@ -95,7 +124,16 @@ def __get_variable_metadata(
zip(variable_names, dataset_meta_df["type"])
)
variable_name_to_size_map = {
var: (int(length) if pd.notna(length) else None)
var: (
int(length)
if pd.notna(length)
and (
# Because NaN is a float, pandas forces an array of integers with any missing values to become floating point
isinstance(length, int | float)
or (isinstance(length, str) and length.isdigit())
)
else None
)
for var, length in zip(variable_names, dataset_meta_df["length"])
}
return {
Expand All @@ -108,41 +146,7 @@ def __get_variable_metadata(
"number_of_variables": len(variable_names),
}

def __dataset_label(self) -> dict:
logger = logging.getLogger("validator")

if not self.datasets_csv_path.exists():
return {}

try:
datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding)
except (UnicodeDecodeError, UnicodeError) as e:
logger.error(
f"\n Error reading CSV from: {self.file_path}"
f"\n Failed to decode with {self.encoding} encoding: {e}"
f"\n Please specify the correct encoding using the -e flag."
)
return {}
except Exception as e:
logger.error("Error reading CSV file %s. %s", self.file_path, e)
return {}

if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns:
return {}

datasets_df["dataset"] = datasets_df["Filename"].apply(
lambda x: Path(str(x)).stem.lower()
)

current_dataset = Path(self.file_name).stem.lower()
match = datasets_df[datasets_df["dataset"] == current_dataset]

if match.empty:
return {}

return {"dataset_label": str(match.iloc[0]["Label"])}

def __data_meta(self):
def __data_metadata(self):
logger = logging.getLogger("validator")
result = {
"dataset_length": 0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface:
basename(full_path).split(".")[1].upper()
)
df = reader.from_file(full_path)
# Build a simulated json pointer for the case where we are simulating json data.
if self.standard == "usdm":

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this and the excel data service using string literals for usdm are a bit fragile-- I ran a validation using USDM in capitals versus usdm in lowercase and got different results for the same data.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was actually a bug with the recently-added caching. I've resolved the bug and you should now see consistent results. Let me know if you still have other concerns about the capitalization.

df["_path"] = [f"/{dataset_name}/{i}" for i in range(len(df))]
return df

@cached_dataset(DatasetTypes.VARIABLES_METADATA.value)
Expand Down
12 changes: 6 additions & 6 deletions tests/unit/test_csv_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,9 @@ def test_duplicate_paths_removed(self, tmp_path):

VARIABLES_CSV = textwrap.dedent("""\
dataset,variable,label,type,length
patients.csv,id,Patient ID,integer,10
patients.csv,name,Patient Name,string,50
patients.csv,age,Patient Age,integer,3
patients,id,Patient ID,integer,10
patients,name,Patient Name,string,50
patients,age,Patient Age,integer,3
""")

DATA_CSV = textwrap.dedent("""\
Expand All @@ -192,7 +192,7 @@ def test_duplicate_paths_removed(self, tmp_path):

DATASETS_CSV = textwrap.dedent("""\
Filename,Label
patients.csv,Patient Dataset
patients,Patient Dataset
""")


Expand Down Expand Up @@ -299,7 +299,7 @@ def test_variable_name_to_size_map_with_values(self):
def test_variable_name_to_size_map_with_nan_length(self):
variables_with_nan = textwrap.dedent("""\
dataset,variable,label,type,length
patients.csv,id,Patient ID,integer,
patients,id,Patient ID,integer,
""")
_write(self._variables_path(), variables_with_nan)
reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv")
Expand All @@ -308,7 +308,7 @@ def test_variable_name_to_size_map_with_nan_length(self):

def test_dataset_name_lookup_is_case_insensitive(self):
"""File name with mixed case should still match _variables.csv entry."""
variables_upper = VARIABLES_CSV.replace("patients.csv", "PATIENTS.CSV")
variables_upper = VARIABLES_CSV.replace("patients", "PATIENTS")
_write(self._variables_path(), variables_upper)
reader = DatasetCSVMetadataReader(str(self.data_path), "PATIENTS.CSV")
result = reader.read()
Expand Down
Loading