From 10581fcd6679b7210e7780fa83ba755831153a29 Mon Sep 17 00:00:00 2001 From: Joshua Fortriede Date: Fri, 9 May 2025 09:28:10 -0400 Subject: [PATCH 1/2] dont flag missing sequence files if DRS_URI Check if the drs_uri key is in the file_descriptor of the sequence file. If yes, it is assumed that the FASTQ file will be located externally and will get (eventually) a DRS URI. As such, do not mark the data file as missing. --- hca/staging_area_validator.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hca/staging_area_validator.py b/hca/staging_area_validator.py index eee0a0c..c0a7a01 100644 --- a/hca/staging_area_validator.py +++ b/hca/staging_area_validator.py @@ -272,6 +272,14 @@ def validate_descriptors_file(self, blob: gcs.Blob) -> None: if metadata_file := self.metadata_files.get(metadata_id): metadata_file["crc32c"] = file_json["crc32c"] metadata_versions = metadata_file["metadata_versions"] + + # Sequence file data_files might not be present if they are managed access. + # File Descriptor v2.1.0 allows for the drs_uri to be a string or null. + # In both of these cases, we set found_data_file to True + if metadata_file["entity_type"] == "sequence_file": + if "drs_uri" in file_json: + metadata_file["found_data_file"] = True + assert ( descriptor_version in metadata_versions ), f"Corresponding metadata version for descriptor version {descriptor_version} not found" From 46bc96ed713bfc1c3a1ec10acca59f2848eba516 Mon Sep 17 00:00:00 2001 From: Joshua Fortriede Date: Fri, 9 May 2025 09:29:21 -0400 Subject: [PATCH 2/2] Finished "TODO: remove unused `metadata_type`" Changed to just get last split. --- hca/staging_area_validator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hca/staging_area_validator.py b/hca/staging_area_validator.py index c0a7a01..27aca89 100644 --- a/hca/staging_area_validator.py +++ b/hca/staging_area_validator.py @@ -258,8 +258,7 @@ def validate_file_description(self, file_description: str) -> None: def validate_descriptors_file(self, blob: gcs.Blob) -> None: # Expected syntax: descriptors/{metadata_type}/{metadata_id}_{version}.json - # TODO: remove unused `metadata_type` - metadata_type, descriptor_file = blob.name.split("/")[-2:] + descriptor_file = blob.name.split("/")[-1] assert descriptor_file.count("_") == 1 assert descriptor_file.endswith(".json")