Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Encoding: UTF-8
Roxygen: list(markdown = TRUE)
Depends: R (>= 4.1.0)
Imports:
cli,
curl,
dplyr,
glue,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(add_dataset_samples)
export(create_dataset)
export(download_dataset)
export(download_project)
export(download_sample)
export(get_auth)
Expand All @@ -16,6 +17,7 @@ export(scpca_projects)
export(set_dataset_email)
export(start_dataset_processing)
export(view_terms)
export(wait_and_download_dataset)
import(httr2)
importFrom(dplyr,.data)
importFrom(stats,setNames)
2 changes: 2 additions & 0 deletions R/datasets.R
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@ get_dataset_status <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKE
detail <- get_dataset_detail(dataset, auth_token)
if (isTRUE(detail$is_failed)) {
"failed"
} else if (isTRUE(detail$is_expired)) {
"expired"
} else if (isTRUE(detail$is_succeeded)) {
"succeeded"
} else if (isTRUE(detail$is_processing) || isTRUE(detail$is_started)) {
Expand Down
272 changes: 266 additions & 6 deletions R/downloads.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' @param destination the `destination` argument to validate
#'
#' @noRd
check_destination_is_auth <- function(destination) {
warn_destination_is_auth <- function(destination) {
if (is_uuid(destination)) {
warning(
"`destination` looks like an authorization token (a UUID), not a directory path.",
Expand Down Expand Up @@ -83,7 +83,7 @@ download_sample <- function(
quiet = FALSE,
auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")
) {
check_destination_is_auth(destination)
warn_destination_is_auth(destination)
auth_token <- resolve_auth_token(auth_token)
stopifnot(
"quiet must be a logical value" = is.logical(quiet) && length(quiet) == 1
Expand Down Expand Up @@ -156,6 +156,8 @@ download_sample <- function(
#' If FALSE, existing files will be returned.
#' Default is FALSE.
#' @param quiet Whether to suppress download progress messages. Default is FALSE.
#' @param unzip Whether to unzip the downloaded file. Default is TRUE. When FALSE,
#' the zip file is saved directly to `destination` and its path is returned.
#' @param auth_token An authorization token from [get_auth()]. Defaults to the
#' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
#'
Expand Down Expand Up @@ -188,9 +190,10 @@ download_project <- function(
overwrite = FALSE,
redownload = FALSE,
quiet = FALSE,
unzip = TRUE,
auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")
) {
check_destination_is_auth(destination)
warn_destination_is_auth(destination)
auth_token <- resolve_auth_token(auth_token)
stopifnot(
"Invalid project_id." = grepl("^SCPCP\\d{6}$", project_id),
Expand Down Expand Up @@ -285,7 +288,14 @@ download_project <- function(

detail <- get_ccdl_dataset_detail(dataset$id, auth_token)
download_url <- setNames(detail$download_url, detail$download_filename)
file_paths <- download_and_extract_file(download_url, destination, overwrite, redownload, quiet)
file_paths <- download_and_extract_file(
url = download_url,
parent_dir = destination,
overwrite = overwrite,
redownload = redownload,
quiet = quiet,
unzip = unzip
)
invisible(file_paths)
}

Expand All @@ -297,12 +307,34 @@ download_project <- function(
#' @param redownload Whether to re-download if files from the same url already exist
#' (if FALSE, existing files will be returned)
#' @param quiet Whether to suppress progress messages
#' @param unzip Whether to unzip the downloaded file. Default is TRUE. When FALSE,
#' the zip file is saved directly to `parent_dir` and its path is returned.
#'
#' @returns A character vector of extracted file paths
#' @returns A character vector of extracted file paths, or the zip file path when
#' `unzip = FALSE`.
#'
#' @keywords internal
download_and_extract_file <- function(url, parent_dir, overwrite, redownload, quiet) {
download_and_extract_file <- function(url, parent_dir, overwrite, redownload, quiet, unzip = TRUE) {
download_filename <- if (!is.null(names(url))) names(url) else parse_download_file(url)

if (!unzip) {
zip_path <- file.path(parent_dir, download_filename)
if (file.exists(zip_path) && !overwrite) {
message(glue::glue(
"File {zip_path} already exists; skipping download.",
"\nUse 'overwrite = TRUE' to replace the existing file."
))
return(zip_path)
}
req <- httr2::request(unname(url))
if (!quiet) {
message(glue::glue("Downloading {download_filename}..."))
req <- httr2::req_progress(req, type = "down")
}
req |> req_perform(path = zip_path)
return(zip_path)
}

destination_dir <- file.path(parent_dir, stringr::str_remove(download_filename, "\\.zip$"))

# exit if directory already exists
Expand Down Expand Up @@ -366,3 +398,231 @@ parse_download_file <- function(scpca_url) {
stringr::str_extract("SCPC[^\\s]+\\.zip") |>
unname()
}


#' Download a custom dataset's files from the ScPCA Portal
#'
#' Downloads and extracts the files for a custom dataset that has finished
#' processing. The dataset must have a status of "succeeded"; use
#' [get_dataset_status()] to check before calling this function, or use
#' [wait_and_download_dataset()] to wait for processing to complete and then
#' download in a single call.
#'
#' The downloaded files are saved in a subdirectory of `destination`, named
#' from the dataset's download filename (which includes the dataset ID, format,
#' and date).
#'
#' @param dataset the dataset UUID string, or a list with an `$id` element,
#' such as the return value of [create_dataset()].
#' @param destination The path to the directory where the unzipped file directory
#' should be saved. Default is "scpca_data".
#' @param overwrite Whether to overwrite files in existing directories if they
#' already exist. Note that files in existing directories that do not have the
#' same name as one of the downloaded files will not be deleted. Default is FALSE.
#' @param redownload Whether to re-download if files from the same dataset already
#' exist. If FALSE, existing files will be returned. Default is FALSE.
#' @param quiet Whether to suppress download progress messages. Default is FALSE.
#' @param unzip Whether to unzip the downloaded file. Default is TRUE. When FALSE,
#' the zip file is saved directly to `destination` and its path is returned.
#' @param auth_token an authorization token from [get_auth()]. Defaults to the
#' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
#'
#' @importFrom stats setNames
#'
#' @returns a vector of file paths for the downloaded files (invisibly)
#'
#' @import httr2
#' @export
#'
#' @examples
#' \dontrun{
#' # Create a dataset, start processing, then download once complete
#' ds <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
#' start_dataset_processing(ds, email = "user@example.com")
#'
#' # Check status then download when ready
#' get_dataset_status(ds)
#' download_dataset(ds, destination = "scpca_data")
#'
#' # Or use wait_and_download_dataset() to do all of this in one call
#' wait_and_download_dataset(ds, start = TRUE, email = "user@example.com")
#' }
download_dataset <- function(
dataset,
destination = "scpca_data",
overwrite = FALSE,
redownload = FALSE,
quiet = FALSE,
unzip = TRUE,
auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")
) {
warn_destination_is_auth(destination)
auth_token <- resolve_auth_token(auth_token)
stopifnot(
"unzip must be a logical value" = is.logical(unzip) && length(unzip) == 1,
"overwrite must be a logical value" = is.logical(overwrite) && length(overwrite) == 1,
"redownload must be a logical value" = is.logical(redownload) && length(redownload) == 1,
"quiet must be a logical value" = is.logical(quiet) && length(quiet) == 1
)
dataset_id <- resolve_dataset_id(dataset)
detail <- get_dataset_detail(dataset_id, auth_token)

if (isTRUE(detail$is_expired)) {
stop(
glue::glue(
"Dataset `{dataset_id}` has expired and is no longer available for download.",
" Use `wait_and_download_dataset()` to regenerate it."
),
call. = FALSE
)
}

if (!isTRUE(detail$is_succeeded)) {
status <- if (isTRUE(detail$is_failed)) {
"failed"
} else if (isTRUE(detail$is_processing) || isTRUE(detail$is_started)) {
"processing"
} else {
"pending"
}
stop(
glue::glue(
"Dataset `{dataset_id}` is not ready for download (status: {status}).",
" Use `wait_and_download_dataset()` to wait for processing to complete."
),
call. = FALSE
)
}

if (!dir.exists(destination)) {
dir.create(destination, recursive = TRUE)
}

download_url <- setNames(detail$download_url, detail$download_filename)

file_paths <- download_and_extract_file(
url = download_url,
parent_dir = destination,
overwrite = overwrite,
redownload = redownload,
quiet = quiet,
unzip = unzip
)
invisible(file_paths)
}


#' @rdname download_dataset
#' @export
#'
#' @param email optional email address for the download notification. Only used
#' when `start = TRUE`. Passed to [start_dataset_processing()].
#' @param poll_interval Number of minutes to wait between status checks.
#' Default is 0.5 (30 seconds).
#' @param timeout Maximum number of minutes to wait for processing to complete.
#' Use `Inf` to wait indefinitely. Default is 60 (1 hour).
wait_and_download_dataset <- function(
dataset,
destination = "scpca_data",
email = NULL,
overwrite = FALSE,
redownload = FALSE,
poll_interval = 0.5,
timeout = 60,
quiet = FALSE,
unzip = TRUE,
auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")
) {
warn_destination_is_auth(destination)
auth_token <- resolve_auth_token(auth_token)
stopifnot(
"poll_interval must be a single non-negative number of minutes" = is.numeric(poll_interval) &&
length(poll_interval) == 1 &&
poll_interval >= 0,
"timeout must be a single positive number or Inf" = is.numeric(timeout) &&
length(timeout) == 1 &&
timeout >= 0,
"quiet must be a logical value" = is.logical(quiet) && length(quiet) == 1
)
dataset_id <- resolve_dataset_id(dataset)

if (get_dataset_status(dataset_id, auth_token = auth_token) %in% c("pending", "expired")) {
start_dataset_processing(dataset_id, email = email, auth_token = auth_token)
}

start_time <- Sys.time()
status <- get_dataset_status(dataset_id, auth_token = auth_token)

if (!quiet) {
cli::cli_progress_bar(
format = "{cli::pb_spin} Waiting for dataset {dataset_id} [{status}] {cli::pb_elapsed}",
clear = FALSE
)
}

repeat {
if (status == "succeeded") {
break
}
if (status == "failed") {
if (!quiet) {
cli::cli_progress_done()
}
stop(glue::glue("Dataset `{dataset_id}` processing failed."), call. = FALSE)
}
if (status == "expired") {
if (!quiet) {
cli::cli_progress_done()
}
stop(
glue::glue(
"Dataset `{dataset_id}` unexpectedly expired during processing.",
" Please report this as a bug."
),
call. = FALSE
)
}

elapsed <- as.numeric(difftime(Sys.time(), start_time, units = "mins"))
if (is.finite(timeout) && elapsed >= timeout) {
if (!quiet) {
cli::cli_progress_done()
}
stop(
glue::glue(
"Timed out after {round(elapsed, 1)} minutes waiting for dataset `{dataset_id}`.",
" Use `timeout = Inf` to wait indefinitely."
),
call. = FALSE
)
}

if (!quiet) {
# keep the progress spinner updating every half second until the next poll
next_loop <- Sys.time() + poll_interval * 60
while (Sys.time() < next_loop) {
cli::cli_progress_update(force = TRUE)
Sys.sleep(0.5)
}
} else {
Sys.sleep(poll_interval * 60)
}

status <- get_dataset_status(dataset_id, auth_token = auth_token)
if (!quiet) cli::cli_progress_update(force = TRUE)
}

if (!quiet) {
cli::cli_progress_done()
}

download_dataset(
dataset_id,
destination = destination,
unzip = unzip,
overwrite = overwrite,
redownload = redownload,
quiet = quiet,
auth_token = auth_token
)
}
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ reference:
- set_dataset_email
- start_dataset_processing
- get_dataset_status
- download_dataset
15 changes: 13 additions & 2 deletions man/download_and_extract_file.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading