From d49ea970114841d25e48fbc5ceda080cf1a4ff8c Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Thu, 23 Apr 2026 08:27:00 -0400 Subject: [PATCH] First pass at base package cache --- crates/ark/src/lsp/diagnostics.rs | 4 + crates/oak_package/src/package_description.rs | 39 +++ crates/oak_sources/src/base.rs | 138 +++++++++ crates/oak_sources/src/hash.rs | 9 + crates/oak_sources/src/installed_package.rs | 98 +++++++ crates/oak_sources/src/lib.rs | 264 +++++++++++------- 6 files changed, 455 insertions(+), 97 deletions(-) create mode 100644 crates/oak_sources/src/base.rs create mode 100644 crates/oak_sources/src/hash.rs create mode 100644 crates/oak_sources/src/installed_package.rs diff --git a/crates/ark/src/lsp/diagnostics.rs b/crates/ark/src/lsp/diagnostics.rs index 4d1eaa39a..27c504a20 100644 --- a/crates/ark/src/lsp/diagnostics.rs +++ b/crates/ark/src/lsp/diagnostics.rs @@ -1647,6 +1647,7 @@ foo version: "1.0.0".to_string(), depends: vec![], repository: None, + priority: None, fields: Dcf::new(), }; let package = Package::from_parts(PathBuf::from("/mock/path"), description, namespace); @@ -1744,6 +1745,7 @@ foo version: "1.0.0".to_string(), depends: vec![], repository: None, + priority: None, fields: Dcf::new(), }; let package1 = @@ -1760,6 +1762,7 @@ foo version: "1.0.0".to_string(), depends: vec![], repository: None, + priority: None, fields: Dcf::new(), }; let package2 = @@ -1818,6 +1821,7 @@ foo version: "1.0.0".to_string(), depends: vec![], repository: None, + priority: None, fields: Dcf::new(), }; let package = Package::from_parts(PathBuf::from("/mock/path"), description, namespace); diff --git a/crates/oak_package/src/package_description.rs b/crates/oak_package/src/package_description.rs index 2e408c74a..519a6dae0 100644 --- a/crates/oak_package/src/package_description.rs +++ b/crates/oak_package/src/package_description.rs @@ -40,6 +40,8 @@ pub struct Description { pub repository: Option, + pub priority: Option, + /// Raw DCF fields pub fields: Dcf, } @@ -49,6 +51,12 @@ pub enum Repository { CRAN, } +#[derive(Clone, Debug, PartialEq)] +pub enum Priority { + Base, + Recommended, +} + impl Description { /// Parse a DESCRIPTION file in DCF format pub fn parse(contents: &str) -> anyhow::Result { @@ -84,11 +92,22 @@ impl Description { None }); + let priority = fields.get("Priority").and_then(|priority| { + if priority == "base" { + return Some(Priority::Base); + } + if priority == "recommended" { + return Some(Priority::Recommended); + } + None + }); + Ok(Description { name, version, depends, repository, + priority, fields, }) } @@ -214,6 +233,26 @@ Repository: CRAN"#; assert_eq!(parsed.repository, Some(Repository::CRAN)); } + #[test] + fn parses_description_with_priority() { + let desc = r#"Package: utils +Version: 4.5.0 +Priority: base"#; + let parsed = Description::parse(desc).unwrap(); + assert_eq!(parsed.priority, Some(Priority::Base)); + + let desc = r#"Package: MASS +Version: 7.3-65 +Priority: recommended"#; + let parsed = Description::parse(desc).unwrap(); + assert_eq!(parsed.priority, Some(Priority::Recommended)); + + let desc = r#"Package: mypkg +Version: 1.0.0"#; + let parsed = Description::parse(desc).unwrap(); + assert!(parsed.priority.is_none()); + } + #[test] fn parses_description_with_unknown_repository() { let desc = r#"Package: mypackage diff --git a/crates/oak_sources/src/base.rs b/crates/oak_sources/src/base.rs new file mode 100644 index 000000000..3ae3eb958 --- /dev/null +++ b/crates/oak_sources/src/base.rs @@ -0,0 +1,138 @@ +use std::io::Cursor; +use std::io::Read; + +use flate2::read::GzDecoder; +use oak_fs::file_lock::FileLock; + +use crate::download::Outcome; + +/// Names of the R base packages, i.e. everything that ships with R and carries +/// `Priority: base` in its DESCRIPTION. +pub(crate) const BASE_PACKAGES: &[&str] = &[ + "base", + "compiler", + "datasets", + "graphics", + "grDevices", + "grid", + "methods", + "parallel", + "splines", + "stats", + "stats4", + "tcltk", + "tools", + "utils", +]; + +/// Download the R source tarball for R {version} from CRAN's archive. +/// +/// Base R packages (e.g. `base`, `utils`, `stats`) are not distributed at the standard +/// `src/contrib/` location on CRAN. Instead, we must retrieve them from the base R +/// sources themselves, which lives at `src/base/R-{major}/R-{version}.tar.gz`. Each +/// package is located inside that tarball at `src/library/{package}/`. +/// +/// Returns `Ok(None)` if the tarball is not on CRAN (e.g. a development R version), which +/// we treat as "source unavailable" rather than an error. +pub(crate) fn download(version: &str) -> anyhow::Result>> { + let major = version + .split('.') + .next() + .ok_or_else(|| anyhow::anyhow!("Invalid R version for base source download: {version}"))?; + + let mirrors = ["https://cran.r-project.org", "https://cran.rstudio.com"]; + let suffix = format!("src/base/R-{major}/R-{version}.tar.gz"); + + match crate::download::download_with_mirrors(&suffix, &mirrors)? { + Outcome::Success(response) => { + let mut bytes = Vec::new(); + response.into_body().into_reader().read_to_end(&mut bytes)?; + Ok(Some(bytes)) + }, + Outcome::NotFound => Ok(None), + } +} + +/// Extract a single base package's R files from the R source tarball bytes. +/// +/// Writes `R-{version}/src/library/{package}/R/*.R` entries into an `R/` folder inside +/// the directory `destination_lock` lives in. Files are marked read only to match the +/// rest of the cache. +pub(crate) fn extract( + package: &str, + version: &str, + bytes: &[u8], + destination_lock: &FileLock, +) -> anyhow::Result<()> { + let destination = destination_lock.parent().join("R"); + std::fs::create_dir(&destination)?; + + let cursor = Cursor::new(bytes); + let gz = GzDecoder::new(cursor); + let mut archive = tar::Archive::new(gz); + + let prefix = format!("R-{version}/src/library/{package}/R/"); + + for entry in archive.entries()? { + let mut entry = entry?; + let path = entry.path()?; + + let Some(relative) = path.strip_prefix(&prefix).ok() else { + continue; + }; + + if relative + .extension() + .is_none_or(|ext| ext != "R" && ext != "r") + { + continue; + } + + let absolute = destination.join(relative); + + // Some base packages (e.g. `utils`) have platform-specific subdirs under `R/` + // like `R/windows/` and `R/unix/` (their `Makefile` handles them at install + // time). Create parents if one is required so `unpack()` can write nested files. + if let Some(parent) = relative.parent().filter(|p| !p.as_os_str().is_empty()) { + std::fs::create_dir_all(destination.join(parent))?; + } + + entry.unpack(&absolute)?; + crate::fs::set_readonly(&absolute)?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use oak_fs::file_lock::Filesystem; + use tempfile::TempDir; + + use crate::base::download; + use crate::base::extract; + + /// Requires internet access and downloads a large tarball of the R sources + #[ignore = "Downloads a 40mb tarball"] + #[test] + fn test_base_download_and_extract() { + let bytes = download("4.5.0").unwrap().expect("R 4.5.0 source to exist"); + + let destination_tempdir = TempDir::new().unwrap(); + let destination = Filesystem::new(destination_tempdir.path().to_path_buf()); + let destination_lock = destination.open_rw_exclusive_create(".lock").unwrap(); + + extract("utils", "4.5.0", &bytes, &destination_lock).unwrap(); + + // Spot check: `utils` has a well-known `help.R` file + let help = destination_lock.parent().join("R").join("help.R"); + assert!(help.exists()); + assert!(help.metadata().unwrap().permissions().readonly()); + } + + #[test] + fn test_base_download_unknown_version_returns_none() { + let bytes = download("0.0.0").unwrap(); + assert!(bytes.is_none()); + } +} diff --git a/crates/oak_sources/src/hash.rs b/crates/oak_sources/src/hash.rs new file mode 100644 index 000000000..308805479 --- /dev/null +++ b/crates/oak_sources/src/hash.rs @@ -0,0 +1,9 @@ +use sha2::Digest; +use sha2::Sha256; + +/// Retain 8 ASCII characters for each hash fragment +pub(crate) fn hash(contents: &str) -> String { + let mut hash = hex::encode(Sha256::digest(contents)); + hash.truncate(8); + hash +} diff --git a/crates/oak_sources/src/installed_package.rs b/crates/oak_sources/src/installed_package.rs new file mode 100644 index 000000000..e7bc06902 --- /dev/null +++ b/crates/oak_sources/src/installed_package.rs @@ -0,0 +1,98 @@ +use std::fs::read_to_string; +use std::path::Path; +use std::path::PathBuf; + +use oak_package::package_description::Description; + +pub(crate) struct InstalledPackage { + key: String, + name: String, + library_path: PathBuf, + description: Description, + description_hash: String, +} + +impl InstalledPackage { + pub(crate) fn find(package: &str, library_paths: &[PathBuf]) -> anyhow::Result> { + let mut library_path = None; + + for library_path_candidate in library_paths { + if library_path_candidate.join(package).exists() { + library_path = Some(library_path_candidate); + break; + } + } + + let Some(library_path) = library_path else { + // Not installed + return Ok(None); + }; + + let package_path = library_path.join(package); + + let description_path = package_path.join("DESCRIPTION"); + let description_contents = read_to_string(&description_path)?; + let description = Description::parse(&description_contents)?; + + let library_path_hash = crate::hash::hash(library_path.to_string_lossy().as_ref()); + let description_hash = crate::hash::hash(&description_contents); + + // Flat key unique enough to handle: + // - The same R package across multiple libpaths + // - Reinstalling a dev R package without changing the version (0.1.0.9000) + let key = format!( + "{name}_{version}_libpath-{library_path_hash}_description-{description_hash}", + name = package, + version = &description.version, + library_path_hash = &library_path_hash, + description_hash = &description_hash + ); + + Ok(Some(Self { + key, + name: package.to_string(), + library_path: library_path.clone(), + description, + description_hash, + })) + } + + pub(crate) fn name(&self) -> &str { + &self.name + } + + pub(crate) fn version(&self) -> &str { + &self.description().version + } + + pub(crate) fn description(&self) -> &Description { + &self.description + } + + // Flat key unique enough to handle: + // - The same R package across multiple libpaths + // - Reinstalling a dev R package without changing the version (0.1.0.9000) + pub(crate) fn key(&self) -> &str { + &self.key + } + + pub(crate) fn library_path(&self) -> &Path { + self.library_path.as_path() + } + + pub(crate) fn package_path(&self) -> PathBuf { + self.library_path.join(&self.name) + } + + pub(crate) fn description_path(&self) -> PathBuf { + self.package_path().join("DESCRIPTION") + } + + pub(crate) fn namespace_path(&self) -> PathBuf { + self.package_path().join("NAMESPACE") + } + + pub(crate) fn description_hash(&self) -> &str { + &self.description_hash + } +} diff --git a/crates/oak_sources/src/lib.rs b/crates/oak_sources/src/lib.rs index 3c1e14ab9..71ca22e6b 100644 --- a/crates/oak_sources/src/lib.rs +++ b/crates/oak_sources/src/lib.rs @@ -1,11 +1,12 @@ +mod base; mod cran; mod download; mod fs; +mod hash; +mod installed_package; mod srcref; use std::collections::HashSet; -use std::fs::read_to_string; -use std::path::Path; use std::path::PathBuf; use std::sync::RwLock; @@ -14,12 +15,12 @@ use chrono::TimeDelta; use chrono::Utc; use oak_fs::file_lock; use oak_fs::file_lock::FileLock; -use oak_package::package_description::Description; +use oak_package::package_description::Priority; use oak_package::package_description::Repository; use serde::Deserialize; use serde::Serialize; -use sha2::Digest; -use sha2::Sha256; + +use crate::installed_package::InstalledPackage; /// Name of the root lock file and the per-key lock file. const LOCK_FILENAME: &str = ".lock"; @@ -187,40 +188,13 @@ impl PackageCache { } fn get_result(&self, package: &str) -> anyhow::Result> { - // Find install path of the package - let mut libpath = None; - for r_libpath in &self.r_libpaths { - if r_libpath.join(package).exists() { - libpath = Some(r_libpath); - break; - } - } - let Some(libpath) = libpath else { - // Not even installed. We don't record this package in `source_unavailable` in - // case the user installs it later in the session. + let Some(package) = InstalledPackage::find(package, &self.r_libpaths)? else { + // Not even installed return Ok(None); }; - let package_path = libpath.join(package); - let namespace_path = package_path.join("NAMESPACE"); - let description_path = package_path.join("DESCRIPTION"); - - let description_contents = read_to_string(&description_path)?; - let description = Description::parse(&description_contents)?; - - let version = description.version.as_str(); - - let libpath_hash = hash(libpath.to_string_lossy().as_ref()); - let description_hash = hash(&description_contents); - - // Flat key unique enough to handle: - // - The same R package across multiple libpaths - // - Reinstalling a dev R package without changing the version (0.1.0.9000) - let key = - format!("{package}_{version}_libpath-{libpath_hash}_description-{description_hash}"); - // Read path: completion sentinel present, already exists on disk - let destination = self.cache_root_lock.parent().join(&key); + let destination = self.cache_root_lock.parent().join(package.key()); if destination.join(METADATA_FILENAME).exists() { return Ok(Some(destination)); } @@ -230,92 +204,166 @@ impl PackageCache { if self .source_unavailable .read() - .is_ok_and(|set| set.contains(&key)) + .is_ok_and(|set| set.contains(package.key())) { return Ok(None); } - // Write path: take per-key exclusive lock - let destination = self.cache_root.join(&key); + // Write path + let result = if matches!(package.description().priority, Some(Priority::Base)) { + // R version to download is the same as the base package version + self.try_populate_base(&package.description().version) + } else { + self.try_populate(&package) + }; + + match result { + Ok(true) => Ok(Some(destination)), + Ok(false) => { + // Unavailable for some reason, maybe package isn't on CRAN. + // Never try and generate sources again this session. + self.source_unavailable + .write() + .ok() + .map(|mut set| set.insert(package.key().to_string())); + Ok(None) + }, + Err(err) => { + // Errored for some reason during source generation, maybe a download failed. + // Never try and generate sources again this session. + log::error!( + "Failed to cache {name} {version}: {err:?}", + name = package.name(), + version = package.version() + ); + self.source_unavailable + .write() + .ok() + .map(|mut set| set.insert(package.key().to_string())); + Ok(None) + }, + } + } + + fn try_populate_base(&self, version: &str) -> anyhow::Result { + // Download the R sources in their entirety + let Some(bytes) = crate::base::download(version)? else { + log::trace!("No R source tarball on CRAN for version {version}"); + return Ok(false); + }; + + // Populate all base packages from the download + for package in crate::base::BASE_PACKAGES { + let Some(package) = InstalledPackage::find(package, &self.r_libpaths)? else { + // It would be very odd to not find a base package + return Ok(false); + }; + self.try_populate_base_package(&package, version, &bytes)?; + } + + Ok(true) + } + + fn try_populate_base_package( + &self, + package: &InstalledPackage, + version: &str, + bytes: &[u8], + ) -> anyhow::Result<()> { + // Take per-key exclusive lock + let destination = self.cache_root.join(package.key()); destination.create_dir()?; let destination_lock = destination.open_rw_exclusive_create(LOCK_FILENAME)?; - // Re-check: another writer may have populated the key while we waited for an - // exclusive lock + // Another writer may have populated the key while we waited for an exclusive lock if destination_lock.parent().join(METADATA_FILENAME).exists() { - return Ok(Some(destination_lock.parent().to_path_buf())); + return Ok(()); } // Wipe any partial content from a prior writer that may have crashed before // writing `.metadata`. destination_lock.remove_siblings()?; - if self.try_populate( - package, - version, - libpath, - &namespace_path, - &description_path, - &description, - &description_hash, - &destination_lock, - )? { - Ok(Some(destination_lock.parent().to_path_buf())) + crate::base::extract(package.name(), version, bytes, &destination_lock)?; + + crate::fs::copy_as_readonly( + package.description_path(), + destination_lock.parent().join("DESCRIPTION"), + )?; + + // The `base` package itself has no NAMESPACE, for now we generate an empty + // NAMESPACE, but eventually we will want to fully populate it with a + // pseudo-NAMESPACE. + if package.name() == "base" { + std::fs::write(destination_lock.parent().join("NAMESPACE"), "")?; + crate::fs::set_readonly(destination_lock.parent().join("NAMESPACE"))?; } else { - // Never try source generation for this key again - self.source_unavailable - .write() - .ok() - .map(|mut set| set.insert(key)); - Ok(None) + crate::fs::copy_as_readonly( + package.namespace_path(), + destination_lock.parent().join("NAMESPACE"), + )?; } + + // Last! `.metadata` is the completion sentinel. + self.write_metadata(package, &destination_lock)?; + + Ok(()) } /// Writes `DESCRIPTION`, `NAMESPACE`, and `R/` to the cache entry, if possible - fn try_populate( - &self, - package: &str, - version: &str, - libpath: &Path, - namespace_path: &Path, - description_path: &Path, - description: &Description, - description_hash: &str, - destination_lock: &FileLock, - ) -> anyhow::Result { - if !self.write_r_files(package, version, description, destination_lock)? { + fn try_populate(&self, package: &InstalledPackage) -> anyhow::Result { + // Take per-key exclusive lock + let destination = self.cache_root.join(package.key()); + destination.create_dir()?; + let destination_lock = destination.open_rw_exclusive_create(LOCK_FILENAME)?; + + // Another writer may have populated the key while we waited for an exclusive lock + if destination_lock.parent().join(METADATA_FILENAME).exists() { + return Ok(true); + } + + // Wipe any partial content from a prior writer that may have crashed before + // writing `.metadata`. + destination_lock.remove_siblings()?; + + if !self.write_r_files(package, &destination_lock)? { return Ok(false); } crate::fs::copy_as_readonly( - description_path, + package.description_path(), destination_lock.parent().join("DESCRIPTION"), )?; - crate::fs::copy_as_readonly(namespace_path, destination_lock.parent().join("NAMESPACE"))?; + crate::fs::copy_as_readonly( + package.namespace_path(), + destination_lock.parent().join("NAMESPACE"), + )?; // Last! Only write `.metadata` if all other writes succeed. It is our completion sentinal. - self.write_metadata(package, libpath, description_hash, destination_lock)?; + self.write_metadata(package, &destination_lock)?; Ok(true) } fn write_r_files( &self, - package: &str, - version: &str, - description: &Description, + package: &InstalledPackage, destination_lock: &FileLock, ) -> anyhow::Result { // Try caching from srcref match crate::srcref::cache_srcref( - package, - version, + package.name(), + &package.description().version, destination_lock, &self.r, &self.r_libpaths, ) { Ok(true) => { - log::trace!("Cached {package} {version} from srcrefs."); + log::trace!( + "Cached {name} {version} from srcrefs.", + name = package.name(), + version = package.version() + ); return Ok(true); }, Ok(false) => { @@ -323,15 +371,23 @@ impl PackageCache { }, Err(err) => { // Fall through with log - log::warn!("Failed to cache {package} {version} from srcrefs: {err:?}"); + log::warn!( + "Failed to cache {name} {version} from srcrefs: {err:?}", + name = package.name(), + version = package.version() + ); }, } // Try caching from CRAN - if matches!(description.repository, Some(Repository::CRAN)) { - match crate::cran::cache_cran(package, version, destination_lock) { + if matches!(package.description().repository, Some(Repository::CRAN)) { + match crate::cran::cache_cran(package.name(), package.version(), destination_lock) { Ok(true) => { - log::trace!("Cached {package} {version} from CRAN download."); + log::trace!( + "Cached {name} {version} from CRAN download.", + name = package.name(), + version = package.version() + ); return Ok(true); }, Ok(false) => { @@ -339,7 +395,11 @@ impl PackageCache { }, Err(err) => { // Fall through with log - log::warn!("Failed to cache {package} {version} from CRAN download: {err:?}"); + log::warn!( + "Failed to cache {name} {version} from CRAN download: {err:?}", + name = package.name(), + version = package.version() + ); }, } } @@ -362,15 +422,13 @@ impl PackageCache { /// hold a shared lock). fn write_metadata( &self, - package: &str, - libpath: &Path, - description_hash: &str, + package: &InstalledPackage, destination_lock: &FileLock, ) -> anyhow::Result<()> { let metadata = Metadata { - package: package.to_string(), - libpath: libpath.to_path_buf(), - description_hash: description_hash.to_string(), + package: package.name().to_string(), + libpath: package.library_path().to_path_buf(), + description_hash: package.description_hash().to_string(), generated_at: Utc::now(), }; let contents = serde_json::to_vec_pretty(&metadata)?; @@ -447,7 +505,7 @@ impl PackageCache { continue; }; - if hash(&description_contents) != metadata.description_hash { + if crate::hash::hash(&description_contents) != metadata.description_hash { log::trace!("Cleaning {} due to changed DESCRIPTION", path.display()); crate::fs::remove_dir_all_or_warn(&path); continue; @@ -458,9 +516,21 @@ impl PackageCache { } } -/// Retain 8 ASCII characters for each hash fragment -fn hash(contents: &str) -> String { - let mut hash = hex::encode(Sha256::digest(contents)); - hash.truncate(8); - hash -} +// // For local testing +// #[cfg(test)] +// mod tests { +// use std::path::PathBuf; +// +// use crate::PackageCache; +// +// #[test] +// fn testit() { +// let r_script_path = PathBuf::from("/usr/local/bin/Rscript"); +// let r_libpaths = vec![ +// PathBuf::from("/Users/davis/Library/R/arm64/4.5/library"), +// PathBuf::from("/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library"), +// ]; +// let cache = PackageCache::new(r_script_path, r_libpaths).unwrap(); +// cache.get("utils"); +// } +// }