-
Notifications
You must be signed in to change notification settings - Fork 3
perf: optimize technology detection with content caching and pre-discovery #421
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -80,22 +80,22 @@ struct RepoMetadata { | |
| paths: HashSet<PathBuf>, | ||
| /// Set of relative paths that are directories. | ||
| dirs: HashSet<PathBuf>, | ||
| /// Immediate subdirectories of the project root (depth 1), cached for fast Gradle scanning. | ||
| root_dirs: Vec<PathBuf>, | ||
| /// Map of file extension (e.g., ".rs") to the first relative path found with it. | ||
| /// Used to quickly evaluate file_extensions rules. | ||
| extensions: HashMap<String, PathBuf>, | ||
| /// Relative paths to subdirectories that appear to be standalone projects (issue #409). | ||
| nested_projects: Vec<PathBuf>, | ||
| /// Relevant Gradle build files discovered during the walk. | ||
| gradle_files: Vec<PathBuf>, | ||
| } | ||
|
|
||
| impl RepoMetadata { | ||
| fn collect(project_root: &Path) -> Self { | ||
| let mut paths = HashSet::new(); | ||
| let mut dirs = HashSet::new(); | ||
| let mut root_dirs = Vec::new(); | ||
| let mut extensions = HashMap::new(); | ||
| let mut nested_projects = BTreeSet::new(); | ||
| let mut gradle_files = Vec::new(); | ||
|
|
||
| for entry in WalkDir::new(project_root) | ||
| .max_depth(MAX_DISCOVER_DEPTH) | ||
|
|
@@ -116,15 +116,31 @@ impl RepoMetadata { | |
| let relative_buf = relative.to_path_buf(); | ||
| paths.insert(relative_buf.clone()); | ||
|
|
||
| if entry.file_type().is_dir() && entry.depth() == 1 { | ||
| root_dirs.push(relative_buf.clone()); | ||
| } | ||
| if entry.file_type().is_dir() { | ||
| dirs.insert(relative_buf.clone()); | ||
| } | ||
|
|
||
| if entry.file_type().is_file() { | ||
| let file_name = entry.file_name().to_str().unwrap_or(""); | ||
| let depth = entry.depth(); | ||
|
|
||
| // Gradle file discovery for scan_gradle_layout | ||
| if depth == 1 { | ||
| if matches!( | ||
| file_name, | ||
| "build.gradle.kts" | ||
| | "build.gradle" | ||
| | "settings.gradle.kts" | ||
| | "settings.gradle" | ||
| ) { | ||
| gradle_files.push(relative_buf.clone()); | ||
| } | ||
| } else if depth == 2 | ||
| && ((relative.starts_with("gradle") && file_name == "libs.versions.toml") | ||
| || matches!(file_name, "build.gradle.kts" | "build.gradle")) | ||
| { | ||
| gradle_files.push(relative_buf.clone()); | ||
| } | ||
|
|
||
| // Integrated Nested Project Discovery (issue #409) | ||
| if PROJECT_MANIFEST_FILES.contains(&file_name) | ||
|
|
@@ -153,9 +169,9 @@ impl RepoMetadata { | |
| Self { | ||
| paths, | ||
| dirs, | ||
| root_dirs, | ||
| extensions, | ||
| nested_projects: nested_projects.into_iter().collect(), | ||
| gradle_files, | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -208,7 +224,7 @@ struct CompiledDetectionRules { | |
| } | ||
|
|
||
| struct CompiledConfigFileContentRules { | ||
| files: Option<Vec<String>>, | ||
| files: Option<Vec<PathBuf>>, | ||
| patterns: Vec<Regex>, | ||
| scan_gradle_layout: bool, | ||
| } | ||
|
|
@@ -283,7 +299,10 @@ impl CatalogDrivenDetector { | |
| .collect::<Result<Vec<_>>>()?; | ||
|
|
||
| Ok::<_, anyhow::Error>(CompiledConfigFileContentRules { | ||
| files: content_rules.files.clone(), | ||
| files: content_rules | ||
| .files | ||
| .as_ref() | ||
| .map(|files| files.iter().map(PathBuf::from).collect()), | ||
| patterns, | ||
| scan_gradle_layout: content_rules.scan_gradle_layout.unwrap_or(false), | ||
| }) | ||
|
|
@@ -318,11 +337,19 @@ impl RepoDetector for CatalogDrivenDetector { | |
|
|
||
| let mut detections = Vec::new(); | ||
|
|
||
| // Optimization: Cache file content to avoid redundant I/O for shared config files (e.g. setup.py). | ||
| let mut content_cache = HashMap::new(); | ||
|
|
||
| // Phase 1: Evaluate root project | ||
| for (tech_id, compiled) in &self.rules { | ||
| if let Some(detection) = | ||
| evaluate_rules(project_root, tech_id, compiled, &all_packages, &metadata) | ||
| { | ||
| if let Some(detection) = evaluate_rules( | ||
| project_root, | ||
| tech_id, | ||
| compiled, | ||
| &all_packages, | ||
| &metadata, | ||
| &mut content_cache, | ||
| ) { | ||
| detections.push(detection); | ||
| } | ||
| } | ||
|
|
@@ -332,6 +359,9 @@ impl RepoDetector for CatalogDrivenDetector { | |
| let nested_dir = project_root.join(rel_nested_dir); | ||
| let nested_meta = RepoMetadata::collect(&nested_dir); | ||
| let nested_pkgs = collect_package_names(&nested_dir, &nested_meta); | ||
| // Clear content cache between projects because config files like setup.py | ||
| // will have different content in different sub-projects. | ||
| content_cache.clear(); | ||
|
|
||
| let offset = Some(rel_nested_dir.to_path_buf()); | ||
|
|
||
|
|
@@ -341,9 +371,14 @@ impl RepoDetector for CatalogDrivenDetector { | |
| continue; | ||
| } | ||
|
|
||
| if let Some(detection) = | ||
| evaluate_rules(&nested_dir, tech_id, compiled, &nested_pkgs, &nested_meta) | ||
| { | ||
| if let Some(detection) = evaluate_rules( | ||
| &nested_dir, | ||
| tech_id, | ||
| compiled, | ||
| &nested_pkgs, | ||
| &nested_meta, | ||
| &mut content_cache, | ||
| ) { | ||
| // Adjust paths: detections are relative to nested_dir, need to prepend offset | ||
| let adjusted = TechnologyDetection { | ||
| technology: detection.technology, | ||
|
|
@@ -388,6 +423,7 @@ fn evaluate_rules( | |
| rules: &CompiledDetectionRules, | ||
| all_packages: &BTreeSet<String>, | ||
| metadata: &RepoMetadata, | ||
| content_cache: &mut HashMap<PathBuf, String>, | ||
| ) -> Option<TechnologyDetection> { | ||
| // Check packages (exact match) | ||
| if let Some(packages) = &rules.packages { | ||
|
|
@@ -439,10 +475,22 @@ fn evaluate_rules( | |
| if let Some(content_rules) = &rules.config_file_content { | ||
| let files_to_scan = gather_content_scan_files(project_root, content_rules, metadata); | ||
| for file_path in &files_to_scan { | ||
| let absolute = project_root.join(file_path); | ||
| if let Ok(content) = fs::read_to_string(&absolute) { | ||
| // Optimization: Use content cache to avoid redundant I/O for shared config files. | ||
| let content = if let Some(cached) = content_cache.get(file_path) { | ||
| Some(cached) | ||
| } else { | ||
| let absolute = project_root.join(file_path); | ||
| if let Ok(content) = fs::read_to_string(absolute) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Semgrep identified a blocking 🔴 issue in your code: File path constructed from untrusted configuration rules allows path traversal attack, enabling attackers to read arbitrary files outside the intended directory. More details about thisThe code reads file content from a path constructed by joining Dataflow graphflowchart LR
classDef invis fill:white, stroke: none
classDef default fill:#e7f5ff, color:#1c7fd6, stroke: none
subgraph File0["<b>src/skills/detect.rs</b>"]
direction LR
%% Source
subgraph Source
direction LR
v0["<a href=https://github.com/dallay/agentsync/blob/564017c1055a4a806d0d9bf8e92d60869fe8220a/src/skills/detect.rs#L335 target=_blank style='text-decoration:none; color:#1c7fd6'>[Line: 335] project_root</a>"]
end
%% Intermediate
subgraph Traces0[Traces]
direction TB
v2["<a href=https://github.com/dallay/agentsync/blob/564017c1055a4a806d0d9bf8e92d60869fe8220a/src/skills/detect.rs#L335 target=_blank style='text-decoration:none; color:#1c7fd6'>[Line: 335] metadata</a>"]
v3["<a href=https://github.com/dallay/agentsync/blob/564017c1055a4a806d0d9bf8e92d60869fe8220a/src/skills/detect.rs#L358 target=_blank style='text-decoration:none; color:#1c7fd6'>[Line: 358] &</a>"]
v4["<a href=https://github.com/dallay/agentsync/blob/564017c1055a4a806d0d9bf8e92d60869fe8220a/src/skills/detect.rs#L358 target=_blank style='text-decoration:none; color:#1c7fd6'>[Line: 358] in</a>"]
v5["<a href=https://github.com/dallay/agentsync/blob/564017c1055a4a806d0d9bf8e92d60869fe8220a/src/skills/detect.rs#L358 target=_blank style='text-decoration:none; color:#1c7fd6'>[Line: 358] rel_nested_dir</a>"]
v6["<a href=https://github.com/dallay/agentsync/blob/564017c1055a4a806d0d9bf8e92d60869fe8220a/src/skills/detect.rs#L359 target=_blank style='text-decoration:none; color:#1c7fd6'>[Line: 359] nested_dir</a>"]
v7["<a href=https://github.com/dallay/agentsync/blob/564017c1055a4a806d0d9bf8e92d60869fe8220a/src/skills/detect.rs#L375 target=_blank style='text-decoration:none; color:#1c7fd6'>[Line: 375] &</a>"]
v8["<a href=https://github.com/dallay/agentsync/blob/564017c1055a4a806d0d9bf8e92d60869fe8220a/src/skills/detect.rs#L374 target=_blank style='text-decoration:none; color:#1c7fd6'>[Line: 374] evaluate_rules</a>"]
v9["<a href=https://github.com/dallay/agentsync/blob/564017c1055a4a806d0d9bf8e92d60869fe8220a/src/skills/detect.rs#L421 target=_blank style='text-decoration:none; color:#1c7fd6'>[Line: 421] project_root</a>"]
v10["<a href=https://github.com/dallay/agentsync/blob/564017c1055a4a806d0d9bf8e92d60869fe8220a/src/skills/detect.rs#L482 target=_blank style='text-decoration:none; color:#1c7fd6'>[Line: 482] absolute</a>"]
end
v2 --> v3
v3 --> v4
v4 --> v5
v5 --> v6
v6 --> v7
v7 --> v8
v8 --> v9
v9 --> v10
%% Sink
subgraph Sink
direction LR
v1["<a href=https://github.com/dallay/agentsync/blob/564017c1055a4a806d0d9bf8e92d60869fe8220a/src/skills/detect.rs#L483 target=_blank style='text-decoration:none; color:#1c7fd6'>[Line: 483] absolute</a>"]
end
end
%% Class Assignment
Source:::invis
Sink:::invis
Traces0:::invis
File0:::invis
%% Connections
Source --> Traces0
Traces0 --> Sink
To resolve this comment: ✨ Commit fix suggestion
Canonicalizing the path with 💬 Ignore this findingReply with Semgrep commands to ignore this finding.
Alternatively, triage in Semgrep AppSec Platform to ignore the finding created by tainted-path. You can view more details about this finding in the Semgrep AppSec Platform. |
||
| content_cache.insert(file_path.clone(), content); | ||
| content_cache.get(file_path) | ||
| } else { | ||
| None | ||
| } | ||
| }; | ||
|
|
||
| if let Some(content) = content { | ||
| for pattern in &content_rules.patterns { | ||
| if pattern.is_match(&content) { | ||
| if pattern.is_match(content) { | ||
| let display = file_path.display().to_string(); | ||
| return Some(make_detection( | ||
| tech_id, | ||
|
|
@@ -501,39 +549,19 @@ fn gather_content_scan_files( | |
| let mut files = Vec::new(); | ||
|
|
||
| if rules.scan_gradle_layout { | ||
| // Root-level Gradle files | ||
| for name in &[ | ||
| "build.gradle.kts", | ||
| "build.gradle", | ||
| "settings.gradle.kts", | ||
| "settings.gradle", | ||
| "gradle/libs.versions.toml", | ||
| ] { | ||
| let path = PathBuf::from(name); | ||
| if metadata.paths.contains(&path) { | ||
| files.push(path); | ||
| } | ||
| } | ||
|
|
||
| // Optimization: Use pre-calculated root_dirs from metadata to avoid | ||
| // re-filtering the entire directory set on every tech rule evaluation. | ||
| for dir in &metadata.root_dirs { | ||
| for build_file in &["build.gradle.kts", "build.gradle"] { | ||
| let path = dir.join(build_file); | ||
| if metadata.paths.contains(&path) { | ||
| files.push(path); | ||
| } | ||
| } | ||
| } | ||
| // Optimization: Use pre-discovered Gradle files from metadata to avoid | ||
| // dynamic path construction and repeated lookups in the path set. | ||
| files.extend(metadata.gradle_files.iter().cloned()); | ||
| } | ||
|
|
||
| if let Some(explicit_files) = &rules.files { | ||
| for file in explicit_files { | ||
| let path = PathBuf::from(file); | ||
| if (metadata.paths.contains(&path) || project_root.join(&path).exists()) | ||
| && !files.contains(&path) | ||
| for path in explicit_files { | ||
| // Optimization: check metadata.paths first (O(1)), only fallback to fs for deep matches. | ||
| // Avoid duplicate entries if explicit files overlap with Gradle files. | ||
| if (metadata.paths.contains(path) || project_root.join(path).exists()) | ||
| && !files.contains(path) | ||
| { | ||
| files.push(path); | ||
| files.push(path.clone()); | ||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a blank line after the heading to satisfy Markdown linting.
Line 156 is missing the required blank line below the heading (
MD022), which can keep doc-lint warnings noisy in CI.Suggested fix
## 2026-05-24 - Content Caching and Gradle Discovery for Tech Detection + **Learning:** Even with single-pass metadata, technology detection was performing redundant I/O and allocations by re-reading shared configuration files (e.g., `setup.py`) and re-calculating Gradle layout paths for every matching rule. This created an \(O(T \times F)\) overhead in I/O and heap allocations.📝 Committable suggestion
🧰 Tools
🪛 markdownlint-cli2 (0.22.1)
[warning] 156-156: Headings should be surrounded by blank lines
Expected: 1; Actual: 0; Below
(MD022, blanks-around-headings)
🤖 Prompt for AI Agents