Skip to content
51 changes: 48 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,57 @@ $ autorepro scan --json
"detected": [],
"languages": {}
}

# Enhanced scanning with depth control
$ autorepro scan --depth 0
Detected: python
- python -> pyproject.toml

$ autorepro scan --depth 2
Detected: node, python
- node -> package.json
- python -> pyproject.toml

# Filtering with ignore patterns
$ autorepro scan --depth 2 --ignore 'node_modules/**' --ignore 'dist/**'
Detected: python
- python -> pyproject.toml

# Respecting .gitignore rules
$ autorepro scan --respect-gitignore
Detected: python
- python -> pyproject.toml

# JSON with file samples
$ autorepro scan --json --show 3
{
"detected": ["python"],
"languages": {
"python": {
"score": 4,
"reasons": [...],
"files_sample": ["./pyproject.toml", "./main.py", "./utils.py"]
}
}
}
```

**Status:** `scan` is implemented with weighted scoring system and dual output formats (text/JSON).
**Status:** `scan` is implemented with weighted scoring system, dual output formats (text/JSON), and enhanced hierarchical scanning capabilities.

**Scan Options:**
- `--json`: Output in JSON format with scores and detailed reasons
- `--show-scores`: Add score lines to text output (ignored with --json)
- `--depth N`: Maximum depth to scan (0 for root only, default: unlimited)
- `--ignore PATTERN`: Ignore files/directories matching pattern (repeatable)
- `--respect-gitignore`: Respect .gitignore rules when scanning
- `--show N`: Number of sample files per language to include in JSON output (default: 5)

**Enhanced Scanning Features:**
- **Hierarchical scanning**: Control scan depth with `--depth` parameter
- **Pattern-based filtering**: Use `--ignore` to exclude files/directories by glob patterns
- **Gitignore integration**: `--respect-gitignore` honors .gitignore rules including negation patterns (`!pattern`)
- **File sampling**: JSON output includes `files_sample` array with up to N sample files per language
- **Stable ordering**: Sample files are sorted deterministically for consistent results

**Weighted Scoring System:**
- **Lock files (weight 4)**: `pnpm-lock.yaml`, `yarn.lock`, `npm-shrinkwrap.json`, `package-lock.json`, `go.sum`, `Cargo.lock`
Expand All @@ -233,9 +277,10 @@ $ autorepro scan --json
- **Source files (weight 1)**: `*.py`, `*.go`, `*.rs`, `*.java`, `*.cs`, `*.js`, `*.ts`, etc.

**Scan Behavior:**
- **Root-only**: Scans only the current directory (non-recursive)
- **Deterministic ordering**: Languages and reasons are sorted alphabetically
- **Configurable depth**: `--depth 0` scans root only, `--depth N` scans N levels deep, no flag scans unlimited depth
- **Deterministic ordering**: Languages, reasons, and file samples are sorted alphabetically
- **Score accumulation**: Multiple indicators for same language add their weights together
- **Filtering integration**: Ignored files don't contribute to detection scores or language presence
- **Exit code 0**: Always succeeds, even with no detections

**Supported Languages:**
Expand Down
100 changes: 83 additions & 17 deletions autorepro/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,27 @@ def _setup_scan_parser(subparsers) -> argparse.ArgumentParser:
action="store_true",
help="Show scores in text output (only effective without --json)",
)
scan_parser.add_argument(
"--depth",
type=int,
help="Maximum depth to scan (0 for root only, default: unlimited)",
)
scan_parser.add_argument(
"--ignore",
action="append",
default=[],
help="Ignore files/directories matching pattern (can be specified multiple times)",
)
scan_parser.add_argument(
"--respect-gitignore",
action="store_true",
help="Respect .gitignore rules when scanning",
)
scan_parser.add_argument(
"--show",
type=int,
help="Number of sample files per language to include in JSON output (default: 5)",
)
scan_parser.add_argument(
"-q",
"--quiet",
Expand Down Expand Up @@ -489,12 +510,28 @@ def create_parser() -> argparse.ArgumentParser:
@time_execution(log_threshold=0.5)
@handle_errors({}, default_return=1, log_errors=True)
@log_operation("language detection scan")
def cmd_scan(json_output: bool = False, show_scores: bool = False) -> int:
def cmd_scan( # noqa: PLR0913
json_output: bool = False,
show_scores: bool = False,
depth: int | None = None,
ignore_patterns: list[str] | None = None,
respect_gitignore: bool = False,
show_files_sample: int | None = None,
) -> int:
"""Handle the scan command."""
if ignore_patterns is None:
ignore_patterns = []

if json_output:
# Use new weighted evidence collection for JSON output
try:
evidence = collect_evidence(Path("."))
evidence = collect_evidence(
Path("."),
depth=depth,
ignore_patterns=ignore_patterns,
respect_gitignore=respect_gitignore,
show_files_sample=show_files_sample,
)
detected_languages = sorted(evidence.keys())
except (OSError, PermissionError):
# Handle I/O errors gracefully for JSON output - return empty results
Expand All @@ -516,31 +553,48 @@ def cmd_scan(json_output: bool = False, show_scores: bool = False) -> int:
print(json.dumps(json_result, indent=2))
return 0
else:
# Use legacy text output
detected = detect_languages(".")
# Use enhanced evidence collection for text output too
try:
evidence = collect_evidence(
Path("."),
depth=depth,
ignore_patterns=ignore_patterns,
respect_gitignore=respect_gitignore,
)
except (OSError, PermissionError):
print("No known languages detected.")
return 0

if not detected:
if not evidence:
print("No known languages detected.")
return 0

# Extract language names for header
languages = [lang for lang, _ in detected]
# Extract language names for header (sorted)
languages = sorted(evidence.keys())
print(f"Detected: {', '.join(languages)}")

# Print details for each language
for lang, reasons in detected:
reasons_str = ", ".join(reasons)
for lang in languages:
lang_data = evidence[lang]
reasons = lang_data.get("reasons", [])

# Extract unique patterns for display (with type check)
if isinstance(reasons, list):
patterns = list(
dict.fromkeys(
reason["pattern"]
for reason in reasons
if isinstance(reason, dict)
)
)
reasons_str = ", ".join(patterns)
else:
reasons_str = "unknown"
print(f"- {lang} -> {reasons_str}")

# Add score if --show-scores is enabled
if show_scores:
try:
evidence = collect_evidence(Path("."))
if lang in evidence:
print(f" Score: {evidence[lang]['score']}")
except (OSError, PermissionError):
# Skip scores if evidence collection fails
pass
print(f" Score: {lang_data['score']}")

return 0

Expand Down Expand Up @@ -1911,9 +1965,21 @@ def _dispatch_scan_command(args) -> int:
# Load settings and apply plugins before any rule usage
settings = _get_project_settings(args)
_apply_plugins_env(settings)

# Determine show_files_sample value
show_value = getattr(args, "show", None)
json_output = getattr(args, "json", False)
show_files_sample = (
show_value if show_value is not None else (5 if json_output else None)
)

return cmd_scan(
json_output=getattr(args, "json", False),
json_output=json_output,
show_scores=getattr(args, "show_scores", False),
depth=getattr(args, "depth", None),
ignore_patterns=getattr(args, "ignore", []),
respect_gitignore=getattr(args, "respect_gitignore", False),
show_files_sample=show_files_sample,
)


Expand Down
Loading
Loading