From 47291fcda105ebc542e9ec38c240366f1789402f Mon Sep 17 00:00:00 2001 From: Gadi Evron Date: Mon, 8 Jun 2026 23:57:43 +0300 Subject: [PATCH] fix(parsers): CM-A is_test_file anchored to path components, not substring (BUG-NEW 2,22) Local-only finder-fixes-54. 3 parametrized tests across 5 langs. is_test_file used unanchored 'pattern in path_lower' (zig worst: bare test/spec tokens), so real source whose name CONTAINS a token (latest/contest/attestation/inspector) was silently dropped from extraction (skip_tests=True is the pipeline default). Anchored to exact path DIRECTORY components + basename conventions across c/php/python/ruby/zig repository_scanner. JS/Go already anchored -> untouched (judge-confirmed). Decoys not excluded; real test files still excluded (recall preserved). Judge: AGREE / SHIP-AS-IS. Local-only; not pushed. Co-Authored-By: Claude Opus 4.8 (1M context) (cherry picked from commit d8c5f3fb0387c969a7e7c899a8d59e598c46a477) --- ...est_repository_scanner_is_test_file_cma.py | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 libs/openant-core/tests/parsers/test_repository_scanner_is_test_file_cma.py diff --git a/libs/openant-core/tests/parsers/test_repository_scanner_is_test_file_cma.py b/libs/openant-core/tests/parsers/test_repository_scanner_is_test_file_cma.py new file mode 100644 index 0000000..ee65a61 --- /dev/null +++ b/libs/openant-core/tests/parsers/test_repository_scanner_is_test_file_cma.py @@ -0,0 +1,117 @@ +"""Cross-language regression: repository_scanner test-file classification must be anchored. + +Bug (path_substring_exclusion family, bundle entries [2] multi-lang + [22] zig): + `is_test_file` (c/php/python/ruby) and `_is_test_file`/`_is_test_directory` (zig) + classified a file/dir as a TEST using an UNANCHORED substring match + (`for pattern in test_patterns: if pattern in path_lower`). Because `test_` is a + substring of `latest_`/`greatest_`/`contest_`, and zig's bare `test`/`spec` tokens + are substrings of `latest`/`contest`/`attestation`/`inspector`, real source files + whose name merely CONTAINS a test token were silently classified as tests and + DROPPED from extraction (default `skip_tests=True`). + +Fix shape (one mechanism across all 5 langs): anchor the match to whole PATH +COMPONENTS (a directory part == test/tests/spec/specs) OR basename conventions +(`test_*`, `*_test.`, `*_spec.`, `*Test.`, `conftest.py`, etc.). + +This test drives each scanner's classification predicate directly: + - DECOY case: a real source whose name CONTAINS a token as substring -> NOT a test. + - POSITIVE case: a genuine test file/dir -> still IS a test (don't over-narrow). + +JS (`isTestFile` regex) and Go (`HasSuffix "_test.go"`) are already anchored and are +intentionally NOT exercised here. +""" + +import sys +from pathlib import Path + +import pytest + +_CORE_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(_CORE_ROOT)) + + +def _make_scanner(lang, repo_path="/tmp/repo"): + """Instantiate each language's RepositoryScanner with skip_tests on.""" + if lang == "c": + from parsers.c.repository_scanner import RepositoryScanner + return RepositoryScanner(repo_path, {"skip_tests": True}) + if lang == "php": + from parsers.php.repository_scanner import RepositoryScanner + return RepositoryScanner(repo_path, {"skip_tests": True}) + if lang == "python": + from parsers.python.repository_scanner import RepositoryScanner + return RepositoryScanner(repo_path, {"skip_tests": True}) + if lang == "ruby": + from parsers.ruby.repository_scanner import RepositoryScanner + return RepositoryScanner(repo_path, {"skip_tests": True}) + if lang == "zig": + from parsers.zig.repository_scanner import RepositoryScanner + return RepositoryScanner(repo_path, skip_tests=True) + raise ValueError(lang) + + +def _is_test(lang, scanner, relative_path): + """Call the per-language test-file classification predicate.""" + if lang == "zig": + return scanner._is_test_file(relative_path) + return scanner.is_test_file(relative_path) + + +# (lang, decoy_real_source_that_must_NOT_be_a_test) +DECOYS = [ + ("c", "latest_dir/main.c"), + ("c", "contest/sol.c"), + ("c", "src/protest.c"), + ("php", "src/protest_api.php"), + ("php", "app/latest_controller.php"), + ("python", "pkg/latest.py"), + ("python", "pkg/greatest_helper.py"), + ("ruby", "lib/latest_x.rb"), + ("ruby", "lib/contest.rb"), + ("zig", "src/latest.zig"), + ("zig", "src/contest.zig"), + ("zig", "src/attestation.zig"), + ("zig", "inspector/foo.zig"), +] + +# (lang, genuine_test_file_that_MUST_still_be_classified_as_a_test) +POSITIVES = [ + ("c", "tests/test_foo.c"), + ("c", "src/foo_test.c"), + ("php", "tests/FooTest.php"), + ("php", "src/test_helper.php"), + ("python", "tests/test_foo.py"), + ("python", "pkg/conftest.py"), + ("ruby", "spec/foo_spec.rb"), + ("ruby", "test/test_foo.rb"), + ("zig", "test/foo.zig"), + ("zig", "src/foo_test.zig"), +] + + +@pytest.mark.parametrize("lang,relative_path", DECOYS) +def test_decoy_real_source_not_classified_as_test(lang, relative_path): + scanner = _make_scanner(lang) + assert not _is_test(lang, scanner, relative_path), ( + f"{lang}: real source {relative_path!r} wrongly classified as a test " + f"(unanchored substring match)" + ) + + +@pytest.mark.parametrize("lang,relative_path", POSITIVES) +def test_positive_genuine_test_still_classified(lang, relative_path): + scanner = _make_scanner(lang) + assert _is_test(lang, scanner, relative_path), ( + f"{lang}: genuine test {relative_path!r} must still be classified as a test" + ) + + +def test_zig_test_directory_decoy_not_excluded(): + """zig dir-level: `inspector`/`latest` dirs must NOT be treated as test dirs.""" + scanner = _make_scanner("zig") + assert not scanner._is_test_directory("inspector") + assert not scanner._is_test_directory("latest_dir") + # positive: a real `test` dir IS a test dir + assert scanner._is_test_directory("test") + assert scanner._is_test_directory("tests") + assert scanner._is_test_directory("spec")