From 18f40d0ddb8475c2abc49c6c722ba9d64cbf3a44 Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Fri, 5 Dec 2025 19:47:11 +1000 Subject: [PATCH 01/20] feat: add support to use inferred build tools and to extract tool-specific build dependency information (#1256) Adds support to use inferred build tools and to extract tool-specific build dependency information for buildspec generation. Signed-off-by: Abhinav Pradeep --- .../common_spec/pypi_spec.py | 44 +++++++- .../dockerfile/pypi_dockerfile_output.py | 15 ++- .../test_pypi_dockerfile_output.ambr | 2 +- .../compare_dockerfile_buildspec.py | 106 ++++++++++++++++++ .../expected_default.buildspec | 4 +- .../expected_dockerfile.buildspec | 50 +++++++++ .../cases/pypi_cachetools/test.yaml | 14 +++ .../expected_dockerfile.buildspec | 50 +++++++++ .../cases/pypi_markdown-it-py/test.yaml | 14 +++ .../pypi_toga/expected_default.buildspec | 4 +- .../pypi_toga/expected_dockerfile.buildspec | 50 +++++++++ tests/integration/cases/pypi_toga/test.yaml | 14 +++ tests/integration/run.py | 1 + 13 files changed, 361 insertions(+), 7 deletions(-) create mode 100644 tests/build_spec_generator/dockerfile/compare_dockerfile_buildspec.py create mode 100644 tests/integration/cases/pypi_cachetools/expected_dockerfile.buildspec create mode 100644 tests/integration/cases/pypi_markdown-it-py/expected_dockerfile.buildspec create mode 100644 tests/integration/cases/pypi_toga/expected_dockerfile.buildspec diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index 0f7d78824..bb90ba6a1 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -6,6 +6,7 @@ import logging import os import re +from typing import Any import tomli from packageurl import PackageURL @@ -67,15 +68,17 @@ def get_default_build_commands( match build_tool_name: case "pip": - default_build_commands.append("python -m build".split()) + default_build_commands.append("python -m build --wheel -n".split()) case "poetry": default_build_commands.append("poetry build".split()) case "flit": + # We might also want to deal with existence flit.ini, we can do so via + # "python -m flit.tomlify" default_build_commands.append("flit build".split()) case "hatch": default_build_commands.append("hatch build".split()) case "conda": - default_build_commands.append("conda build".split()) + default_build_commands.append('echo("Not supported")'.split()) case _: pass @@ -156,6 +159,7 @@ def resolve_fields(self, purl: PackageURL) -> None: try: with pypi_package_json.sourcecode(): try: + # Get the build time requirements from ["build-system", "requires"] pyproject_content = pypi_package_json.get_sourcecode_file_contents("pyproject.toml") content = tomli.loads(pyproject_content.decode("utf-8")) requires = json_extract(content, ["build-system", "requires"], list) @@ -164,10 +168,10 @@ def resolve_fields(self, purl: PackageURL) -> None: backend = json_extract(content, ["build-system", "build-backend"], str) if backend: build_backends_set.add(backend.replace(" ", "")) - python_version_constraint = json_extract(content, ["project", "requires-python"], str) if python_version_constraint: python_version_set.add(python_version_constraint.replace(" ", "")) + self.apply_tool_specific_inferences(build_requires_set, python_version_set, content) logger.debug( "After analyzing pyproject.toml from the sdist: build-requires: %s, build_backend: %s", build_requires_set, @@ -239,6 +243,40 @@ def resolve_fields(self, purl: PackageURL) -> None: self.data["build_commands"] = patched_build_commands + def apply_tool_specific_inferences( + self, build_requires_set: set[str], python_version_set: set[str], pyproject_contents: dict[str, Any] + ) -> None: + """ + Based on build tools inferred, look into the pyproject.toml for related additional dependencies. + + Parameters + ---------- + build_requires_set: set[str] + Set of build requirements to populate. + python_version_set: set[str] + Set of compatible interpreter versions to populate. + pyproject_contents: dict[str, Any] + Parsed contents of the pyproject.toml file. + """ + # If we have hatch as a build_tool, we will examine [tool.hatch.build.hooks.*] to + # look for any additional build dependencies declared there. + if "hatch" in self.data["build_tools"]: + # Look for [tool.hatch.build.hooks.*] + hatch_build_hooks = json_extract(pyproject_contents, ["tool", "hatch", "build", "hooks"], dict) + if hatch_build_hooks: + for _, section in hatch_build_hooks.items(): + dependencies = section.get("dependencies") + if dependencies: + build_requires_set.update(elem.replace(" ", "") for elem in dependencies) + # If we have flit as a build_tool, we will check if the legacy header [tool.flit.metadata] exists, + # and if so, check to see if we can use its "requires-python". + if "flit" in self.data["build_tools"]: + flit_python_version_constraint = json_extract( + pyproject_contents, ["tool", "flit", "metadata", "requires-python"], str + ) + if flit_python_version_constraint: + python_version_set.add(flit_python_version_constraint.replace(" ", "")) + def read_directory(self, wheel_path: str, purl: PackageURL) -> tuple[str, str]: """ Read in the WHEEL and METADATA file from the .dist_info directory. diff --git a/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py b/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py index fd41f063c..ef2360a5c 100644 --- a/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py +++ b/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py @@ -38,6 +38,17 @@ def gen_dockerfile(buildspec: BaseBuildSpecDict) -> str: logger.debug("Could not derive a specific interpreter version.") raise GenerateBuildSpecError("Could not derive specific interpreter version.") backend_install_commands: str = " && ".join(build_backend_commands(buildspec)) + build_tool_install: str = "" + if ( + buildspec["build_tools"][0] != "pip" + and buildspec["build_tools"][0] != "conda" + and buildspec["build_tools"][0] != "flit" + ): + build_tool_install = f"pip install {buildspec['build_tools'][0]} && " + elif buildspec["build_tools"][0] == "flit": + build_tool_install = ( + f"pip install {buildspec['build_tools'][0]} && if test -f \"flit.ini\"; then python -m flit.tomlify; fi && " + ) dockerfile_content = f""" #syntax=docker/dockerfile:1.10 FROM oraclelinux:9 @@ -87,7 +98,7 @@ def gen_dockerfile(buildspec: BaseBuildSpecDict) -> str: EOF # Run the build - RUN /deps/bin/python -m build --wheel -n + RUN {"source /deps/bin/activate && " + build_tool_install + " ".join(x for x in buildspec["build_commands"][0])} """ return dedent(dockerfile_content) @@ -148,4 +159,6 @@ def build_backend_commands(buildspec: BaseBuildSpecDict) -> list[str]: commands: list[str] = [] for backend, version_constraint in buildspec["build_requires"].items(): commands.append(f'/deps/bin/pip install "{backend}{version_constraint}"') + # For a stable order on the install commands + commands.sort() return commands diff --git a/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr b/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr index a39631a05..696ee6f8d 100644 --- a/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr +++ b/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr @@ -50,7 +50,7 @@ EOF # Run the build - RUN /deps/bin/python -m build --wheel -n + RUN source /deps/bin/activate && python -m build ''' # --- diff --git a/tests/build_spec_generator/dockerfile/compare_dockerfile_buildspec.py b/tests/build_spec_generator/dockerfile/compare_dockerfile_buildspec.py new file mode 100644 index 000000000..8c181d8d5 --- /dev/null +++ b/tests/build_spec_generator/dockerfile/compare_dockerfile_buildspec.py @@ -0,0 +1,106 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Script to compare a generated dockerfile buildspec.""" + +import argparse +import logging +from collections.abc import Callable + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +logging.basicConfig(format="[%(filename)s:%(lineno)s %(tag)s] %(message)s") + + +def log_with_tag(tag: str) -> Callable[[str], None]: + """Generate a log function that prints the name of the file and a tag at the beginning of each line.""" + + def log_fn(msg: str) -> None: + logger.info(msg, extra={"tag": tag}) + + return log_fn + + +log_info = log_with_tag("INFO") +log_err = log_with_tag("ERROR") +log_passed = log_with_tag("PASSED") +log_failed = log_with_tag("FAILED") + + +def log_diff(result: str, expected: str) -> None: + """Pretty-print the diff of two strings.""" + output = [ + *("---- Result ---", result), + *("---- Expected ---", expected), + "-----------------", + ] + log_info("\n".join(output)) + + +def main() -> int: + """Compare a Macaron generated dockerfile buildspec. + + Returns + ------- + int + 0 if the generated dockerfile matches the expected output, or non-zero otherwise. + """ + parser = argparse.ArgumentParser() + parser.add_argument("result_dockerfile", help="the result dockerfile buildspec") + parser.add_argument("expected_dockerfile_buildspec", help="the expected buildspec dockerfile") + args = parser.parse_args() + + # Load both files + with open(args.result_dockerfile, encoding="utf-8") as file: + buildspec = normalize(file.read()) + + with open(args.expected_dockerfile_buildspec, encoding="utf-8") as file: + expected_buildspec = normalize(file.read()) + + log_info( + f"Comparing the dockerfile buildspec {args.result_dockerfile} with the expected " + + "output dockerfile {args.expected_dockerfile_buildspec}" + ) + + # Compare the files + return compare(buildspec, expected_buildspec) + + +def normalize(contents: str) -> list[str]: + """Convert string of file contents to list of its non-empty lines""" + return [line.strip() for line in contents.splitlines() if line.strip()] + + +def compare(buildspec: list[str], expected_buildspec: list[str]) -> int: + """Compare the lines in the two files directly. + + Early return when an unexpected difference is found. If the lengths + mismatch, but the first safe_index_max lines are the same, print + the missing/extra lines. + + Returns + ------- + int + 0 if the generated dockerfile matches the expected output, or non-zero otherwise. + """ + safe_index_max = min(len(buildspec), len(expected_buildspec)) + for index in range(safe_index_max): + if buildspec[index] != expected_buildspec[index]: + # Log error + log_err("Mismatch found:") + # Log diff + log_diff(buildspec[index], expected_buildspec[index]) + return 1 + if safe_index_max < len(expected_buildspec): + log_err("Mismatch found: result is missing trailing lines") + log_diff("", "\n".join(expected_buildspec[safe_index_max:])) + return 1 + if safe_index_max < len(buildspec): + log_err("Mismatch found: result has extra trailing lines") + log_diff("\n".join(buildspec[safe_index_max:]), "") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/integration/cases/pypi_cachetools/expected_default.buildspec b/tests/integration/cases/pypi_cachetools/expected_default.buildspec index 5af209e96..0b5d8acfa 100644 --- a/tests/integration/cases/pypi_cachetools/expected_default.buildspec +++ b/tests/integration/cases/pypi_cachetools/expected_default.buildspec @@ -19,7 +19,9 @@ [ "python", "-m", - "build" + "build", + "--wheel", + "-n" ] ], "build_requires": { diff --git a/tests/integration/cases/pypi_cachetools/expected_dockerfile.buildspec b/tests/integration/cases/pypi_cachetools/expected_dockerfile.buildspec new file mode 100644 index 000000000..749757f91 --- /dev/null +++ b/tests/integration/cases/pypi_cachetools/expected_dockerfile.buildspec @@ -0,0 +1,50 @@ + +#syntax=docker/dockerfile:1.10 +FROM oraclelinux:9 + +# Install core tools +RUN dnf -y install which wget tar git + +# Install compiler and make +RUN dnf -y install gcc make + +# Download and unzip interpreter +RUN <=3.4" + /deps/bin/pip install build +EOF + +# Run the build +RUN source /deps/bin/activate && pip install flit && if test -f "flit.ini"; then python -m flit.tomlify; fi && flit build diff --git a/tests/integration/cases/pypi_markdown-it-py/test.yaml b/tests/integration/cases/pypi_markdown-it-py/test.yaml index a57b7d2cf..d3b0b365a 100644 --- a/tests/integration/cases/pypi_markdown-it-py/test.yaml +++ b/tests/integration/cases/pypi_markdown-it-py/test.yaml @@ -27,3 +27,17 @@ steps: kind: default_build_spec result: output/buildspec/pypi/markdown-it-py/macaron.buildspec expected: expected_default.buildspec +- name: Generate the buildspec + kind: gen-build-spec + options: + command_args: + - -purl + - pkg:pypi/markdown-it-py@4.0.0 + - --output-format + - dockerfile +- name: Compare Dockerfile + kind: compare + options: + kind: dockerfile_build_spec + result: output/buildspec/pypi/markdown-it-py/dockerfile.buildspec + expected: expected_dockerfile.buildspec diff --git a/tests/integration/cases/pypi_toga/expected_default.buildspec b/tests/integration/cases/pypi_toga/expected_default.buildspec index ffb146e81..819113207 100644 --- a/tests/integration/cases/pypi_toga/expected_default.buildspec +++ b/tests/integration/cases/pypi_toga/expected_default.buildspec @@ -19,7 +19,9 @@ [ "python", "-m", - "build" + "build", + "--wheel", + "-n" ] ], "build_requires": { diff --git a/tests/integration/cases/pypi_toga/expected_dockerfile.buildspec b/tests/integration/cases/pypi_toga/expected_dockerfile.buildspec new file mode 100644 index 000000000..47e1e012a --- /dev/null +++ b/tests/integration/cases/pypi_toga/expected_dockerfile.buildspec @@ -0,0 +1,50 @@ + +#syntax=docker/dockerfile:1.10 +FROM oraclelinux:9 + +# Install core tools +RUN dnf -y install which wget tar git + +# Install compiler and make +RUN dnf -y install gcc make + +# Download and unzip interpreter +RUN < None: "find_source": ["tests", "find_source", "compare_source_reports.py"], "rc_build_spec": ["tests", "build_spec_generator", "reproducible_central", "compare_rc_build_spec.py"], "default_build_spec": ["tests", "build_spec_generator", "common_spec", "compare_default_buildspec.py"], + "dockerfile_build_spec": ["tests", "build_spec_generator", "dockerfile", "compare_dockerfile_buildspec.py"], } VALIDATE_SCHEMA_SCRIPTS: dict[str, Sequence[str]] = { From d6627dfad917bcfffec92e38eda0e42903e739bc Mon Sep 17 00:00:00 2001 From: Nicholas Allen Date: Thu, 11 Dec 2025 13:51:44 +1000 Subject: [PATCH 02/20] feat: add new dataflow analysis, replacing existing analysis for GitHub Actions (#1229) Signed-off-by: Nicholas Allen --- docs/source/conf.py | 6 +- ...acaron.build_spec_generator.dockerfile.rst | 26 + .../apidoc/macaron.build_spec_generator.rst | 1 + ...acaron.code_analyzer.dataflow_analysis.rst | 98 + .../apidoc/macaron.code_analyzer.rst | 13 +- .../apidoc/macaron.parsers.rst | 8 + .../apidoc/macaron.repo_finder.rst | 8 + .../macaron.slsa_analyzer.build_tool.rst | 32 + ...lsa_analyzer.ci_service.github_actions.rst | 8 - golang/cmd/bashexprparser/bashexprparser.go | 59 + golang/cmd/bashparser/bashparser.go | 7 +- golang/internal/bashparser/bashparser.go | 63 +- pyproject.toml | 2 + src/macaron/code_analyzer/call_graph.py | 104 - .../dataflow_analysis/__init__.py | 2 + .../dataflow_analysis/analysis.py | 469 ++++ .../code_analyzer/dataflow_analysis/bash.py | 1891 +++++++++++++++++ .../dataflow_analysis/cmd_parser.py | 88 + .../code_analyzer/dataflow_analysis/core.py | 695 ++++++ .../dataflow_analysis/evaluation.py | 772 +++++++ .../code_analyzer/dataflow_analysis/facts.py | 702 ++++++ .../code_analyzer/dataflow_analysis/github.py | 1314 ++++++++++++ .../dataflow_analysis/github_expr.py | 141 ++ .../code_analyzer/dataflow_analysis/models.py | 679 ++++++ .../dataflow_analysis/printing.py | 681 ++++++ .../run_analysis_standalone.py | 46 + src/macaron/parsers/bashparser.py | 243 +-- src/macaron/parsers/bashparser_model.py | 848 ++++++++ .../build_tool/base_build_tool.py | 10 +- .../checks/build_as_code_check.py | 202 +- .../checks/build_script_check.py | 5 +- .../checks/build_service_check.py | 5 +- .../github_actions_vulnerability_check.py | 87 +- .../checks/trusted_builder_l3_check.py | 73 +- .../ci_service/base_ci_service.py | 65 +- .../slsa_analyzer/ci_service/circleci.py | 11 +- .../ci_service/github_actions/analyzer.py | 801 ------- .../github_actions/github_actions_ci.py | 120 +- .../slsa_analyzer/ci_service/gitlab_ci.py | 12 +- .../slsa_analyzer/ci_service/jenkins.py | 148 +- .../slsa_analyzer/ci_service/travis.py | 12 +- src/macaron/slsa_analyzer/specs/ci_spec.py | 6 +- tests/conftest.py | 43 +- tests/parsers/bashparser/test_bashparser.py | 40 +- tests/provenance/test_provenance_finder.py | 6 +- tests/slsa_analyzer/build_tool/test_conda.py | 5 +- tests/slsa_analyzer/build_tool/test_docker.py | 7 +- tests/slsa_analyzer/build_tool/test_flit.py | 5 +- tests/slsa_analyzer/build_tool/test_go.py | 5 +- tests/slsa_analyzer/build_tool/test_gradle.py | 7 +- tests/slsa_analyzer/build_tool/test_hatch.py | 5 +- tests/slsa_analyzer/build_tool/test_maven.py | 7 +- tests/slsa_analyzer/build_tool/test_npm.py | 7 +- tests/slsa_analyzer/build_tool/test_pip.py | 5 +- tests/slsa_analyzer/build_tool/test_poetry.py | 7 +- tests/slsa_analyzer/build_tool/test_yarn.py | 7 +- .../checks/test_build_as_code_check.py | 29 +- .../checks/test_build_service_check.py | 4 +- ...test_github_actions_vulnerability_check.py | 11 +- .../test_provenance_l3_content_check.py | 4 +- .../checks/test_trusted_builder_l3_check.py | 25 +- .../ci_service/test_github_actions.py | 63 +- tests/slsa_analyzer/test_analyze_context.py | 4 +- 63 files changed, 9044 insertions(+), 1815 deletions(-) create mode 100644 docs/source/pages/developers_guide/apidoc/macaron.build_spec_generator.dockerfile.rst create mode 100644 docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.dataflow_analysis.rst create mode 100644 golang/cmd/bashexprparser/bashexprparser.go delete mode 100644 src/macaron/code_analyzer/call_graph.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/__init__.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/analysis.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/bash.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/cmd_parser.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/core.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/evaluation.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/facts.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/github.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/github_expr.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/models.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/printing.py create mode 100644 src/macaron/code_analyzer/dataflow_analysis/run_analysis_standalone.py create mode 100644 src/macaron/parsers/bashparser_model.py delete mode 100644 src/macaron/slsa_analyzer/ci_service/github_actions/analyzer.py diff --git a/docs/source/conf.py b/docs/source/conf.py index 6bad46788..31a241743 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. # Configuration file for the Sphinx documentation builder. @@ -76,6 +76,10 @@ "", ] +suppress_warnings = [ + "sphinx_autodoc_typehints.forward_reference", # Sphinx has issues with resolving forward references. +] + # We add the docstrings for class constructors in the `__init__` methods. def skip(app, what, name, obj, would_skip, options): diff --git a/docs/source/pages/developers_guide/apidoc/macaron.build_spec_generator.dockerfile.rst b/docs/source/pages/developers_guide/apidoc/macaron.build_spec_generator.dockerfile.rst new file mode 100644 index 000000000..fd655837c --- /dev/null +++ b/docs/source/pages/developers_guide/apidoc/macaron.build_spec_generator.dockerfile.rst @@ -0,0 +1,26 @@ +macaron.build\_spec\_generator.dockerfile package +================================================= + +.. automodule:: macaron.build_spec_generator.dockerfile + :members: + :show-inheritance: + :undoc-members: + +Submodules +---------- + +macaron.build\_spec\_generator.dockerfile.dockerfile\_output module +------------------------------------------------------------------- + +.. automodule:: macaron.build_spec_generator.dockerfile.dockerfile_output + :members: + :show-inheritance: + :undoc-members: + +macaron.build\_spec\_generator.dockerfile.pypi\_dockerfile\_output module +------------------------------------------------------------------------- + +.. automodule:: macaron.build_spec_generator.dockerfile.pypi_dockerfile_output + :members: + :show-inheritance: + :undoc-members: diff --git a/docs/source/pages/developers_guide/apidoc/macaron.build_spec_generator.rst b/docs/source/pages/developers_guide/apidoc/macaron.build_spec_generator.rst index 5bc830015..679e381d8 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.build_spec_generator.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.build_spec_generator.rst @@ -14,6 +14,7 @@ Subpackages macaron.build_spec_generator.cli_command_parser macaron.build_spec_generator.common_spec + macaron.build_spec_generator.dockerfile macaron.build_spec_generator.reproducible_central Submodules diff --git a/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.dataflow_analysis.rst b/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.dataflow_analysis.rst new file mode 100644 index 000000000..343287f28 --- /dev/null +++ b/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.dataflow_analysis.rst @@ -0,0 +1,98 @@ +macaron.code\_analyzer.dataflow\_analysis package +================================================= + +.. automodule:: macaron.code_analyzer.dataflow_analysis + :members: + :show-inheritance: + :undoc-members: + +Submodules +---------- + +macaron.code\_analyzer.dataflow\_analysis.analysis module +--------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.analysis + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.bash module +----------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.bash + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.cmd\_parser module +------------------------------------------------------------ + +.. automodule:: macaron.code_analyzer.dataflow_analysis.cmd_parser + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.core module +----------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.core + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.evaluation module +----------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.evaluation + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.facts module +------------------------------------------------------ + +.. automodule:: macaron.code_analyzer.dataflow_analysis.facts + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.github module +------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.github + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.github\_expr module +------------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.github_expr + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.models module +------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.models + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.printing module +--------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.printing + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.run\_analysis\_standalone module +-------------------------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.run_analysis_standalone + :members: + :show-inheritance: + :undoc-members: diff --git a/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.rst b/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.rst index 6216f77e6..b46c0eac7 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.rst @@ -6,13 +6,10 @@ macaron.code\_analyzer package :show-inheritance: :undoc-members: -Submodules ----------- +Subpackages +----------- -macaron.code\_analyzer.call\_graph module ------------------------------------------ +.. toctree:: + :maxdepth: 1 -.. automodule:: macaron.code_analyzer.call_graph - :members: - :show-inheritance: - :undoc-members: + macaron.code_analyzer.dataflow_analysis diff --git a/docs/source/pages/developers_guide/apidoc/macaron.parsers.rst b/docs/source/pages/developers_guide/apidoc/macaron.parsers.rst index 63ad1a5e9..3dad1ee97 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.parsers.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.parsers.rst @@ -33,6 +33,14 @@ macaron.parsers.bashparser module :show-inheritance: :undoc-members: +macaron.parsers.bashparser\_model module +---------------------------------------- + +.. automodule:: macaron.parsers.bashparser_model + :members: + :show-inheritance: + :undoc-members: + macaron.parsers.github\_workflow\_model module ---------------------------------------------- diff --git a/docs/source/pages/developers_guide/apidoc/macaron.repo_finder.rst b/docs/source/pages/developers_guide/apidoc/macaron.repo_finder.rst index 04a654b94..dcf5b333b 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.repo_finder.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.repo_finder.rst @@ -57,6 +57,14 @@ macaron.repo\_finder.repo\_finder\_java module :show-inheritance: :undoc-members: +macaron.repo\_finder.repo\_finder\_npm module +--------------------------------------------- + +.. automodule:: macaron.repo_finder.repo_finder_npm + :members: + :show-inheritance: + :undoc-members: + macaron.repo\_finder.repo\_finder\_pypi module ---------------------------------------------- diff --git a/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.build_tool.rst b/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.build_tool.rst index a44611b57..e7c3e6552 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.build_tool.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.build_tool.rst @@ -17,6 +17,14 @@ macaron.slsa\_analyzer.build\_tool.base\_build\_tool module :show-inheritance: :undoc-members: +macaron.slsa\_analyzer.build\_tool.conda module +----------------------------------------------- + +.. automodule:: macaron.slsa_analyzer.build_tool.conda + :members: + :show-inheritance: + :undoc-members: + macaron.slsa\_analyzer.build\_tool.docker module ------------------------------------------------ @@ -25,6 +33,14 @@ macaron.slsa\_analyzer.build\_tool.docker module :show-inheritance: :undoc-members: +macaron.slsa\_analyzer.build\_tool.flit module +---------------------------------------------- + +.. automodule:: macaron.slsa_analyzer.build_tool.flit + :members: + :show-inheritance: + :undoc-members: + macaron.slsa\_analyzer.build\_tool.go module -------------------------------------------- @@ -41,6 +57,14 @@ macaron.slsa\_analyzer.build\_tool.gradle module :show-inheritance: :undoc-members: +macaron.slsa\_analyzer.build\_tool.hatch module +----------------------------------------------- + +.. automodule:: macaron.slsa_analyzer.build_tool.hatch + :members: + :show-inheritance: + :undoc-members: + macaron.slsa\_analyzer.build\_tool.language module -------------------------------------------------- @@ -81,6 +105,14 @@ macaron.slsa\_analyzer.build\_tool.poetry module :show-inheritance: :undoc-members: +macaron.slsa\_analyzer.build\_tool.pyproject module +--------------------------------------------------- + +.. automodule:: macaron.slsa_analyzer.build_tool.pyproject + :members: + :show-inheritance: + :undoc-members: + macaron.slsa\_analyzer.build\_tool.yarn module ---------------------------------------------- diff --git a/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.ci_service.github_actions.rst b/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.ci_service.github_actions.rst index d745c347f..67b6da97f 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.ci_service.github_actions.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.ci_service.github_actions.rst @@ -9,14 +9,6 @@ macaron.slsa\_analyzer.ci\_service.github\_actions package Submodules ---------- -macaron.slsa\_analyzer.ci\_service.github\_actions.analyzer module ------------------------------------------------------------------- - -.. automodule:: macaron.slsa_analyzer.ci_service.github_actions.analyzer - :members: - :show-inheritance: - :undoc-members: - macaron.slsa\_analyzer.ci\_service.github\_actions.github\_actions\_ci module ----------------------------------------------------------------------------- diff --git a/golang/cmd/bashexprparser/bashexprparser.go b/golang/cmd/bashexprparser/bashexprparser.go new file mode 100644 index 000000000..3a55db7d2 --- /dev/null +++ b/golang/cmd/bashexprparser/bashexprparser.go @@ -0,0 +1,59 @@ +/* Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +package main + +import ( + "flag" + "fmt" + "os" + + "github.com/oracle/macaron/golang/internal/bashparser" + "github.com/oracle/macaron/golang/internal/filewriter" +) + +// Parse the bash expression and provide parsed objects in JSON format to stdout or a file. +// Params: +// +// -input : the bash expr content in string +// -output : the output file path to store the JSON content +// +// Return code: +// +// 0 - Parse successfully, return the JSON as string to stdout. If -output is set, store the json content to the file. +// If there is any errors storing to file, the result is still printed to stdout, but the errors are put to stderr instead. +// 1 - Error: Missing bash script or output file paths. +// 2 - Error: Could not parse the bash script file. Parse errors will be printed to stderr. +func main() { + input := flag.String("input", "", "The bash expr content to be parsed.") + out_path := flag.String("output", "", "The output file path to store the JSON content.") + flag.Parse() + + var json_content string + var parse_err error + if len(*input) <= 0 { + fmt.Fprintln(os.Stderr, "Missing bash expr input.") + flag.PrintDefaults() + os.Exit(1) + } else { + // Read the bash script from command line argument. + json_content, parse_err = bashparser.ParseExpr(*input) + } + + if parse_err != nil { + fmt.Fprintln(os.Stderr, parse_err.Error()) + os.Exit(2) + } + + fmt.Println(json_content) + + if len(*out_path) > 0 { + err := filewriter.StoreBytesToFile([]byte(json_content), *out_path) + if err != nil { + fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(1) + } + } + + os.Exit(0) +} diff --git a/golang/cmd/bashparser/bashparser.go b/golang/cmd/bashparser/bashparser.go index ed598ea28..50cc6fec2 100644 --- a/golang/cmd/bashparser/bashparser.go +++ b/golang/cmd/bashparser/bashparser.go @@ -1,4 +1,4 @@ -/* Copyright (c) 2022 - 2023, Oracle and/or its affiliates. All rights reserved. */ +/* Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. */ /* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ package main @@ -29,13 +29,14 @@ func main() { file_path := flag.String("file", "", "The path of the bash script file.") input := flag.String("input", "", "The bash script content to be parsed. Input is prioritized over file option.") out_path := flag.String("output", "", "The output file path to store the JSON content.") + raw := flag.Bool("raw", false, "Return raw parse-tree") flag.Parse() var json_content string var parse_err error if len(*input) > 0 { // Read the bash script from command line argument. - json_content, parse_err = bashparser.ParseCommands(*input) + json_content, parse_err = bashparser.Parse(*input, *raw) } else if len(*file_path) <= 0 { fmt.Fprintln(os.Stderr, "Missing bash script input or file path.") flag.PrintDefaults() @@ -47,7 +48,7 @@ func main() { fmt.Fprintln(os.Stderr, read_err.Error()) os.Exit(1) } - json_content, parse_err = bashparser.ParseCommands(string(data)) + json_content, parse_err = bashparser.Parse(string(data), *raw) } if parse_err != nil { diff --git a/golang/internal/bashparser/bashparser.go b/golang/internal/bashparser/bashparser.go index a033e6f73..b88e43a6e 100644 --- a/golang/internal/bashparser/bashparser.go +++ b/golang/internal/bashparser/bashparser.go @@ -1,4 +1,4 @@ -/* Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. */ +/* Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. */ /* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ // Package bashparser parses the bash scripts and provides parsed objects in JSON. @@ -11,6 +11,7 @@ import ( "strings" "mvdan.cc/sh/v3/syntax" + "mvdan.cc/sh/v3/syntax/typedjson" ) // CMDResult is used to export the bash command results in JSON. @@ -68,3 +69,63 @@ func ParseCommands(data string) (string, error) { return string(result_bytes), nil } + +func ParseRaw(data string) (string, error) { + // Replace GitHub Actions's expressions with ``$MACARON_UNKNOWN``` variable because the bash parser + // doesn't recognize such expressions. For example: ``${{ foo }}`` will be replaced by ``$MACARON_UNKNOWN``. + // Note that we don't use greedy matching, so if we have `${{ ${{ foo }} }}`, it will not be replaced by + // `$MACARON_UNKNOWN`. + // See: https://docs.github.com/en/actions/learn-github-actions/expressions. + var re, reg_error = regexp.Compile(`\$\{\{.*?\}\}`) + if reg_error != nil { + return "", reg_error + } + + // We replace the GH Actions variables with "$MACARON_UNKNOWN". + data = string(re.ReplaceAll([]byte(data), []byte("$$MACARON_UNKNOWN"))) + data_str := strings.NewReader(data) + data_parsed, parse_err := syntax.NewParser().Parse(data_str, "") + if parse_err != nil { + return "", parse_err + } + + b := new(strings.Builder) + encode_err := typedjson.Encode(b, data_parsed) + if encode_err != nil { + return "", encode_err + } + + return b.String(), nil +} + +func Parse(data string, raw bool) (string, error) { + if raw { + return ParseRaw(data) + } else { + return ParseCommands(data) + } +} + +func ParseExpr(data string) (string, error) { + data_str := strings.NewReader(data) + result_str := "[" + first := true + for word_parsed, parse_err := range syntax.NewParser().WordsSeq(data_str) { + if parse_err != nil { + return "", parse_err + } + b := new(strings.Builder) + encode_err := typedjson.Encode(b, word_parsed) + if encode_err != nil { + return "", encode_err + } + if first { + result_str = result_str + b.String() + first = false + } else { + result_str = result_str + ", " + b.String() + } + } + result_str = result_str + "]" + return result_str, nil +} diff --git a/pyproject.toml b/pyproject.toml index 336e611af..65fd534dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,8 @@ dependencies = [ "semgrep == 1.113.0", "email-validator >=2.2.0,<3.0.0", "rich >=13.5.3,<15.0.0", + "lark >= 1.3.0,<2.0.0", + "frozendict >= 2.4.6, <3.0.0", ] keywords = [] # https://pypi.org/classifiers/ diff --git a/src/macaron/code_analyzer/call_graph.py b/src/macaron/code_analyzer/call_graph.py deleted file mode 100644 index 1f3be3fac..000000000 --- a/src/macaron/code_analyzer/call_graph.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - -"""This module contains classes to generate build call graphs for the target repository.""" - -from collections import deque -from collections.abc import Iterable -from typing import Any, Generic, TypeVar - -Node = TypeVar("Node", bound="BaseNode") -# The documentation below for `TypeVar` is commented out due to a breaking -# change in Sphinx version (^=6.1.0). -# Reported at: https://github.com/oracle/macaron/issues/58. -# """This binds type ``Node`` to ``BaseNode`` and any of its subclasses. - -# Therefore, any node of type ``Node`` that is stored in the call graph -# container will be a subtype of ``BaseNode``. -# """ - - -class BaseNode(Generic[Node]): - """This is the generic class for call graph nodes.""" - - def __init__(self, caller: Node | None = None, node_id: str | None = None) -> None: - """Initialize instance. - - Parameters - ---------- - caller: Node | None - The caller node. - node_id: str | None - The unique identifier of a node in the callgraph. - """ - self.callee: list[Node] = [] - self.caller: Node | None = caller - # Each node can have a model that summarizes certain properties for static analysis. - # By default this model is set to None. - self.model: Any = None - self.node_id = node_id - - def add_callee(self, node: Node) -> None: - """Add a callee to the current node. - - Parameters - ---------- - node : Node - The callee node. - """ - self.callee.append(node) - - def has_callee(self) -> bool: - """Check if the current node has callees. - - Returns - ------- - bool - Return False if there are no callees, otherwise True. - """ - return bool(self.callee) - - -class CallGraph(Generic[Node]): - """This is the generic class for creating a call graph.""" - - def __init__(self, root: Node, repo_path: str) -> None: - """Initialize instance. - - Parameters - ---------- - root : Node - The root call graph node. - repo_path : str - The path to the repo. - """ - self.root = root - self.repo_path = repo_path - - def get_root(self) -> Node: - """Get the root node in the call graph. - - Returns - ------- - Node - The root node. - """ - return self.root - - def bfs(self) -> Iterable[Node]: - """Traverse the call graph in breadth first search order. - - Yields - ------ - Node - The traversed nodes. - """ - queue: deque[Node] = deque() - queue.extend(self.root.callee) - visited = [] - while queue: - node = queue.popleft() - if node not in visited: - queue.extend(node.callee) - visited.append(node) - yield node diff --git a/src/macaron/code_analyzer/dataflow_analysis/__init__.py b/src/macaron/code_analyzer/dataflow_analysis/__init__.py new file mode 100644 index 000000000..8e17a3508 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. diff --git a/src/macaron/code_analyzer/dataflow_analysis/analysis.py b/src/macaron/code_analyzer/dataflow_analysis/analysis.py new file mode 100644 index 000000000..6f7c3f35f --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/analysis.py @@ -0,0 +1,469 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Entry points to perform and use the dataflow analysis.""" + +from __future__ import annotations + +from collections.abc import Iterable + +from macaron.code_analyzer.dataflow_analysis import bash, core, evaluation, facts, github, printing +from macaron.errors import CallGraphError +from macaron.parsers import actionparser, github_workflow_model +from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool, BuildToolCommand + + +def analyse_github_workflow_file(workflow_path: str, repo_path: str | None, dump_debug: bool = False) -> core.Node: + """Perform dataflow analysis for GitHub Actions Workflow file. + + Parameters + ---------- + workflow_path: str + The path to workflow file. + repo_path: str | None + The path to the repo. + dump_debug: bool + Whether to output debug dot file (in the current working directory). + + Returns + ------- + core.Node + Graph representation of workflow and analysis results. + """ + workflow = actionparser.parse(workflow_path) + + analysis_context = core.OwningContextRef(core.AnalysisContext(repo_path)) + + core.reset_debug_sequence_number() + raw_workflow_node = github.RawGitHubActionsWorkflowNode.create(workflow, analysis_context, workflow_path) + core.increment_debug_sequence_number() + + raw_workflow_node.analyse() + + if dump_debug: + with open("analysis." + workflow_path.replace("/", "_") + ".dot", "w", encoding="utf-8") as f: + printing.print_as_dot_graph(raw_workflow_node, f, include_properties=True, include_states=True) + + return raw_workflow_node + + +def analyse_github_workflow( + workflow: github_workflow_model.Workflow, workflow_source_path: str, repo_path: str | None, dump_debug: bool = False +) -> core.Node: + """Perform dataflow analysis for GitHub Actions Workflow. + + Parameters + ---------- + workflow: github_workflow_model.Workflow + The workflow. + workflow_path: str + The source path for the workflow. + repo_path: str | None + The path to the repo. + dump_debug: bool + Whether to output debug dot file (in the current working directory). + + Returns + ------- + core.Node + Graph representation of workflow and analysis results. + """ + analysis_context = core.OwningContextRef(core.AnalysisContext(repo_path)) + + core.reset_debug_sequence_number() + raw_workflow_node = github.RawGitHubActionsWorkflowNode.create(workflow, analysis_context, workflow_source_path) + core.increment_debug_sequence_number() + + raw_workflow_node.analyse() + + if dump_debug: + with open("analysis." + workflow_source_path.replace("/", "_") + ".dot", "w", encoding="utf-8") as f: + printing.print_as_dot_graph(raw_workflow_node, f, include_properties=True, include_states=True) + + return raw_workflow_node + + +def analyse_bash_script( + bash_content: str, source_path: str, repo_path: str | None, dump_debug: bool = False +) -> core.Node: + """Perform dataflow analysis for Bash script. + + Parameters + ---------- + bash_content: str + The Bash script content. + source_path: str + The source path for the Bash script. + repo_path: str | None + The path to the repo. + dump_debug: bool + Whether to output debug dot file (in the current working directory). + + Returns + ------- + core.Node + Graph representation of Bash script and analysis results. + """ + analysis_context = core.OwningContextRef(core.AnalysisContext(repo_path)) + bash_context = core.OwningContextRef(bash.BashScriptContext.create_in_isolation(analysis_context, source_path)) + core.reset_debug_sequence_number() + bash_node = bash.RawBashScriptNode(facts.StringLiteral(bash_content), bash_context) + core.increment_debug_sequence_number() + + bash_node.analyse() + + if dump_debug: + with open( + "analysis." + source_path.replace("/", "_") + "." + str(hash(bash_content)) + ".dot", "w", encoding="utf-8" + ) as f: + printing.print_as_dot_graph(bash_node, f, include_properties=True, include_states=True) + + return bash_node + + +# TODO generalise visitors +class FindSecretsVisitor: + """Visitor to find references to GitHub secrets in analysis expressions.""" + + #: Scope in which secrets may be found + workflow_var_scope: facts.Scope + #: Found secret variable names, populated by running the visitor + secrets: set[str] + + def __init__(self, workflow_var_scope: facts.Scope) -> None: + """Construct a visitor to find secrets. + + Parameters + ---------- + workflow_var_scope: facts.Scope + Scope in which secrets may be found + """ + self.workflow_var_scope = workflow_var_scope + self.secrets = set() + + def visit_value(self, value: facts.Value) -> None: + """Search value expression for secrets.""" + match value: + case facts.StringLiteral(_): + return + case facts.Read(loc): + self.visit_location(loc) + if evaluation.scope_matches(loc.scope, self.workflow_var_scope): + match loc.loc: + case facts.Variable(facts.StringLiteral(name)): + if name.startswith("secrets."): + self.secrets.add(name[len("secrets.") :]) + return + case facts.ArbitraryNewData(_): + return + case facts.UnaryStringOp(_, operand): + self.visit_value(operand) + return + case facts.BinaryStringOp(_, operand1, operand2): + self.visit_value(operand1) + self.visit_value(operand2) + return + case facts.ParameterPlaceholderValue(name): + return + case facts.InstalledPackage(name, version, distribution, url): + self.visit_value(name) + self.visit_value(version) + self.visit_value(distribution) + self.visit_value(url) + return + case facts.Symbolic(sym_val): + self.visit_value(sym_val) + return + raise CallGraphError("unknown facts.Value type: " + value.__class__.__name__) + + def visit_location(self, location: facts.Location) -> None: + """Search location expression for secrets.""" + self.visit_location_specifier(location.loc) + + def visit_location_specifier(self, location: facts.LocationSpecifier) -> None: + """Search location expression for secrets.""" + match location: + case facts.Filesystem(path): + self.visit_value(path) + return + case facts.Variable(name): + self.visit_value(name) + return + case facts.Artifact(name, file): + self.visit_value(name) + self.visit_value(file) + return + case facts.FilesystemAnyUnderDir(path): + self.visit_value(path) + return + case facts.ArtifactAnyFilename(name): + self.visit_value(name) + return + case facts.ParameterPlaceholderLocation(name): + return + case facts.Console(): + return + case facts.Installed(name): + self.visit_value(name) + return + raise CallGraphError("unknown location type: " + location.__class__.__name__) + + +def get_reachable_secrets(bash_cmd_node: bash.BashSingleCommandNode) -> set[str]: + """Get GitHub secrets that are reachable at a bash command. + + Parameters + ---------- + bash_cmd_node: bash.BashSingleCommandNode + The target Bash command node. + + Returns + ------- + set[str] + The set of reachable secret variable names. + """ + result: set[str] = set() + github_context = bash_cmd_node.context.ref.get_containing_github_context() + if github_context is None: + return result + env_scope = bash_cmd_node.context.ref.env.ref + workflow_var_scope = github_context.job_context.ref.workflow_context.ref.workflow_variables.ref + + for loc, vals in bash_cmd_node.before_state.state.items(): + if evaluation.scope_matches(env_scope, loc.scope): + for val in vals: + visitor = FindSecretsVisitor(workflow_var_scope) + visitor.visit_value(val) + result.update(visitor.secrets) + + return result + + +def get_containing_github_job( + node: core.Node, parents: dict[core.Node, core.Node] +) -> github.GitHubActionsNormalJobNode | None: + """Return the GitHub job node containing the given node, if any. + + Parameters + ---------- + node: core.Node + The target node. + parents: dict[core.Node, code.Node] + The mapping of nodes to their parent nodes. + + Returns + ------- + github.GitHubActionsNormalJobNode | None + The containing job node, or None if there is no containing job. + """ + caller_node: core.Node | None = parents.get(node) + while caller_node is not None: + match caller_node: + case github.GitHubActionsWorkflowNode(): + break + case github.GitHubActionsNormalJobNode(): + return caller_node + + caller_node = parents.get(caller_node) + + return None + + +def get_containing_github_step( + node: core.Node, parents: dict[core.Node, core.Node] +) -> github.GitHubActionsRunStepNode | None: + """Return the GitHub step node containing the given node, if any. + + Parameters + ---------- + node: core.Node + The target node. + parents: dict[core.Node, code.Node] + The mapping of nodes to their parent nodes. + + Returns + ------- + github.GitHubActionsRunStepNode | None + The containing step node, or None if there is no containing step. + """ + caller_node: core.Node | None = parents.get(node) + while caller_node is not None: + match caller_node: + case github.GitHubActionsWorkflowNode(): + break + case github.GitHubActionsNormalJobNode(): + break + case github.GitHubActionsRunStepNode(): + return caller_node + + caller_node = parents.get(caller_node) + + return None + + +def get_containing_github_workflow( + node: core.Node, parents: dict[core.Node, core.Node] +) -> github.GitHubActionsWorkflowNode | None: + """Return the GitHub workflow node containing the given node, if any. + + Parameters + ---------- + node: core.Node + The target node. + parents: dict[core.Node, code.Node] + The mapping of nodes to their parent nodes. + + Returns + ------- + github.GitHubActionsWorkflowNode | None + The containing workflow node, or None if there is no containing workflow. + """ + caller_node: core.Node | None = parents.get(node) + while caller_node is not None: + match caller_node: + case github.GitHubActionsWorkflowNode(): + return caller_node + + caller_node = parents.get(caller_node) + + return None + + +def _get_build_tool_commands(nodes: core.NodeForest, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: + """Traverse the callgraph and find all the reachable build tool commands.""" + for root in nodes.root_nodes: + for node in core.traverse_bfs(root): + # We are just interested in nodes that have bash commands. + if isinstance(node, bash.BashSingleCommandNode): + # We collect useful contextual information for the called BashNode. + # The GitHub Actions workflow that triggers the path in the callgraph. + workflow_node = None + # The step in GitHub Actions job that triggers the path in the callgraph. + step_node = None + + # Walk up the callgraph to find the relevant caller nodes. + # In GitHub Actions a `GitHubWorkflowNode` may call several `GitHubJobNode`s + # and a `GitHubJobNode` may call several steps, which can be external `GitHubWorkflowNode` + # or inlined run nodes. + # TODO: revisit this implementation if analysis of external workflows is supported in + # the future, and decide if setting the caller workflow and job nodes to the nodes in the + # main triggering workflow is still expected. + workflow_node = get_containing_github_workflow(node, nodes.parents) + step_node = get_containing_github_step(node, nodes.parents) + + # Find the bash commands that call the build tool. + resolved_cmds = evaluation.evaluate(node, node.cmd) + resolved_args = [evaluation.evaluate(node, arg) if arg is not None else None for arg in node.args] + + # TODO combinations + + cmd = [evaluation.get_single_resolved_str_with_default(resolved_cmds, "$MACARON_UNKNOWN")] + [ + ( + evaluation.get_single_resolved_str_with_default(resolved_arg, "$MACARON_UNKNOWN") + if resolved_arg is not None + else "$MACARON_UNKNOWN" + ) + for resolved_arg in resolved_args + ] + + if build_tool.is_build_command(cmd): + lang_versions = lang_distributions = lang_url = None + evaluated_installed_languages = evaluation.evaluate( + node, + facts.Read( + facts.Location( + node.context.ref.filesystem.ref, + facts.Installed(facts.StringLiteral(build_tool.language)), + ) + ), + ) + evaluated_installed_languages = evaluation.filter_symbolic_values(evaluated_installed_languages) + + lang_versions = [] + lang_distributions = [] + lang_urls = [] + + for evaluated_installed_language in evaluated_installed_languages: + if isinstance(evaluated_installed_language[0], facts.InstalledPackage): + if isinstance(evaluated_installed_language[0].version, facts.StringLiteral): + lang_version_str = evaluated_installed_language[0].version.literal + if lang_version_str not in lang_versions: + lang_versions.append(lang_version_str) + if isinstance(evaluated_installed_language[0].distribution, facts.StringLiteral): + lang_distribution_str = evaluated_installed_language[0].distribution.literal + if lang_distribution_str not in lang_distributions: + lang_distributions.append(lang_distribution_str) + if isinstance(evaluated_installed_language[0].url, facts.StringLiteral): + lang_url_str = evaluated_installed_language[0].url.literal + if lang_url_str not in lang_urls: + lang_urls.append(lang_url_str) + + lang_url = lang_urls[0] if len(lang_urls) > 0 else "" + + lang_versions = sorted(lang_versions) + lang_distributions = sorted(lang_distributions) + lang_urls = sorted(lang_urls) + + yield BuildToolCommand( + ci_path=( + workflow_node.context.ref.source_filepath + if workflow_node is not None + else node.context.ref.source_filepath + ), + command=cmd, + step_node=step_node, + language=build_tool.language, + language_versions=lang_versions, + language_distributions=lang_distributions, + language_url=lang_url, + reachable_secrets=list(get_reachable_secrets(node)), + events=get_ci_events_from_workflow(workflow_node.definition) if workflow_node else [], + ) + + +def get_build_tool_commands(nodes: core.NodeForest, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: + """Traverse the callgraph and find all the reachable build tool commands. + + This generator yields sorted build tool command objects to allow a deterministic behavior. + The objects are sorted based on the string representation of the build tool object. + + Parameters + ---------- + nodes: core.NodeForest + The callgraph reachable from the CI workflows. + build_tool: BaseBuildTool + The corresponding build tool for which shell commands need to be detected. + + Yields + ------ + BuildToolCommand + The object that contains the build command as well useful contextual information. + """ + return sorted(_get_build_tool_commands(nodes, build_tool), key=str) + + +def get_ci_events_from_workflow(workflow: github_workflow_model.Workflow) -> list[str]: + """Get the CI events that trigger the GitHub Action workflow. + + Parameters + ---------- + workflow: github_workflow_model.Workflow + The target GitHub Action workflow. + + Returns + ------- + list[str] + The list of event names. + """ + result: list[str] = [] + on = workflow["on"] + if isinstance(on, str): + result.append(on) + elif isinstance(on, list): + for hook in on: + result.append(hook) + else: + for key in on: + result.append(key) + + return result diff --git a/src/macaron/code_analyzer/dataflow_analysis/bash.py b/src/macaron/code_analyzer/dataflow_analysis/bash.py new file mode 100644 index 000000000..f350448a5 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/bash.py @@ -0,0 +1,1891 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Dataflow analysis implementation for analysing Bash shell scripts.""" + +from __future__ import annotations + +import json +import os.path +from collections import defaultdict +from collections.abc import Callable, Iterator +from dataclasses import dataclass +from itertools import product +from typing import cast + +from macaron import MACARON_PATH +from macaron.code_analyzer.dataflow_analysis import core, evaluation, facts, github, models, printing +from macaron.errors import CallGraphError, ParseError +from macaron.parsers import bashparser, bashparser_model + + +class BashExit(core.ExitType): + """Exit type for Bash exit statement.""" + + def __hash__(self) -> int: + return 37199 + + def __eq__(self, other: object) -> bool: + return isinstance(other, BashExit) + + +# Convenience instance of BashExit. +BASH_EXIT = BashExit() + + +class BashReturn(core.ExitType): + """Exit type for returning from a Bash function.""" + + def __hash__(self) -> int: + return 91193 + + def __eq__(self, other: object) -> bool: + return isinstance(other, BashReturn) + + +# Convenience instance of BashReturn. +BASH_RETURN = BashReturn() + + +@dataclass(frozen=True) +class BashScriptContext(core.Context): + """Context for a Bash script.""" + + #: Outer context, which may be a GitHub run step, another Bash script + #: that ran this script, or just the outermost analysis context if analysing + #: the script in isolation. + outer_context: ( + core.ContextRef[github.GitHubActionsStepContext] + | core.ContextRef[BashScriptContext] + | core.ContextRef[core.AnalysisContext] + ) + #: Scope for filesystem used by the script. + filesystem: core.ContextRef[facts.Scope] + #: Scope for env variables within the script. + env: core.ContextRef[facts.Scope] + #: Scope for defined functions within the script. + func_decls: core.ContextRef[facts.Scope] + #: Scope for the stdin attached to the Bash process. + stdin_scope: core.ContextRef[facts.Scope] + #: Location for the stdin attached to the Bash process. + stdin_loc: facts.LocationSpecifier + #: Scope for the stdout attached to the Bash process. + stdout_scope: core.ContextRef[facts.Scope] + #: Location for the stdout attached to the Bash process. + stdout_loc: facts.LocationSpecifier + #: Filepath for Bash script file. + source_filepath: str + + @staticmethod + def create_from_run_step( + context: core.ContextRef[github.GitHubActionsStepContext], source_filepath: str + ) -> BashScriptContext: + """Create a new Bash script context (for being called from a GitHub step) and its associated scopes. + + Reuses the filesystem and stdout scopes from the outer context, env scope inherits from the outer scope. + + Parameters + ---------- + context: core.ContextRef[github.GitHubActionsStepContext] + Outer step context. + source_filepath: str + Filepath of Bash script file. + + Returns + ------- + BashScriptContext + The new Bash script context. + """ + return BashScriptContext( + context.get_non_owned(), + context.ref.job_context.ref.filesystem.get_non_owned(), + core.OwningContextRef(facts.Scope("env", context.ref.env.ref)), + core.OwningContextRef(facts.Scope("func_decls")), + stdin_scope=core.OwningContextRef(facts.Scope("stdin")), + stdin_loc=facts.Console(), + stdout_scope=context.ref.job_context.ref.workflow_context.ref.console.get_non_owned(), + stdout_loc=facts.Console(), + source_filepath=source_filepath, + ) + + @staticmethod + def create_from_bash_script(context: core.ContextRef[BashScriptContext], source_filepath: str) -> BashScriptContext: + """Create a new Bash script context (for being called from another Bash script) and its associated scopes. + + Reuses the filesystem, stdin, and stdout scopes from the outer context, env scope inherits from the outer context. + + Parameters + ---------- + context: core.ContextRef[BashScriptContext] + Outer Bash script context. + source_filepath: str + Filepath of Bash script file. + + Returns + ------- + BashScriptContext + The new Bash script context. + """ + return BashScriptContext( + context.get_non_owned(), + context.ref.filesystem.get_non_owned(), + core.OwningContextRef(facts.Scope("env", context.ref.env.ref)), + core.OwningContextRef(facts.Scope("func_decls")), + stdin_scope=context.ref.stdin_scope.get_non_owned(), + stdin_loc=facts.Console(), + stdout_scope=context.ref.stdout_scope.get_non_owned(), + stdout_loc=facts.Console(), + source_filepath=source_filepath, + ) + + @staticmethod + def create_in_isolation(context: core.ContextRef[core.AnalysisContext], source_filepath: str) -> BashScriptContext: + """Create a new Bash script context (for being analysed in isolation) and its associated scopes. + + Parameters + ---------- + context: core.ContextRef[core.AnalysisContext] + Outer analysis context. + source_filepath: str + Filepath of Bash script file. + + Returns + ------- + BashScriptContext + The new Bash script context. + """ + return BashScriptContext( + context.get_non_owned(), + core.OwningContextRef(facts.Scope("filesystem")), + core.OwningContextRef(facts.Scope("env")), + core.OwningContextRef(facts.Scope("func_decls")), + stdin_scope=core.OwningContextRef(facts.Scope("stdin")), + stdin_loc=facts.Console(), + stdout_scope=core.OwningContextRef(facts.Scope("stdout")), + stdout_loc=facts.Console(), + source_filepath=source_filepath, + ) + + def with_stdin( + self, stdin_scope: core.ContextRef[facts.Scope], stdin_loc: facts.LocationSpecifier + ) -> BashScriptContext: + """Return a modified bash script context with the given stdin.""" + return BashScriptContext( + self.outer_context, + self.filesystem, + self.env, + self.func_decls, + stdin_scope, + stdin_loc, + self.stdout_scope, + self.stdout_loc, + self.source_filepath, + ) + + def with_stdout( + self, stdout_scope: core.ContextRef[facts.Scope], stdout_loc: facts.LocationSpecifier + ) -> BashScriptContext: + """Return a modified bash script context with the given stdout.""" + return BashScriptContext( + self.outer_context, + self.filesystem, + self.env, + self.func_decls, + self.stdin_scope, + self.stdin_loc, + stdout_scope, + stdout_loc, + self.source_filepath, + ) + + def get_containing_github_context(self) -> github.GitHubActionsStepContext | None: + """Return the (possibly transitive) containing GitHub step context, if there is one.""" + outer_context = self.outer_context.ref + while isinstance(outer_context, BashScriptContext): + outer_context = outer_context.outer_context.ref + + if isinstance(outer_context, github.GitHubActionsStepContext): + return outer_context + return None + + def get_containing_analysis_context(self) -> core.AnalysisContext: + """Return the (possibly transitive) containing analysis context.""" + outer_context = self.outer_context.ref + while isinstance(outer_context, BashScriptContext): + outer_context = outer_context.outer_context.ref + + if isinstance(outer_context, github.GitHubActionsStepContext): + return outer_context.job_context.ref.workflow_context.ref.analysis_context.ref + + return outer_context + + def direct_refs(self) -> Iterator[core.ContextRef[core.Context] | core.ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + yield self.outer_context + yield self.filesystem + yield self.env + yield self.func_decls + yield self.stdin_scope + yield self.stdout_scope + + +class RawBashScriptNode(core.InterpretationNode): + """Interpretation node representing a Bash script (with the script as an unparsed string value). + + Defines how to resolve and parse the Bash script content and generate the analysis representation. + """ + + #: Value for Bash script content (as a string). + script: facts.Value + #: Bash script context. + context: core.ContextRef[BashScriptContext] + + def __init__(self, script: facts.Value, context: core.ContextRef[BashScriptContext]) -> None: + """Initialize Bash script node. + + Parameters + ---------- + script: facts.Value + Value for Bash script content (as a string). + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.script = script + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret the Bash script to resolve and parse the Bash script content and generate the analysis representation.""" + if isinstance(self.script, facts.StringLiteral): + script_str = self.script.literal + + def build_bash_script() -> core.Node: + try: + parsed_bash = bashparser.parse_raw(script_str, MACARON_PATH) + return BashScriptNode.create(parsed_bash, self.context.get_non_owned()) + except ParseError: + return core.NoOpStatementNode() + + return {"default": build_bash_script} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + +class BashScriptNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a Bash script. + + Control flow structure consists of a sequence of Bash statements. + Note that this can model complex control flow with branching, loops, etc. + because those control flow constructs will be statement nodes with their + own control flow nested within. + + Control flow that the cuts across multiple levels, such as an exit statement + within a if statement branch that would cause the entire script to exit + early, are modelled using the alternate exits mechanism (i.e. exit statement + creates a BashExit exit state, in the enclosing control-flow constructs the + successor of the BashExit exit of a child node will be an early BashExit exit + of that construct, and so on up until this node, where there will be a early + normal exit, and so the caller of this script would then proceed as normal after + the script exits). + """ + + #: Parsed Bash script AST. + definition: bashparser_model.File + #: Statement nodes in execution order. + stmts: list[BashStatementNode] + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.File, + stmts: list[BashStatementNode], + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash script node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.File + Parsed Bash script AST. + stmts: list[BashStatementNode] + Statement nodes in execution order. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.stmts = stmts + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence(self.stmts) + + def children(self) -> Iterator[core.Node]: + """Yield the nodes in the sequence.""" + yield from self.stmts + + def get_entry(self) -> core.Node: + """Return the entry node, the first statement in the sequence.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns the next in the sequence or the exit in the case of the last node, or an + early exit in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {core.DEFAULT_EXIT} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create(script: bashparser_model.File, context: core.NonOwningContextRef[BashScriptContext]) -> BashScriptNode: + """Create Bash script node from Bash script AST. + + Parameters + ---------- + script: bashparser_model.File + Parsed Bash script AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + stmts = [BashStatementNode(stmt, context) for stmt in script["Stmts"]] + return BashScriptNode(script, stmts, context) + + +class BashBlockNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a Bash block. + + Control flow structure consists of a sequence of Bash statements. + """ + + #: Parsed block AST or list of statement ASTs. + definition: bashparser_model.Block | list[bashparser_model.Stmt] + #: Statement nodes in execution order. + stmts: list[BashStatementNode] + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.Block | list[bashparser_model.Stmt], + stmts: list[BashStatementNode], + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash block node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.Block | list[bashparser_model.Stmt] + Parsed block AST or list of statement ASTs. + stmts: list[BashStatementNode] + Statement nodes in execution order. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.stmts = stmts + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence(self.stmts) + + def children(self) -> Iterator[core.Node]: + """Yield the nodes in the sequence.""" + yield from self.stmts + + def get_entry(self) -> core.Node: + """Return the entry node, the first statement in the sequence.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns the next in the sequence or the exit in the case of the last node, or a + propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if isinstance(self.definition, list): + if len(self.definition) > 0: + result["line num (in script)"] = {(None, str(self.definition[0]["Pos"]["Line"]))} + else: + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + script: bashparser_model.Block | list[bashparser_model.Stmt], + context: core.NonOwningContextRef[BashScriptContext], + ) -> BashBlockNode: + """Create Bash block node from block AST or list of statement ASTs. + + Parameters + ---------- + script: bashparser_model.Block | list[bashparser_model.Stmt] + Parsed block AST or list of statement ASTs. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + if isinstance(script, list): + stmts = [BashStatementNode(stmt, context) for stmt in script] + else: + stmts = [BashStatementNode(stmt, context) for stmt in script["Stmts"]] + return BashBlockNode(script, stmts, context) + + +class BashFuncCallNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a call to a Bash function. + + Control flow structure consists of a single block containing the function body. + """ + + #: The parsed AST of the callsite statement. + call_definition: bashparser_model.Stmt + #: The parsed AST of the function declaration. + func_definition: bashparser_model.FuncDecl + #: Node representing the function body. + block: BashBlockNode + #: Bash script context. + context: core.ContextRef[BashScriptContext] + + def __init__( + self, + call_definition: bashparser_model.Stmt, + func_definition: bashparser_model.FuncDecl, + block: BashBlockNode, + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash function call node. + + Parameters + ---------- + call_definition: bashparser_model.Stmt + The parsed AST of the callsite statement. + func_definition: bashparser_model.FuncDecl + The parsed AST of the function declaration. + block: BashBlockNode + Node representing the function body. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.call_definition = call_definition + self.func_definition = func_definition + self.block = block + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence([self.block]) + + def children(self) -> Iterator[core.Node]: + """Yield the function body block node.""" + yield self.block + + def get_entry(self) -> core.Node: + """Return the function body block node.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns the next node in the sequence or the exit in the case of the last node, or an + early exit in the case of a BashReturn exit type, or a propagated early BashExit exit + in the case of a BashExit exit type. + """ + if isinstance(exit_type, BashReturn): + return {core.DEFAULT_EXIT} + if isinstance(exit_type, BashExit): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table. + + Contains the line number of the callsite, the line number of the function declaration, and the scopes. + """ + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.call_definition["Pos"]["Line"]))} + result["callee decl line num (in script)"] = {(None, str(self.func_definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + +def get_stdout_redirects(stmt: bashparser_model.Stmt, context: BashScriptContext) -> set[facts.Location]: + """Extract the stdout redirects specified on the statement as a set of location expressions.""" + redirs: set[facts.Location] = set() + for redir in stmt.get("Redirs", []): + if redir["Op"] in { + bashparser_model.RedirOperators.RdrOut.value, + bashparser_model.RedirOperators.RdrAll.value, + bashparser_model.RedirOperators.AppAll.value, + bashparser_model.RedirOperators.AppOut.value, + }: + if "Word" in redir: + redir_word = redir["Word"] + redir_val = convert_shell_word_to_value(redir_word, context) + if redir_val is not None: + redirs.add(facts.Location(context.filesystem.ref, facts.Filesystem(redir_val[0]))) + return redirs + + +class BashStatementNode(core.InterpretationNode): + """Interpretation node representing any kind of Bash statement. + + Defines how to interpret the different kinds of statements and generate the appropriate + analysis representation. + """ + + #: The parsed statement AST. + definition: bashparser_model.Stmt + #: Bash script context. + context: core.ContextRef[BashScriptContext] + + def __init__(self, definition: bashparser_model.Stmt, context: core.ContextRef[BashScriptContext]) -> None: + """Initialize statement node.""" + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret the different kinds of statements and generate the appropriate analysis representation.""" + cmd = self.definition["Cmd"] + if ( + bashparser_model.is_call_expr(cmd) + and len(cmd.get("Args", [])) == 0 + and "Assigns" in cmd + and len(cmd["Assigns"]) == 1 + ): + # Single variable assignment statement. + assign = cmd["Assigns"][0] + + def build_assign() -> core.Node: + rhs_content = ( + parse_content(assign["Value"]["Parts"], True) + if "Value" in assign + else [LiteralOrEnvVar(is_env_var=False, literal="")] + ) + if rhs_content is not None: + rhs_val = convert_shell_value_sequence_to_fact_value(rhs_content, self.context.ref) + return models.VarAssignNode( + kind=models.VarAssignKind.BASH_ENV_VAR, + var_scope=self.context.ref.env.ref, + var_name=facts.StringLiteral(assign["Name"]["Value"]), + value=rhs_val, + ) + return core.NoOpStatementNode() + + return {"default": build_assign} + if bashparser_model.is_call_expr(cmd) and "Args" in cmd and len(cmd["Args"]) > 0: + # Statement executing a command, generate node with command name expression and + # expressions for each argument value. + # In the case where a word may tokenize as multiple words depending on the value, + # attempt to resolve them and where they do resolve to something that tokenizes as + # multiple args, generate alternative interpretations with those expanded number of + # args, alongside interpretations where those words are a dynamic expression that is + # constrained to be a single word. + arg_vals = [convert_shell_word_to_value(arg, self.context.ref) for arg in cmd["Args"]] + multitoken_resolved_arg_vals: dict[ + int, list[tuple[list[bashparser_model.Word], evaluation.ReadBindings]] + ] = defaultdict(list) + + for index, arg_val_elem in enumerate(arg_vals): + if arg_val_elem is None: + continue + arg_val_elem_val, arg_quoted = arg_val_elem + if not arg_quoted: + resolved_arg_vals = evaluation.evaluate(self, arg_val_elem_val) + for resolved_arg_val, resolved_arg_val_bindings in resolved_arg_vals: + match resolved_arg_val: + case facts.StringLiteral(literal): + parsed_bash_expr = parse_bash_expr(literal) + if parsed_bash_expr is not None and len(parsed_bash_expr) > 1: + multitoken_resolved_arg_vals[index].append( + (parsed_bash_expr, resolved_arg_val_bindings) + ) + arg_indices_in_order: list[int] = [] + values_indices_in_order: list[list[int]] = [] + for index, vals in multitoken_resolved_arg_vals.items(): + arg_indices_in_order.append(index) + values_indices_in_order.append([index for index, _ in enumerate(vals)] + [-1]) + + # Cross product could become very expensive + values_product = list(product(*values_indices_in_order)) + + if len(values_product) == 0: + values_product = [()] + + result: dict[core.InterpretationKey, Callable[[], core.Node]] = {} + + for values_product_elem in values_product: + new_arg_vals: dict[int, list[facts.Value | None]] = {} + read_bindings_list: list[evaluation.ReadBindings] = [] + for arg_index, value_index in zip(arg_indices_in_order, values_product_elem): + if value_index != -1: + expanded_vals, bindings = multitoken_resolved_arg_vals[arg_index][value_index] + read_bindings_list.append(bindings) + converted = [ + convert_shell_word_to_value(expanded_val, self.context.ref) + for expanded_val in expanded_vals + ] + new_arg_vals[arg_index] = [x[0] if x is not None else None for x in converted] + else: + old_arg_val = arg_vals[arg_index] + new_arg_vals[arg_index] = [ + facts.SingleBashTokenConstraint(old_arg_val[0]) if old_arg_val is not None else None + ] + + combined_bindings = evaluation.ReadBindings.combine_bindings(read_bindings_list) + if combined_bindings is None: + continue + full_arg_list: list[facts.Value | None] = [] + + for index, arg_val in enumerate(arg_vals): + if index in new_arg_vals: + full_arg_list.extend(new_arg_vals[index]) + else: + full_arg_list.append(arg_val[0] if arg_val is not None else None) + + cmd_arg = full_arg_list[0] + # TODO subshells + if cmd_arg is not None: + cmd_arg_val = cmd_arg + + def build_single_cmd( # pylint: disable=dangerous-default-value + cmd_arg: facts.Value = cmd_arg_val, cmd_arg_list: list[facts.Value | None] = full_arg_list[1:] + ) -> core.Node: + stdout_redirs = get_stdout_redirects(self.definition, self.context.ref) + return BashSingleCommandNode( + self.definition, self.context.get_non_owned(), cmd_arg, cmd_arg_list, stdout_redirs + ) + + result[("cmd", values_product_elem, combined_bindings)] = build_single_cmd + return result + if bashparser_model.is_if_clause(cmd): + # If statement. + + def build_if() -> core.Node: + return BashIfClauseNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_if} + + if bashparser_model.is_for_clause(cmd): + # For statement. + + def build_for() -> core.Node: + return BashForClauseNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_for} + if bashparser_model.is_binary_cmd(cmd): + match cmd["Op"]: + case bashparser_model.BinCmdOperators.Pipe.value: + + def build_pipe() -> core.Node: + return BashPipeNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_pipe} + case bashparser_model.BinCmdOperators.PipeAll.value: + pass + case bashparser_model.BinCmdOperators.AndStmt.value: + + def build_and() -> core.Node: + return BashAndNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_and} + case bashparser_model.BinCmdOperators.OrStmt.value: + + def build_or() -> core.Node: + return BashOrNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_or} + raise CallGraphError("unknown binary operator: " + str(cmd["Op"])) + if bashparser_model.is_func_decl(cmd): + # Represent Bash function decl as a store of the serialized function defintion, + # into a variable in the function decl scope. + func_decl_str = json.dumps(cmd) + + def build_func_decl() -> core.Node: + return models.VarAssignNode( + kind=models.VarAssignKind.BASH_FUNC_DECL, + var_scope=self.context.ref.func_decls.ref, + var_name=facts.StringLiteral(cmd["Name"]["Value"]), + value=facts.StringLiteral(func_decl_str), + ) + + return {"default": build_func_decl} + if bashparser_model.is_block(cmd): + + def build_block() -> core.Node: + return BashBlockNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_block} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + +class BashIfClauseNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a Bash if statement. + + Control flow structure consists of executing the statements of the condition, + followed by a branch to execute either the then node or the else node (or if + there is no else node, exit immediately). The analysis is not path sensitive, + so both branches are always considered possible regardless of the condition. + """ + + #: Parsed if statement AST. + definition: bashparser_model.IfClause + #: Block node to execute the condition. + cond_stmts: BashBlockNode + #: Block node for the case where the condition is true. + then_stmts: BashBlockNode + #: Node for the case where the condition is false, if any + #: (will be another if node in the case of an elif). + else_stmts: BashBlockNode | BashIfClauseNode | None + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.IfClause, + cond_stmts: BashBlockNode, + then_stmts: BashBlockNode, + else_stmts: BashBlockNode | BashIfClauseNode | None, + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash if statement node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.IfClause + Parsed if statement AST. + cond_stmts: BashBlockNode + Block node to execute the condition. + then_stmts: BashBlockNode + Block node for the case where the condition is true. + else_stmts: BashBlockNode | BashIfClauseNode | None + Node for the case where the condition is false, if any + (will be another if node in the case of an elif). + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.cond_stmts = cond_stmts + self.then_stmts = then_stmts + self.else_stmts = else_stmts + self.context = context + + self._cfg = core.ControlFlowGraph(self.cond_stmts) + self._cfg.add_successor(self.cond_stmts, core.DEFAULT_EXIT, self.then_stmts) + self._cfg.add_successor(self.then_stmts, core.DEFAULT_EXIT, core.DEFAULT_EXIT) + if else_stmts is not None: + self._cfg.add_successor(self.cond_stmts, core.DEFAULT_EXIT, else_stmts) + self._cfg.add_successor(else_stmts, core.DEFAULT_EXIT, core.DEFAULT_EXIT) + else: + self._cfg.add_successor(self.cond_stmts, core.DEFAULT_EXIT, core.DEFAULT_EXIT) + + def children(self) -> Iterator[core.Node]: + """Yield the condition node, then node and (if present) else node.""" + yield self.cond_stmts + yield self.then_stmts + if self.else_stmts is not None: + yield self.else_stmts + + def get_entry(self) -> core.Node: + """Return the entry node (the condition node).""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns a propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + if_stmt: bashparser_model.IfClause, context: core.NonOwningContextRef[BashScriptContext] + ) -> BashIfClauseNode: + """Create a Bash if statement node from if statement AST. + + Parameters + ---------- + if_stmt: bashparser_model.IfClause + Parsed if statement AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + cond_stmts = BashBlockNode.create(if_stmt["Cond"], context) + then_stmts = BashBlockNode.create(if_stmt["Then"], context) + else_clause = if_stmt.get("Else") + else_part: BashBlockNode | BashIfClauseNode | None = None + if else_clause is None: + else_part = None + elif bashparser_model.is_else_clause(else_clause): + else_part = BashBlockNode.create(else_clause["Then"], context) + else: + else_part = BashIfClauseNode.create(cast(bashparser_model.IfClause, else_clause), context) + return BashIfClauseNode( + definition=if_stmt, cond_stmts=cond_stmts, then_stmts=then_stmts, else_stmts=else_part, context=context + ) + + +class BashForClauseNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a Bash for statement. + + Control flow structure consists of executing the statements of the condition, + followed by a branch to execute or skip the loop body node . The analysis is + not path sensitive, so both branches are always considered possible regardless + of the condition. + + TODO: Currently doesn't actually model the loop back edge (need more testing to + be confident of analysis termination in the presence of loops). + """ + + #: Parsed for statement AST. + definition: bashparser_model.ForClause + #: Block node to execute the initializer. + init_stmts: BashBlockNode | None + #: Block node to execute the condition. + cond_stmts: BashBlockNode | None + #: Block node for the loop body. + body_stmts: BashBlockNode + #: Block node to execute the post. + post_stmts: BashBlockNode | None + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.ForClause, + init_stmts: BashBlockNode | None, + cond_stmts: BashBlockNode | None, + body_stmts: BashBlockNode, + post_stmts: BashBlockNode | None, + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash for statement node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.ForClause + Parsed if statement AST. + init_stmts: BashBlockNode | None + Block node to execute the initializer. + cond_stmts: BashBlockNode | None + Block node to execute the condition. + body_stmts: BashBlockNode + Block node for the body. + post_stmts: BashBlockNode | None + Block node to execute the post. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.init_stmts = init_stmts + self.cond_stmts = cond_stmts + self.body_stmts = body_stmts + self.post_stmts = post_stmts + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence( + list(filter(core.node_is_not_none, [self.init_stmts, self.cond_stmts, self.body_stmts, self.post_stmts])) + ) + + def children(self) -> Iterator[core.Node]: + """Yield the initializer, condition, body and post nodes.""" + if self.init_stmts is not None: + yield self.init_stmts + if self.cond_stmts is not None: + yield self.cond_stmts + yield self.body_stmts + if self.post_stmts is not None: + yield self.post_stmts + + def get_entry(self) -> core.Node: + """Return the entry node.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns a propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + for_stmt: bashparser_model.ForClause, context: core.NonOwningContextRef[BashScriptContext] + ) -> BashForClauseNode: + """Create a Bash for statement node from for statement AST. + + Parameters + ---------- + for_stmt: bashparser_model.ForClause + Parsed for statement AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + body_stmts = BashBlockNode.create(for_stmt["Do"], context) + + loop = for_stmt["Loop"] + if not bashparser_model.is_cstyle_loop(loop): + return BashForClauseNode( + definition=for_stmt, + init_stmts=None, + cond_stmts=None, + body_stmts=body_stmts, + post_stmts=None, + context=context, + ) + + init_stmts: BashBlockNode | None = None + if "Init" in loop: + init_arithm_cmd = bashparser_model.ArithmCmd( + Type="ArithmCmd", + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Left=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Right=bashparser_model.Pos(Offset=0, Line=0, Col=0), + X=loop["Init"], + ) + init_stmt = bashparser_model.Stmt( + Cmd=init_arithm_cmd, + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Position=bashparser_model.Pos(Offset=0, Line=0, Col=0), + ) + init_stmts = BashBlockNode.create([init_stmt], context) + + cond_stmts: BashBlockNode | None = None + if "Cond" in loop: + cond_arithm_cmd = bashparser_model.ArithmCmd( + Type="ArithmCmd", + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Left=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Right=bashparser_model.Pos(Offset=0, Line=0, Col=0), + X=loop["Cond"], + ) + cond_stmt = bashparser_model.Stmt( + Cmd=cond_arithm_cmd, + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Position=bashparser_model.Pos(Offset=0, Line=0, Col=0), + ) + cond_stmts = BashBlockNode.create([cond_stmt], context) + + post_stmts: BashBlockNode | None = None + if "Post" in loop: + post_arithm_cmd = bashparser_model.ArithmCmd( + Type="ArithmCmd", + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Left=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Right=bashparser_model.Pos(Offset=0, Line=0, Col=0), + X=loop["Post"], + ) + post_stmt = bashparser_model.Stmt( + Cmd=post_arithm_cmd, + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Position=bashparser_model.Pos(Offset=0, Line=0, Col=0), + ) + post_stmts = BashBlockNode.create([post_stmt], context) + + return BashForClauseNode( + definition=for_stmt, + init_stmts=init_stmts, + cond_stmts=cond_stmts, + body_stmts=body_stmts, + post_stmts=post_stmts, + context=context, + ) + + +@dataclass(frozen=True) +class BashPipeContext(core.Context): + """Context for a Bash pipe operation. + + Introduces a scope and location to represent the pipe itself connecting the piped commands, + where output from the piped-from command is written prior to being read as input by the piped-to + command. + """ + + #: Outer Bash script context + bash_script_context: core.ContextRef[BashScriptContext] + #: Scope for pipe. + pipe_scope: core.ContextRef[facts.Scope] + #: Location for pipe. + pipe_loc: facts.LocationSpecifier + + @staticmethod + def create(context: core.ContextRef[BashScriptContext]) -> BashPipeContext: + """Create a new pipe context and its associated scope.""" + return BashPipeContext(context.get_non_owned(), core.OwningContextRef(facts.Scope("pipe")), facts.Console()) + + def direct_refs(self) -> Iterator[core.ContextRef[core.Context] | core.ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + yield self.bash_script_context + yield self.pipe_scope + + +class BashPipeNode(core.ControlFlowGraphNode): + """Control flow node representing a Bash pipe ("|") binary command. + + Control flow structure consists of executing the left-hand side, + followed by the right-hand side. + A pipe scope and location is introduced to model the piping of the + output from the first command to the input of the second command. + """ + + #: Parsed pipe binary command AST. + definition: bashparser_model.BinaryCmd + #: Left-hand side (first) command. + lhs: BashStatementNode + #: Right-hand side (second) command. + rhs: BashStatementNode + #: Pipe context. + context: core.ContextRef[BashPipeContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.BinaryCmd, + lhs: BashStatementNode, + rhs: BashStatementNode, + context: core.ContextRef[BashPipeContext], + ) -> None: + """Initialize Bash pipe node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.BinaryCmd + Parsed pipe binary command AST. + lhs: BashStatementNode + Left-hand side (first) command. + rhs: BashStatementNode + Right-hand side (second) command. + context: core.ContextRef[BashPipeContext] + Pipe context. + """ + super().__init__() + self.definition = definition + self.lhs = lhs + self.rhs = rhs + self.context = context + + self._cfg = core.ControlFlowGraph(self.lhs) + self._cfg.add_successor(self.lhs, core.DEFAULT_EXIT, self.rhs) + self._cfg.add_successor(self.rhs, core.DEFAULT_EXIT, core.DEFAULT_EXIT) + + def children(self) -> Iterator[core.Node]: + """Yield the subcommands.""" + yield self.lhs + yield self.rhs + + def get_entry(self) -> core.Node: + """Return the entry node (the lhs node).""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns a propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + pipe_cmd: bashparser_model.BinaryCmd, context: core.NonOwningContextRef[BashScriptContext] + ) -> BashPipeNode: + """Create Bash pipe node from pipe binary command AST. + + Parameters + ---------- + pipe_cmd: bashparser_model.BinaryCmd + Parsed pipe binary command AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + pipe_context = core.OwningContextRef(BashPipeContext.create(context)) + piped_from_context = core.NonOwningContextRef( + context.ref.with_stdout(pipe_context.ref.pipe_scope.get_non_owned(), pipe_context.ref.pipe_loc) + ) + piped_to_context = core.NonOwningContextRef( + context.ref.with_stdin(pipe_context.ref.pipe_scope.get_non_owned(), pipe_context.ref.pipe_loc) + ) + lhs = BashStatementNode(pipe_cmd["X"], piped_from_context) + rhs = BashStatementNode(pipe_cmd["Y"], piped_to_context) + return BashPipeNode(definition=pipe_cmd, lhs=lhs, rhs=rhs, context=pipe_context) + + +class BashAndNode(core.ControlFlowGraphNode): + """Control flow node representing a Bash AND ("&&") binary command. + + Control flow structure consists of executing the left-hand side, + followed by the right-hand side. + + (TODO model short circuit?) + """ + + #: Parsed AND binary command AST. + definition: bashparser_model.BinaryCmd + #: Left-hand side (first) command. + lhs: BashStatementNode + #: Right-hand side (second) command. + rhs: BashStatementNode + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.BinaryCmd, + lhs: BashStatementNode, + rhs: BashStatementNode, + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash and node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.BinaryCmd + Parsed AND binary command AST. + lhs: BashStatementNode + Left-hand side (first) command. + rhs: BashStatementNode + Right-hand side (second) command. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.lhs = lhs + self.rhs = rhs + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence([lhs, rhs]) + + def children(self) -> Iterator[core.Node]: + """Yield the subcommands.""" + yield self.lhs + yield self.rhs + + def get_entry(self) -> core.Node: + """Return the entry node (the lhs node).""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns a propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + and_cmd: bashparser_model.BinaryCmd, context: core.NonOwningContextRef[BashScriptContext] + ) -> BashAndNode: + """Create Bash and node from AND binary command AST. + + Parameters + ---------- + and_cmd: bashparser_model.BinaryCmd + Parsed AND binary command AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + lhs = BashStatementNode(and_cmd["X"], context) + rhs = BashStatementNode(and_cmd["Y"], context) + return BashAndNode(definition=and_cmd, lhs=lhs, rhs=rhs, context=context) + + +class BashOrNode(core.ControlFlowGraphNode): + """Control flow node representing a Bash OR ("||") binary command. + + Control flow structure consists of executing the left-hand side, + followed by the right-hand side. + + (TODO model short circuit?) + """ + + #: Parsed OR binary command AST. + definition: bashparser_model.BinaryCmd + #: Left-hand side (first) command. + lhs: BashStatementNode + #: Right-hand side (second) command. + rhs: BashStatementNode + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.BinaryCmd, + lhs: BashStatementNode, + rhs: BashStatementNode, + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash OR node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.BinaryCmd + Parsed OR binary command AST. + lhs: BashStatementNode + Left-hand side (first) command. + rhs: BashStatementNode + Right-hand side (second) command. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.lhs = lhs + self.rhs = rhs + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence([lhs, rhs]) + + def children(self) -> Iterator[core.Node]: + """Yield the subcommands.""" + yield self.lhs + yield self.rhs + + def get_entry(self) -> core.Node: + """Return the entry node (the lhs node).""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns a propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create(or_cmd: bashparser_model.BinaryCmd, context: core.NonOwningContextRef[BashScriptContext]) -> BashOrNode: + """Create Bash OR node from OR binary command AST. + + Parameters + ---------- + and_cmd: bashparser_model.BinaryCmd + Parsed AND binary command AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + lhs = BashStatementNode(or_cmd["X"], context) + rhs = BashStatementNode(or_cmd["Y"], context) + return BashOrNode(definition=or_cmd, lhs=lhs, rhs=rhs, context=context) + + +class BashSingleCommandNode(core.InterpretationNode): + """Interpretation node representing a single Bash command. + + Defines how to interpret the semantics of the different supported commands that + may be invoked. + """ + + #: Parsed statement AST. + definition: bashparser_model.Stmt + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Expression for command name. + cmd: facts.Value + #: Expressions for argument values (None if unrepresentable). + args: list[facts.Value | None] + #: Location expressions for where stdout is redirected to. + stdout_redirects: set[facts.Location] + + def __init__( + self, + definition: bashparser_model.Stmt, + context: core.ContextRef[BashScriptContext], + cmd: facts.Value, + args: list[facts.Value | None], + stdout_redirects: set[facts.Location], + ) -> None: + """Initialize Bash single command node. + + Parameters + ---------- + definition: bashparser_model.Stmt + Parsed statement AST. + context: core.ContextRef[BashScriptContext] + Bash script context. + cmd: facts.Value + Expression for command name. + args: list[facts.Value | None] + Expressions for argument values (None if unrepresentable). + stdout_redirects: set[facts.Location] + Location expressions for where stdout is redirected to. + """ + super().__init__() + self.definition = definition + self.context = context + self.cmd = cmd + self.args = args + self.stdout_redirects = stdout_redirects + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret the semantics of the different supported commands that may be invoked.""" + eval_transformer = evaluation.EvaluationTransformer(state) + evaluated_writes = eval_transformer.transform_value(self.cmd) + result: dict[core.InterpretationKey, Callable[[], core.Node]] = {} + + for resolved_cmd, bindings in evaluated_writes: + match resolved_cmd: + case facts.StringLiteral("echo"): + # Echo command, may have two different interpretations: + # - The concrete semantics of writing to the location its stdout is directed to + # - If writing to the special GitHub output var file, the higher-level semantics + # of writing to the variable as specified in the echoed value. + if len(self.stdout_redirects) in {0, 1} and len(self.args) == 1: + first_arg = self.args[0] + stdout_redir = ( + next(iter(self.stdout_redirects)) + if len(self.stdout_redirects) == 1 + else facts.Location(self.context.ref.stdout_scope.ref, self.context.ref.stdout_loc) + ) + if first_arg is not None: + first_arg_val = first_arg + + def build_echo( + stdout_redir: facts.Location = stdout_redir, first_arg_val: facts.Value = first_arg_val + ) -> core.Node: + return models.BashEchoNode(stdout_redir, first_arg_val) + + github_context = self.context.ref.get_containing_github_context() + + if ( + self._is_github_output_loc(stdout_redir) + and github_context is not None + and github_context.output_var_prefix is not None + ): + output_var_prefix = github_context.output_var_prefix + job_variables_scope = github_context.job_context.ref.job_variables.ref + split = evaluation.parse_str_expr_split(first_arg, "=", maxsplit=1) + if len(split) == 2: + + def build_github_var_write( + job_variables_scope: facts.Scope = job_variables_scope, + output_var_prefix: str = output_var_prefix, + split: list[facts.Value] = split, + ) -> core.Node: + return models.VarAssignNode( + kind=models.VarAssignKind.GITHUB_JOB_VAR, + var_scope=job_variables_scope, + var_name=facts.BinaryStringOp.get_string_concat( + facts.StringLiteral(output_var_prefix), split[0] + ), + value=split[1], + ) + + result[("echo_github_var", bindings)] = build_github_var_write + + result[("echo", bindings)] = build_echo + case facts.StringLiteral("mvn"): + # Maven build command. + for arg in self.args: + match arg: + case facts.StringLiteral(arg_lit): + if arg_lit in {"package", "install", "deploy", "verify"}: + + def build_mvn_build() -> core.Node: + return models.MavenBuildModelNode( + filesystem_scope=self.context.ref.filesystem.ref + ) + + result[("mvn", bindings)] = build_mvn_build + case facts.StringLiteral("exit"): + # Exit command exits the script. + def build_exit_stmt() -> core.Node: + return BashExitNode() + + result[("exit", bindings)] = build_exit_stmt + case facts.StringLiteral("base64"): + # base64 command may encode or decode Base64 strings. + + # TODO model other possibilities + if len(self.stdout_redirects) in {0, 1}: + stdout_redir = ( + next(iter(self.stdout_redirects)) + if len(self.stdout_redirects) == 1 + else facts.Location(self.context.ref.stdout_scope.ref, self.context.ref.stdout_loc) + ) + if len(self.args) == 0: + + def build_base64_encode(stdout_redir: facts.Location = stdout_redir) -> core.Node: + return models.Base64EncodeNode( + facts.Location(self.context.ref.stdin_scope.ref, self.context.ref.stdin_loc), + stdout_redir, + ) + + result[("base64_encode", bindings)] = build_base64_encode + elif len(self.args) == 1 and ( + self.args[0] == facts.StringLiteral("-d") or self.args[0] == facts.StringLiteral("--decode") + ): + + def build_base64_decode(stdout_redir: facts.Location = stdout_redir) -> core.Node: + return models.Base64DecodeNode( + facts.Location(self.context.ref.stdin_scope.ref, self.context.ref.stdin_loc), + stdout_redir, + ) + + result[("base64_decode", bindings)] = build_base64_decode + case facts.StringLiteral(cmd_name) if cmd_name.endswith(".sh"): + # Invoking another shell script. + + # TODO pass arguments + + repo_path = self.context.ref.get_containing_analysis_context().repo_path + if repo_path is not None: + # Check for path traversal patterns before analyzing a bash file. + # TODO working dir + bash_file_path = os.path.realpath(os.path.join(repo_path, "", cmd_name)) + if os.path.exists(bash_file_path) and bash_file_path.startswith(repo_path): + + def build_run_bash_script_file(bash_file_path: str = bash_file_path) -> core.Node: + bash_text = "" + with open(bash_file_path, encoding="utf-8") as bash_file: + bash_text = bash_file.read() + return RawBashScriptNode( + facts.StringLiteral(bash_text), + core.OwningContextRef( + BashScriptContext.create_from_bash_script(self.context, bash_file_path) + ), + ) + + result[("run_file_bash_script", bindings)] = build_run_bash_script_file + case facts.StringLiteral(cmd_name): + # If the command name is a defined shell function (as resolved from a read of the variable of that + # name in the function decl scope), then create a function call to the function definition stored + # in that variable. + + evaluated_func_decls = evaluation.evaluate( + self, + facts.Read( + facts.Location( + scope=self.context.ref.func_decls.ref, loc=facts.Variable(facts.StringLiteral(cmd_name)) + ) + ), + ) + for resolved_func, resolved_func_bindings in evaluated_func_decls: + if isinstance(resolved_func, facts.StringLiteral): + combined_func_bindings = evaluation.ReadBindings.combine_bindings( + [bindings, resolved_func_bindings] + ) + if combined_func_bindings is not None: + resolved_func_json = resolved_func.literal + + def build_func_call(func_json: str = resolved_func_json) -> core.Node: + func_decl = cast(bashparser_model.FuncDecl, json.loads(func_json)) + return BashFuncCallNode( + self.definition, + func_decl, + BashBlockNode.create([func_decl["Body"]], self.context.get_non_owned()), + self.context, + ) + + result[("function_call", combined_func_bindings)] = build_func_call + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + if not isinstance(self.cmd, facts.StringLiteral) or len(result) == 0: + result["default"] = build_noop + + return result + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table. + + Contains the line number, command expression, argument expressions, stdout redirect location expressions, and scopes. + """ + properties: dict[str, set[tuple[str | None, str]]] = {} + properties["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + properties["cmd"] = {(None, self.cmd.to_datalog_fact_string())} + for index, arg in enumerate(self.args): + properties["arg" + str(index)] = { + (None, arg.to_datalog_fact_string()) if arg is not None else (None, "UNKNOWN") + } + properties["stdout_redirects"] = {(None, x.to_datalog_fact_string()) for x in self.stdout_redirects} + printing.add_context_owned_scopes_to_properties_table(properties, self.context) + return properties + + @staticmethod + def _is_github_output_loc(loc: facts.Location) -> bool: + """Return whether the location is the special GitHub output variable file.""" + match loc: + case facts.Location( + _, facts.Filesystem(facts.Read(facts.Location(_, facts.Variable(facts.StringLiteral("GITHUB_OUTPUT"))))) + ): + return True + return False + + +class BashExitNode(core.StatementNode): + """Statement node representing a Bash exit command. + + Always exits with the BashExit exit type (which causes the whole script to exit). + """ + + def apply_effects(self, before_state: core.State) -> dict[core.ExitType, core.State]: + """Apply the effects of the Bash exit. + + Returns a BashExit exit state that is otherwise the same as the before state. + """ + state = core.State() + core.transfer_state(before_state, state) + return {BASH_EXIT: state} + + +@dataclass(frozen=True) +class LiteralOrEnvVar: + """Represents either a literal or a read of an environment variable.""" + + #: Whether this represents an environment variable (or else a string literal). + is_env_var: bool + #: The environment variable name or string literal value. + literal: str + + +def is_simple_var_read(param_exp: bashparser_model.ParamExp) -> bool: + """Return whether expression is a simple env var read e.g. $ENV_VAR.""" + if param_exp.get("Excl", False) or param_exp.get("Length", False) or param_exp.get("Width", False): + return False + if ( + "Index" in param_exp + or "Slice" in param_exp + or "Repl" in param_exp + or "Names" in param_exp + or "Exp" in param_exp + ): + return False + return True + + +def parse_env_var_read_word_part(part: bashparser_model.WordPart, allow_dbl_quoted: bool) -> str | None: + """Parse word part as a read of an environment variable. + + If the given word part is a read of an env var (possibly enclosed in double quotes, if allowed), + return the name of the variable, otherwise None. + """ + if bashparser_model.is_dbl_quoted(part): + if not allow_dbl_quoted: + return None + if "Parts" not in part or len(part["Parts"]) == 0: + return "" + if len(part["Parts"]) == 1: + part = part["Parts"][0] + else: + return None + + if bashparser_model.is_param_exp(part): + if not is_simple_var_read(part): + return None + return part["Param"]["Value"] + + return None + + +def parse_env_var_read_word(word: bashparser_model.Word, allow_dbl_quoted: bool) -> str | None: + """Parse word as a read of an environment variable. + + If the given word is a read of an env var (possibly enclosed in double quotes, if allowed), + return the name of the variable, otherwise None. + """ + if len(word["Parts"]) == 1: + part = word["Parts"][0] + return parse_env_var_read_word_part(part, allow_dbl_quoted) + return None + + +def parse_content(parts: list[bashparser_model.WordPart], allow_dbl_quoted: bool) -> list[LiteralOrEnvVar] | None: + """Parse the given sequence of word parts. + + Return a representation as a sequence of string literal and env var reads, or else return None if not representable in this way. + + If allow_dbl_quoted is True, permit word parts to be double quoted expressions, the content of which will + be included in the sequence (if False, return None if the sequence contains double quoted expressions). + """ + content: list[LiteralOrEnvVar] = [] + for part in parts: + env_var = parse_env_var_read_word_part(part, allow_dbl_quoted) + if env_var is not None: + content.append(LiteralOrEnvVar(is_env_var=True, literal=env_var)) + elif bashparser_model.is_lit(part): + content.append(LiteralOrEnvVar(is_env_var=False, literal=part["Value"])) + elif bashparser_model.is_dbl_quoted(part) and "Parts" in part: + subcontent = parse_content(part["Parts"], False) + if subcontent is None: + return None + content.extend(subcontent) + else: + return None + return content + + +def convert_shell_value_sequence_to_fact_value( + content: list[LiteralOrEnvVar], context: BashScriptContext +) -> facts.Value: + """Convert sequence of Bash values into a single concatenated expression.""" + if len(content) == 0: + raise CallGraphError("sequence cannot be empty") + + first_val = convert_shell_value_to_fact_value(content[0], context) + if len(content) == 1: + return first_val + + rest_val = convert_shell_value_sequence_to_fact_value(content[1:], context) + + return facts.BinaryStringOp(op=facts.BinaryStringOperator.STRING_CONCAT, operand1=first_val, operand2=rest_val) + + +def convert_shell_value_to_fact_value(val: LiteralOrEnvVar, context: BashScriptContext) -> facts.Value: + """Convert a Bash literal or env var read into a value expression.""" + if val.is_env_var: + return facts.Read( + loc=facts.Location(scope=context.env.ref, loc=facts.Variable(name=facts.StringLiteral(literal=val.literal))) + ) + return facts.StringLiteral(literal=val.literal) + + +def convert_shell_word_to_value( + word: bashparser_model.Word, context: BashScriptContext +) -> tuple[facts.Value, bool] | None: + """Convert a Bash word into a value expression. + + Return value expression alongside a bool indicating whether the value is + "quoted" (or else may require further expansion post-resolution if "unquoted"). + """ + dbl_quoted_parts = parse_dbl_quoted_string(word) + if dbl_quoted_parts is not None: + return convert_shell_value_sequence_to_fact_value(dbl_quoted_parts, context), True + + sgl_quoted_str = parse_sql_quoted_string(word) + if sgl_quoted_str is not None: + return facts.StringLiteral(sgl_quoted_str), True + + singular_literal = parse_singular_literal(word) + if singular_literal is not None: + return facts.StringLiteral(literal=singular_literal), True + + single_var = parse_env_var_read_word(word, False) + if single_var is not None: + return convert_shell_value_to_fact_value(LiteralOrEnvVar(True, single_var), context), False + + return None + + +def parse_dbl_quoted_string(word: bashparser_model.Word) -> list[LiteralOrEnvVar] | None: + """Parse double quoted string. + + If the given word is a double quoted expression, return + a representation as a sequence of string literal and env var reads, or + else return None if it is not a double quoted expression or if it is + not representable in this way. + """ + if len(word["Parts"]) == 1: + part = word["Parts"][0] + if bashparser_model.is_dbl_quoted(part) and "Parts" in part: + return parse_content(part["Parts"], False) + + return None + + +def parse_sql_quoted_string(word: bashparser_model.Word) -> str | None: + """Parse single quoted string. + + If the given word is a single quoted string, return the string + literal content, otherwise return None. + """ + if len(word["Parts"]) == 1: + part = word["Parts"][0] + if bashparser_model.is_sgl_quoted(part): + return part["Value"] + + return None + + +def parse_singular_literal(word: bashparser_model.Word) -> str | None: + """Parse singular literal word. + + If the given word is a single literal, return the string + literal content, otherwise return None. + """ + if len(word["Parts"]) == 1: + part = word["Parts"][0] + if bashparser_model.is_lit(part): + return part["Value"] + + return None + + +# Cache for Bash expression parsing. +# note: not thread safe +_bashparser_cache: dict[str, list[bashparser_model.Word] | None] = {} + + +def parse_bash_expr(expr: str) -> list[bashparser_model.Word] | None: + """Parse bash expression. + + Results are cached to avoid unnessary invocations of the Bash parser + (since it requires spawning a separate process). + """ + if expr in _bashparser_cache: + return _bashparser_cache[expr] + try: + parse_result = bashparser.parse_expr(expr, MACARON_PATH) + _bashparser_cache[expr] = parse_result + return parse_result + except ParseError: + return None diff --git a/src/macaron/code_analyzer/dataflow_analysis/cmd_parser.py b/src/macaron/code_analyzer/dataflow_analysis/cmd_parser.py new file mode 100644 index 000000000..f6a074a90 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/cmd_parser.py @@ -0,0 +1,88 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module contains parsers for command line interfaces for commands relevant to analysis.""" + +from __future__ import annotations + +import argparse + + +def parse_python_command_line(args: list[str]) -> argparse.Namespace: + """Parse python command line. + + Parameters + ---------- + args: list[str] + Argument list to python command + + Returns + ------- + argparse.Namespace + Parsed python command args + """ + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument("-B", action="store_true") + parser.add_argument("-b", action="count") + parser.add_argument("--check-hash-based-pycs") + parser.add_argument("-d", action="store_true") + parser.add_argument("-E", action="store_true") + parser.add_argument("-h", action="store_true") + parser.add_argument("-?", action="store_true", dest="h") + parser.add_argument("--help", action="store_true", dest="h") + parser.add_argument("--help-env", action="store_true") + parser.add_argument("--help-xoptions", action="store_true") + parser.add_argument("--help-all", action="store_true") + parser.add_argument("-i", action="store_true") + parser.add_argument("-I", action="store_true") + parser.add_argument("-o", action="count") + parser.add_argument("-P", action="store_true") + parser.add_argument("-q", action="store_true") + parser.add_argument("-s", action="store_true") + parser.add_argument("-S", action="store_true") + parser.add_argument("-u", action="store_true") + parser.add_argument("-v", action="count") + parser.add_argument("-V", action="count") + parser.add_argument("--version", action="count", dest="V") + parser.add_argument("-w", action="store") + parser.add_argument("-x", action="store") + parser.add_argument("-m", nargs=argparse.REMAINDER) + parser.add_argument("-c", nargs=argparse.REMAINDER) + parser.add_argument("file", nargs=argparse.REMAINDER) + + parsed_args = parser.parse_args(args) + + if parsed_args.m is not None: + parsed_args.subprocess_args = parsed_args.m[1:] + parsed_args.m = parsed_args.m[0] + parsed_args.file = None + elif parsed_args.c is not None: + parsed_args.subprocess_args = parsed_args.c[1:] + parsed_args.c = parsed_args.c[0] + parsed_args.file = None + else: + if len(parsed_args.file) > 0 and parsed_args.file[0] == "--": + parsed_args.file = parsed_args.file[1:] + if len(parsed_args.file) == 0: + parsed_args.subprocess_args = [] + parsed_args.file = None + else: + parsed_args.subprocess_args = parsed_args.file[1:] + parsed_args.file = parsed_args.file[0] + + return parsed_args + + +def main() -> None: + """Test python command line parser.""" + print(str(parse_python_command_line(["-B", "-m", "pip", "install", "-U", "cibuildwheel"]))) # noqa: T201 + print(str(parse_python_command_line(["-B", "pip.py", "install", "-U", "cibuildwheel"]))) # noqa: T201 + print(str(parse_python_command_line(["-B", "--", "--pip.py", "install", "-U", "cibuildwheel"]))) # noqa: T201 + print( # noqa: T201 + str(parse_python_command_line(["-B", "-c", "import sys; print(sys.argv[1:])", "install", "-U", "cibuildwheel"])) + ) + print(str(parse_python_command_line(["-B"]))) # noqa: T201 + + +if __name__ == "__main__": + main() diff --git a/src/macaron/code_analyzer/dataflow_analysis/core.py b/src/macaron/code_analyzer/dataflow_analysis/core.py new file mode 100644 index 000000000..5a33ef56a --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/core.py @@ -0,0 +1,695 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Core dataflow analysis framework definitions and algorithm.""" + +from __future__ import annotations + +import functools +from abc import ABC, abstractmethod +from collections import defaultdict +from collections.abc import Callable, Iterator, Sequence +from dataclasses import dataclass +from queue import Queue +from typing import Generic, Protocol, TypeGuard, TypeVar + +from macaron.code_analyzer.dataflow_analysis import facts +from macaron.errors import CallGraphError + +# Debug sequence number used to provide ordering information in debug graph. +# note: not thread safe +DEBUG_SEQUENCE_NUMBER = 0 + + +def reset_debug_sequence_number() -> None: + """Reset debug sequence number.""" + global DEBUG_SEQUENCE_NUMBER # pylint: disable=global-statement + DEBUG_SEQUENCE_NUMBER = 0 + + +def get_debug_sequence_number() -> int: + """Get current debug sequence number value.""" + return DEBUG_SEQUENCE_NUMBER + + +def increment_debug_sequence_number() -> None: + """Increment debug sequence number.""" + global DEBUG_SEQUENCE_NUMBER # pylint: disable=global-statement + DEBUG_SEQUENCE_NUMBER = DEBUG_SEQUENCE_NUMBER + 1 + + +@dataclass(frozen=True) +class StateDebugLabel: + """Label for state fact providing information useful for debugging. + + Provides a record of analysis ordering and whether the fact was just copied + from another state rather than newly produced. + """ + + #: Sequence number at time when state fact was created. + sequence_number: int + #: Whether the state fact is just copied from another state rather than newly produced.""" + copied: bool + + +class StateTransferFilter(ABC): + """Interface for state transfer filters, which filter out state facts by location.""" + + @abstractmethod + def should_transfer(self, loc: facts.Location) -> bool: + """Return whether facts with the given locations should be transferred or else filtered out.""" + + +class State: + """Representation of the abstract storage state at some program point. + + Consists of a set of abstract locations, each associated with a set of possible values. + """ + + #: Mapping of locations to a set of possible values. + #: Values are annotated with a label containing info relevant for debugging + state: dict[facts.Location, dict[facts.Value, StateDebugLabel]] + + def __init__(self) -> None: + """Construct an empty state.""" + self.state = defaultdict(dict) + + +class DefaultStateTransferFilter(StateTransferFilter): + """Default state transfer filter that includes all locations.""" + + def should_transfer(self, loc: facts.Location) -> bool: + """Transfer all locations.""" + return True + + +# Convenience instance of DefaultStateTransferFilter +DEFAULT_STATE_TRANSFER_FILTER = DefaultStateTransferFilter() + + +class ExcludedLocsStateTransferFilter(StateTransferFilter): + """State transfer filter that excludes any locations in the given set.""" + + #: Locations to exclude. + excluded_locs: set[facts.Location] + + def __init__(self, excluded_locs: set[facts.Location]) -> None: + """Construct filter that excludes the given locations.""" + self.excluded_locs = excluded_locs + + def should_transfer(self, loc: facts.Location) -> bool: + """Return whether facts with the given locations should be transferred or else filtered out.""" + return loc not in self.excluded_locs + + +class ExcludedScopesStateTransferFilter(StateTransferFilter): + """State transfer filter that excludes any locations that are within the scopes in the given set.""" + + #: Scopes to exclude. + excluded_scopes: set[facts.Scope] + + def __init__(self, excluded_scopes: set[facts.Scope]) -> None: + """Construct filter that excludes the given scopes.""" + self.excluded_scopes = excluded_scopes + + def should_transfer(self, loc: facts.Location) -> bool: + """Return whether facts with the given locations should be transferred or else filtered out.""" + return loc.scope not in self.excluded_scopes + + +def transfer_state( + src_state: State, + dest_state: State, + transfer_filter: StateTransferFilter = DEFAULT_STATE_TRANSFER_FILTER, + debug_is_copy: bool = True, +) -> bool: + """Transfer/copy all facts in the src state to the dest state, except those excluded by the given filter. + + Parameters + ---------- + src_state: State + The state to transfer facts from. + dest_state: State + The state to modify by transferring facts to. + transfer_filter: StateTransferFilter + The filter to apply to the transferred facts (by default, transfer all). + debug_is_copy: bool + Whether the facts newly added to the dest state should be recorded as being copied or not (for debugging purposes). + + Returns + ------- + bool + Whether the dest state was modified. + """ + changed = False + for loc, vals in src_state.state.items(): + if not transfer_filter.should_transfer(loc): + continue + exit_vals = dest_state.state[loc] + for val, label in vals.items(): + if val not in exit_vals: + exit_vals[val] = StateDebugLabel(get_debug_sequence_number(), True if debug_is_copy else label.copied) + changed = True + return changed + + +class ExitType(ABC): + """Representation of an exit type, describing the manner in which the execution of a node may terminate.""" + + @abstractmethod + def __hash__(self) -> int: + pass + + @abstractmethod + def __eq__(self, other: object) -> bool: + pass + + +class DefaultExit(ExitType): + """Default, normal exit.""" + + def __hash__(self) -> int: + return 19391 + + def __eq__(self, other: object) -> bool: + return isinstance(other, DefaultExit) + + +# Convenience instance of DefaultExit. +DEFAULT_EXIT = DefaultExit() + + +class Node(ABC): + """Base class of all node types in dataflow analysis. + + Subclasses will represent the various program/semantic constructs, + and define how to analyse them. + """ + + #: Abstract state at the point before the execution of this node. + before_state: State + + #: Abstract state at the point after the execution of this node, for each possible distinct exit type. + exit_states: dict[ExitType, State] + + #: Sequence number at the point the node was created, recorded for debugging purposes. + created_debug_sequence_num: int + #: Log of begin/end sequence numbers each time this node was processed, recorded for debugging purposes. + processed_log: list[tuple[int, int]] + + def __init__(self) -> None: + """Initialize with empty states.""" + self.before_state = State() + self.exit_states = defaultdict(State) + self.created_debug_sequence_num = get_debug_sequence_number() + self.processed_log = [] + + @abstractmethod + def children(self) -> Iterator[Node]: + """Yield the child nodes of this node.""" + + @abstractmethod + def analyse(self) -> bool: + """Perform analysis of this node (and potentially any child nodes). + + Update the exit states with the analysis result. + Returns whether anything was modified. + """ + raise NotImplementedError + + def is_processed(self) -> bool: + """Return whether this node has been processed.""" + return len(self.processed_log) > 0 + + def notify_processed(self, begin_seq_num: int, end_seq_num: int) -> None: + """Record that this node has been processed.""" + self.processed_log.append((begin_seq_num, end_seq_num)) + + def get_exit_state_transfer_filter(self) -> StateTransferFilter: + """Return the state transfer filter applicable to the exit state of this node. + + By default, nothing is excluded. Subclasses should override to provide appropriate filters + to avoid transferring state that will be irrelevant after the node exits. + """ + return DEFAULT_STATE_TRANSFER_FILTER + + def __hash__(self) -> int: + return id(self) + + def __eq__(self, other: object) -> bool: + return self is other + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a table of stringified properties, describing the details of this node, for debugging purposes. + + The returned properties table is a mapping of name to value-set, which can be rendered via the functions + in the printing module. + """ + return {} + + +def node_is_not_none(node: Node | None) -> TypeGuard[Node]: + """Return whether the given node is not None.""" + return node is not None + + +def traverse_bfs(node: Node) -> Iterator[Node]: + """Traverse the node tree in a breadth-first manner, yielding the nodes (including this node) in traversal order.""" + queue: Queue[Node] = Queue() + queue.put(node) + while not queue.empty() > 0: + next_node = queue.get() + yield next_node + for child in next_node.children(): + queue.put(child) + + +def build_parent_mapping(node: Node) -> dict[Node, Node]: + """Construct a mapping of nodes to their parent nodes.""" + parents: dict[Node, Node] = {} + + queue: Queue[Node] = Queue() + queue.put(node) + while not queue.empty(): + next_node = queue.get() + for child in next_node.children(): + parents[child] = next_node + queue.put(child) + + return parents + + +class NodeForest: + """A collection of independent root nodes (with no control-flow or relation between them).""" + + #: Collection of root nodes. + root_nodes: list[Node] + #: Mapping of nodes to their parent nodes. + parents: dict[Node, Node] + + def __init__(self, root_nodes: list[Node]) -> None: + """Construct a NodeForest for the given nodes, and build the parent mapping.""" + self.root_nodes = root_nodes + self.parents = {} + for root_node in root_nodes: + root_node_parents = build_parent_mapping(root_node) + self.parents.update(root_node_parents) + + +class ControlFlowGraph: + """Graph structure to represent control flow graphs.""" + + #: Entry node. + entry: Node + #: Graph of successor edges. + #: Each edge is from a particular exit of a particular node, either to a node or to an exit of the control flow itself. + successors: dict[Node, dict[ExitType, set[Node | ExitType]]] + + def __init__(self, entry: Node) -> None: + """Construct an initially-empty control flow graph.""" + self.entry = entry + self.successors = defaultdict(lambda: defaultdict(set)) + + def get_entry(self) -> Node: + """Return the entry node.""" + return self.entry + + def add_successor(self, src: Node, exit_type: ExitType, dest: Node | ExitType) -> None: + """Add a successor edge to the control flow graph.""" + self.successors[src][exit_type].add(dest) + + def get_successors(self, node: Node, exit_type: ExitType) -> set[Node | ExitType]: + """Return the successors for a particular exit of a particular node.""" + return self.successors[node][exit_type] + + @staticmethod + def create_from_sequence(seq: Sequence[Node]) -> ControlFlowGraph: + """Construct a linear sequence of nodes.""" + if len(seq) == 0: + raise CallGraphError("cannot create control flow graph from empty sequence") + cfg = ControlFlowGraph(seq[0]) + prev_node = seq[0] + for node in seq[1:]: + cfg.add_successor(prev_node, DEFAULT_EXIT, node) + prev_node = node + + cfg.add_successor(prev_node, DEFAULT_EXIT, DEFAULT_EXIT) + + return cfg + + +class ControlFlowGraphNode(Node): + """Base class for nodes representing control-flow constructs. + + Defines the generic algorithm for analysing control flow graphs. + Subclasses will define the child nodes and concrete graph structure. + """ + + def _propagate_edges( + self, + worklist: set[Node], + src_state: State, + state_transfer_filter: StateTransferFilter, + successors: set[Node | ExitType], + ) -> bool: + changed = False + for successor in successors: + if isinstance(successor, Node): + transfer_changed = transfer_state(src_state, successor.before_state, state_transfer_filter) + changed = changed or transfer_changed + if transfer_changed or not successor.is_processed(): + worklist.add(successor) + elif isinstance(successor, ExitType): + changed = transfer_state(src_state, self.exit_states[successor], state_transfer_filter) or changed + return changed + + def analyse(self) -> bool: + """Perform analysis of this node. + + Performs analysis of the child nodes and propagates state from the exit state of an updated node to the before + state of its successor nodes, according to the control-flow-graph structure, then analyses the successor nodes, + and so on until a fixpoint is reached and no further updates may be made to any node states. + + Returns whether anything was modified. + """ + begin_seq_num = get_debug_sequence_number() + entry_node = self.get_entry() + if entry_node is None: + changed = transfer_state(self.before_state, self.exit_states[DEFAULT_EXIT]) + increment_debug_sequence_number() + return changed + + changed = transfer_state(self.before_state, entry_node.before_state) + increment_debug_sequence_number() + + worklist = {entry_node} + + while len(worklist) > 0: + next_node = worklist.pop() + next_changed = next_node.analyse() + changed = changed or next_changed + + next_state_transfer_filter = next_node.get_exit_state_transfer_filter() + + for exit_type, exit_state in next_node.exit_states.items(): + successors = self.get_successors(next_node, exit_type) + changed = self._propagate_edges(worklist, exit_state, next_state_transfer_filter, successors) or changed + + increment_debug_sequence_number() + + self.notify_processed(begin_seq_num, get_debug_sequence_number() - 1) + return changed + + @abstractmethod + def get_entry(self) -> Node | None: + """Return the entry node.""" + + @abstractmethod + def get_successors(self, node: Node, exit_type: ExitType) -> set[Node | ExitType]: + """Return the successors for a particular exit of a particular node.""" + + +class StatementNode(Node): + """Base class for nodes representing constructs with direct effects (and no child nodes). + + Subclasses will define the effects that apply when the node is executed. + """ + + def analyse(self) -> bool: + """Perform analysis of this node, by applying the effects to update the after state. + + Returns whether anything was modified. + """ + begin_seq_num = get_debug_sequence_number() + new_exit_states = self.apply_effects(self.before_state) + changed = False + for new_exit_type, new_exit_state in new_exit_states.items(): + changed = transfer_state(new_exit_state, self.exit_states[new_exit_type], debug_is_copy=False) or changed + + self.notify_processed(begin_seq_num, get_debug_sequence_number()) + increment_debug_sequence_number() + return changed + + def children(self) -> Iterator[Node]: + """Yield nothing, as statements have no child nodes.""" + yield from () + + @abstractmethod + def apply_effects(self, before_state: State) -> dict[ExitType, State]: + """Apply the effects of the statement, given the before state, returning the resulting exit state.""" + + +class NoOpStatementNode(StatementNode): + """Statement that has no effect.""" + + def apply_effects(self, before_state: State) -> dict[ExitType, State]: + """Apply the effects of the no-op, returning an exit state that is the same as the before state.""" + state = State() + transfer_state(before_state, state) + return {DEFAULT_EXIT: state} + + +class InterpretationKey(Protocol): + """Interpretation key used to identify interpretations that have been produced before. + + Must support hashing and equality comparison to allow use as a dict key. + """ + + @abstractmethod + def __hash__(self) -> int: + pass + + @abstractmethod + def __eq__(self, other: object, /) -> bool: + pass + + +class InterpretationNode(Node): + """Base class for nodes representing constructs requiring interpretation. + + Such constructs must be interpreted to produce possibly-multiple child nodes representing possible + interpretations of the semantics of the node. + + Analysing the interpretation node will apply the combined effects of all of the possible interpretations. + Subclasses will define how to identify the possible interpretations and generate the corresponding nodes. + """ + + #: The generated interpretations of this node, identified/deduplicated by some interpretation key. + interpretations: dict[InterpretationKey, Node] + + def __init__(self) -> None: + """Initialize node with no interpretations.""" + super().__init__() + self.interpretations = {} + + def children(self) -> Iterator[Node]: + """Yield each of the possible interpretations.""" + yield from self.interpretations.values() + + def update_interpretations(self) -> bool: + """Analyse the node to identify interpretations. + + Analysis is done in the context of the current before state, adding any + new interpretations generated to the interpretations dict. + """ + latest_interpretations = self.identify_interpretations(self.before_state) + new_interpretations = {x: y for (x, y) in latest_interpretations.items() if x not in self.interpretations} + for new_interpretation, build_node in new_interpretations.items(): + self.interpretations[new_interpretation] = build_node() + + return len(new_interpretations) != 0 + + @abstractmethod + def identify_interpretations(self, state: State) -> dict[InterpretationKey, Callable[[], Node]]: + """Analyse the node, in the context of the given before state, to identify interpretations. + + Returns, for each discovered interpretation, an identifying interpretation key that can be used + to determine if the interpretation has been produced previously, and a callable that generates + the node representing that interpretation (used to generate the node if the interpretation is new, + otherwise the previously-generated node will be reused). + """ + + def analyse(self) -> bool: + """Perform analysis of this node, by analysing each possible interpretation. + + Merges the exit states of each analysed interpretation to update the exit state of this node. + + Returns whether anything was modified. + """ + begin_seq_num = get_debug_sequence_number() + + interpretations_changed = self.update_interpretations() + + increment_debug_sequence_number() + + sub_nodes_changed = False + exit_changed = False + + key_transfer_changed: dict[InterpretationKey, bool] = {} + + for key, node in self.interpretations.items(): + transfer_changed = transfer_state(self.before_state, node.before_state) + key_transfer_changed[key] = transfer_changed + sub_nodes_changed = sub_nodes_changed or transfer_changed + + increment_debug_sequence_number() + + for key, node in self.interpretations.items(): + if key_transfer_changed[key] or not node.is_processed(): + analyse_changed = node.analyse() + sub_nodes_changed = sub_nodes_changed or analyse_changed + + for node in self.interpretations.values(): + for exit_type, exit_state in node.exit_states.items(): + if exit_type not in self.exit_states: + exit_changed = True + exit_changed = ( + transfer_state(exit_state, self.exit_states[exit_type], node.get_exit_state_transfer_filter()) + or exit_changed + ) + + self.notify_processed(begin_seq_num, get_debug_sequence_number()) + increment_debug_sequence_number() + + return interpretations_changed or sub_nodes_changed or exit_changed + + +R_co = TypeVar("R_co", covariant=True) + + +@dataclass(frozen=True) +class OwningContextRef(Generic[R_co]): + """A reference to a part of a node's context that "owns" it. + + Ownership is used to identify what scopes are tied to a particular node + such that they cease to exist or become irrelevant after the node exits, + and thus any values stored in locations within those scopes may be erased + from the state beyond that point to simplify the state. + """ + + ref: R_co + + def get_non_owned(self) -> NonOwningContextRef[R_co]: + """Return a non owning reference to the same object.""" + return NonOwningContextRef(self.ref) + + +@dataclass(frozen=True) +class NonOwningContextRef(Generic[R_co]): + """A reference to a part of a node's context that does not "own" it. + + Ownership is used to identify what scopes are tied to a particular node + such that they cease to exist or become irrelevant after the node exits, + and thus any values stored in locations within those scopes may be erased + from the state beyond that point to simplify the state. + """ + + ref: R_co + + def get_non_owned(self) -> NonOwningContextRef[R_co]: + """Return a non-owning reference to the same object.""" + return self + + +# A context ref may be owning or non-owning. +ContextRef = OwningContextRef[R_co] | NonOwningContextRef[R_co] + + +class Context(ABC): + """Base class for node contexts. + + Represents the necessary context that influences the analysis of a node, + primarily that of identifying the concrete scopes that fill particular + roles in the node. + """ + + @abstractmethod + def direct_refs(self) -> Iterator[ContextRef[Context] | ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + + def owned_scopes(self) -> Iterator[OwningContextRef[facts.Scope]]: + """Yield the scopes that are owned by this context. + + Owned scopes are those that are directly referenced by owning references or scopes + that are indirectly referenced by owning references, through referenced contexts that + are referenced by owning references. + """ + for ref in self.direct_refs(): + if isinstance(ref, OwningContextRef): + if isinstance(ref.ref, Context): + yield from ref.ref.owned_scopes() + else: + yield ref + + +@dataclass(frozen=True) +class AnalysisContext(Context): + """Outermost context of the analysis. + + Records the path to the repo checkout, to allow the analysis access to files in the repo. + """ + + repo_path: str | None + + def direct_refs(self) -> Iterator[ContextRef[Context] | ContextRef[facts.Scope]]: + """No direct references, yields nothing.""" + yield from [] + + +class SimpleSequence(ControlFlowGraphNode): + """Control-flow-graph node representing the execution of a sequence of nodes.""" + + #: The sequence of nodes to execute. + seq: list[Node] + #: The control flow graph. + _cfg: ControlFlowGraph + + def __init__(self, seq: list[Node]) -> None: + """Construct control-flow-graph from sequence.""" + super().__init__() + self.seq = seq + self._cfg = ControlFlowGraph.create_from_sequence(seq) + + def children(self) -> Iterator[Node]: + """Yield the nodes in the sequence.""" + yield from self.seq + + def get_entry(self) -> Node: + """Return the entry node, the first in the sequence.""" + return self.seq[0] + + def get_successors(self, node: Node, exit_type: ExitType) -> set[Node | ExitType]: + """Return the successor for a given node (the next in the sequence or the exit in the case of the last node).""" + return self._cfg.get_successors(node, exit_type) + + +class SimpleAlternatives(InterpretationNode): + """Interpretation node representing a concrete set of alternative nodes.""" + + #: The alternatives. + alts: list[Node] + + def __init__(self, alts: list[Node]) -> None: + """Initialize node.""" + super().__init__() + self.alts = alts + + def identify_interpretations(self, state: State) -> dict[InterpretationKey, Callable[[], Node]]: + """Return the interpretations of this node, that is, each of the alternatives.""" + + def get_alt(index: int) -> Node: + return self.alts[index] + + return {i: functools.partial(get_alt, i) for i in range(0, len(self.alts))} + + +def get_owned_scopes(context: ContextRef[Context]) -> set[facts.Scope]: + """Return the set of scopes owned via the given reference to a context. + + Returns empty if the given reference is non-owning. + """ + match context: + case OwningContextRef(ref): + return {scope.ref for scope in ref.owned_scopes()} + case NonOwningContextRef(ref): + return set() diff --git a/src/macaron/code_analyzer/dataflow_analysis/evaluation.py b/src/macaron/code_analyzer/dataflow_analysis/evaluation.py new file mode 100644 index 000000000..69d5a022c --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/evaluation.py @@ -0,0 +1,772 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Functions for evaluating and resolving dataflow analysis expressions.""" + +from __future__ import annotations + +import base64 +import os.path +from dataclasses import dataclass +from typing import TypeVar + +from frozendict import frozendict + +from macaron.code_analyzer.dataflow_analysis import bash, core, facts +from macaron.errors import CallGraphError + + +def evaluate(node: core.Node, value: facts.Value) -> set[tuple[facts.Value, ReadBindings]]: + """Evaluate the given value, at the point immediately prior to the execution of the given node. + + Parameters + ---------- + node: core.Node + The node at which to evaluate the value (i.e. in the context of the before state of the node). + value: facts.Value + The value expression to evaluate. + + Returns + ------- + set[tuple[facts.Value, ReadBindings]] + The set of possible resolved values for the value expression, each with a record of the + resolved value chosen for any read expressions. + """ + eval_transformer = EvaluationTransformer(node.before_state) + return eval_transformer.transform_value(value) + + +@dataclass(frozen=True) +class WriteStatement: + """Representation of a write to a given location of a given value.""" + + #: The location to write to. + location: facts.Location + #: The value to write. + value: facts.Value + + def perform_write(self, before_state: core.State) -> tuple[core.State, set[facts.Location]]: + """Return a state containing only the values stored by the write operation, in context of the before state. + + Also returns the set of locations within that state which should be considered to have been overwritten, + erasing any previous values. + """ + eval_transformer = EvaluationTransformer(before_state) + written_state = core.State() + evaluated_writes = eval_transformer.transform_write(self.location, self.value) + for loc, val, _ in evaluated_writes: + written_state.state[loc][val] = core.StateDebugLabel(core.get_debug_sequence_number(), False) + # Currently, never erases previous values. + return (written_state, set()) + + +@dataclass(frozen=True) +class StatementSet: + """Representation of a set of (simultaneous) write operations.""" + + #: The set of writes. + stmts: set[WriteStatement] + + def apply_effects(self, before_state: core.State) -> core.State: + """Apply the effect of the set of writes, returning the resulting state.""" + final_state = core.State() + final_overwritten_locs: set[facts.Location] = set() + for stmt in self.stmts: + written_state, overwritten_locs = stmt.perform_write(before_state) + for loc in overwritten_locs: + final_overwritten_locs.add(loc) + core.transfer_state(written_state, final_state, debug_is_copy=False) + + core.transfer_state(before_state, final_state, core.ExcludedLocsStateTransferFilter(final_overwritten_locs)) + return final_state + + @staticmethod + def union(*stmt_sets: StatementSet) -> StatementSet: + """Combine multiple write sets into one.""" + stmts: set[WriteStatement] = set() + for stmt_set in stmt_sets: + for stmt in stmt_set.stmts: + stmts.add(stmt) + return StatementSet(stmts) + + +class ParameterPlaceholderTransformer: + """Expression transformer which replaces parameter placeholders with their corresponding bound values.""" + + #: Whether to raise an exception if a parameter is found with no provided binding. + allow_unbound_params: bool + #: Bindings for value parameter placeholders, mapping parameter name to bound value expression. + value_parameter_binds: dict[str, facts.Value] + #: Bindings for location parameter placeholders, mapping parameter name to bound location expression. + location_parameter_binds: dict[str, facts.LocationSpecifier] + #: Bindings for scope parameter placeholders, mapping parameter name to bound scope. + scope_parameter_binds: dict[str, facts.Scope] + + def __init__( + self, + allow_unbound_params: bool = True, + value_parameter_binds: dict[str, facts.Value] | None = None, + location_parameter_binds: dict[str, facts.LocationSpecifier] | None = None, + scope_parameter_binds: dict[str, facts.Scope] | None = None, + ) -> None: + """Initialize transformer with bindings. + + Parameters + ---------- + allow_unbound_params: bool + Whether to raise an exception if a parameter is found with no provided binding. + value_parameter_binds: dict[str, facts.Value] | None + Bindings for value parameter placeholders, mapping parameter name to bound value expression. + location_parameter_binds: dict[str, facts.Value] | None + Bindings for location parameter placeholders, mapping parameter name to bound location expression. + scope_parameter_binds: dict[str, facts.Value] | None + Bindings for scope parameter placeholders, mapping parameter name to bound scope. + """ + self.allow_unbound_params = allow_unbound_params + self.value_parameter_binds = value_parameter_binds or {} + self.location_parameter_binds = location_parameter_binds or {} + self.scope_parameter_binds = scope_parameter_binds or {} + + def transform_value(self, value: facts.Value) -> facts.Value: + """Transform given value expression. + + Returns a value expression with any parameter placeholders replaced with their bound values. + """ + match value: + case facts.StringLiteral(_): + return value + case facts.Read(loc): + new_loc = self.transform_location(loc) + if new_loc is loc: + return value + return facts.Read(new_loc) + case facts.ArbitraryNewData(_): + return value + case facts.UnaryStringOp(op, operand): + new_operand = self.transform_value(operand) + if new_operand is operand: + return value + return facts.UnaryStringOp(op, new_operand) + case facts.BinaryStringOp(op, operand1, operand2): + new_operand1 = self.transform_value(operand1) + new_operand2 = self.transform_value(operand2) + + if op == facts.BinaryStringOperator.STRING_CONCAT: + return facts.BinaryStringOp.get_string_concat(new_operand1, new_operand2) + + # if new_operand1 is operand1 and new_operand2 is operand2: + # return value + # return facts.BinaryStringOp(op, new_operand1, new_operand2) + case facts.ParameterPlaceholderValue(name): + if name in self.value_parameter_binds: + return self.value_parameter_binds[name] + if not self.allow_unbound_params: + raise CallGraphError("unbound value parameter: " + name) + return value + case facts.InstalledPackage(name, version, distribution, url): + new_name = self.transform_value(name) + new_version = self.transform_value(version) + new_distribution = self.transform_value(distribution) + new_url = self.transform_value(url) + if new_name is name and new_version is version and new_distribution is distribution and new_url is url: + return value + return facts.InstalledPackage(new_name, new_version, new_distribution, new_url) + case facts.SingleBashTokenConstraint(val): + new_val = self.transform_value(val) + if new_val is val: + return value + return facts.SingleBashTokenConstraint(new_val) + case facts.Symbolic(sym_val): + new_sym_val = self.transform_value(sym_val) + if new_sym_val is sym_val: + return value + return facts.Symbolic(new_sym_val) + raise CallGraphError("unknown facts.Value type: " + value.__class__.__name__) + + def transform_location(self, location: facts.Location) -> facts.Location: + """Transform given location expression. + + Returns a location expression with any parameter placeholders replaced with their bound values. + """ + new_scope = self.transform_scope(location.scope) + new_location_spec = self.transform_location_specifier(location.loc) + if new_scope is location.scope and new_location_spec is location.loc: + return location + return facts.Location(new_scope, new_location_spec) + + def transform_location_specifier(self, location: facts.LocationSpecifier) -> facts.LocationSpecifier: + """Transform given location specifier expression. + + Returns a location specifier expression with any parameter placeholders replaced with their bound values. + """ + match location: + case facts.Filesystem(path): + new_path = self.transform_value(path) + if new_path is path: + return location + return facts.Filesystem(new_path) + case facts.Variable(name): + new_name = self.transform_value(name) + if new_name is name: + return location + return facts.Variable(new_name) + case facts.Artifact(name, file): + new_name = self.transform_value(name) + new_file = self.transform_value(file) + if new_name is name and new_file is file: + return location + return facts.Artifact(new_name, new_file) + case facts.FilesystemAnyUnderDir(path): + new_path = self.transform_value(path) + if new_path is path: + return location + return facts.FilesystemAnyUnderDir(new_path) + case facts.ArtifactAnyFilename(name): + new_name = self.transform_value(name) + if new_name is name: + return location + return facts.ArtifactAnyFilename(new_name) + case facts.ParameterPlaceholderLocation(name): + if name in self.location_parameter_binds: + return self.location_parameter_binds[name] + if not self.allow_unbound_params: + raise CallGraphError("unbound location parameter: " + name) + return location + case facts.Console(): + return location + case facts.Installed(name): + new_name = self.transform_value(name) + if new_name is name: + return location + return facts.Installed(new_name) + raise CallGraphError("unknown location type: " + location.__class__.__name__) + + def transform_scope(self, scope: facts.Scope) -> facts.Scope: + """Transform given scope. + + Returns a scope with any parameter placeholders replaced with their bound values. + """ + if isinstance(scope, facts.ParameterPlaceholderScope): + if scope.name in self.scope_parameter_binds: + return self.scope_parameter_binds[scope.name] + if not self.allow_unbound_params: + raise CallGraphError("unbound scope parameter: " + scope.name) + return scope + + def transform_statement(self, statement: WriteStatement) -> WriteStatement: + """Transform given write statement. + + Returns a write statement with any parameter placeholders replaced with their bound values. + """ + new_location = self.transform_location(statement.location) + new_value = self.transform_value(statement.value) + if new_location is statement.location and new_value is statement.value: + return statement + return WriteStatement(new_location, new_value) + + def transform_statement_set(self, statement_set: StatementSet) -> StatementSet: + """Transform given write statement set. + + Returns a write statement set with any parameter placeholders replaced with their bound values. + """ + changed = False + new_stmts: set[WriteStatement] = set() + for stmt in statement_set.stmts: + new_stmt = self.transform_statement(stmt) + if new_stmt is not stmt: + changed = True + new_stmts.add(new_stmt) + + if not changed: + return statement_set + return StatementSet(new_stmts) + + +T = TypeVar("T") + + +def is_singleton(s: set[T], e: T) -> bool: + """Return whether the given set contains only the single given element.""" + return len(s) == 1 and next(iter(s)) == e + + +def is_singleton_no_bindings(s: set[tuple[T, ReadBindings]], e: T) -> bool: + """Return whether the given set contains only the single given element with no read bindings.""" + return len(s) == 1 and next(iter(s)) == (e, READBINDINGS_EMPTY) + + +def scope_matches(read_scope: facts.Scope, stored_scope: facts.Scope) -> bool: + """Return whether the given read scope matches the given stored scope. + + Matching means that a read of the read scope may return values from the stored scope. + """ + cur_scope: facts.Scope | None = read_scope + while cur_scope is not None: + if cur_scope == stored_scope: + return True + cur_scope = cur_scope.outer_scope + return False + + +def location_subsumes(loc: facts.LocationSpecifier, subloc: facts.LocationSpecifier) -> bool: + """Return whether the given location subsumes the given sub location. + + Subsumption means that a read of subloc may be considered to be a read of loc or some part thereof. + """ + if loc == subloc: + return True + + match loc, subloc: + case facts.Filesystem(facts.StringLiteral(loc_path_lit)), facts.Filesystem( + facts.StringLiteral(subloc_path_lit) + ): + # Ignore superficial differences in file path due to "./" relative paths. + if ( + not loc_path_lit.startswith("/") + and not subloc_path_lit.startswith("/") + and loc_path_lit.removeprefix("./") == subloc_path_lit.removeprefix("./") + ): + return True + case facts.FilesystemAnyUnderDir(facts.StringLiteral(dir_lit)), facts.Filesystem( + facts.StringLiteral(subloc_path_lit) + ): + # A file path under the same dir as a FilesystemAnyUnderDir is subsumed. + if subloc_path_lit.startswith(dir_lit.removesuffix("/") + "/"): + return True + return False + + +def get_values_for_subsumed_read( + read_loc: facts.LocationSpecifier, state_loc: facts.LocationSpecifier, state_vals: set[facts.Value] +) -> set[facts.Value]: + """Return the set of values stored in the state location, if relevant for the given read location.""" + match read_loc, state_loc: + case facts.ArtifactAnyFilename(read_artifact_name), facts.Artifact(state_artifact_name, state_artifact_file): + if read_artifact_name == state_artifact_name: + return {state_artifact_file} + + if location_subsumes(state_loc, read_loc): + return state_vals + + return set() + + +class ReadBindings: + """Set of bindings of read expressions to values bound as the result of those read expressions.""" + + #: Mapping of read expressions to bound values. + bindings: frozendict[facts.Read, facts.Value] + + def __init__(self, binds: frozendict[facts.Read, facts.Value] | None = None) -> None: + """Initialize with given bindings.""" + self.bindings = binds or frozendict() + + def __len__(self) -> int: + """Return the number of bindings in the set.""" + return len(self.bindings) + + def with_binding(self, read: facts.Read, value: facts.Value) -> ReadBindings | None: + """Return bindings with the given additional binding, or None if the bindings conflict.""" + if read in self.bindings: + if self.bindings[read] != value: + return None + return self + new_binds = self.bindings.set(read, value) + return ReadBindings(new_binds) + + def with_bindings(self, bindings: ReadBindings) -> ReadBindings | None: + """Return bindings with the given additional bindings, or None if the bindings conflict.""" + if len(bindings) == 0: + return self + if len(self) == 0: + return bindings + + for read, val in bindings.bindings.items(): + if read in self.bindings: + if self.bindings[read] != val: + return None + + combined_bindings = frozendict({**self.bindings, **bindings.bindings}) + return ReadBindings(combined_bindings) + + @staticmethod + def combine_bindings(bindings_list: list[ReadBindings]) -> ReadBindings | None: + """Return bindings combining all bindings in the given list, or None if the bindings conflict.""" + if len(bindings_list) == 0: + return READBINDINGS_EMPTY + + cur_binding: ReadBindings | None = bindings_list[0] + for bindings in bindings_list[1:]: + cur_binding = cur_binding.with_bindings(bindings) if cur_binding is not None else None + if cur_binding is None: + return None + return cur_binding + + def __hash__(self) -> int: + return hash(self.bindings) + + def __eq__(self, other: object) -> bool: + if isinstance(other, ReadBindings): + return self.bindings == other.bindings + return False + + def __repr__(self) -> str: + return str(self.bindings) + + +# Convenience instance of empty bindings. +READBINDINGS_EMPTY = ReadBindings() + + +class EvaluationTransformer: + """Expression transformer which evaluates the expression to produce a set of resolved values. + + The expression is evaluated in the context of a specified abstract storage state. + """ + + #: The state from which to resolve reads. + state: core.State + + def __init__(self, state: core.State) -> None: + """Initialize transformer with state from which to resolve reads.""" + self.state = state + + def transform_write( + self, location: facts.Location, value: facts.Value + ) -> set[tuple[facts.Location, facts.Value, ReadBindings]]: + """Transform a write location and value, returning the set of resolved values with the necessary bindings.""" + evaluated_locations = self.transform_location(location) + evaluated_values = self.transform_value(value) + result: set[tuple[facts.Location, facts.Value, ReadBindings]] = set() + for loc, loc_bindings in evaluated_locations: + for val, val_bindings in evaluated_values: + combined_bindings = loc_bindings.with_bindings(val_bindings) + if combined_bindings is not None: + result.add((loc, val, combined_bindings)) + return result + + def transform_value(self, value: facts.Value) -> set[tuple[facts.Value, ReadBindings]]: + """Transform a value expression, returning the set of resolved values with the necessary bindings.""" + match value: + case facts.StringLiteral(_): + return {(value, READBINDINGS_EMPTY)} + case facts.Read(loc): + # Read values from the state. + new_locs = self.transform_location(loc) + read_vals: set[tuple[facts.Value, ReadBindings]] = set() + for new_loc, new_loc_bindings in new_locs: + read_vals.add((facts.Symbolic(facts.Read(new_loc)), new_loc_bindings)) + + for state_loc, state_vals in self.state.state.items(): + if scope_matches(new_loc.scope, state_loc.scope): + for read_val in get_values_for_subsumed_read( + new_loc.loc, state_loc.loc, set(state_vals.keys()) + ): + combined_bindings = new_loc_bindings.with_binding(value, read_val) + if combined_bindings is not None: + read_vals.add((read_val, combined_bindings)) + return read_vals + case facts.ArbitraryNewData(_): + return {(value, READBINDINGS_EMPTY)} + case facts.UnaryStringOp(op, operand): + new_operands = self.transform_value(operand) + if op == facts.UnaryStringOperator.BASENAME: + # Concretely evaluate basename operator for string literal. + basename_result: set[tuple[facts.Value, ReadBindings]] = set() + for new_operand, new_operand_bindings in new_operands: + if isinstance(new_operand, facts.StringLiteral): + basename_result.add( + (facts.StringLiteral(os.path.basename(new_operand.literal)), new_operand_bindings) + ) + return basename_result + if op == facts.UnaryStringOperator.BASE64DECODE: + # Concretely evaluate base64 decode operator for string literal + base64_decode_result: set[tuple[facts.Value, ReadBindings]] = set() + for new_operand, new_operand_bindings in new_operands: + if isinstance(new_operand, facts.StringLiteral): + base64_decode_result.add( + ( + facts.StringLiteral(base64.b64decode(new_operand.literal).decode("utf-8")), + new_operand_bindings, + ) + ) + return base64_decode_result + return set() + case facts.BinaryStringOp(op, operand1, operand2): + new_operand1s = self.transform_value(operand1) + new_operand2s = self.transform_value(operand2) + if op == facts.BinaryStringOperator.STRING_CONCAT: + # Concretely evaluate string concatenation for concat of 2 string literals. + concat_result: set[tuple[facts.Value, ReadBindings]] = set() + for new_operand1, new_operand1_bindings in new_operand1s: + for new_operand2, new_operand2_bindings in new_operand2s: + if isinstance(new_operand1, facts.StringLiteral) and isinstance( + new_operand2, facts.StringLiteral + ): + combined_bindings = new_operand1_bindings.with_bindings(new_operand2_bindings) + if combined_bindings is not None: + # TODO Have some truncated symbolic representation for + # excessively long strings rather than just dropping them. + if len(new_operand1.literal) + len(new_operand2.literal) < 10000: + concat_result.add( + ( + facts.StringLiteral(new_operand1.literal + new_operand2.literal), + combined_bindings, + ) + ) + return concat_result + + # return set() + case facts.SingleBashTokenConstraint(operand): + # For single bash token constraint, to evaluate a string literal, the literal is parsed + # as a bash expression, and if that results in a single element, then the constraint + # is met and the unmodified literal is returned, if it parses as multiple elements, then + # no resolved values are produced for that literal. + # + # Otherwise returns the constrained expression as is, while simplifying redundant + # multiply-nested constraints. + # + new_operands = self.transform_value(operand) + constraint_result: set[tuple[facts.Value, ReadBindings]] = set() + for new_operand, new_operand_bindings in new_operands: + match new_operand: + case facts.StringLiteral(lit): + parsed_bash_expr = bash.parse_bash_expr(lit) + if parsed_bash_expr is not None and len(parsed_bash_expr) == 1: + constraint_result.add((new_operand, new_operand_bindings)) + + case facts.SingleBashTokenConstraint(suboperand): + constraint_result.add((facts.SingleBashTokenConstraint(suboperand), new_operand_bindings)) + case _: + constraint_result.add((facts.SingleBashTokenConstraint(new_operand), new_operand_bindings)) + return constraint_result + case facts.ParameterPlaceholderValue(name): + return set() + case facts.InstalledPackage(name, version, distribution, url): + # Resolve parameters and return every combination. + new_names = self.transform_value(name) + new_versions = self.transform_value(version) + new_distributions = self.transform_value(distribution) + new_urls = self.transform_value(url) + if ( + is_singleton_no_bindings(new_names, name) + and is_singleton_no_bindings(new_versions, version) + and is_singleton_no_bindings(new_distributions, distribution) + and is_singleton_no_bindings(new_urls, url) + ): + return {(value, READBINDINGS_EMPTY)} + result: set[tuple[facts.Value, ReadBindings]] = set() + for new_name, new_name_bindings in new_names: + for new_version, new_version_bindings in new_versions: + version_combined_bindings = new_name_bindings.with_bindings(new_version_bindings) + if version_combined_bindings is None: + continue + for new_distribution, new_distribution_bindings in new_distributions: + distribution_combined_bindings = version_combined_bindings.with_bindings( + new_distribution_bindings + ) + if distribution_combined_bindings is None: + continue + for new_url, new_url_bindings in new_urls: + url_combined_bindings = distribution_combined_bindings.with_bindings(new_url_bindings) + if url_combined_bindings is not None: + result.add( + ( + facts.InstalledPackage(new_name, new_version, new_distribution, new_url), + url_combined_bindings, + ) + ) + return result + case facts.Symbolic(_): + return {(value, READBINDINGS_EMPTY)} + raise CallGraphError("unknown facts.Value type: " + value.__class__.__name__) + + def transform_location(self, location: facts.Location) -> set[tuple[facts.Location, ReadBindings]]: + """Transform a location expression, returning the set of resolved values with the necessary bindings.""" + new_location_specs = self.transform_location_specifier(location.loc) + if is_singleton_no_bindings(new_location_specs, location.loc): + return {(location, READBINDINGS_EMPTY)} + return { + (facts.Location(location.scope, new_location_spec), new_location_spec_bindings) + for new_location_spec, new_location_spec_bindings in new_location_specs + } + + def transform_location_specifier( + self, location: facts.LocationSpecifier + ) -> set[tuple[facts.LocationSpecifier, ReadBindings]]: + """Transform a location specifier expression, returning the set of resolved values with the necessary bindings.""" + match location: + case facts.Filesystem(path): + new_paths = self.transform_value(path) + if is_singleton_no_bindings(new_paths, path): + return {(location, READBINDINGS_EMPTY)} + return {(facts.Filesystem(new_path), new_path_bindings) for new_path, new_path_bindings in new_paths} + case facts.Variable(name): + new_names = self.transform_value(name) + if is_singleton_no_bindings(new_names, name): + return {(location, READBINDINGS_EMPTY)} + return {(facts.Variable(new_name), new_name_bindings) for new_name, new_name_bindings in new_names} + case facts.Artifact(name, file): + new_names = self.transform_value(name) + new_files = self.transform_value(file) + if is_singleton_no_bindings(new_names, name) and is_singleton_no_bindings(new_files, file): + return {(location, READBINDINGS_EMPTY)} + artifact_result: set[tuple[facts.LocationSpecifier, ReadBindings]] = set() + for new_name, new_name_bindings in new_names: + for new_file, new_file_bindings in new_files: + combined_bindings = new_name_bindings.with_bindings(new_file_bindings) + if combined_bindings is not None: + artifact_result.add((facts.Artifact(new_name, new_file), combined_bindings)) + return artifact_result + case facts.FilesystemAnyUnderDir(path): + new_paths = self.transform_value(path) + if is_singleton_no_bindings(new_paths, path): + return {(location, READBINDINGS_EMPTY)} + return { + (facts.FilesystemAnyUnderDir(new_path), new_path_bindings) + for new_path, new_path_bindings in new_paths + } + case facts.ArtifactAnyFilename(name): + new_names = self.transform_value(name) + if is_singleton_no_bindings(new_names, name): + return {(location, READBINDINGS_EMPTY)} + return { + (facts.FilesystemAnyUnderDir(new_name), new_name_bindings) + for new_name, new_name_bindings in new_names + } + case facts.ParameterPlaceholderLocation(name): + return {(location, READBINDINGS_EMPTY)} + case facts.Console(): + return {(location, READBINDINGS_EMPTY)} + case facts.Installed(name): + new_names = self.transform_value(name) + return {(facts.Installed(new_name), new_name_bindings) for new_name, new_name_bindings in new_names} + raise CallGraphError("unknown location type: " + location.__class__.__name__) + + +# TODO generalise visitors +class ContainsSymbolicVisitor: + """Visitor to determine whether a given expression contains any symbolic expressions.""" + + def visit_value(self, value: facts.Value) -> bool: + """Search value expression for symbolic expressions and return whether any were found.""" + match value: + case facts.StringLiteral(_): + return False + case facts.Read(loc): + return self.visit_location(loc) + case facts.ArbitraryNewData(_): + return False + case facts.UnaryStringOp(_, operand): + return self.visit_value(operand) + case facts.BinaryStringOp(_, operand1, operand2): + return self.visit_value(operand1) or self.visit_value(operand2) + case facts.ParameterPlaceholderValue(name): + return False + case facts.InstalledPackage(name, version, distribution, url): + return ( + self.visit_value(name) + or self.visit_value(version) + or self.visit_value(distribution) + or self.visit_value(url) + ) + case facts.SingleBashTokenConstraint(operand): + return self.visit_value(operand) + case facts.Symbolic(_): + return True + raise CallGraphError("unknown facts.Value type: " + value.__class__.__name__) + + def visit_location(self, location: facts.Location) -> bool: + """Search location expression for symbolic expressions and return whether any were found.""" + return self.visit_location_specifier(location.loc) + + def visit_location_specifier(self, location: facts.LocationSpecifier) -> bool: + """Search location specifier expression for symbolic expressions and return whether any were found.""" + match location: + case facts.Filesystem(path): + return self.visit_value(path) + case facts.Variable(name): + return self.visit_value(name) + case facts.Artifact(name, file): + return self.visit_value(name) or self.visit_value(file) + case facts.FilesystemAnyUnderDir(path): + return self.visit_value(path) + case facts.ArtifactAnyFilename(name): + return self.visit_value(name) + case facts.ParameterPlaceholderLocation(name): + return False + case facts.Console(): + return False + case facts.Installed(name): + return self.visit_value(name) + raise CallGraphError("unknown location type: " + location.__class__.__name__) + + +def filter_symbolic_values(values: set[tuple[facts.Value, ReadBindings]]) -> set[tuple[facts.Value, ReadBindings]]: + """Filter out symbolic values. + + Returns a set containing all elements from the given set that do not contain any symbolic expressions. + """ + return {val for val in values if not ContainsSymbolicVisitor().visit_value(val[0])} + + +def filter_symbolic_locations( + locs: set[tuple[facts.Location, ReadBindings]], +) -> set[tuple[facts.Location, ReadBindings]]: + """Filter out symbolic locations. + + Returns a set containing all elements from the given set that do not contain any symbolic expressions. + """ + return {loc for loc in locs if not ContainsSymbolicVisitor().visit_location(loc[0])} + + +def filter_symbolic_location_specifiers( + locs: set[tuple[facts.LocationSpecifier, ReadBindings]], +) -> set[tuple[facts.LocationSpecifier, ReadBindings]]: + """Filter out symbolic location specifiers. + + Returns a set containing all elements from the given set that do not contain any symbolic expressions. + """ + return {loc for loc in locs if not ContainsSymbolicVisitor().visit_location_specifier(loc[0])} + + +def get_single_resolved_str(resolved_values: set[tuple[facts.Value, ReadBindings]]) -> str | None: + """If the given set contains only a single string literal value, return that string, or else None.""" + resolved_values = filter_symbolic_values(resolved_values) + if len(resolved_values) == 1: + val = next(iter(resolved_values))[0] + if isinstance(val, facts.StringLiteral): + return val.literal + return None + + +def get_single_resolved_str_with_default( + resolved_values: set[tuple[facts.Value, ReadBindings]], default_value: str +) -> str: + """If the given set contains only a single string literal value, return that string, else return default value.""" + result = get_single_resolved_str(resolved_values) + if result is not None: + return result + return default_value + + +def parse_str_expr_split(str_expr: facts.Value, delimiter_char: str, maxsplit: int = -1) -> list[facts.Value]: + """Split a string expression on the appearance of the delimiter char in literal parts of the expression.""" + if len(delimiter_char) != 1: + raise CallGraphError("delimiter_char must be single char") + + match str_expr: + case facts.StringLiteral(literal): + split_str = literal.split(delimiter_char, maxsplit=maxsplit) + return [facts.StringLiteral(s) for s in split_str] + case facts.BinaryStringOp(facts.BinaryStringOperator.STRING_CONCAT, o1, o2): + split_lhs = parse_str_expr_split(o1, delimiter_char, maxsplit) + split_rhs = parse_str_expr_split( + o2, delimiter_char, -1 if maxsplit == -1 else maxsplit - (len(split_lhs) - 1) + ) + if len(split_lhs) == 1 and len(split_rhs) == 1: + return [str_expr] + return ( + split_lhs[:-1] + [facts.BinaryStringOp.get_string_concat(split_lhs[-1], split_rhs[0])] + split_rhs[1:] + ) + return [str_expr] diff --git a/src/macaron/code_analyzer/dataflow_analysis/facts.py b/src/macaron/code_analyzer/dataflow_analysis/facts.py new file mode 100644 index 000000000..28d0f869d --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/facts.py @@ -0,0 +1,702 @@ +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Definitions of dataflow analysis representation for value expressions and abstract storage locations. + +Also includes an incomplete implementation of serialization/deserialization to a Souffle-datalog-compatible representation, +which originated as a remnant of a previous prototype version that involved the datalog engine in the analysis, but +is retained here because the serialization is useful for producing a human-readable string representation for debugging purposes, +and it may be necessary in future to make these expressions available to the policy engine (which uses datalog). +Deserialization is currently non-functional primarily due to the inability to deserialize scope identity, but may +potentially be revisited in future, so is left here for posterity. +""" + +from __future__ import annotations + +import abc +from dataclasses import dataclass +from enum import Enum, auto + +from macaron.errors import CallGraphError, ParseError + + +class Value(abc.ABC): + """Base class for value expressions. + + Subclasses should be comparable by structural equality. + """ + + @abc.abstractmethod + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + + def __str__(self) -> str: + return self.to_datalog_fact_string() + + def __repr__(self) -> str: + return self.__str__() + + +class LocationSpecifier(abc.ABC): + """Base class for location expressions. + + Subclasses should be comparable by structural equality. + """ + + @abc.abstractmethod + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + + def __str__(self) -> str: + return self.to_datalog_fact_string() + + def __repr__(self) -> str: + return self.__str__() + + +# Sequence number to automatically give scopes unique names. +# note: not thread safe +SCOPE_SEQUENCE_NUMBER = 0 + + +class Scope: + """Representation of a scope in which a location may exist. + + This allows for distinct locations with the same name/path/expression to exist separately in different namespaces. + + A scope may have an outer scope, such that a read from a scope may return values from + the outer scope(s). + + Unlike other expression classes, scopes are distinguished by object identity and not + structural equality (TODO now that scopes have names, maybe should revisit this since + it makes serialization/deserialization difficult). + """ + + #: Name for display purposes. + identifier: str + #: Outer scope, if any. + outer_scope: Scope | None + + def __init__(self, name: str, outer_scope: Scope | None = None) -> None: + """Initialize scope. + + Parameters + ---------- + name: str + Name for display purposes (a sequence number will automatically be appended to make it unique). + outer_scope: Scope | None + Outer scope, if any. + """ + self.outer_scope = outer_scope + global SCOPE_SEQUENCE_NUMBER # pylint: disable=global-statement + self.identifier = str(SCOPE_SEQUENCE_NUMBER) + "_" + name + SCOPE_SEQUENCE_NUMBER += 1 + + def __hash__(self) -> int: + return id(self) + + def __eq__(self, other: object) -> bool: + return self is other + + def to_datalog_fact_string(self, include_outer_scope: bool = False) -> str: + """Return string representation of scope (in datalog serialized format).""" + return ( + "$Scope(" + + enquote_datalog_string_literal(self.identifier) + + ( + ", " + self.outer_scope.to_datalog_fact_string() + if include_outer_scope and self.outer_scope is not None + else "" + ) + + ")" + ) + + def __str__(self) -> str: + return self.to_datalog_fact_string() + + def __repr__(self) -> str: + return self.__str__() + + +class ParameterPlaceholderScope(Scope): + """Special scope placeholder to allow generic parameterized expressions. + + TODO This is not really a proper subclass of Scope, should revisit type relationship. + """ + + #: Parameter name. + name: str + + def __init__(self, name: str) -> None: # pylint: disable=super-init-not-called + """Initialize placeholder scope with given parameter name.""" + self.identifier = "param_" + name + self.name = name + + def __hash__(self) -> int: + return hash(self.name) + + def __eq__(self, other: object) -> bool: + return isinstance(other, ParameterPlaceholderScope) and other.name == self.name + + def to_datalog_fact_string(self, include_outer_scope: bool = False) -> str: + """Return string representation of scope (in datalog serialized format).""" + return "$ParameterPlaceholderScope(" + enquote_datalog_string_literal(self.name) + ")" + + def __str__(self) -> str: + return self.to_datalog_fact_string() + + def __repr__(self) -> str: + return self.__str__() + + +@dataclass(frozen=True, repr=False) +class Location: + """A location expression qualified with the scope it resides in.""" + + #: Scope the location resides in. + scope: Scope + #: Location expression. + loc: LocationSpecifier + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "[" + self.scope.to_datalog_fact_string() + ", " + self.loc.to_datalog_fact_string() + "]" + + def __str__(self) -> str: + return self.to_datalog_fact_string() + + def __repr__(self) -> str: + return self.__str__() + + +@dataclass(frozen=True, repr=False) +class StringLiteral(Value): + """Value expression representing a string literal.""" + + #: String literal. + literal: str + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$StringLiteral(" + enquote_datalog_string_literal(self.literal) + ")" + + +@dataclass(frozen=True, repr=False) +class Read(Value): + """Value expression representing a read of the value stored at a location.""" + + #: Read value location. + loc: Location + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Read(" + self.loc.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class ArbitraryNewData(Value): + """Value expression representing some arbitrary data.""" + + #: Name distiguishing the origin of the data. + at: str + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$ArbitraryNewData(" + enquote_datalog_string_literal(self.at) + ")" + + +@dataclass(frozen=True, repr=False) +class InstalledPackage(Value): + """Value expression representing an installed package, with identifying metadata (name, version, etc.).""" + + #: Package name. + name: Value + #: Package version. + version: Value + #: Package distribution. + distribution: Value + #: URL of the package. + url: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return ( + "$InstalledPackage(" + + self.name.to_datalog_fact_string() + + ", " + + self.version.to_datalog_fact_string() + + ", " + + self.distribution.to_datalog_fact_string() + + ", " + + self.url.to_datalog_fact_string() + + ")" + ) + + +class UnaryStringOperator(Enum): + """Unary operators.""" + + BASENAME = auto() + BASE64_ENCODE = auto() + BASE64DECODE = auto() + + +def un_op_to_datalog_fact_string(op: UnaryStringOperator) -> str: + """Return string representation of operator (in datalog serialized format).""" + if op == UnaryStringOperator.BASENAME: + return "$BaseName" + if op == UnaryStringOperator.BASE64_ENCODE: + return "$Base64Encode" + if op == UnaryStringOperator.BASE64DECODE: + return "$Base64Decode" + raise CallGraphError("unknown UnaryStringOperator") + + +class BinaryStringOperator(Enum): + """Binary operators.""" + + STRING_CONCAT = auto() + + +def bin_op_to_datalog_fact_string(op: BinaryStringOperator) -> str: + """Return string representation of operator (in datalog serialized format).""" + if op == BinaryStringOperator.STRING_CONCAT: + return "$StringConcat" + raise CallGraphError("unknown BinaryStringOperator") + + +@dataclass(frozen=True, repr=False) +class UnaryStringOp(Value): + """Value expression representing a unary operator.""" + + #: Operator. + op: UnaryStringOperator + #: Operand value. + operand: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return ( + "$UnaryStringOp(" + + un_op_to_datalog_fact_string(self.op) + + ", " + + self.operand.to_datalog_fact_string() + + ")" + ) + + +@dataclass(frozen=True, repr=False) +class BinaryStringOp(Value): + """Value expression representing a binary operator.""" + + #: Operator. + op: BinaryStringOperator + #: First operand value. + operand1: Value + #: Second operand value. + operand2: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return ( + "$BinaryStringOp(" + + bin_op_to_datalog_fact_string(self.op) + + ", " + + self.operand1.to_datalog_fact_string() + + ", " + + self.operand2.to_datalog_fact_string() + + ")" + ) + + @staticmethod + def get_string_concat(operand1: Value, operand2: Value) -> Value: + """Construct a string concatenation operator. + + Applies some simple constant-folding simplifications. + """ + match operand1, operand2: + # "a" + "b" = "ab" + case StringLiteral(op1_lit), StringLiteral(op2_lit): + return StringLiteral(op1_lit + op2_lit) + # "" + x = x + case StringLiteral(""), _: + return operand2 + # x + "" = x + case _, StringLiteral(""): + return operand1 + # (x + "a") + "b" = x + "ab" + case BinaryStringOp(BinaryStringOperator.STRING_CONCAT, subop1, StringLiteral(subop2_lit)), StringLiteral( + op2_lit + ): + return BinaryStringOp(BinaryStringOperator.STRING_CONCAT, subop1, StringLiteral(subop2_lit + op2_lit)) + # "a" + ("b" + x) = "ab" + x + case StringLiteral(op1_lit), BinaryStringOp( + BinaryStringOperator.STRING_CONCAT, StringLiteral(subop1_lit), subop2 + ): + return BinaryStringOp(BinaryStringOperator.STRING_CONCAT, StringLiteral(op1_lit + subop1_lit), subop2) + + return BinaryStringOp(BinaryStringOperator.STRING_CONCAT, operand1, operand2) + + +@dataclass(frozen=True, repr=False) +class ParameterPlaceholderValue(Value): + """Special placeholder value to allow generic parameterized expressions.""" + + #: Parameter name. + name: str + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$ParameterPlaceholderValue(" + enquote_datalog_string_literal(self.name) + ")" + + +@dataclass(frozen=True, repr=False) +class Symbolic(Value): + """Value expression representing a symbolic expression. + + Represents an expression that has been "frozen" in symbolic form rather than evaluated concretely. + """ + + #: Symbolic expression. + val: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Symbolic(" + self.val.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class SingleBashTokenConstraint(Value): + """Value expression representing a constraint that the underlying value does not parse as multiple Bash tokens.""" + + #: Constrained expression. + val: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$SingleBashTokenConstraint(" + self.val.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class Filesystem(LocationSpecifier): + """Location expression representing a filesystem location at a particular file path.""" + + #: Filepath value. + path: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Filesystem(" + self.path.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class Variable(LocationSpecifier): + """Location expression representing a variable.""" + + #: Variable name. + name: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Variable(" + self.name.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class Artifact(LocationSpecifier): + """Location expression representing a file stored within some named artifact storage location.""" + + #: Artifact name. + name: Value + #: File name within artifact. + file: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Artifact(" + self.name.to_datalog_fact_string() + ", " + self.file.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class FilesystemAnyUnderDir(LocationSpecifier): + """Location expression representing any file under a particular directory.""" + + #: Directory file path. + path: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$FilesystemAnyUnderDir(" + self.path.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class ArtifactAnyFilename(LocationSpecifier): + """Location expression representing any file contained with a named artifact storage location.""" + + #: Artifact name. + name: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$ArtifactAnyFilename(" + self.name.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class ParameterPlaceholderLocation(LocationSpecifier): + """Special placeholder location expression to allow generic parameterized expressions.""" + + #: Parameter name. + name: str + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$ParameterPlaceholderLocation(" + enquote_datalog_string_literal(self.name) + ")" + + +@dataclass(frozen=True, repr=False) +class Console(LocationSpecifier): + """Location expression representing a console, pipe or other text stream.""" + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Console" + + +@dataclass(frozen=True, repr=False) +class Installed(LocationSpecifier): + """Location expression representing an installed package.""" + + #: Package name. + name: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Installed(" + self.name.to_datalog_fact_string() + ")" + + +def enquote_datalog_string_literal(literal: str) -> str: + """Enquote a datalog string literal, with appropriate escaping.""" + return '"' + literal.replace("\\", "\\\\").replace('"', '\\"') + '"' + + +class FactParseError(Exception): + """Happens when an error occurs during fact parsing.""" + + +def consume_whitespace(text: str) -> str: + """Consume leading whitespace, returning the remainder to the text.""" + text_end_idx = len(text) + space_end_idx = text_end_idx + idx = 0 + while idx < text_end_idx: + if text[idx].isspace(): + idx = idx + 1 + else: + space_end_idx = idx + break + return text[space_end_idx:text_end_idx] + + +def consume(text: str, token: str) -> str: + """Consume the leading token from the text. + + Raises exception if text does not start with the token. + """ + if text.startswith(token): + return text[len(token) :] + raise FactParseError(text) + + +def parse_qualified_name(text: str) -> tuple[str, str]: + """Parse a qualified name, returning the name and the remainder of the text.""" + text = consume_whitespace(text) + text_end_idx = len(text) + name_end_idx = text_end_idx + idx = 0 + while idx < text_end_idx: + if text[idx].isalnum() or text[idx] == "_" or text[idx] == "?" or text[idx] == ".": + idx = idx + 1 + else: + name_end_idx = idx + break + return text[0:name_end_idx], text[name_end_idx:text_end_idx] + + +def parse_symbol(text: str) -> tuple[str, str]: + """Parse datalog-serialized string literal.""" + text = consume(text, '"') + text_end_idx = len(text) + str_end_idx = text_end_idx + idx = 0 + in_escape = False + char_list = [] + while idx < text_end_idx: + if text[idx] == "\\": + if not in_escape: + in_escape = True + else: + char_list.append("\\") + in_escape = False + elif text[idx] == '"': + if not in_escape: + str_end_idx = idx + break + char_list.append('"') + in_escape = False + else: + char_list.append(text[idx]) + idx = idx + 1 + + lit = "".join(char_list) + text = text[str_end_idx:] + text = consume(text, '"') + return lit, text + + +def parse_location_specifier(text: str) -> tuple[LocationSpecifier, str]: + """Deserialize location specifier from string representation (in datalog serialized format).""" + text = consume(text, "$") + kind, text = parse_qualified_name(text) + match kind: + case "Filesystem": + text = consume(text, "(") + path_val, text = parse_value(text) + text = consume_whitespace(text) + text = consume(text, ")") + return Filesystem(path_val), text + case "Variable": + text = consume(text, "(") + name_val, text = parse_value(text) + text = consume_whitespace(text) + text = consume(text, ")") + return Variable(name_val), text + case "Artifact": + text = consume(text, "(") + name_val, text = parse_value(text) + text = consume(text, ",") + text = consume_whitespace(text) + file_val, text = parse_value(text) + text = consume(text, ")") + return Artifact(name_val, file_val), text + case "FilesystemAnyUnderDir": + text = consume(text, "(") + path_val, text = parse_value(text) + text = consume_whitespace(text) + text = consume(text, ")") + return FilesystemAnyUnderDir(path_val), text + case "ArtifactAnyFilename": + text = consume(text, "(") + name_val, text = parse_value(text) + text = consume_whitespace(text) + text = consume(text, ")") + return ArtifactAnyFilename(name_val), text + case "Console": + return Console(), text + case "Installed": + text = consume(text, "(") + name_val, text = parse_value(text) + text = consume_whitespace(text) + text = consume(text, ")") + return Installed(name_val), text + + raise FactParseError() + + +def parse_location(text: str) -> tuple[Location, str]: + """Deserialize location from string representation (in datalog serialized format). + + Currently non-functional primarily due to the inability to deserialize scope identity. + """ + raise ParseError("cannot parse, need fix") + + +def parse_value(text: str) -> tuple[Value, str]: + """Deserialize value expression from string representation (in datalog serialized format).""" + text = consume(text, "$") + kind, text = parse_qualified_name(text) + match kind: + case "StringLiteral": + text = consume(text, "(") + lit, text = parse_symbol(text) + text = consume_whitespace(text) + text = consume(text, ")") + return StringLiteral(lit), text + case "Read": + text = consume(text, "(") + loc, text = parse_location(text) + text = consume_whitespace(text) + text = consume(text, ")") + return Read(loc), text + case "ArbitraryNewData": + text = consume(text, "(") + at, text = parse_symbol(text) + text = consume_whitespace(text) + text = consume(text, ")") + return ArbitraryNewData(at), text + case "UnaryStringOp": + text = consume(text, "(") + un_operator, text = parse_un_op(text) + text = consume(text, ",") + text = consume_whitespace(text) + operand_val, text = parse_value(text) + text = consume(text, ")") + return UnaryStringOp(un_operator, operand_val), text + case "BinaryStringOp": + text = consume(text, "(") + bin_operator, text = parse_bin_op(text) + text = consume(text, ",") + text = consume_whitespace(text) + operand1, text = parse_value(text) + text = consume(text, ",") + text = consume_whitespace(text) + operand2, text = parse_value(text) + text = consume(text, ")") + return BinaryStringOp(bin_operator, operand1, operand2), text + case "ParameterPlaceholderValue": + text = consume(text, "(") + name, text = parse_symbol(text) + text = consume_whitespace(text) + text = consume(text, ")") + return ParameterPlaceholderValue(name), text + case "SingleBashTokenConstraint": + text = consume(text, "(") + operand, text = parse_value(text) + text = consume(text, ")") + return SingleBashTokenConstraint(operand), text + case "InstalledPackage": + text = consume(text, "(") + name_val, text = parse_value(text) + text = consume(text, ",") + text = consume_whitespace(text) + version_val, text = parse_value(text) + text = consume(text, ",") + text = consume_whitespace(text) + distribution_val, text = parse_value(text) + text = consume(text, ",") + text = consume_whitespace(text) + url_val, text = parse_value(text) + text = consume(text, ")") + return InstalledPackage(name_val, version_val, distribution_val, url_val), text + raise FactParseError() + + +def parse_un_op(text: str) -> tuple[UnaryStringOperator, str]: + """Deserialize unary operator from string representation (in datalog serialized format).""" + text = consume(text, "$") + name, text = parse_qualified_name(text) + match name: + case "BaseName": + return UnaryStringOperator.BASENAME, text + case "Base64Encode": + return UnaryStringOperator.BASE64_ENCODE, text + case "Base64Decode": + return UnaryStringOperator.BASE64DECODE, text + raise FactParseError() + + +def parse_bin_op(text: str) -> tuple[BinaryStringOperator, str]: + """Deserialize binary operator from string representation (in datalog serialized format).""" + text = consume(text, "$") + name, text = parse_qualified_name(text) + match name: + case "StringConcat": + return BinaryStringOperator.STRING_CONCAT, text + raise FactParseError() diff --git a/src/macaron/code_analyzer/dataflow_analysis/github.py b/src/macaron/code_analyzer/dataflow_analysis/github.py new file mode 100644 index 000000000..6da30e745 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/github.py @@ -0,0 +1,1314 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Dataflow analysis implementation for analysing GitHub Actions Workflow build pipelines.""" + +from __future__ import annotations + +from collections import defaultdict +from collections.abc import Callable, Iterator +from dataclasses import dataclass +from graphlib import TopologicalSorter + +from macaron.code_analyzer.dataflow_analysis import bash, core, evaluation, facts, github_expr, models, printing +from macaron.errors import CallGraphError +from macaron.parsers import github_workflow_model + + +@dataclass(frozen=True) +class GitHubActionsWorkflowContext(core.Context): + """Context for the top-level scope of a GitHub Actions Workflow.""" + + #: Outer analysis context. + analysis_context: core.ContextRef[core.AnalysisContext] + #: Scope for artifact storage within the pipeline execution (for upload/download artifact). + artifacts: core.ContextRef[facts.Scope] + #: Scope for artifacts published as GitHub releases by the pipeline. + releases: core.ContextRef[facts.Scope] + #: Scope for environment variables (env block at top-level of workflow). + env: core.ContextRef[facts.Scope] + #: Scope for variables within the workflow. + workflow_variables: core.ContextRef[facts.Scope] + #: Scope for console output. + console: core.ContextRef[facts.Scope] + #: Filepath of workflow file. + source_filepath: str + + @staticmethod + def create( + analysis_context: core.ContextRef[core.AnalysisContext], source_filepath: str + ) -> GitHubActionsWorkflowContext: + """Create a new workflow context and its associated scopes. + + Parameters + ---------- + analysis_context: core.ContextRef[core.AnalysisContext] + Outer analysis context. + source_filepath: str + Filepath of workflow file. + + Returns + ------- + GitHubActionsWorkflowContext + The new workflow context. + """ + return GitHubActionsWorkflowContext( + analysis_context=analysis_context.get_non_owned(), + artifacts=core.OwningContextRef(facts.Scope("artifacts")), + releases=core.OwningContextRef(facts.Scope("releases")), + env=core.OwningContextRef(facts.Scope("env")), + workflow_variables=core.OwningContextRef(facts.Scope("workflow_vars")), + console=core.OwningContextRef(facts.Scope("console")), + source_filepath=source_filepath, + ) + + def direct_refs(self) -> Iterator[core.ContextRef[core.Context] | core.ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + yield self.analysis_context + yield self.artifacts + yield self.releases + yield self.env + yield self.workflow_variables + yield self.console + + +@dataclass(frozen=True) +class GitHubActionsJobContext(core.Context): + """Context for a job within a GitHub Actions Workflow.""" + + #: Outer workflow context. + workflow_context: core.ContextRef[GitHubActionsWorkflowContext] + #: Scope for filesystem used by the job and its steps. + filesystem: core.ContextRef[facts.Scope] + #: Scope for environment variables (env block at job level). + env: core.ContextRef[facts.Scope] + #: Scope for variables within the job (step output variables, etc.). + job_variables: core.ContextRef[facts.Scope] + + @staticmethod + def create(workflow_context: core.ContextRef[GitHubActionsWorkflowContext]) -> GitHubActionsJobContext: + """Create a new job context and its associated scopes. + + Env and job variables scopes inherit from outer context. + + Parameters + ---------- + workflow_context: core.ContextRef[GitHubActionsWorkflowContext] + Outer workflow context. + + Returns + ------- + GitHubActionsJobContext + The new job context. + """ + return GitHubActionsJobContext( + workflow_context=workflow_context.get_non_owned(), + filesystem=core.OwningContextRef(facts.Scope("filesystem")), + env=core.OwningContextRef(facts.Scope("env", workflow_context.ref.env.ref)), + job_variables=core.OwningContextRef(facts.Scope("job_vars", workflow_context.ref.workflow_variables.ref)), + ) + + def direct_refs(self) -> Iterator[core.ContextRef[core.Context] | core.ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + yield self.workflow_context + yield self.filesystem + yield self.env + yield self.job_variables + + +@dataclass(frozen=True) +class GitHubActionsStepContext(core.Context): + """Context for a step within a job within a GitHub Actions Workflow.""" + + #: Outer job context. + job_context: core.ContextRef[GitHubActionsJobContext] + #: Scope for environment variables (env block at step level) + env: core.ContextRef[facts.Scope] + #: Name prefix for step output variables (stored in the job variables) + #: belonging to this step (e.g. "steps.step_id.outputs.") + output_var_prefix: str | None + + @staticmethod + def create(job_context: core.ContextRef[GitHubActionsJobContext], step_id: str | None) -> GitHubActionsStepContext: + """Create a new step context and its associated scopes. + + Env scope inherits from outer context. Output var prefix is derived from step_id. + + Parameters + ---------- + job_context: core.ContextRef[GitHubActionsJobContext] + Outer job context. + step_id: str | None + Step id. If provided, used to derive name previx for step output variables. + + Returns + ------- + GitHubActionsStepContext + The new step context. + """ + return GitHubActionsStepContext( + job_context=job_context.get_non_owned(), + env=core.OwningContextRef(facts.Scope("env", job_context.ref.env.ref)), + output_var_prefix=("steps." + step_id + ".outputs.") if step_id is not None else None, + ) + + def direct_refs(self) -> Iterator[core.ContextRef[core.Context] | core.ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + yield self.job_context + yield self.env + + +class RawGitHubActionsWorkflowNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Workflow. + + Defines how to interpret a parsed workflow and generate its analysis representation. + """ + + #: Parsed workflow AST. + definition: github_workflow_model.Workflow + + #: Workflow context + context: core.ContextRef[GitHubActionsWorkflowContext] + + def __init__( + self, definition: github_workflow_model.Workflow, context: core.ContextRef[GitHubActionsWorkflowContext] + ) -> None: + """Initialize node. + + Typically, construction should be done via the create function rather than using this constructor directly. + """ + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret the workflow AST to generate control flow representation.""" + + def build_workflow_node() -> core.Node: + return GitHubActionsWorkflowNode.create(self.definition, self.context.get_non_owned()) + + return {"default": build_workflow_node} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the workflow name and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if "name" in self.definition: + result["workflow name"] = {(None, self.definition["name"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + @staticmethod + def create( + workflow: github_workflow_model.Workflow, + analysis_context: core.ContextRef[core.AnalysisContext], + source_filepath: str, + ) -> RawGitHubActionsWorkflowNode: + """Create workflow node and its associated context. + + Parameters + ---------- + workflow: github_workflow_model.Workflow + Parsed workflow AST. + analysis_context: core.ContextRef[core.AnalysisContext] + Outer analysis context. + source_filepath: str + Filepath of workflow file. + + Returns + ------- + RawGitHubActionsWorkflowNode + The new workflow node. + """ + workflow_context = GitHubActionsWorkflowContext.create(analysis_context, source_filepath) + + return RawGitHubActionsWorkflowNode(workflow, core.OwningContextRef(workflow_context)) + + +class GitHubActionsWorkflowNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a GitHub Actions Workflow. + + Control flow structure executes each job in an arbitrary linear sequence + (by default a topological sort satsifying the job dependencies). If an env block exists, + it is applied beforehand. + """ + + #: Parsed workflow AST. + definition: github_workflow_model.Workflow + #: Workflow context. + context: core.ContextRef[GitHubActionsWorkflowContext] + #: Node to apply effects of env block, if any. + env_block: RawGitHubActionsEnvNode | None + #: Job nodes, identified by their job id. + jobs: dict[str, RawGitHubActionsJobNode] + #: List of job ids specifying job execution order. + order: list[str] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: github_workflow_model.Workflow, + context: core.ContextRef[GitHubActionsWorkflowContext], + env_block: RawGitHubActionsEnvNode | None, + jobs: dict[str, RawGitHubActionsJobNode], + order: list[str], + ) -> None: + """Initialize workflow node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: github_workflow_model.Workflow + Parsed workflow AST. + context: core.ContextRef[GitHubActionsWorkflowContext] + Workflow context. + env_block: RawGitHubActionsEnvNode | None + Node to apply effects of env block, if any. + jobs: dict[str, RawGitHubActionsJobNode] + List of job ids specifying job execution order. + order: list[str] + List of job ids specifying job execution order. + """ + super().__init__() + self.definition = definition + self.context = context + self.env_block = env_block + self.jobs = jobs + self.order = order + + self._cfg = core.ControlFlowGraph.create_from_sequence( + list(filter(core.node_is_not_none, [self.env_block] + [self.jobs[job_id] for job_id in self.order])) + ) + + def children(self) -> Iterator[core.Node]: + """Yield the child nodes of this node.""" + if self.env_block is not None: + yield self.env_block + for job_id in self.order: + yield self.jobs[job_id] + + def get_entry(self) -> core.Node: + """Return the entry node.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successors for a particular exit of a particular node.""" + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the workflow name and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if "name" in self.definition: + result["workflow name"] = {(None, self.definition["name"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + @staticmethod + def create( + workflow: github_workflow_model.Workflow, context: core.NonOwningContextRef[GitHubActionsWorkflowContext] + ) -> GitHubActionsWorkflowNode: + """Create workflow node from workflow AST. + + Also creates a job node for each job, and performs a topological sort of the job dependency graph + to choose an arbitrary valid sequential execution order. + + Parameters + ---------- + workflow: github_workflow_model.Workflow + Parsed workflow AST. + context: core.NonOwningContextRef[GitHubActionsWorkflowContext] + Workflow context. + + Returns + ------- + GitHubActionsWorkflowNode + The new workflow node. + """ + jobs: dict[str, RawGitHubActionsJobNode] = {} + + for job_id, job in workflow["jobs"].items(): + job_node = RawGitHubActionsJobNode( + job, job_id, core.OwningContextRef(GitHubActionsJobContext.create(context)) + ) + jobs[job_id] = job_node + + dependency_graph: dict[str, list[str]] = {} + for job_id, job_node in jobs.items(): + edges: list[str] = [] + if "needs" in job_node.definition: + needs = job_node.definition["needs"] + if isinstance(needs, list): + for need in needs: + # TODO invalid needs id? + edges.append(need) + elif isinstance(needs, str): + edges.append(needs) + dependency_graph[job_id] = edges + + ts = TopologicalSorter(dependency_graph) + order = list(ts.static_order()) + + env_block = None + if "env" in workflow: + env_block = RawGitHubActionsEnvNode(workflow["env"], context) + + return GitHubActionsWorkflowNode(workflow, context, env_block, jobs, order) + + +class RawGitHubActionsJobNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Job. + + Defines how to interpret the different kinds of jobs (normal jobs, reusable workflow call jobs), + and generate their analysis representation. + """ + + #: Parsed job AST. + definition: github_workflow_model.Job + #: Job id. + job_id: str + #: Job context. + context: core.ContextRef[GitHubActionsJobContext] + + def __init__( + self, definition: github_workflow_model.Job, job_id: str, context: core.ContextRef[GitHubActionsJobContext] + ) -> None: + """Initialize node.""" + super().__init__() + self.definition = definition + self.job_id = job_id + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret job AST to generate representation for either a normal job or a reusable workflow call job.""" + if github_workflow_model.is_normal_job(self.definition): + normal_job_definition = self.definition + + def build_normal_job() -> core.Node: + return GitHubActionsNormalJobNode.create( + normal_job_definition, self.job_id, self.context.get_non_owned() + ) + + return {"default": build_normal_job} + if github_workflow_model.is_reusable_workflow_call_job(self.definition): + raw_with_params = self.definition.get("with", {}) + call_def = self.definition + if isinstance(raw_with_params, dict): + + def build_reusable_workflow_call_job() -> core.Node: + uses_name, _, uses_version = call_def["uses"].rpartition("@") + + with_parameters: dict[str, facts.Value] = {} + for key, val in raw_with_params.items(): + if isinstance(val, str): + parsed_val = github_expr.extract_value_from_expr_string( + val, self.context.ref.job_variables.ref + ) + if parsed_val is not None: + with_parameters[key] = parsed_val + elif isinstance(val, bool): + with_parameters[key] = facts.StringLiteral("true") if val else facts.StringLiteral("false") + else: + with_parameters[key] = facts.StringLiteral(str(val)) + + return GitHubActionsReusableWorkflowCallNode( + call_def, + self.job_id, + self.context.get_non_owned(), + uses_name, + uses_version if uses_version != "" else None, + with_parameters, + ) + + return {"default": build_reusable_workflow_call_job} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + raise CallGraphError("invalid job") + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the job id and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["job id"] = {(None, self.job_id)} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + +class GitHubActionsNormalJobNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a GitHub Actions Normal Job. + + Control flow structure executes each step in the order defined by the job, + preceded by applying the effects of the matrix and env blocks if they exist + and succeeded by applying the effects of the output block if it exists. + (TODO generating output block not yet implemented). + """ + + #: Parsed job AST. + definition: github_workflow_model.NormalJob + #: Job id. + job_id: str + #: Node to apply effects of matrix block, if any. + matrix_block: RawGitHubActionsMatrixNode | None + #: Node to apply effects of env block, if any. + env_block: RawGitHubActionsEnvNode | None + #: Step nodes, in execution order. + steps: list[RawGitHubActionsStepNode] + #: Node to apply effects of output block, if any. + output_block: core.Node | None # TODO More specific + #: Job context + context: core.ContextRef[GitHubActionsJobContext] + #: Control flow graph + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: github_workflow_model.NormalJob, + job_id: str, + matrix_block: RawGitHubActionsMatrixNode | None, + env_block: RawGitHubActionsEnvNode | None, + steps: list[RawGitHubActionsStepNode], + output_block: core.Node | None, + context: core.ContextRef[GitHubActionsJobContext], + ) -> None: + """Initialize job node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: github_workflow_model.NormalJob + Parsed job AST. + job_id: str + Job id. + matrix_block: RawGitHubActionsMatrixNode | None + Node to apply effects of matrix block, if any. + env_block: RawGitHubActionsEnvNode | None + Node to apply effects of env block, if any. + steps: list[RawGitHubActionsStepNode] + Step nodes, in execution order. + output_block: core.Node | None, + Node to apply effects of output block, if any. + context: core.ContextRef[GitHubActionsJobContext] + Job context. + """ + super().__init__() + self.definition = definition + self.job_id = job_id + self.matrix_block = matrix_block + self.env_block = env_block + self.steps = steps + self.output_block = output_block + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence( + list(filter(core.node_is_not_none, [self.matrix_block, self.env_block] + self.steps + [self.output_block])) + ) + + def children(self) -> Iterator[core.Node]: + """Yield the child nodes of this node.""" + if self.matrix_block is not None: + yield self.matrix_block + if self.env_block is not None: + yield self.env_block + yield from self.steps + if self.output_block is not None: + yield self.output_block + + def get_entry(self) -> core.Node: + """Return the entry node.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successors for a particular exit of a particular node.""" + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the job id and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["job id"] = {(None, self.job_id)} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + job: github_workflow_model.NormalJob, job_id: str, context: core.NonOwningContextRef[GitHubActionsJobContext] + ) -> GitHubActionsNormalJobNode: + """Create normal job node from job AST. Also creates a step node for each step. + + Parameters + ---------- + job: github_workflow_model.NormalJob + Parsed job AST. + job_id: str + Job id. + context: core.NonOwningContextRef[GitHubActionsJobContext] + Job context. + + Returns + ------- + GitHubActionsNormalJobNode + The new job node. + """ + # TODO output block + + matrix_block = None + if "strategy" in job and "matrix" in job["strategy"]: + matrix_block = RawGitHubActionsMatrixNode(job["strategy"]["matrix"], context) + + env_block = None + if "env" in job: + env_block = RawGitHubActionsEnvNode(job["env"], context) + + steps = [ + RawGitHubActionsStepNode( + step, core.OwningContextRef(GitHubActionsStepContext.create(context, step.get("id"))) + ) + for step in job.get("steps", []) + ] + + return GitHubActionsNormalJobNode(job, job_id, matrix_block, env_block, steps, None, context) + + +class GitHubActionsReusableWorkflowCallNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Reusable Workflow Call Job. + + Defines how to interpret the semantics of different supported reusable workflows that may + be invoked (TODO currently none are supported). + """ + + #: Parsed reusable workflow call AST. + definition: github_workflow_model.ReusableWorkflowCallJob + #: Job id. + job_id: str + #: Job context. + context: core.ContextRef[GitHubActionsJobContext] + + #: Name of the reusable workflow being invoked (without version component). + uses_name: str + #: Version of the reusable workflow being invoked (if specified). + uses_version: str | None + + #: Input parameters specified for reusable workflow. + with_parameters: dict[str, facts.Value] + + def __init__( + self, + definition: github_workflow_model.ReusableWorkflowCallJob, + job_id: str, + context: core.ContextRef[GitHubActionsJobContext], + uses_name: str, + uses_version: str | None, + with_parameters: dict[str, facts.Value], + ) -> None: + """Initialize reusable workflow call node. + + Parameters + ---------- + definition: github_workflow_model.ReusableWorkflowCallJob + Parsed reusable workflow call AST. + job_id: str + Job id. + context: core.ContextRef[GitHubActionsJobContext] + Job context. + uses_name: str + Name of the reusable workflow being invoked (without version component). + uses_version: str | None + Version of the reusable workflow being invoked (if specified). + with_parameters: dict[str, facts.Value] + Input parameters specified for reusable workflow. + """ + super().__init__() + self.definition = definition + self.job_id = job_id + self.context = context + self.uses_name = uses_name + self.uses_version = uses_version + self.with_parameters = with_parameters + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Intepret the semantics of the different supported reusable workflows. + + (TODO currently none are supported). + """ + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table. + + Contains the job id, reusable workflow name, and scopes. + """ + result: dict[str, set[tuple[str | None, str]]] = {} + result["job id"] = {(None, self.job_id)} + result["uses"] = {(None, self.definition["uses"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + +class RawGitHubActionsStepNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Step. + + Defines how to interpret the different kinds of steps (run jobs, action steps), + and generate their analysis representation. + """ + + #: Parsed step AST. + definition: github_workflow_model.Step + #: Step context + context: core.ContextRef[GitHubActionsStepContext] + + def __init__( + self, definition: github_workflow_model.Step, context: core.ContextRef[GitHubActionsStepContext] + ) -> None: + """Intitialize node.""" + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret step AST to generate representation depending on whether it is a run step or an action step.""" + if github_workflow_model.is_action_step(self.definition): + action_step_definition = self.definition + + def build_action_step() -> core.Node: + return RawGitHubActionsActionStepNode(action_step_definition, self.context.get_non_owned()) + + return {"default": build_action_step} + if github_workflow_model.is_run_step(self.definition): + run_step_definition = self.definition + + def build_run_step() -> core.Node: + return GitHubActionsRunStepNode.create(run_step_definition, self.context.get_non_owned()) + + return {"default": build_run_step} + raise CallGraphError("invalid step") + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table. + + Contains the step id, name, action name (if action step), and scopes. + """ + result: dict[str, set[tuple[str | None, str]]] = {} + if "id" in self.definition: + result["step id"] = {(None, self.definition["id"])} + elif "name" in self.definition: + result["step name"] = {(None, self.definition["name"])} + if github_workflow_model.is_action_step(self.definition): + result["step uses"] = {(None, self.definition["uses"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + +class RawGitHubActionsActionStepNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Action Step. + + Defines how to extract the name, version and parameters used to invoke the action, + and generate a node with those details resolved for further interpretation. + """ + + #: Parsed step AST. + definition: github_workflow_model.ActionStep + #: Step context. + context: core.ContextRef[GitHubActionsStepContext] + + def __init__( + self, definition: github_workflow_model.ActionStep, context: core.ContextRef[GitHubActionsStepContext] + ) -> None: + """Initialize node.""" + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Intepret action step AST to extract the name, version and parameters.""" + raw_with_params = self.definition.get("with", {}) + if isinstance(raw_with_params, dict): + + def build_action() -> core.Node: + uses_name, _, uses_version = self.definition["uses"].rpartition("@") + + with_parameters: dict[str, facts.Value] = {} + for key, val in raw_with_params.items(): + if isinstance(val, str): + parsed_val = github_expr.extract_value_from_expr_string( + val, self.context.ref.job_context.ref.job_variables.ref + ) + if parsed_val is not None: + with_parameters[key] = parsed_val + elif isinstance(val, bool): + with_parameters[key] = facts.StringLiteral("true") if val else facts.StringLiteral("false") + else: + with_parameters[key] = facts.StringLiteral(str(val)) + + return GitHubActionsActionStepNode( + self.definition, + self.context.get_non_owned(), + uses_name, + uses_version if uses_version != "" else None, + with_parameters, + ) + + return {"default": build_action} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the step id, name, action name, and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if "id" in self.definition: + result["step id"] = {(None, self.definition["id"])} + elif "name" in self.definition: + result["step name"] = {(None, self.definition["name"])} + result["step uses"] = {(None, self.definition["uses"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + +class GitHubActionsActionStepNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Action Step. + + Defines how to interpret the semantics of different supported actions that may + be invoked. + """ + + #: Parsed step AST. + definition: github_workflow_model.ActionStep + #: Step context. + context: core.ContextRef[GitHubActionsStepContext] + + #: Name of the action being invoked (without version component). + uses_name: str + #: Version of the action being invoked (if specified). + uses_version: str | None + + #: Input parameters specified for action. + with_parameters: dict[str, facts.Value] + + def __init__( + self, + definition: github_workflow_model.ActionStep, + context: core.ContextRef[GitHubActionsStepContext], + uses_name: str, + uses_version: str | None, + with_parameters: dict[str, facts.Value], + ) -> None: + """Initialize action step node. + + Parameters + ---------- + definition: github_workflow_model.ActionStep + Parsed step AST. + context: core.ContextRef[GitHubActionsStepContext] + Step context. + uses_name: str + Name of the action being invoked (without version component). + uses_version: str | None + Version of the action being invoked (if specified). + with_parameters: dict[str, facts.Value] + Input parameters specified for action. + """ + super().__init__() + self.definition = definition + self.context = context + self.uses_name = uses_name + self.uses_version = uses_version + self.with_parameters = with_parameters + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Intepret the semantics of the different supported actions.""" + match self.uses_name: + case "actions/checkout": + + def build_checkout() -> core.Node: + return models.GitHubActionsGitCheckoutModelNode() + + return {"default": build_checkout} + case "actions/setup-java": + # Installs Java toolchain + def build_setup_java() -> core.Node: + return models.InstallPackageNode( + install_scope=self.context.ref.job_context.ref.filesystem.ref, + name=facts.StringLiteral("java"), + version=self.with_parameters.get("java-version", facts.StringLiteral("")), + distribution=self.with_parameters.get("distribution", facts.StringLiteral("")), + url=facts.StringLiteral("https://github.com/actions/setup-java"), + ) + + return {"default": build_setup_java} + case "graalvm/setup-graalvm": + # Installs Java toolchain + def build_setup_graalvm() -> core.Node: + return models.InstallPackageNode( + install_scope=self.context.ref.job_context.ref.filesystem.ref, + name=facts.StringLiteral("java"), + version=self.with_parameters.get("java-version", facts.StringLiteral("")), + distribution=self.with_parameters.get("distribution", facts.StringLiteral("graalvm")), + url=facts.StringLiteral("https://github.com/graalvm/setup-graalvm"), + ) + + return {"default": build_setup_graalvm} + + case "oracle-actions/setup-java": + # Installs Java toolchain + def build_setup_oracle_java() -> core.Node: + return models.InstallPackageNode( + install_scope=self.context.ref.job_context.ref.filesystem.ref, + name=facts.StringLiteral("java"), + version=self.with_parameters.get("release", facts.StringLiteral("")), + distribution=self.with_parameters.get("website", facts.StringLiteral("oracle.com")), + url=facts.StringLiteral("https://github.com/oracle-actions/setup-java"), + ) + + return {"default": build_setup_oracle_java} + case "actions/setup-python": + # Installs Python toolchain + def build_setup_python() -> core.Node: + return models.InstallPackageNode( + install_scope=self.context.ref.job_context.ref.filesystem.ref, + name=facts.StringLiteral("python"), + version=self.with_parameters.get("python-version", facts.StringLiteral("")), + distribution=facts.StringLiteral(""), + url=facts.StringLiteral(""), + ) + + return {"default": build_setup_python} + case "actions/upload-artifact": + # Uploads artifact to pipeline artifact storage. + if "name" in self.with_parameters and "path" in self.with_parameters: + split = evaluation.parse_str_expr_split(self.with_parameters["path"], "\n") + if len(split) == 1: + + def build_upload_artifact() -> core.Node: + return models.GitHubActionsUploadArtifactModelNode( + artifacts_scope=self.context.ref.job_context.ref.workflow_context.ref.artifacts.ref, + artifact_name=self.with_parameters["name"], + artifact_file=facts.UnaryStringOp(facts.UnaryStringOperator.BASENAME, split[0]), + filesystem_scope=self.context.ref.job_context.ref.filesystem.ref, + path=split[0], + ) + + return {"default": build_upload_artifact} + + def build_multiple_upload_artifact() -> core.Node: + seq: list[core.Node] = [ + models.GitHubActionsUploadArtifactModelNode( + artifacts_scope=self.context.ref.job_context.ref.workflow_context.ref.artifacts.ref, + artifact_name=self.with_parameters["name"], + artifact_file=facts.UnaryStringOp(facts.UnaryStringOperator.BASENAME, path), + filesystem_scope=self.context.ref.job_context.ref.filesystem.ref, + path=path, + ) + for path in [x for x in split if x != facts.StringLiteral("")] + ] + if len(seq) == 0: + return core.NoOpStatementNode() + return core.SimpleSequence(seq) + + return {"default": build_multiple_upload_artifact} + + case "actions/download-artifact": + # Downloads artifact from pipeline artifact storage. + if "name" in self.with_parameters: + + def build_download_artifact() -> core.Node: + return models.GitHubActionsDownloadArtifactModelNode( + artifacts_scope=self.context.ref.job_context.ref.workflow_context.ref.artifacts.ref, + artifact_name=self.with_parameters["name"], + filesystem_scope=self.context.ref.job_context.ref.filesystem.ref, + ) + + return {"default": build_download_artifact} + case "softprops/action-gh-release": + # Creates a GitHub release. + if "files" in self.with_parameters: + split = evaluation.parse_str_expr_split(self.with_parameters["files"], "\n") + if len(split) == 1: + + def build_upload_release() -> core.Node: + return models.GitHubActionsReleaseModelNode( + artifacts_scope=self.context.ref.job_context.ref.workflow_context.ref.releases.ref, + artifact_name=facts.StringLiteral(str(id(self))), + artifact_file=facts.UnaryStringOp(facts.UnaryStringOperator.BASENAME, split[0]), + filesystem_scope=self.context.ref.job_context.ref.filesystem.ref, + path=split[0], + ) + + return {"default": build_upload_release} + + def build_multiple_upload_release() -> core.Node: + return core.SimpleSequence( + [ + models.GitHubActionsReleaseModelNode( + artifacts_scope=self.context.ref.job_context.ref.workflow_context.ref.releases.ref, + artifact_name=facts.StringLiteral(str(id(self))), + artifact_file=facts.UnaryStringOp(facts.UnaryStringOperator.BASENAME, path), + filesystem_scope=self.context.ref.job_context.ref.filesystem.ref, + path=path, + ) + for path in [x for x in split if x != facts.StringLiteral("")] + ] + ) + + return {"default": build_multiple_upload_release} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the step id, name, action name, with parameters, and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if "id" in self.definition: + result["step id"] = {(None, self.definition["id"])} + elif "name" in self.definition: + result["step_name"] = {(None, self.definition["name"])} + result["step uses"] = {(None, self.definition["uses"])} + + for key, val in self.with_parameters.items(): + result["with(" + key + ")"] = {(None, val.to_datalog_fact_string())} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + +class GitHubActionsRunStepNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a GitHub Actions Run Step. + + Control flow structure executes the shell script defined by the step. + If an env block exists, it is applied beforehand. + """ + + #: Parsed step AST. + definition: github_workflow_model.RunStep + #: Node to apply effects of env block, if any. + env_block: RawGitHubActionsEnvNode | None + #: Shell script to be run. + shell_block: bash.RawBashScriptNode + #: Step context. + context: core.ContextRef[GitHubActionsStepContext] + #: Control flow graph + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: github_workflow_model.RunStep, + env_block: RawGitHubActionsEnvNode | None, + shell_block: bash.RawBashScriptNode, + context: core.ContextRef[GitHubActionsStepContext], + ) -> None: + """Initialize run step node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: github_workflow_model.RunStep + Parsed step AST. + env_block: RawGitHubActionsEnvNode | None + Node to apply effects of env block, if any. + shell_block: bash.RawBashScriptNode + Shell script to be run. + context: core.ContextRef[GitHubActionsStepContext] + Step context. + """ + super().__init__() + self.definition = definition + self.env_block = env_block + self.shell_block = shell_block + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence( + list(filter(core.node_is_not_none, [self.env_block, self.shell_block])) + ) + + def children(self) -> Iterator[core.Node]: + """Yield the child nodes of this node.""" + if self.env_block is not None: + yield self.env_block + yield self.shell_block + + def get_entry(self) -> core.Node: + """Return the entry node.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successors for a particular exit of a particular node.""" + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the step id, name, and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if "id" in self.definition: + result["step id"] = {(None, self.definition["id"])} + elif "name" in self.definition: + result["step name"] = {(None, self.definition["name"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + run_step: github_workflow_model.RunStep, context: core.NonOwningContextRef[GitHubActionsStepContext] + ) -> GitHubActionsRunStepNode: + """Create run step node from step AST. + + Parameters + ---------- + run_step: github_workflow_model.RunStep + Parsed step AST. + context: core.NonOwningContextRef[GitHubActionsStepContext] + Step context. + + Returns + ------- + GitHubActionsRunStepNode + The new run step node. + """ + env_block = None + if "env" in run_step: + env_block = RawGitHubActionsEnvNode(run_step["env"], context) + script_node = bash.RawBashScriptNode( + facts.StringLiteral(run_step["run"]), + core.OwningContextRef(bash.BashScriptContext.create_from_run_step(context, "")), + ) + return GitHubActionsRunStepNode(run_step, env_block, script_node, context) + + +class RawGitHubActionsEnvNode(core.InterpretationNode): + """Interpretation node representing an env block in a GitHub Actions Workflow/Job/Step. + + Defines how to interpret the declarative env block to generate imperative constructs to + write the values to the env variables. + """ + + #: Parsed env block AST. + definition: github_workflow_model.Env + #: Outer context. + context: core.ContextRef[GitHubActionsWorkflowContext | GitHubActionsJobContext | GitHubActionsStepContext] + + def __init__( + self, + definition: github_workflow_model.Env, + context: core.ContextRef[GitHubActionsWorkflowContext | GitHubActionsJobContext | GitHubActionsStepContext], + ) -> None: + """Initialize env block node. + + Parameters + ---------- + definition: github_workflow_model.Env + Parsed env block AST. + context: core.ContextRef[GitHubActionsWorkflowContext | GitHubActionsJobContext | GitHubActionsStepContext] + Outer context. + """ + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret declarative env block to generate imperative constructs to write to the env vars.""" + env = self.definition + if isinstance(env, dict): + + def build_env_writes() -> core.Node: + env_writes: dict[str, facts.Value] = {} + for key, val in env.items(): + if isinstance(val, str): + var_scope = ( + self.context.ref.job_context.ref.job_variables.ref + if isinstance(self.context.ref, GitHubActionsStepContext) + else ( + self.context.ref.job_variables.ref + if isinstance(self.context.ref, GitHubActionsJobContext) + else None + ) + ) + parsed_val = github_expr.extract_value_from_expr_string(val, var_scope) + if parsed_val is not None: + env_writes[key] = parsed_val + elif isinstance(val, bool): + env_writes[key] = facts.StringLiteral("true") if val else facts.StringLiteral("false") + else: + env_writes[key] = facts.StringLiteral(str(val)) + + if len(env_writes) == 0: + return core.NoOpStatementNode() + + return core.SimpleSequence( + [ + models.VarAssignNode( + models.VarAssignKind.GITHUB_ENV_VAR, self.context.ref.env.ref, facts.StringLiteral(var), val + ) + for var, val in env_writes.items() + ] + ) + + return {"default": build_env_writes} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + +class RawGitHubActionsMatrixNode(core.InterpretationNode): + """Interpretation node representing a matrix block in a GitHub Actions Job. + + Defines how to interpret the declarative matrix block to generate imperative constructs to + write the values to the matrix variables. + """ + + #: Parsed matrix block AST. + definition: github_workflow_model.Matrix + #: Outer job context. + context: core.ContextRef[GitHubActionsJobContext] + + def __init__( + self, + definition: github_workflow_model.Matrix, + context: core.ContextRef[GitHubActionsJobContext], + ) -> None: + """Initialize matrix node. + + Parameters + ---------- + definition: github_workflow_model.Matrix + Parsed matrix block AST. + context: core.ContextRef[GitHubActionsJobContext] + Outer job context. + """ + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret declarative matrix block to generate imperative constructs to write to the matrix variables.""" + matrix = self.definition + if isinstance(matrix, dict): + + def build_matrix_writes() -> core.Node: + matrix_writes: dict[str, list[facts.Value]] = defaultdict(list) + if isinstance(matrix, dict): + for key, vals in matrix.items(): + if isinstance(vals, list): + var_scope = self.context.ref.job_variables.ref + + for val in vals: + if isinstance(val, str): + parsed_val = github_expr.extract_value_from_expr_string(val, var_scope) + if parsed_val is not None: + matrix_writes[key].append(parsed_val) + elif isinstance(val, bool): + matrix_writes[key].append( + facts.StringLiteral("true") if val else facts.StringLiteral("false") + ) + else: + matrix_writes[key].append(facts.StringLiteral(str(val))) + + if len(matrix_writes) == 0: + return core.NoOpStatementNode() + + return core.SimpleSequence( + [ + core.SimpleAlternatives( + [ + models.VarAssignNode( + models.VarAssignKind.GITHUB_JOB_VAR, + self.context.ref.job_variables.ref, + facts.StringLiteral("matrix." + key), + val, + ) + for val in vals + ] + ) + for key, vals in matrix_writes.items() + ] + ) + + return {"default": build_matrix_writes} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result diff --git a/src/macaron/code_analyzer/dataflow_analysis/github_expr.py b/src/macaron/code_analyzer/dataflow_analysis/github_expr.py new file mode 100644 index 000000000..8961750a4 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/github_expr.py @@ -0,0 +1,141 @@ +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Parser for GitHub Actions expression language.""" + +from typing import cast + +from lark import Lark, Token, Tree + +from macaron.code_analyzer.dataflow_analysis import facts + +# Parser for GitHub Actions expression language grammar. +github_expr_parser = Lark( + r""" + _expr: literal + | identifier + | _operator_expr + | function_call + + literal: BOOLEAN_LITERAL + | NULL_LITERAL + | NUMBER_LITERAL + | STRING_LITERAL + + BOOLEAN_LITERAL: "true" | "false" + + NULL_LITERAL: "null" + + NUMBER_LITERAL: SIGNED_NUMBER + + STRING_LITERAL: "'" STRING_INNER + "'" + + STRING_INNER: /.*?/s + + CNAMEWITHDASH: ("_"|LETTER) ("_"|"-"|LETTER|DIGIT)* + + identifier: CNAMEWITHDASH + + _operator_expr: paren_expr + | property_deref + | property_deref_object_filter + | index_expr + | not_expr + | and_expr + | or_expr + | less_than_expr + | less_than_equal_expr + | greater_than_expr + | greater_than_equal_expr + | equal_expr + | not_equal_expr + + paren_expr: "(" _expr ")" + property_deref: _expr "." identifier + property_deref_object_filter: _expr "." "*" + index_expr: _expr "[" _expr "]" + not_expr: "!" _expr + and_expr: _expr "&&" _expr + or_expr: _expr "||" _expr + less_than_expr: _expr "<" _expr + less_than_equal_expr: _expr "<=" _expr + greater_than_expr: _expr ">" _expr + greater_than_equal_expr: _expr ">=" _expr + equal_expr: _expr "==" _expr + not_equal_expr: _expr "!=" _expr + + function_call: identifier "(" _expr ("," _expr)* ")" + + %import common.SIGNED_NUMBER + %import common.WS + %import common.LETTER + %import common.DIGIT + %import common._STRING_INNER + %ignore WS + """, + start="_expr", +) + + +def extract_expr_variable_name(node: Token | Tree[Token]) -> str | None: + """Return variable access path for token. + + If the given node is a variable access or sequence of property accesses, return the + access path as a string, otherwise return None. + """ + if isinstance(node, Tree) and node.data == "property_deref": + rest = extract_expr_variable_name(node.children[0]) + property_identifier = cast(Tree, node.children[1]) + if rest is not None: + identifier = cast(Token, property_identifier.children[0]) + return rest + "." + identifier + elif isinstance(node, Tree) and node.data == "identifier": + identifier = cast(Token, node.children[0]) + return cast(str, identifier.value) + + return None + + +def extract_value_from_expr_string(s: str, var_scope: facts.Scope | None) -> facts.Value | None: + """Return a value expression representation of a string containing GitHub Actions expressions. + + GitHub Action expressions within the string are denoted by "${{ }}". + + Returns None if it is unrepresentable. + """ + cur_idx = 0 + cur_expr_begin = s.find("${{") + values: list[facts.Value] = [] + while cur_expr_begin != -1: + cur_str = s[cur_idx:cur_expr_begin] + values.append(facts.StringLiteral(cur_str)) + cur_expr_end = s.find("}}", cur_expr_begin) + cur_expr = s[cur_expr_begin + 3 : cur_expr_end] + parse_tree = github_expr_parser.parse(cur_expr) + + node = parse_tree.children[0] + + var_str = extract_expr_variable_name(node) + if var_str is not None and var_scope is not None: + values.append( + facts.Read( + loc=facts.Location(scope=var_scope, loc=facts.Variable(name=facts.StringLiteral(literal=var_str))) + ) + ) + else: + return None + + cur_idx = cur_expr_end + 2 + cur_expr_begin = s.find("${{", cur_idx) + last_str = s[cur_idx:] + + values.append(facts.StringLiteral(last_str)) + + if len(values) == 1: + return values[0] + + cur_concat = facts.BinaryStringOp.get_string_concat(values[0], values[1]) + + for val in values[2:]: + cur_concat = facts.BinaryStringOp.get_string_concat(cur_concat, val) + return cur_concat diff --git a/src/macaron/code_analyzer/dataflow_analysis/models.py b/src/macaron/code_analyzer/dataflow_analysis/models.py new file mode 100644 index 000000000..4528c1bc1 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/models.py @@ -0,0 +1,679 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Models of supported commands, actions, etc. that may be invoked by build pipelines. + +Defines how they are modelled by the dataflow analysis in terms of their effect on the abstract state. +""" + +from __future__ import annotations + +from enum import Enum, auto +from functools import cache + +from macaron.code_analyzer.dataflow_analysis import core, evaluation, facts + + +class BoundParameterisedStatementSet: + """Representation of a set of (simultaneous) write operations. + + Defined as a reference to a set of generic parameterised statements, along with a set of parameter bindings + that instantiate the parameterised statements with concrete subexpressions. + """ + + #: Set of generic parameterised statements. + parameterised_stmts: evaluation.StatementSet + #: Parameter bindings for values. + value_parameter_binds: dict[str, facts.Value] + #: Parameter bindings for locations. + location_parameter_binds: dict[str, facts.LocationSpecifier] + #: Parameter bindings for scopes. + scope_parameter_binds: dict[str, facts.Scope] + #: Instantiated statements. + instantiated_statements: evaluation.StatementSet + + def __init__( + self, + parameterised_stmts: evaluation.StatementSet, + value_parameter_binds: dict[str, facts.Value] | None = None, + location_parameter_binds: dict[str, facts.LocationSpecifier] | None = None, + scope_parameter_binds: dict[str, facts.Scope] | None = None, + ) -> None: + """Initialize bound parameterised statement set. + + Parameters + ---------- + parameterised_stmts: evaluation.StatementSet + Set of generic parameterised statements. + value_parameter_binds: dict[str, facts.Value] | None + Parameter bindings for value. + location_parameter_binds: dict[str, facts.LocationSpecifier] | None + Parameter bindings for locations. + scope_parameter_binds: dict[str, facts.Scope] | None + Parameter bindings for scopes. + """ + self.parameterised_stmts = parameterised_stmts + self.value_parameter_binds = value_parameter_binds or {} + self.location_parameter_binds = location_parameter_binds or {} + self.scope_parameter_binds = scope_parameter_binds or {} + + transformer = evaluation.ParameterPlaceholderTransformer( + allow_unbound_params=False, + value_parameter_binds=self.value_parameter_binds, + location_parameter_binds=self.location_parameter_binds, + scope_parameter_binds=self.scope_parameter_binds, + ) + self.instantiated_statements = transformer.transform_statement_set(parameterised_stmts) + + def get_statements(self) -> evaluation.StatementSet: + """Return instantiated statement set.""" + return self.instantiated_statements + + +class BoundParameterisedModelNode(core.StatementNode): + """Statement node that applies effects as defined in a provided model. + + Subclasses will define a statement node with a specific model. + """ + + #: Statement effects model. + stmts: BoundParameterisedStatementSet + + def __init__(self, stmts: BoundParameterisedStatementSet) -> None: + """Initialise model statement node.""" + super().__init__() + + self.stmts = stmts + + def apply_effects(self, before_state: core.State) -> dict[core.ExitType, core.State]: + """Apply effects as defined in a provided model.""" + return {core.DEFAULT_EXIT: self.stmts.get_statements().apply_effects(before_state)} + + +class InstallPackageNode(BoundParameterisedModelNode): + """Model for package installation. + + Stores a representation of the installed package into the abstract "installed packages" location. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return the model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("install_scope"), + facts.Installed(name=facts.ParameterPlaceholderValue("name")), + ), + facts.InstalledPackage( + name=facts.ParameterPlaceholderValue("name"), + version=facts.ParameterPlaceholderValue("version"), + distribution=facts.ParameterPlaceholderValue("distribution"), + url=facts.ParameterPlaceholderValue("url"), + ), + ) + } + ) + + #: Scope into which to install. + install_scope: facts.Scope + #: Package name. + name: facts.Value + #: Package version. + version: facts.Value + #: Package distribution. + distribution: facts.Value + #: URL of package. + url: facts.Value + + def __init__( + self, + install_scope: facts.Scope, + name: facts.Value, + version: facts.Value, + distribution: facts.Value, + url: facts.Value, + ) -> None: + """Initialize install package node. + + Parameters + ---------- + install_scope: facts.Scope + Scope into which to install. + name: facts.Value + Package name. + version: facts.Value + Package version. + distribution: facts.Value + Package distribution. + url: facts.Value + URL of package. + """ + self.install_scope = install_scope + self.name = name + self.version = version + self.distribution = distribution + self.url = url + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + value_parameter_binds={"name": name, "version": version, "distribution": distribution, "url": url}, + scope_parameter_binds={"install_scope": install_scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "install_scope": {(None, self.install_scope.to_datalog_fact_string())}, + "name": {(None, self.name.to_datalog_fact_string())}, + "version": {(None, self.version.to_datalog_fact_string())}, + "distribution": {(None, self.distribution.to_datalog_fact_string())}, + "url": {(None, self.url.to_datalog_fact_string())}, + } + + +class VarAssignKind(Enum): + """Kind of variable assignment.""" + + #: Bash environment variable. + BASH_ENV_VAR = auto() + #: Bash function declaration. + BASH_FUNC_DECL = auto() + #: GitHub job variable. + GITHUB_JOB_VAR = auto() + #: GitHub environment variable. + GITHUB_ENV_VAR = auto() + #: Other uncategorized variable. + OTHER = auto() + + +class VarAssignNode(BoundParameterisedModelNode): + """Model for variable assignment. + + Stores the assigned value to the variable location. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return the model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("var_scope"), + facts.Variable(facts.ParameterPlaceholderValue("var_name")), + ), + facts.ParameterPlaceholderValue("value"), + ) + } + ) + + #: The kind of variable. + kind: VarAssignKind + #: The scope in which the variable is stored. + var_scope: facts.Scope + #: The name of the variable. + var_name: facts.Value + #: The value to assign to the variable. + value: facts.Value + + def __init__(self, kind: VarAssignKind, var_scope: facts.Scope, var_name: facts.Value, value: facts.Value) -> None: + """Initialize variable assignment node. + + Parameters + ---------- + kind: VarAssignKind + The kind of variable. + var_scope: facts.Scope + The scope in which the variable is stored. + var_name: facts.Value + The name of the variable. + value: facts.Value + The value to assign to the variable. + """ + self.kind = kind + self.var_scope = var_scope + self.var_name = var_name + self.value = value + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + value_parameter_binds={"var_name": var_name, "value": value}, + scope_parameter_binds={"var_scope": var_scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "kind": {(None, self.kind.name)}, + "var_scope": {(None, self.var_scope.to_datalog_fact_string())}, + "var_name": {(None, self.var_name.to_datalog_fact_string())}, + "value": {(None, self.value.to_datalog_fact_string())}, + } + + +class GitHubActionsGitCheckoutModelNode(core.StatementNode): + """Model for GitHub git checkout operation. + + Currently modelled as a no-op. + """ + + def apply_effects(self, before_state: core.State) -> dict[core.ExitType, core.State]: + """Apply effects for git checkout (currently nothing).""" + state = core.State() + core.transfer_state(before_state, state) + return {core.DEFAULT_EXIT: state} + + +class GitHubActionsUploadArtifactModelNode(BoundParameterisedModelNode): + """Model for uploading artifacts to GitHub pipeline artifact storage. + + Stores the content read from a file to the artifact storage location. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return the model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("artifacts_scope"), + facts.Artifact( + name=facts.ParameterPlaceholderValue("artifact_name"), + file=facts.ParameterPlaceholderValue("artifact_file"), + ), + ), + facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("filesystem_scope"), + facts.Filesystem(facts.ParameterPlaceholderValue("path")), + ) + ), + ) + } + ) + + #: Scope for pipeline artifact storage. + artifacts_scope: facts.Scope + #: Artifact name. + artifact_name: facts.Value + #: Artifact filename. + artifact_file: facts.Value + #: Scope for filesystem from which to read file. + filesystem_scope: facts.Scope + #: File path to read artifact content from. + path: facts.Value + + def __init__( + self, + artifacts_scope: facts.Scope, + artifact_name: facts.Value, + artifact_file: facts.Value, + filesystem_scope: facts.Scope, + path: facts.Value, + ) -> None: + """Initialize upload artifacts node. + + Parameters + ---------- + artifacts_scope: facts.Scope + Scope for pipeline artifact storage. + artifact_name: facts.Value + Artifact name. + artifact_file: facts.Value + Artifact filename. + filesystem_scope: facts.Scope + Scope for filesystem from which to read file. + path: facts.Value + File path to read artifact content from. + """ + self.artifacts_scope = artifacts_scope + self.artifact_name = artifact_name + self.artifact_file = artifact_file + self.filesystem_scope = filesystem_scope + self.path = path + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + value_parameter_binds={"artifact_name": artifact_name, "artifact_file": artifact_file, "path": path}, + scope_parameter_binds={"artifacts_scope": artifacts_scope, "filesystem_scope": filesystem_scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "artifacts_scope": {(None, self.artifacts_scope.to_datalog_fact_string())}, + "artifact_name": {(None, self.artifact_name.to_datalog_fact_string())}, + "artifact_file": {(None, self.artifact_file.to_datalog_fact_string())}, + "filesystem_scope": {(None, self.filesystem_scope.to_datalog_fact_string())}, + "path": {(None, self.path.to_datalog_fact_string())}, + } + + +class GitHubActionsDownloadArtifactModelNode(BoundParameterisedModelNode): + """Model for downloading artifacts from GitHub pipeline artifact storage. + + For each file in the artifact, reads the content of that artifact and + stores it to the filesystem under the same filename. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("filesystem_scope"), + facts.Filesystem( + facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("artifacts_scope"), + facts.ArtifactAnyFilename(facts.ParameterPlaceholderValue("artifact_name")), + ) + ) + ), + ), + facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("artifacts_scope"), + facts.Artifact( + name=facts.ParameterPlaceholderValue("artifact_name"), + file=facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("artifacts_scope"), + facts.ArtifactAnyFilename(facts.ParameterPlaceholderValue("artifact_name")), + ) + ), + ), + ) + ), + ) + } + ) + + #: Scope for pipeline artifact storage. + artifacts_scope: facts.Scope + #: Artifact name. + artifact_name: facts.Value + #: Scope for filesystem to store artifacts to. + filesystem_scope: facts.Scope + + def __init__(self, artifacts_scope: facts.Scope, artifact_name: facts.Value, filesystem_scope: facts.Scope) -> None: + """Initialize download artifacts node. + + Parameters + ---------- + artifacts_scope: facts.Scope + Scope for pipeline artifact storage. + artifact_name: facts.Value + Artifact name. + filesystem_scope: facts.Scope + Scope for filesystem to store artifacts to. + """ + self.artifacts_scope = artifacts_scope + self.artifact_name = artifact_name + self.filesystem_scope = filesystem_scope + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + value_parameter_binds={"artifact_name": artifact_name}, + scope_parameter_binds={"artifacts_scope": artifacts_scope, "filesystem_scope": filesystem_scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "artifacts_scope": {(None, self.artifacts_scope.to_datalog_fact_string())}, + "artifact_name": {(None, self.artifact_name.to_datalog_fact_string())}, + "filesystem_scope": {(None, self.filesystem_scope.to_datalog_fact_string())}, + } + + +class GitHubActionsReleaseModelNode(GitHubActionsUploadArtifactModelNode): + """Model for uploading artifacts to a GitHub release. + + Modelled in the same way as artifact upload. + """ + + +class BashEchoNode(BoundParameterisedModelNode): + """Model for Bash echo command, which writes the echoed value to some location.""" + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("out_loc_scope"), + facts.ParameterPlaceholderLocation("out_loc_spec"), + ), + facts.ParameterPlaceholderValue("value"), + ) + } + ) + + #: Output location. + out_loc: facts.Location + #: Value written. + value: facts.Value + + def __init__(self, out_loc: facts.Location, value: facts.Value) -> None: + """Initialize echo node. + + Parameters + ---------- + out_loc: facts.Location + Output location. + value: facts.Value + Value written. + """ + self.out_loc = out_loc + self.value = value + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + value_parameter_binds={"value": value}, + location_parameter_binds={"out_loc_spec": out_loc.loc}, + scope_parameter_binds={"out_loc_scope": out_loc.scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "out_loc": {(None, self.out_loc.to_datalog_fact_string())}, + "value": {(None, self.value.to_datalog_fact_string())}, + } + + +class Base64EncodeNode(BoundParameterisedModelNode): + """Model for Base64 encode operation. + + Reads a value from some location, Base64-encodes it and writes the result to another location. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("out_loc_scope"), + facts.ParameterPlaceholderLocation("out_loc_spec"), + ), + facts.UnaryStringOp( + facts.UnaryStringOperator.BASE64_ENCODE, + facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("in_loc_scope"), + facts.ParameterPlaceholderLocation("in_loc_spec"), + ) + ), + ), + ) + } + ) + + #: Location to read input from. + in_loc: facts.Location + #: Location to write encoded output to. + out_loc: facts.Location + + def __init__(self, in_loc: facts.Location, out_loc: facts.Location) -> None: + """Initialize Base64 encode node. + + Parameters + ---------- + in_loc: facts.Location + Location to read input from. + out_loc: facts.Location + Location to write encoded output to. + """ + self.in_loc = in_loc + self.out_loc = out_loc + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + location_parameter_binds={"out_loc_spec": out_loc.loc, "in_loc_spec": in_loc.loc}, + scope_parameter_binds={"out_loc_scope": out_loc.scope, "in_loc_scope": in_loc.scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "in_loc": {(None, self.in_loc.to_datalog_fact_string())}, + "out_loc": {(None, self.out_loc.to_datalog_fact_string())}, + } + + +class Base64DecodeNode(BoundParameterisedModelNode): + """Model for Base64 decode operation. + + Reads a value from some location, Base64-decodes it and writes the result to another location. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("out_loc_scope"), + facts.ParameterPlaceholderLocation("out_loc_spec"), + ), + facts.UnaryStringOp( + facts.UnaryStringOperator.BASE64DECODE, + facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("in_loc_scope"), + facts.ParameterPlaceholderLocation("in_loc_spec"), + ) + ), + ), + ) + } + ) + + #: Location to read input from. + in_loc: facts.Location + #: Location to write decoded output to. + out_loc: facts.Location + + def __init__(self, in_loc: facts.Location, out_loc: facts.Location) -> None: + """Initialize Base64 decode node. + + Parameters + ---------- + in_loc: facts.Location + Location to read input from. + out_loc: facts.Location + Location to write decoded output to. + """ + self.in_loc = in_loc + self.out_loc = out_loc + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + location_parameter_binds={"out_loc_spec": out_loc.loc, "in_loc_spec": in_loc.loc}, + scope_parameter_binds={"out_loc_scope": out_loc.scope, "in_loc_scope": in_loc.scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "in_loc": {(None, self.in_loc.to_datalog_fact_string())}, + "out_loc": {(None, self.out_loc.to_datalog_fact_string())}, + } + + +class MavenBuildModelNode(BoundParameterisedModelNode): + """Model for Maven build commands. + + Maven build behaviour is approximated as writing some files under the target directory. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("filesystem_scope"), + facts.FilesystemAnyUnderDir(facts.StringLiteral("./target")), + ), + facts.ArbitraryNewData("mvn"), # TODO something better? + ) + } + ) + + #: Scope for filesystem written to. + filesystem_scope: facts.Scope + + def __init__(self, filesystem_scope: facts.Scope) -> None: + """Initialize Maven build node. + + Parameters + ---------- + filesystem_scope: facts.Scope + Scope for filesystem written to. + """ + self.filesystem_scope = filesystem_scope + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), scope_parameter_binds={"filesystem_scope": filesystem_scope} + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return {"filesystem_scope": {(None, self.filesystem_scope.to_datalog_fact_string())}} diff --git a/src/macaron/code_analyzer/dataflow_analysis/printing.py b/src/macaron/code_analyzer/dataflow_analysis/printing.py new file mode 100644 index 000000000..0ffd61813 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/printing.py @@ -0,0 +1,681 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Functions for printing/displaying dataflow analysis nodes in the form of graphviz (dot) output. + +Allows the analysis representation and results to be rendered as a human-readable node-link graph. + +Makes use of graphviz's html-like label feature to add detailed information to each node. +Tables are specified in the form of a dict[str, set[tuple[str | None, str]], which is rendered as +a two-column table, with the first column containing each of the keys of the dict, and the second +column containing the corresponding set of values, as a nested vertical table, with each value having +an optional label that, if present, will be rendered in a visually distinguished manner alongside the +value. +""" + +from __future__ import annotations + +import dataclasses +from dataclasses import dataclass +from typing import TextIO + +from macaron.code_analyzer.dataflow_analysis import core + + +def print_as_dot_graph(node: core.Node, out: TextIO, include_properties: bool, include_states: bool) -> None: + """Print root node as dot graph. + + Parameters + ---------- + node: core.Node + The root node to print. + out: TextIO + Output stream to print to. + include_properties: bool + Whether to include detail on the properties of each node (disable to make nodes simpler/smaller). + include_states: bool + Whether to include detail on the abstract state at each node (disable to make nodes simpler/smaller). + """ + out.write("digraph {\n") + out.write('node [style="filled", fillcolor="white"]\n') + print_as_dot_string(node, out, include_properties=include_properties, include_states=include_states) + out.write("}\n") + + +def get_printable_table_for_state( + state: core.State, state_filter: core.StateTransferFilter | None = None +) -> dict[str, set[tuple[str | None, str]]]: + """Return a table of the stringified representation of the state. + + Consists of a mapping of storage locations to the set of values they may contain + (see module comment for description of the return type). + + Values are additionally labeled with whether they were new and not copied, and whether + they will be excluded by the given filter. + """ + result: dict[str, set[tuple[str | None, str]]] = {} + for key, vals in state.state.items(): + vals_strs: set[tuple[str | None, str]] = { + ( + str(label.sequence_number) + + ("*" if not label.copied else "") + + ("!" if state_filter is not None and not state_filter.should_transfer(key) else ""), + val.to_datalog_fact_string(), + ) + for val, label in vals.items() + } + key_str = key.to_datalog_fact_string() + result[key_str] = vals_strs + return result + + +def print_as_dot_string(node: core.Node, out: TextIO, include_properties: bool, include_states: bool) -> None: + """Print node as dot representation (to be embedded within a dot graph). + + Parameters + ---------- + node: core.Node + The node to print. + out: TextIO + Output stream to print to. + include_properties: bool + Whether to include detail on the properties of each node (disable to make nodes simpler/smaller). + include_states: bool + Whether to include detail on the abstract state at each node (disable to make nodes simpler/smaller). + """ + match node: + case core.ControlFlowGraphNode(): + print_cfg_node_as_dot_string(node, out, include_properties, include_states) + case core.StatementNode(): + print_statement_node_as_dot_string(node, out, include_properties, include_states) + case core.InterpretationNode(): + print_interpretation_node_as_dot_string(node, out, include_properties, include_states) + + +def print_cfg_node_as_dot_string( + cfg_node: core.ControlFlowGraphNode, out: TextIO, include_properties: bool, include_states: bool +) -> None: + """Print control-flow-graph node as dot representation (to be embedded within a dot graph). + + Parameters + ---------- + cfg_node: core.ControlFlowGraphNode + The control-flow-graph node to print. + out: TextIO + Output stream to print to. + include_properties: bool + Whether to include detail on the properties of each node (disable to make nodes simpler/smaller). + include_states: bool + Whether to include detail on the abstract state at each node (disable to make nodes simpler/smaller). + """ + out.write("subgraph cluster_n" + str(id(cfg_node)) + "{\n") + out.write("style=filled\n") + out.write('fillcolor="#fdf3e4ff"\n') + + subtables: list[tuple[str, dict[str, set[tuple[str | None, str]]], DotHtmlLikeTableConfiguration]] = [] + if include_properties: + properties_table = cfg_node.get_printable_properties_table() + if len(properties_table) > 0: + subtables.append( + ( + "Properties", + cfg_node.get_printable_properties_table(), + DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE_PROPERTIES, + ) + ) + + if include_states: + subtables.append( + ( + "Before State", + get_printable_table_for_state(cfg_node.before_state), + DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE_PROPERTIES, + ) + ) + if core.DEFAULT_EXIT in cfg_node.exit_states: + subtables.append( + ( + "Exit State", + get_printable_table_for_state( + cfg_node.exit_states[core.DEFAULT_EXIT], cfg_node.get_exit_state_transfer_filter() + ), + DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE_PROPERTIES, + ) + ) + for exit_type, exit_state in cfg_node.exit_states.items(): + if not isinstance(exit_type, core.DefaultExit): + subtables.append( + ( + "Exit State (" + exit_type.__class__.__name__ + ")", + get_printable_table_for_state(exit_state, cfg_node.get_exit_state_transfer_filter()), + DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE_PROPERTIES, + ) + ) + + out.write( + produce_node_dot_def( + node_id=("n" + str(id(cfg_node))), + node_kind="ControlFlowGraph", + node_type=cfg_node.__class__.__name__, + node_label=( + "[" + + ", ".join( + [str(cfg_node.created_debug_sequence_num)] + + ["(" + str(b) + "-" + str(e) + ")" for b, e in cfg_node.processed_log] + ) + + "]" + if include_states + else None + ), + config=DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE, + subtables=subtables, + ) + + "\n" + ) + + i = 0 + out.write("n" + str(id(cfg_node)) + " -> " + "c" + str(id(cfg_node.get_entry())) + ' [label="entry"]\n') + + for child_node in cfg_node.children(): + out.write( + "c" + + str(id(child_node)) + + ' [label="' + + str(i) + + '", shape=circle, fontcolor="#ffffffff", fillcolor="#aa643bff"]\n' + ) + out.write( + "e" + + str(id(cfg_node)) + + '_exit [label="exit", shape=circle, fontcolor="#ffffffff", fillcolor="#aa643bff"]\n' + ) + next_alt_exit_id = 0 + alt_exit_ids: dict[core.ExitType, int] = {} + + for exit_type in child_node.exit_states: + successors = cfg_node.get_successors(child_node, exit_type) + for successor in successors: + if isinstance(successor, core.Node): + out.write("c" + str(id(child_node)) + " -> " + "c" + str(id(successor)) + ' [label=""]\n') + elif isinstance(successor, core.DefaultExit): + out.write("c" + str(id(child_node)) + " -> " + "e" + str(id(cfg_node)) + "_exit" + ' [label=""]\n') + else: + if successor not in alt_exit_ids: + alt_exit_ids[successor] = next_alt_exit_id + next_alt_exit_id = next_alt_exit_id + 1 + alt_exit_id = alt_exit_ids[successor] + out.write( + "c" + + str(id(child_node)) + + " -> " + + "e" + + str(id(cfg_node)) + + "_alt_exit_" + + str(alt_exit_id) + + ' [label=""]\n' + ) + + for alt_exit_id in alt_exit_ids.values(): + out.write( + "e" + + str(id(cfg_node)) + + "_alt_exit_" + + str(alt_exit_id) + + ' [label="alt-exit", shape=circle, fontcolor="#ffffffff", fillcolor="#aa643bff"]\n' + ) + i = i + 1 + out.write("}\n") + + for child_node in cfg_node.children(): + out.write("c" + str(id(child_node)) + " -> " + "n" + str(id(child_node)) + ' [label=""]\n') + + for child_node in cfg_node.children(): + print_as_dot_string(child_node, out, include_properties=include_properties, include_states=include_states) + + +def print_statement_node_as_dot_string( + node: core.StatementNode, out: TextIO, include_properties: bool, include_states: bool +) -> None: + """Print statement node as dot representation (to be embedded within a dot graph). + + Parameters + ---------- + node: core.StatementNode + The statement node to print. + out: TextIO + Output stream to print to. + include_properties: bool + Whether to include detail on the properties of each node (disable to make nodes simpler/smaller). + include_states: bool + Whether to include detail on the abstract state at each node (disable to make nodes simpler/smaller). + """ + subtables: list[tuple[str, dict[str, set[tuple[str | None, str]]], DotHtmlLikeTableConfiguration]] = [] + + if include_properties: + properties_table = node.get_printable_properties_table() + if len(properties_table) > 0: + subtables.append( + ( + "Properties", + node.get_printable_properties_table(), + DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE_PROPERTIES, + ) + ) + + if include_states: + subtables.append( + ( + "Before State", + get_printable_table_for_state(node.before_state), + DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE_PROPERTIES, + ) + ) + if core.DEFAULT_EXIT in node.exit_states: + subtables.append( + ( + "Exit State", + get_printable_table_for_state( + node.exit_states[core.DEFAULT_EXIT], node.get_exit_state_transfer_filter() + ), + DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE_PROPERTIES, + ) + ) + for exit_type, exit_state in node.exit_states.items(): + if not isinstance(exit_type, core.DefaultExit): + subtables.append( + ( + "Exit State + (" + exit_type.__class__.__name__ + ")", + get_printable_table_for_state(exit_state, node.get_exit_state_transfer_filter()), + DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE_PROPERTIES, + ) + ) + + out.write( + produce_node_dot_def( + node_id=("n" + str(id(node))), + node_kind="Statement", + node_type=node.__class__.__name__, + node_label=( + "[" + + ", ".join( + [str(node.created_debug_sequence_num)] + + ["(" + str(b) + "-" + str(e) + ")" for b, e in node.processed_log] + ) + + "]" + if include_states + else None + ), + config=DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE, + subtables=subtables, + ) + + "\n" + ) + + +def print_interpretation_node_as_dot_string( + node: core.InterpretationNode, out: TextIO, include_properties: bool, include_states: bool +) -> None: + """Print interpretation node as dot representation (to be embedded within a dot graph). + + Parameters + ---------- + node: core.InterpretationNode + The interpretation node to print. + out: TextIO + Output stream to print to. + include_properties: bool + Whether to include detail on the properties of each node (disable to make nodes simpler/smaller). + include_states: bool + Whether to include detail on the abstract state at each node (disable to make nodes simpler/smaller). + """ + subtables: list[tuple[str, dict[str, set[tuple[str | None, str]]], DotHtmlLikeTableConfiguration]] = [] + + if include_properties: + properties_table = node.get_printable_properties_table() + if len(properties_table) > 0: + subtables.append( + ( + "Properties", + node.get_printable_properties_table(), + DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE_PROPERTIES, + ) + ) + + if include_states: + subtables.append( + ( + "Before State", + get_printable_table_for_state(node.before_state), + DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE_PROPERTIES, + ) + ) + if core.DEFAULT_EXIT in node.exit_states: + subtables.append( + ( + "Exit State", + get_printable_table_for_state( + node.exit_states[core.DEFAULT_EXIT], node.get_exit_state_transfer_filter() + ), + DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE_PROPERTIES, + ) + ) + for exit_type, exit_state in node.exit_states.items(): + if not isinstance(exit_type, core.DefaultExit): + subtables.append( + ( + "Exit State + (" + exit_type.__class__.__name__ + ")", + get_printable_table_for_state(exit_state, node.get_exit_state_transfer_filter()), + DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE_PROPERTIES, + ) + ) + + out.write( + produce_node_dot_def( + node_id=("n" + str(id(node))), + node_kind="Interpretation", + node_type=node.__class__.__name__, + node_label=( + "[" + + ", ".join( + [str(node.created_debug_sequence_num)] + + ["(" + str(b) + "-" + str(e) + ")" for b, e in node.processed_log] + ) + + "]" + if include_states + else None + ), + config=DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE, + subtables=subtables, + ) + + "\n" + ) + for child_node in node.interpretations.values(): + out.write("n" + str(id(node)) + " -> " + "n" + str(id(child_node)) + ' [label="interpretation"]\n') + for child_node in node.interpretations.values(): + print_as_dot_string(child_node, out, include_properties=include_properties, include_states=include_states) + + +def escape_for_dot_html_like_label(s: str) -> str: + """Return string escape for inclusion in a dot html-like label.""" + return s.replace("&", "&").replace('"', """).replace("<", "<").replace(">", ">") + + +@dataclass(frozen=True) +class DotHtmlLikeTableConfiguration: + """Configuration for rendering of dot html-like table.""" + + #: Background colour for table header. + header_colour: str + #: Font colour for table header. + header_font_colour: str + #: Font size for table header. + header_font_size: int + #: Whether font of table header should be bold. + header_font_bold: bool + #: Background colour for table body. + body_colour: str + #: Font colour for table body. + body_font_colour: str + #: Font size for table body. + body_font_size: int + + +DARK_BLUE = "#6f757eff" +LIGHT_BLUE = "#dae2efff" +DARK_BROWN = "#aa643bff" +LIGHT_BROWN = "#f5debdff" +DARK_PINK = "#a36472ff" +LIGHT_PINK = "#f6dae1ff" +LIGHT_TEXT = "#ffffffff" +DARK_TEXT = "#161513ff" +DARK_GREY = "#7a736eff" +LIGHT_GREY = "#e4e1dcff" + + +DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE = DotHtmlLikeTableConfiguration( + header_colour=DARK_PINK, + header_font_colour=LIGHT_TEXT, + header_font_size=24, + header_font_bold=True, + body_colour=LIGHT_PINK, + body_font_colour=DARK_TEXT, + body_font_size=6, +) + +DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE_PROPERTIES = dataclasses.replace( + DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE, header_font_size=12 +) + +DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE = DotHtmlLikeTableConfiguration( + header_colour=DARK_BROWN, + header_font_colour=LIGHT_TEXT, + header_font_size=24, + header_font_bold=True, + body_colour=LIGHT_BROWN, + body_font_colour=DARK_TEXT, + body_font_size=6, +) + +DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE_PROPERTIES = dataclasses.replace( + DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE, header_font_size=12 +) + +DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE = DotHtmlLikeTableConfiguration( + header_colour=DARK_BLUE, + header_font_colour=LIGHT_TEXT, + header_font_size=24, + header_font_bold=True, + body_colour=LIGHT_BLUE, + body_font_colour=DARK_TEXT, + body_font_size=6, +) + +DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE_PROPERTIES = dataclasses.replace( + DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE, header_font_size=12 +) + +DOT_HTML_LIKE_TABLE_CONFIG_STATE = DotHtmlLikeTableConfiguration( + header_colour=DARK_GREY, + header_font_colour=LIGHT_TEXT, + header_font_size=12, + header_font_bold=True, + body_colour=LIGHT_GREY, + body_font_colour=DARK_TEXT, + body_font_size=6, +) + + +def truncate_long_strings_for_display(s: str) -> str: + """Truncate long string if necessary for display.""" + if len(s) > 100: + return s[:100] + "..." + return s + + +def produce_dot_html_like_table( + header: str, data: dict[str, set[tuple[str | None, str]]], config: DotHtmlLikeTableConfiguration +) -> str: + """Return the given data table rendered as a dot html-like label table. + + See module comment for description of how data tables are rendered. + """ + lines: list[str] = [] + lines.append( + '' + ) + lines.append( + ' " + ) + + for key, vals in data.items(): + lines.append( + ' " + ) + lines.append( + ' ") + lines.append(" ") + + lines.append("
' + + ("" if config.header_font_bold else "") + + escape_for_dot_html_like_label(header) + + ("" if config.header_font_bold else "") + + "
' + + escape_for_dot_html_like_label(key) + + "' + ) + if len(vals) > 0: + for val in vals: + label_part = ( + ( + '[' + + escape_for_dot_html_like_label(val[0]) + + "] " + ) + if val[0] is not None + else "" + ) + lines.append( + " " + ) + else: + lines.append(" ") + + lines.append("
" + + label_part + + '' + + escape_for_dot_html_like_label(truncate_long_strings_for_display(val[1])) + + "
") + + return "\n".join(lines) + + +def produce_node_dot_html_like_label( + node_kind: str, + node_type: str, + node_label: str | None, + config: DotHtmlLikeTableConfiguration, + subtables: list[tuple[str, dict[str, set[tuple[str | None, str]]], DotHtmlLikeTableConfiguration]], +) -> str: + """Return the given node table data rendered as a dot html-like label table. + + Contains nested tables for each subtable (see module comment for description of how data tables are rendered). + """ + lines: list[str] = [] + lines.append( + '< ' + ) + lines.append( + ' " + ) + lines.append( + ' " + ) + if node_label is not None: + lines.append( + ' " + ) + + for subtable in subtables: + subtable_header, subtable_data, subtable_config = subtable + lines.append( + ' " + ) + + lines.append("
' + + '' + + ("" if config.header_font_bold else "") + + escape_for_dot_html_like_label(node_kind) + + ("" if config.header_font_bold else "") + + "
' + + ("" if config.header_font_bold else "") + + escape_for_dot_html_like_label(node_type) + + ("" if config.header_font_bold else "") + + "
' + + ( + ( + '' + + "" + + escape_for_dot_html_like_label(node_label) + + "" + ) + if node_label is not None + else "" + ) + + "
' + + produce_dot_html_like_table(subtable_header, subtable_data, subtable_config) + + "
>") + + return "\n".join(lines) + + +def produce_node_dot_def( + node_id: str, + node_kind: str, + node_type: str, + node_label: str | None, + config: DotHtmlLikeTableConfiguration, + subtables: list[tuple[str, dict[str, set[tuple[str | None, str]]], DotHtmlLikeTableConfiguration]], +) -> str: + """Return the given node table data rendered as a dot node containig a html-like label table. + + Contains nested tables for each subtable (see module comment for description of how data tables + are rendered). + """ + return ( + '"' + + node_id + + '" [shape=rectangle, fillcolor="' + + config.body_colour + + '" fontname="Oracle Sans Tab", label=' + + produce_node_dot_html_like_label(node_kind, node_type, node_label, config, subtables) + + "]" + ) + + +def add_context_owned_scopes_to_properties_table( + table: dict[str, set[tuple[str | None, str]]], context: core.ContextRef[core.Context] +) -> None: + """Add an entry to the given data table listing the scopes owned by the given context.""" + owned_scopes = core.get_owned_scopes(context) + if len(owned_scopes) > 0: + table["scopes"] = {(None, scope.to_datalog_fact_string(include_outer_scope=True)) for scope in owned_scopes} diff --git a/src/macaron/code_analyzer/dataflow_analysis/run_analysis_standalone.py b/src/macaron/code_analyzer/dataflow_analysis/run_analysis_standalone.py new file mode 100644 index 000000000..faaf084ea --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/run_analysis_standalone.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Module providing entry point to run dataflow analysis independently of Macaron command. + +For experimentation and debugging purposes only. +""" + +import sys + +from macaron.code_analyzer.dataflow_analysis import analysis, bash, core, github, printing +from macaron.slsa_analyzer.build_tool import Maven + + +def main() -> None: + """Entry point for running standalone analysis.""" + raw_workflow_node = analysis.analyse_github_workflow_file(sys.argv[1], None) + with open("dot", "w", encoding="utf-8") as f: + printing.print_as_dot_graph(raw_workflow_node, f, include_properties=True, include_states=True) + + nodes: list[core.Node] = [raw_workflow_node] + while len(nodes) > 0: + node = nodes.pop() + + if isinstance(node, github.GitHubActionsActionStepNode): + print("Action {") # noqa: T201 + print(" name: " + node.uses_name) # noqa: T201 + print(" version: " + node.uses_version if node.uses_version is not None else "") # noqa: T201 + print(" with {") # noqa: T201 + for key, val in node.with_parameters.items(): + print(" " + key + ": " + val.to_datalog_fact_string()) # noqa: T201 + print(" }") # noqa: T201 + print("}") # noqa: T201 + if isinstance(node, bash.BashSingleCommandNode): + print("REACHABLE SECRETS: " + str(analysis.get_reachable_secrets(node))) # noqa: T201 + for child in node.children(): + nodes.append(child) + + build_tool = Maven() + + for build_cmd in analysis.get_build_tool_commands(core.NodeForest([raw_workflow_node]), build_tool): + print("build command: " + str(build_cmd["command"])) # noqa: T201 + + +if __name__ == "__main__": + main() diff --git a/src/macaron/parsers/bashparser.py b/src/macaron/parsers/bashparser.py index 0d5cd66c1..ac2ceed68 100644 --- a/src/macaron/parsers/bashparser.py +++ b/src/macaron/parsers/bashparser.py @@ -13,65 +13,16 @@ import logging import os import subprocess # nosec B404 -from enum import Enum -from typing import Any +from typing import cast -from macaron.code_analyzer.call_graph import BaseNode from macaron.config.defaults import defaults from macaron.config.global_config import global_config -from macaron.errors import CallGraphError, ParseError -from macaron.parsers.actionparser import get_run_step -from macaron.parsers.github_workflow_model import Step +from macaron.errors import ParseError +from macaron.parsers.bashparser_model import File, Word logger: logging.Logger = logging.getLogger(__name__) -class BashScriptType(Enum): - """This class is used for different bash script types.""" - - NONE = "None" - INLINE = "inline" # Inline bash script. - FILE = "file" # Bash script file. - - -class BashNode(BaseNode): - """This class represents a callgraph node for bash commands.""" - - def __init__( - self, - name: str, - node_type: BashScriptType, - source_path: str, - parsed_step_obj: Step | None, - parsed_bash_obj: dict, - **kwargs: Any, - ) -> None: - """Initialize instance. - - Parameters - ---------- - name : str - Name of the bash script file or the step name if the script is inlined. - node_type : BashScriptType - The type of the script. - source_path : str - The path of the script. - parsed_step_obj : Step | None - The parsed step object. - parsed_bash_obj : dict - The parsed bash script object. - """ - super().__init__(**kwargs) - self.name = name - self.node_type: BashScriptType = node_type - self.source_path = source_path - self.parsed_step_obj = parsed_step_obj - self.parsed_bash_obj = parsed_bash_obj - - def __str__(self) -> str: - return f"BashNode({self.name},{self.node_type})" - - def parse_file(file_path: str, macaron_path: str | None = None) -> dict: """Parse a bash script file. @@ -157,111 +108,107 @@ def parse(bash_content: str, macaron_path: str | None = None) -> dict: raise ParseError("Error while loading the parsed bash script.") from error -def create_bash_node( - name: str, - node_id: str | None, - node_type: BashScriptType, - source_path: str, - ci_step_ast: Step | None, - repo_path: str, - caller: BaseNode, - recursion_depth: int, - macaron_path: str | None = None, -) -> BashNode: - """Create a callgraph node for a bash script. +def parse_raw(bash_content: str, macaron_path: str | None = None) -> File: + """Parse a bash script's content. + + Parameters + ---------- + bash_content : str + Bash script content + macaron_path : str | None + Macaron's root path (optional). + + Returns + ------- + bashparser_model.File + The parsed bash script AST in typed JSON (dict) format. + + Raises + ------ + ParseError + When parsing fails with errors. + """ + if not macaron_path: + macaron_path = global_config.macaron_path + cmd = [ + os.path.join(macaron_path, "bin", "bashparser"), + "-input", + bash_content, + "-raw", + ] + + try: + result = subprocess.run( # nosec B603 + cmd, + capture_output=True, + check=True, + cwd=macaron_path, + timeout=defaults.getint("bashparser", "timeout", fallback=30), + ) + except ( + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + FileNotFoundError, + ) as error: + raise ParseError("Error while parsing bash script.") from error + + try: + if result.returncode == 0: + return cast(File, json.loads(result.stdout.decode("utf-8"))) + + raise ParseError(f"Bash script parser failed: {result.stderr.decode('utf-8')}") + + except json.JSONDecodeError as error: + raise ParseError("Error while loading the parsed bash script.") from error - A bash node can have the following types: - * :class:`BashScriptType.INLINE` when it is inlined in a CI workflow. - * :class:`BashScriptType.FILE` when it is a bash script file. +def parse_expr(bash_expr_content: str, macaron_path: str | None = None) -> list[Word]: + """Parse a bash script's content. Parameters ---------- - name: str - A name to be used as the identifier of the node. - node_id: str | None - The node ID if defined. - node_type: BashScriptType - The type of the node. - source_path: str - The file that contains the bash script. - ci_step_ast: Step | None - The AST of the CI step that runs a bash script. - repo_path: str - The path to the target repo. - caller: BaseNode - The caller node. - recursion_depth: int - The number of times this function is called recursively. - macaron_path=None - The path to the Macaron module. + bash_content : str + Bash script content + macaron_path : str | None + Macaron's root path (optional). Returns ------- - BashNode - A bash node object. + list[bashparser_model.Word] + The parsed bash expr AST in typed JSON (dict) format. Raises ------ - CallGraphError - When unable to create a bash node. + ParseError + When parsing fails with errors. """ - if recursion_depth > defaults.getint("bashparser", "recursion_depth", fallback=3): - raise CallGraphError(f"The analysis has reached maximum recursion depth {recursion_depth} at {source_path}.") - parsed_bash_script = {} - working_dir = None - match node_type: - case BashScriptType.INLINE: - if ci_step_ast is None: - raise CallGraphError(f"Unable to find the parsed AST for the CI step at {source_path}.") - working_dir = ci_step_ast.get("working-directory") - run_script = get_run_step(ci_step_ast) - if run_script is None: - raise CallGraphError(f"Invalid run step at {source_path}.") - try: - parsed_bash_script = parse(run_script, macaron_path=macaron_path) - except ParseError as error: - logger.debug(error) - case BashScriptType.FILE: - try: - parsed_bash_script = parse_file(source_path, macaron_path=macaron_path) - except ParseError as error: - logger.debug(error) - bash_node = BashNode( - name, - node_type, - source_path, - parsed_step_obj=ci_step_ast, - parsed_bash_obj=parsed_bash_script, - node_id=node_id, - caller=caller, - ) - caller_commands = parsed_bash_script.get("commands", []) - - # Parse the bash script files called from the current script. - if caller_commands and repo_path: - for cmd in caller_commands: - # Parse the scripts that end with `.sh`. - # TODO: parse Makefiles for bash commands. - if not cmd or not cmd[0] or not cmd[0].endswith(".sh"): - continue - - # Check for path traversal patterns before analyzing a bash file. - bash_file_path = os.path.realpath(os.path.join(repo_path, working_dir or "", cmd[0])) - if os.path.exists(bash_file_path) and bash_file_path.startswith(repo_path): - try: - callee = create_bash_node( - name=cmd[0], - node_id=node_id, - node_type=BashScriptType.FILE, - source_path=bash_file_path, - ci_step_ast=None, - repo_path=repo_path, - caller=bash_node, - recursion_depth=recursion_depth + 1, - macaron_path=macaron_path, - ) - except CallGraphError as error: - raise error - bash_node.add_callee(callee) - return bash_node + if not macaron_path: + macaron_path = global_config.macaron_path + cmd = [ + os.path.join(macaron_path, "bin", "bashexprparser"), + "-input", + bash_expr_content, + ] + try: + result = subprocess.run( # nosec B603 + cmd, + capture_output=True, + check=True, + cwd=macaron_path, + timeout=defaults.getint("bashparser", "timeout", fallback=30), + ) + except ( + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + FileNotFoundError, + ) as error: + raise ParseError("Error while parsing bash expr.") from error + + try: + if result.returncode == 0: + return cast(list[Word], json.loads(result.stdout.decode("utf-8"))) + + raise ParseError(f"Bash script parser failed: {result.stderr.decode('utf-8')}") + + except json.JSONDecodeError as error: + raise ParseError("Error while loading the parsed bash script.") from error diff --git a/src/macaron/parsers/bashparser_model.py b/src/macaron/parsers/bashparser_model.py new file mode 100644 index 000000000..09ca83813 --- /dev/null +++ b/src/macaron/parsers/bashparser_model.py @@ -0,0 +1,848 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Type definitions for Bash AST as produced (and json-serialised) by the "mvdan.cc/sh/v3/syntax" bash parser.""" + +# Suppress warnings about non-ALL_CAPS names (that reference the original names and so are not ours to change). +# pylint: disable=invalid-name + +# Suppress false positive warnings caused by a field with the same name as its type (again, not our names to change) +# pylint: disable=used-before-assignment + +# Refer to original definitions in codebase for description, no need to comment here. +# pylint: disable=missing-class-docstring,missing-function-docstring + +# Forward-referencing union type defs are not expressible with "|" until python 3.12 type statement syntax is available. +# pylint: disable=consider-alternative-union-syntax + +# Suppress all flake8 warnings for the same reasons as above (it does not allow selective disabling at file level). +# flake8: noqa + +from __future__ import annotations + +from enum import Enum +from typing import Literal, NotRequired, TypedDict, TypeGuard, Union + + +class Pos(TypedDict): + Offset: int + Line: int + Col: int + + +class Comment(TypedDict): + Hash: Pos + Text: str + + +WordPart = Union[ + "Lit", "SglQuoted", "DblQuoted", "ParamExp", "CmdSubst", "ArithmExp", "ProcSubst", "ExtGlob", "BraceExp" +] + +ArithmExpr = Union["BinaryArithm", "UnaryArithm", "ParenArithm", "Word"] + +UnAritOperator = int + + +class UnAritOperators(Enum): + Not = 34 # ! + BitNegation = 35 # ~ + Inc = 36 # ++ + Dec = 37 # -- + Plus = 68 # + + Minus = 70 # - + + +class UnaryArithm(TypedDict): + Type: Literal["UnaryArithm"] + Pos: Pos + End: Pos + OpPos: Pos + Op: UnAritOperator + Post: NotRequired[bool] + X: ArithmExpr + + +def is_unary_arithm(expr: ArithmExpr) -> TypeGuard[UnaryArithm]: + return expr.get("Type", "") == "UnaryArithm" + + +BinAritOperator = int + + +class BinAritOperators(Enum): + Add = 68 # + + Sub = 70 # - + Mul = 38 # * + Quo = 85 # / + Rem = 76 # % + Pow = 39 # ** + Eql = 40 # == + Gtr = 54 # > + Lss = 56 # < + Neq = 41 # != + Leq = 42 # <= + Geq = 43 # >= + And = 9 # & + Or = 12 # | + Xor = 80 # ^ + Shr = 55 # >> + Shl = 61 # << + + AndArit = 10 # && + OrArit = 11 # || + Comma = 82 # , + TernQuest = 72 # ? + TernColon = 87 # : + + Assgn = 74 # = + AddAssgn = 44 # += + SubAssgn = 45 # -= + MulAssgn = 46 # *= + QuoAssgn = 47 # /= + RemAssgn = 48 # %= + AndAssgn = 49 # &= + OrAssgn = 50 # |= + XorAssgn = 51 # ^= + ShlAssgn = 52 # <<= + ShrAssgn = 53 # >>= + + +class BinaryArithm(TypedDict): + Type: Literal["BinaryArithm"] + Pos: Pos + End: Pos + OpPos: Pos + Op: BinAritOperator + X: ArithmExpr + Y: ArithmExpr + + +def is_binary_arithm(expr: ArithmExpr) -> TypeGuard[BinaryArithm]: + return expr.get("Type", "") == "BinaryArithm" + + +class ParenArithm(TypedDict): + Type: Literal["ParenArithm"] + Pos: Pos + End: Pos + Lparen: Pos + Rparen: Pos + X: ArithmExpr + + +def is_paren_arithm(expr: ArithmExpr) -> TypeGuard[ParenArithm]: + return expr.get("Type", "") == "ParenArithm" + + +def is_word_arithm(expr: ArithmExpr) -> TypeGuard[Word]: + return "Type" not in expr + + +class Lit(TypedDict): + Type: Literal["Lit"] + Pos: Pos + End: Pos + ValuePos: Pos + ValueEnd: Pos + Value: str + + +def is_lit(part: WordPart) -> TypeGuard[Lit]: + return part["Type"] == "Lit" + + +class SglQuoted(TypedDict): + Type: Literal["SglQuoted"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + Dollar: NotRequired[bool] + Value: str + + +def is_sgl_quoted(part: WordPart) -> TypeGuard[SglQuoted]: + return part["Type"] == "SglQuoted" + + +class DblQuoted(TypedDict): + Type: Literal["DblQuoted"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + Dollar: NotRequired[bool] + Parts: NotRequired[list[WordPart]] + + +def is_dbl_quoted(part: WordPart) -> TypeGuard[DblQuoted]: + return part["Type"] == "DblQuoted" + + +class Slice(TypedDict): + Offset: ArithmExpr + Length: ArithmExpr + + +class Replace(TypedDict): + All: NotRequired[bool] + Orig: Word + With: Word + + +ParNamesOperator = int + + +class ParNamesOperators(Enum): + NamesPrefix = 38 # * + NamesPrefixWords = 84 # @ + + +ParExpOperator = int + + +class ParExpOperators(Enum): + AlternateUnset = 68 # + + AlternateUnsetOrNull = 69 # :+ + DefaultUnset = 70 # - + DefaultUnsetOrNull = 71 # :- + ErrorUnset = 72 # ? + ErrorUnsetOrNull = 73 # :? + AssignUnset = 74 # = + AssignUnsetOrNull = 75 # := + RemSmallSuffix = 76 # % + RemLargeSuffix = 77 # %% + RemSmallPrefix = 78 # # + RemLargePrefix = 79 # ## + UpperFirst = 80 # ^ + UpperAll = 81 # ^^ + LowerFirst = 82 # , + LowerAll = 83 # ,, + OtherParamOps = 84 # @ + + +class Expansion(TypedDict): + Op: ParExpOperator + Word: Word + + +class ParamExp(TypedDict): + Type: Literal["ParamExp"] + Pos: Pos + End: Pos + Dollar: NotRequired[Pos] + Rbrace: NotRequired[Pos] + Short: NotRequired[bool] + Excl: NotRequired[bool] + Length: NotRequired[bool] + Width: NotRequired[bool] + Param: Lit + Index: NotRequired[ArithmExpr] + Slice: NotRequired[Slice] + Repl: NotRequired[Replace] + Names: NotRequired[ParNamesOperator] + Exp: NotRequired[Expansion] + + +def is_param_exp(part: WordPart) -> TypeGuard[ParamExp]: + return part["Type"] == "ParamExp" + + +class CmdSubst(TypedDict): + Type: Literal["CmdSubst"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] + Backquotes: NotRequired[bool] + TempFile: NotRequired[bool] + ReplyVar: NotRequired[bool] + + +def is_cmd_subst(part: WordPart) -> TypeGuard[CmdSubst]: + return part["Type"] == "CmdSubst" + + +class ArithmExp(TypedDict): + Type: Literal["ArithmExp"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + Bracket: NotRequired[bool] + Unsigned: NotRequired[bool] + X: ArithmExpr + + +def is_arithm_exp(part: WordPart) -> TypeGuard[ArithmExp]: + return part["Type"] == "ArithmExp" + + +ProcOperator = int + + +class ProcOperators(Enum): + CmdIn = 66 # <( + CmdOut = 67 # >( + + +class ProcSubst(TypedDict): + Type: Literal["ProcSubst"] + Pos: Pos + End: Pos + OpPos: Pos + Rparen: Pos + Op: ProcOperator + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] + + +def is_proc_subst(part: WordPart) -> TypeGuard[ProcSubst]: + return part["Type"] == "ProcSubst" + + +GlobOperator = int + + +class GlobOperators(Enum): + GlobZeroOrOne = 122 # ?( + GlobZeroOrMore = 123 # *( + GlobOneOrMore = 124 # +( + GlobOne = 125 # @( + GlobExcept = 126 # !( + + +class ExtGlob(TypedDict): + Type: Literal["ExtGlob"] + Pos: Pos + End: Pos + OpPos: Pos + Op: GlobOperator + Pattern: Lit + + +def is_ext_glob(part: WordPart) -> TypeGuard[ExtGlob]: + return part["Type"] == "ExtGlob" + + +class BraceExp(TypedDict): + Type: Literal["BraceExp"] + Pos: Pos + End: Pos + Sequence: NotRequired[bool] + Elems: list[Word] + + +def is_brace_exp(part: WordPart) -> TypeGuard[BraceExp]: + return part["Type"] == "BraceExp" + + +class Word(TypedDict): + Parts: list[WordPart] + + +RedirOperator = int + + +class RedirOperators(Enum): + RdrOut = 54 # > + AppOut = 55 # >> + RdrIn = 56 # < + RdrInOut = 57 # <> + DplIn = 58 # <& + DplOut = 59 # >& + ClbOut = 60 # >| + Hdoc = 61 # << + DashHdoc = 62 # <<- + WordHdoc = 63 # <<< + RdrAll = 64 # &> + AppAll = 65 # &>> + + +class Redirect(TypedDict): + Pos: Pos + End: Pos + OpPos: Pos + Op: RedirOperator + N: NotRequired[Lit] + Word: NotRequired[Word] + Hdoc: NotRequired[Word] + + +class ArrayElem(TypedDict): + Pos: Pos + End: Pos + Index: NotRequired[ArithmExpr] + Value: NotRequired[Word] + Comments: NotRequired[list[Comment]] + + +class ArrayExpr(TypedDict): + Pos: Pos + End: Pos + Lparent: Pos + Rparen: Pos + Elems: list[ArrayElem] + Last: NotRequired[list[Comment]] + + +class Assign(TypedDict): + Pos: Pos + End: Pos + Append: NotRequired[bool] + Naked: NotRequired[bool] + Name: Lit + Index: NotRequired[ArithmExpr] + Value: NotRequired[Word] + Array: NotRequired[ArrayExpr] + + +Command = Union[ + "CallExpr", + "IfClause", + "WhileClause", + "ForClause", + "CaseClause", + "Block", + "Subshell", + "BinaryCmd", + "FuncDecl", + "ArithmCmd", + "TestClause", + "DeclClause", + "LetClause", + "TimeClause", + "CoprocClause", + "TestDecl", +] + + +class CallExpr(TypedDict): + Type: Literal["CallExpr"] + Pos: Pos + End: Pos + Assigns: NotRequired[list[Assign]] + Args: NotRequired[list[Word]] + + +def is_call_expr(cmd: Command) -> TypeGuard[CallExpr]: + return cmd["Type"] == "CallExpr" + + +class IfClause(TypedDict): + Type: Literal["IfClause"] + Pos: Pos + End: Pos + Position: Pos + ThenPos: NotRequired[Pos] + FiPos: NotRequired[Pos] + Cond: list[Stmt] + CondLast: NotRequired[list[Comment]] + Then: list[Stmt] + ThenLast: NotRequired[list[Comment]] + Else: NotRequired[IfClause | ElseClause] + Last: NotRequired[list[Comment]] + + +def is_if_clause(cmd: Command) -> TypeGuard[IfClause]: + return cmd["Type"] == "IfClause" + + +class ElseClause(TypedDict): + Pos: Pos + End: Pos + Position: Pos + FiPos: NotRequired[Pos] + Then: list[Stmt] + ThenLast: NotRequired[list[Comment]] + Last: NotRequired[list[Comment]] + + +def is_else_clause(clause: IfClause | ElseClause) -> TypeGuard[ElseClause]: + return "Type" not in clause + + +class WhileClause(TypedDict): + Type: Literal["WhileClause"] + Pos: Pos + End: Pos + WhilePos: Pos + DoPos: Pos + DonePos: Pos + Cond: list[Stmt] + CondLast: NotRequired[list[Comment]] + Do: list[Stmt] + DoLast: NotRequired[list[Comment]] + + +def is_while_clause(cmd: Command) -> TypeGuard[WhileClause]: + return cmd["Type"] == "WhileClause" + + +Loop = Union["WordIter", "CStyleLoop"] + + +class WordIter(TypedDict): + Type: Literal["WordIter"] + Pos: Pos + End: Pos + Name: Lit + InPos: Pos + Items: list[Word] + + +def is_word_iter(loop: Loop) -> TypeGuard[WordIter]: + return loop["Type"] == "WordIter" + + +class CStyleLoop(TypedDict): + Type: Literal["CStyleLoop"] + Pos: Pos + End: Pos + Lparen: Pos + Rparen: Pos + Init: NotRequired[ArithmExpr] + Cond: NotRequired[ArithmExpr] + Post: NotRequired[ArithmExpr] + + +def is_cstyle_loop(loop: Loop) -> TypeGuard[CStyleLoop]: + return loop["Type"] == "CStyleLoop" + + +class ForClause(TypedDict): + Type: Literal["ForClause"] + Pos: Pos + End: Pos + ForPos: Pos + DoPos: Pos + DonePos: Pos + Select: NotRequired[bool] + Braces: NotRequired[bool] + Loop: Loop + Do: list[Stmt] + DoLast: NotRequired[list[Comment]] + + +def is_for_clause(cmd: Command) -> TypeGuard[ForClause]: + return cmd["Type"] == "ForClause" + + +CaseOperator = int + + +class CaseOperators(Enum): + Break = 30 # ;; + Fallthrough = 31 # ;& + Resume = 32 # ;;& + ResumeKorn = 33 # ;| + + +class CaseItem(TypedDict): + Pos: Pos + End: Pos + Op: CaseOperator + OpPos: Pos + Comments: NotRequired[list[Comment]] + Patterns: list[Word] + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] + + +class CaseClause(TypedDict): + Type: Literal["CaseClause"] + Pos: Pos + End: Pos + Case: Pos + In: Pos + Esac: Pos + Braces: NotRequired[bool] + Word: Word + Items: list[CaseItem] + Last: NotRequired[list[Comment]] + + +def is_case_clause(cmd: Command) -> TypeGuard[CaseClause]: + return cmd["Type"] == "CaseClause" + + +class Block(TypedDict): + Type: Literal["Block"] + Pos: Pos + End: Pos + Lbrace: Pos + Rbrace: Pos + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] + + +def is_block(cmd: Command) -> TypeGuard[Block]: + return cmd["Type"] == "Block" + + +class Subshell(TypedDict): + Type: Literal["Subshell"] + Pos: Pos + End: Pos + Lparen: Pos + Rparen: Pos + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] + + +def is_subshell(cmd: Command) -> TypeGuard[Subshell]: + return cmd["Type"] == "Subshell" + + +BinCmdOperator = int + + +class BinCmdOperators(Enum): + AndStmt = 10 # && + OrStmt = 11 # || + Pipe = 12 # | + PipeAll = 13 # |& + + +class BinaryCmd(TypedDict): + Type: Literal["BinaryCmd"] + Pos: Pos + End: Pos + OpPos: Pos + Op: BinCmdOperator + X: Stmt + Y: Stmt + + +def is_binary_cmd(cmd: Command) -> TypeGuard[BinaryCmd]: + return cmd["Type"] == "BinaryCmd" + + +class FuncDecl(TypedDict): + Type: Literal["FuncDecl"] + Pos: Pos + End: Pos + Position: Pos + RsrvWord: NotRequired[bool] + Parens: NotRequired[bool] + Name: Lit + Body: Stmt + + +def is_func_decl(cmd: Command) -> TypeGuard[FuncDecl]: + return cmd["Type"] == "FuncDecl" + + +class ArithmCmd(TypedDict): + Type: Literal["ArithmCmd"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + Unsigned: NotRequired[bool] + X: ArithmExpr + + +def is_arithm_cmd(cmd: Command) -> TypeGuard[ArithmCmd]: + return cmd["Type"] == "ArithmCmd" + + +TestExpr = Union["BinaryTest", "UnaryTest", "ParenTest", "Word"] + +BinTestOperator = int + + +class BinTestOperators(Enum): + TsReMatch = 112 # =~ + TsNewer = 113 # -nt + TsOlder = 114 # -ot + TsDevIno = 115 # -ef + TsEql = 116 # -eq + TsNeq = 117 # -ne + TsLeq = 118 # -le + TsGeq = 119 # -ge + TsLss = 120 # -lt + TsGtr = 121 # -gt + AndTest = 10 # && + OrTest = 11 # || + TsMatchShort = 74 # = + TsMatch = 40 # == + TsNoMatch = 41 # != + TsBefore = 56 # < + TsAfter = 54 # > + + +class BinaryTest(TypedDict): + Type: Literal["BinaryTest"] + Pos: Pos + End: Pos + OpPos: Pos + Op: BinTestOperator + X: TestExpr + Y: TestExpr + + +def is_binary_test(test_expr: TestExpr) -> TypeGuard[BinaryTest]: + return test_expr.get("Type", "") == "BinaryTest" + + +UnTestOperator = int + + +class UnTestOperators(Enum): + TsExists = 88 # -e + TsRegFile = 89 # -f + TsDirect = 90 # -d + TsCharSp = 91 # -c + TsBlckSp = 92 # -b + TsNmPipe = 93 # -p + TsSocket = 94 # -S + TsSmbLink = 95 # -L + TsSticky = 96 # -k + TsGIDSet = 97 # -g + TsUIDSet = 98 # -u + TsGrpOwn = 99 # -G + TsUsrOwn = 100 # -O + TsModif = 101 # -N + TsRead = 102 # -r + TsWrite = 103 # -w + TsExec = 104 # -x + TsNoEmpty = 105 # -s + TsFdTerm = 106 # -t + TsEmpStr = 107 # -z + TsNempStr = 108 # -n + TsOptSet = 109 # -o + TsVarSet = 110 # -v + TsRefVar = 111 # -R + TsNot = 34 # ! + + +class UnaryTest(TypedDict): + Type: Literal["UnaryTest"] + Pos: Pos + End: Pos + OpPos: Pos + Op: UnTestOperator + X: TestExpr + + +def is_unary_test(test_expr: TestExpr) -> TypeGuard[UnaryTest]: + return test_expr.get("Type", "") == "UnaryTest" + + +class ParenTest(TypedDict): + Type: Literal["ParenTest"] + Pos: Pos + End: Pos + Lparen: Pos + Rparen: Pos + X: TestExpr + + +def is_paren_test(test_expr: TestExpr) -> TypeGuard[ParenTest]: + return test_expr.get("Type", "") == "ParenTest" + + +def is_word_test(test_expr: TestExpr) -> TypeGuard[Word]: + return "Type" not in test_expr + + +class TestClause(TypedDict): + Type: Literal["TestClause"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + X: TestExpr + + +def is_test_clause(cmd: Command) -> TypeGuard[TestClause]: + return cmd["Type"] == "TestClause" + + +class DeclClause(TypedDict): + Type: Literal["DeclClause"] + Pos: Pos + End: Pos + Variant: Lit + Args: list[Assign] + + +def is_decl_clause(cmd: Command) -> TypeGuard[DeclClause]: + return cmd["Type"] == "DeclClause" + + +class LetClause(TypedDict): + Type: Literal["LetClause"] + Pos: Pos + End: Pos + Let: Pos + Exprs: list[ArithmExpr] + + +def is_let_clause(cmd: Command) -> TypeGuard[LetClause]: + return cmd["Type"] == "LetClause" + + +class TimeClause(TypedDict): + Type: Literal["TimeClause"] + Pos: Pos + End: Pos + Time: Pos + PosixFormat: NotRequired[bool] + Stmt: Stmt + + +def is_time_clause(cmd: Command) -> TypeGuard[TimeClause]: + return cmd["Type"] == "TimeClause" + + +class CoprocClause(TypedDict): + Type: Literal["CoprocClause"] + Pos: Pos + End: Pos + Coproc: Pos + Name: Word + Stmt: Stmt + + +def is_coproc_clause(cmd: Command) -> TypeGuard[CoprocClause]: + return cmd["Type"] == "CoprocClause" + + +class TestDecl(TypedDict): + Type: Literal["TestDecl"] + Pos: Pos + End: Pos + Position: Pos + Description: Word + Body: Stmt + + +def is_test_decl(cmd: Command) -> TypeGuard[TestDecl]: + return cmd["Type"] == "TestDecl" + + +class Stmt(TypedDict): + Comments: NotRequired[list[Comment]] + Cmd: Command + Pos: Pos + End: Pos + Position: Pos + Semicolon: NotRequired[Pos] + Negated: NotRequired[bool] + Background: NotRequired[bool] + Coprocess: NotRequired[bool] + Redirs: NotRequired[list[Redirect]] + + +class File(TypedDict): + Type: Literal["File"] + Name: NotRequired[str] + Pos: Pos + End: Pos + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] diff --git a/src/macaron/slsa_analyzer/build_tool/base_build_tool.py b/src/macaron/slsa_analyzer/build_tool/base_build_tool.py index 48ddb8e52..d6f7f9d99 100644 --- a/src/macaron/slsa_analyzer/build_tool/base_build_tool.py +++ b/src/macaron/slsa_analyzer/build_tool/base_build_tool.py @@ -3,6 +3,8 @@ """This module contains the BaseBuildTool class to be inherited by other specific Build Tools.""" +from __future__ import annotations + import glob import itertools import json @@ -14,14 +16,16 @@ from dataclasses import dataclass from enum import Enum from pathlib import Path -from typing import TypedDict +from typing import TYPE_CHECKING, TypedDict -from macaron.code_analyzer.call_graph import BaseNode from macaron.config.defaults import defaults from macaron.dependency_analyzer.cyclonedx import DependencyAnalyzer, NoneDependencyAnalyzer from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.checks.check_result import Confidence, Evidence, EvidenceWeightMap +if TYPE_CHECKING: + from macaron.code_analyzer.dataflow_analysis.core import Node + logger: logging.Logger = logging.getLogger(__name__) @@ -57,7 +61,7 @@ class BuildToolCommand(TypedDict): ci_path: str #: The CI step object that calls the command. - step_node: BaseNode | None + step_node: Node | None #: The list of name of reachable variables that contain secrets.""" reachable_secrets: list[str] diff --git a/src/macaron/slsa_analyzer/checks/build_as_code_check.py b/src/macaron/slsa_analyzer/checks/build_as_code_check.py index fd1260474..bf3693a78 100644 --- a/src/macaron/slsa_analyzer/checks/build_as_code_check.py +++ b/src/macaron/slsa_analyzer/checks/build_as_code_check.py @@ -5,27 +5,26 @@ import logging import os -from typing import cast from sqlalchemy import ForeignKey from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy.sql.sqltypes import String +from macaron.code_analyzer.dataflow_analysis.analysis import get_build_tool_commands, get_containing_github_job +from macaron.code_analyzer.dataflow_analysis.core import traverse_bfs +from macaron.code_analyzer.dataflow_analysis.github import ( + GitHubActionsActionStepNode, + GitHubActionsReusableWorkflowCallNode, + GitHubActionsRunStepNode, +) from macaron.database.table_definitions import CheckFacts from macaron.errors import CallGraphError, ProvenanceError -from macaron.parsers.bashparser import BashNode -from macaron.parsers.github_workflow_model import ActionStep from macaron.provenance.provenance_extractor import ProvenancePredicate from macaron.slsa_analyzer.analyze_context import AnalyzeContext, store_inferred_build_info_results from macaron.slsa_analyzer.checks.base_check import BaseCheck from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService, NoneCIService from macaron.slsa_analyzer.ci_service.circleci import CircleCI -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubJobNode, - GitHubWorkflowNode, - GitHubWorkflowType, -) from macaron.slsa_analyzer.ci_service.gitlab_ci import GitLabCI from macaron.slsa_analyzer.ci_service.travis import Travis from macaron.slsa_analyzer.registry import registry @@ -147,95 +146,94 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: if isinstance(ci_service, NoneCIService): continue + callgraph = ci_info["callgraph"] + trusted_deploy_actions = tool.ci_deploy_kws["github_actions"] or [] # Check for use of a trusted GitHub Actions workflow to publish/deploy. # TODO: verify that deployment is legitimate and not a test if trusted_deploy_actions: - for callee in ci_info["callgraph"].bfs(): - if isinstance(callee, GitHubWorkflowNode) and callee.node_type in [ - GitHubWorkflowType.EXTERNAL, - GitHubWorkflowType.REUSABLE, - ]: - workflow_name = callee.name.split("@")[0] - - if not workflow_name: - logger.debug("Workflow %s is not relevant. Skipping...", callee.name) - continue - if workflow_name in trusted_deploy_actions: - job_id = None - step_id = None - step_name = None - caller_path = "" - job = callee.caller - - # We always expect the caller of the node that calls a third-party - # or Reusable GitHub Action to be a GitHubJobNode. - if not isinstance(job, GitHubJobNode): - continue - - job_id = job.parsed_obj.id - caller_path = job.source_path - - # Only third-party Actions can be called from a step. - # Reusable workflows have to be directly called from the job. - # See https://docs.github.com/en/actions/sharing-automations/ \ - # reusing-workflows#calling-a-reusable-workflow - if callee.node_type == GitHubWorkflowType.EXTERNAL: - callee_step_obj = cast(ActionStep, callee.parsed_obj) - if "id" in callee_step_obj: - step_id = callee_step_obj["id"] - if "name" in callee_step_obj: - step_name = callee_step_obj["name"] - - trigger_link = ci_service.api_client.get_file_link( - ctx.component.repository.full_name, - ctx.component.repository.commit_sha, - file_path=( - ci_service.api_client.get_relative_path_of_workflow( - os.path.basename(caller_path) - ) - if caller_path - else "" - ), - ) + for root in ci_info["callgraph"].root_nodes: + for callee in traverse_bfs(root): + if isinstance(callee, (GitHubActionsReusableWorkflowCallNode, GitHubActionsActionStepNode)): + workflow_name = callee.uses_name + + if workflow_name in trusted_deploy_actions: + job_id = None + step_id = None + step_name = None + caller_path = "" + job = ( + get_containing_github_job(callee, callgraph.parents) + if isinstance(callee, GitHubActionsActionStepNode) + else callee + ) - trusted_workflow_confidence = tool.infer_confidence_deploy_workflow( - ci_path=caller_path, provenance_workflow=prov_workflow - ) - # Store or update the inferred build information if the confidence - # for the current check fact is bigger than the maximum score. - if ( - not result_tables - or trusted_workflow_confidence - > max(result_tables, key=lambda item: item.confidence).confidence - ): - store_inferred_build_info_results( - ctx=ctx, - ci_info=ci_info, - ci_service=ci_service, - trigger_link=trigger_link, - job_id=job_id, - step_id=step_id, - step_name=step_name, - callee_node_type=callee.node_type.value, + if not job: + continue + + job_id = job.job_id + caller_path = job.context.ref.workflow_context.ref.source_filepath + + # Only third-party Actions can be called from a step. + # Reusable workflows have to be directly called from the job. + # See https://docs.github.com/en/actions/sharing-automations/ \ + # reusing-workflows#calling-a-reusable-workflow + if isinstance(callee, GitHubActionsActionStepNode): + callee_node_type = "external" + if "id" in callee.definition: + step_id = callee.definition["id"] + if "name" in callee.definition: + step_name = callee.definition["name"] + else: + callee_node_type = "reusable" + + trigger_link = ci_service.api_client.get_file_link( + ctx.component.repository.full_name, + ctx.component.repository.commit_sha, + file_path=( + ci_service.api_client.get_relative_path_of_workflow( + os.path.basename(caller_path) + ) + if caller_path + else "" + ), ) - result_tables.append( - BuildAsCodeFacts( - build_tool_name=tool.name, - ci_service_name=ci_service.name, - build_trigger=trigger_link, - language=tool.language.value, - deploy_command=workflow_name, - confidence=trusted_workflow_confidence, + + trusted_workflow_confidence = tool.infer_confidence_deploy_workflow( + ci_path=caller_path, provenance_workflow=prov_workflow ) - ) - overall_res = CheckResultType.PASSED - try: - for build_command in ci_service.get_build_tool_commands( - callgraph=ci_info["callgraph"], build_tool=tool - ): + # Store or update the inferred build information if the confidence + # for the current check fact is bigger than the maximum score. + if ( + not result_tables + or trusted_workflow_confidence + > max(result_tables, key=lambda item: item.confidence).confidence + ): + store_inferred_build_info_results( + ctx=ctx, + ci_info=ci_info, + ci_service=ci_service, + trigger_link=trigger_link, + job_id=job_id, + step_id=step_id, + step_name=step_name, + callee_node_type=callee_node_type, + ) + result_tables.append( + BuildAsCodeFacts( + build_tool_name=tool.name, + ci_service_name=ci_service.name, + build_trigger=trigger_link, + language=tool.language.value, + deploy_command=workflow_name, + confidence=trusted_workflow_confidence, + ) + ) + overall_res = CheckResultType.PASSED + try: + for build_command in get_build_tool_commands(nodes=callgraph, build_tool=tool): # Yes or no with a confidence score. result, confidence = tool.is_deploy_command( build_command, @@ -256,23 +254,27 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: not result_tables or confidence > max(result_tables, key=lambda item: item.confidence).confidence ): + job_id = None + step_id = None + step_name = None + step_node = build_command["step_node"] + if step_node: + job_node = get_containing_github_job(step_node, callgraph.parents) + if job_node is not None: + job_id = job_node.job_id + + if isinstance(step_node, GitHubActionsRunStepNode): + step_id = step_node.definition.get("id") + step_name = step_node.definition.get("name") + store_inferred_build_info_results( ctx=ctx, ci_info=ci_info, ci_service=ci_service, trigger_link=trigger_link, - job_id=( - build_command["step_node"].caller.name - if build_command["step_node"] - and isinstance(build_command["step_node"].caller, GitHubJobNode) - else None - ), - step_id=build_command["step_node"].node_id if build_command["step_node"] else None, - step_name=( - build_command["step_node"].name - if isinstance(build_command["step_node"], BashNode) - else None - ), + job_id=job_id, + step_id=step_id, + step_name=step_name, ) result_tables.append( BuildAsCodeFacts( diff --git a/src/macaron/slsa_analyzer/checks/build_script_check.py b/src/macaron/slsa_analyzer/checks/build_script_check.py index ccd61cca1..76374eed1 100644 --- a/src/macaron/slsa_analyzer/checks/build_script_check.py +++ b/src/macaron/slsa_analyzer/checks/build_script_check.py @@ -10,6 +10,7 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy.sql.sqltypes import String +from macaron.code_analyzer.dataflow_analysis.analysis import get_build_tool_commands from macaron.database.table_definitions import CheckFacts from macaron.errors import CallGraphError from macaron.slsa_analyzer.analyze_context import AnalyzeContext @@ -114,9 +115,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: if isinstance(ci_service, NoneCIService): continue try: - for build_command in ci_service.get_build_tool_commands( - callgraph=ci_info["callgraph"], build_tool=tool - ): + for build_command in get_build_tool_commands(ci_info["callgraph"], tool): trigger_link = ci_service.api_client.get_file_link( ctx.component.repository.full_name, ctx.component.repository.commit_sha, diff --git a/src/macaron/slsa_analyzer/checks/build_service_check.py b/src/macaron/slsa_analyzer/checks/build_service_check.py index cea689a7c..f2439d55a 100644 --- a/src/macaron/slsa_analyzer/checks/build_service_check.py +++ b/src/macaron/slsa_analyzer/checks/build_service_check.py @@ -10,6 +10,7 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy.sql.sqltypes import String +from macaron.code_analyzer.dataflow_analysis.analysis import get_build_tool_commands from macaron.database.table_definitions import CheckFacts from macaron.errors import CallGraphError from macaron.slsa_analyzer.analyze_context import AnalyzeContext, store_inferred_build_info_results @@ -118,9 +119,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: continue try: - for build_command in ci_service.get_build_tool_commands( - callgraph=ci_info["callgraph"], build_tool=tool - ): + for build_command in get_build_tool_commands(nodes=ci_info["callgraph"], build_tool=tool): # Yes or no with a confidence score. result, confidence = tool.is_package_command( build_command, ci_service.get_third_party_configurations() diff --git a/src/macaron/slsa_analyzer/checks/github_actions_vulnerability_check.py b/src/macaron/slsa_analyzer/checks/github_actions_vulnerability_check.py index 967946bf1..48c6d445e 100644 --- a/src/macaron/slsa_analyzer/checks/github_actions_vulnerability_check.py +++ b/src/macaron/slsa_analyzer/checks/github_actions_vulnerability_check.py @@ -9,6 +9,12 @@ from sqlalchemy import ForeignKey, String from sqlalchemy.orm import Mapped, mapped_column +from macaron.code_analyzer.dataflow_analysis.analysis import get_containing_github_job +from macaron.code_analyzer.dataflow_analysis.core import traverse_bfs +from macaron.code_analyzer.dataflow_analysis.github import ( + GitHubActionsActionStepNode, + GitHubActionsReusableWorkflowCallNode, +) from macaron.database.db_custom_types import DBJsonList from macaron.database.table_definitions import CheckFacts from macaron.errors import APIAccessError @@ -16,7 +22,6 @@ from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.checks.base_check import BaseCheck, CheckResultType from macaron.slsa_analyzer.checks.check_result import CheckResultData, Confidence, JustificationType -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import GitHubWorkflowNode, GitHubWorkflowType from macaron.slsa_analyzer.package_registry.osv_dev import OSVDevService from macaron.slsa_analyzer.registry import registry from macaron.slsa_analyzer.slsa_req import ReqName @@ -87,47 +92,47 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: external_workflows: dict[str, list] = {} for ci_info in ci_services: - for callee in ci_info["callgraph"].bfs(): - if isinstance(callee, GitHubWorkflowNode) and callee.node_type in { - GitHubWorkflowType.EXTERNAL, - GitHubWorkflowType.REUSABLE, - }: - workflow_name = workflow_version = "" - if "@" in callee.name: - workflow_name, workflow_version = callee.name.split("@") - else: - # Most likely we have encountered an internal reusable workflow, which - # can be skipped. - logger.debug("GitHub Actions workflow %s misses a version. Skipping...", callee.name) - continue - - caller_path = callee.caller.source_path if callee.caller else None - - # Skip the workflow if `workflow_name` or `workflow_version` are missing, - # or if `callee.name` lacks an '@' which can indicate an internal workflow - # within the same repo . - if not workflow_name or not workflow_version: - logger.debug("Workflow %s is not relevant. Skipping...", callee.name) - continue - - ext_workflow: list = external_workflows.get(workflow_name, []) - ext_workflow.append( - { - "version": workflow_version, - "caller_path": ci_info["service"].api_client.get_file_link( - ctx.component.repository.full_name, - ctx.component.repository.commit_sha, - file_path=( - ci_info["service"].api_client.get_relative_path_of_workflow( - os.path.basename(caller_path) - ) - if caller_path - else "" + callgraph = ci_info["callgraph"] + for root in callgraph.root_nodes: + for callee in traverse_bfs(root): + if isinstance(callee, (GitHubActionsReusableWorkflowCallNode, GitHubActionsActionStepNode)): + workflow_name = callee.uses_name + workflow_version = callee.uses_version + if workflow_version is None: + # Most likely we have encountered an internal reusable workflow, which + # can be skipped. + logger.debug("GitHub Actions workflow %s misses a version. Skipping...", workflow_name) + continue + + job = ( + get_containing_github_job(callee, callgraph.parents) + if isinstance(callee, GitHubActionsActionStepNode) + else callee + ) + + if not job: + continue + + caller_path = job.context.ref.workflow_context.ref.source_filepath + + ext_workflow: list = external_workflows.get(workflow_name, []) + ext_workflow.append( + { + "version": workflow_version, + "caller_path": ci_info["service"].api_client.get_file_link( + ctx.component.repository.full_name, + ctx.component.repository.commit_sha, + file_path=( + ci_info["service"].api_client.get_relative_path_of_workflow( + os.path.basename(caller_path) + ) + if caller_path + else "" + ), ), - ), - } - ) - external_workflows[workflow_name] = ext_workflow + } + ) + external_workflows[workflow_name] = ext_workflow # If no external GitHub Actions are found, return passed result. if not external_workflows: diff --git a/src/macaron/slsa_analyzer/checks/trusted_builder_l3_check.py b/src/macaron/slsa_analyzer/checks/trusted_builder_l3_check.py index e9f629447..f6ef41014 100644 --- a/src/macaron/slsa_analyzer/checks/trusted_builder_l3_check.py +++ b/src/macaron/slsa_analyzer/checks/trusted_builder_l3_check.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. @@ -11,16 +11,16 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy.sql.sqltypes import String +from macaron.code_analyzer.dataflow_analysis.core import traverse_bfs +from macaron.code_analyzer.dataflow_analysis.github import ( + GitHubActionsActionStepNode, + GitHubActionsReusableWorkflowCallNode, +) from macaron.config.defaults import defaults from macaron.database.table_definitions import CheckFacts from macaron.slsa_analyzer.analyze_context import AnalyzeContext, store_inferred_build_info_results from macaron.slsa_analyzer.checks.base_check import BaseCheck from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubJobNode, - GitHubWorkflowNode, - GitHubWorkflowType, -) from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions from macaron.slsa_analyzer.registry import registry from macaron.slsa_analyzer.slsa_req import ReqName @@ -114,37 +114,36 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: trusted_builders = defaults.get_list("ci.github_actions", "trusted_builders", fallback=[]) # Look for trusted builders called as GitHub Actions. - for callee in ci_info["callgraph"].bfs(): - if isinstance(callee, GitHubWorkflowNode): - workflow_name = callee.name.split("@")[0] - - # Check if the action is called as a third-party or reusable workflow. - if not workflow_name or callee.node_type not in [ - GitHubWorkflowType.EXTERNAL, - GitHubWorkflowType.REUSABLE, - ]: - logger.debug("Workflow %s is not relevant. Skipping...", callee.name) - continue - if workflow_name in trusted_builders: - caller_path = callee.caller.source_path if isinstance(callee.caller, GitHubJobNode) else "" - caller_link = ci_service.api_client.get_file_link( - ctx.component.repository.full_name, - ctx.component.repository.commit_sha, - ci_service.api_client.get_relative_path_of_workflow(os.path.basename(caller_path)), - ) - - store_inferred_build_info_results( - ctx=ctx, ci_info=ci_info, ci_service=ci_service, trigger_link=caller_link - ) - - found_builder = True - result_values.append( - { - "build_tool_name": callee.name, - "build_trigger": caller_link, - "ci_service_name": ci_service.name, - } - ) + for root in ci_info["callgraph"].root_nodes: + for callee in traverse_bfs(root): + if isinstance(callee, (GitHubActionsReusableWorkflowCallNode, GitHubActionsActionStepNode)): + + workflow_name = callee.uses_name + + if workflow_name in trusted_builders: + if isinstance(callee, GitHubActionsReusableWorkflowCallNode): + caller_path = callee.context.ref.workflow_context.ref.source_filepath + else: + caller_path = callee.context.ref.job_context.ref.workflow_context.ref.source_filepath + + caller_link = ci_service.api_client.get_file_link( + ctx.component.repository.full_name, + ctx.component.repository.commit_sha, + ci_service.api_client.get_relative_path_of_workflow(os.path.basename(caller_path)), + ) + + store_inferred_build_info_results( + ctx=ctx, ci_info=ci_info, ci_service=ci_service, trigger_link=caller_link + ) + + found_builder = True + result_values.append( + { + "build_tool_name": workflow_name, + "build_trigger": caller_link, + "ci_service_name": ci_service.name, + } + ) result_tables = [TrustedBuilderFacts(**result, confidence=Confidence.HIGH) for result in result_values] diff --git a/src/macaron/slsa_analyzer/ci_service/base_ci_service.py b/src/macaron/slsa_analyzer/ci_service/base_ci_service.py index adaa3ce95..9df7e8e70 100644 --- a/src/macaron/slsa_analyzer/ci_service/base_ci_service.py +++ b/src/macaron/slsa_analyzer/ci_service/base_ci_service.py @@ -3,15 +3,14 @@ """This module contains the BaseCIService class to be inherited by a CI service.""" +from __future__ import annotations + import logging import os from abc import abstractmethod -from collections.abc import Iterable from datetime import datetime -from macaron.code_analyzer.call_graph import BaseNode, CallGraph -from macaron.errors import CallGraphError -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool, BuildToolCommand +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.git_service.api_client import BaseAPIClient from macaron.slsa_analyzer.git_service.base_git_service import BaseGitService @@ -92,7 +91,7 @@ def is_detected( return exists @abstractmethod - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -104,7 +103,7 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest The call graph built for the CI. """ raise NotImplementedError @@ -245,31 +244,6 @@ def workflow_run_deleted(self, timestamp: datetime) -> bool: """ return False - def get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """ - Traverse the callgraph and find all the reachable build tool commands. - - Parameters - ---------- - callgraph: CallGraph - The callgraph reachable from the CI workflows. - build_tool: BaseBuildTool - The corresponding build tool for which shell commands need to be detected. - - Yields - ------ - BuildToolCommand - The object that contains the build command as well useful contextual information. - - Raises - ------ - CallGraphError - Error raised when an error occurs while traversing the callgraph. - """ - # By default we assume that there is no callgraph available for a CI service. - # Each CI service should override this method if a callgraph is generated for it. - raise CallGraphError("There is no callgraph for this CI service.") - def get_third_party_configurations(self) -> list[str]: """Get the list of third-party CI configuration files. @@ -309,7 +283,7 @@ def load_defaults(self) -> None: def set_api_client(self) -> None: """Set the API client using the personal access token.""" - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -321,33 +295,10 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest The call graph built for the CI. """ - return CallGraph(BaseNode(), "") - - def get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """ - Traverse the callgraph and find all the reachable build tool commands. - - Parameters - ---------- - callgraph: CallGraph - The callgraph reachable from the CI workflows. - build_tool: BaseBuildTool - The corresponding build tool for which shell commands need to be detected. - - Yields - ------ - BuildToolCommand - The object that contains the build command as well useful contextual information. - - Raises - ------ - CallGraphError - Error raised when an error occurs while traversing the callgraph. - """ - raise CallGraphError("There is no callgraph for this CI service.") + return NodeForest([]) def has_latest_run_passed( self, repo_full_name: str, branch_name: str | None, commit_sha: str, commit_date: str, workflow: str diff --git a/src/macaron/slsa_analyzer/ci_service/circleci.py b/src/macaron/slsa_analyzer/ci_service/circleci.py index 1ac05bd86..72a838218 100644 --- a/src/macaron/slsa_analyzer/ci_service/circleci.py +++ b/src/macaron/slsa_analyzer/ci_service/circleci.py @@ -1,10 +1,11 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module analyze Circle CI.""" +from __future__ import annotations -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.config.defaults import defaults from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService @@ -42,7 +43,7 @@ def load_defaults(self) -> None: def set_api_client(self) -> None: """Set the API client using the personal access token.""" - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -54,10 +55,10 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest The call graph built for the CI. """ - return CallGraph(BaseNode(), "") + return NodeForest([]) def has_latest_run_passed( self, repo_full_name: str, branch_name: str | None, commit_sha: str, commit_date: str, workflow: str diff --git a/src/macaron/slsa_analyzer/ci_service/github_actions/analyzer.py b/src/macaron/slsa_analyzer/ci_service/github_actions/analyzer.py deleted file mode 100644 index 3c234d755..000000000 --- a/src/macaron/slsa_analyzer/ci_service/github_actions/analyzer.py +++ /dev/null @@ -1,801 +0,0 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - -"""This module provides the intermediate representations and analysis functions for GitHub Actions.""" - -import logging -import os -import re -from collections.abc import Iterable -from dataclasses import dataclass -from enum import Enum -from typing import Any, TypeGuard, cast - -from macaron.code_analyzer.call_graph import BaseNode -from macaron.config.global_config import global_config -from macaron.errors import CallGraphError, GitHubActionsValueError, ParseError -from macaron.parsers.actionparser import get_step_input -from macaron.parsers.actionparser import parse as parse_action -from macaron.parsers.bashparser import BashNode, BashScriptType, create_bash_node -from macaron.parsers.github_workflow_model import ( - ActionStep, - Identified, - Job, - NormalJob, - ReusableWorkflowCallJob, - Step, - Workflow, - is_action_step, - is_normal_job, - is_reusable_workflow_call_job, -) -from macaron.slsa_analyzer.build_tool.language import BuildLanguage, Language - -logger: logging.Logger = logging.getLogger(__name__) - - -@dataclass(frozen=True) -class ThirdPartyAction: - """The representation for a third-party GitHub Action.""" - - #: The name of the GitHub Action. - action_name: str - - #: The version of the GitHub Action. - action_version: str | None - - -class GitHubWorkflowType(str, Enum): - """This class represents different GitHub Actions workflow types.""" - - INTERNAL = "internal" # Workflows declared in the repo. - EXTERNAL = "external" # Third-party workflows. - REUSABLE = "reusable" # Reusable workflows. - - -class GitHubWorkflowNode(BaseNode): - """This class represents a callgraph node for GitHub Actions workflows.""" - - def __init__( - self, - name: str, - node_type: GitHubWorkflowType, - source_path: str, - parsed_obj: Workflow | Identified[ReusableWorkflowCallJob] | ActionStep, - model: ThirdPartyAction | None = None, - **kwargs: Any, - ) -> None: - """Initialize instance. - - Parameters - ---------- - name : str - Name of the workflow (or URL for reusable and external workflows). - node_type : GitHubWorkflowType - The type of workflow. - source_path : str - The path of the workflow. - parsed_obj : Workflow | Identified[ReusableWorkflowCallJob] | ActionStep - The parsed Actions workflow object. Actual type must correspond to node type. - (INTERNAL -> Workflow, REUSABLE -> Identified[ReusableWorkflowCallJob], EXTERNAL -> ActionStep) - caller: BaseNode | None - The caller node. - model: ThirdPartyAction | None - The static analysis abstraction for the third-party GitHub Action. - """ - super().__init__(**kwargs) - self.name = name - self.node_type: GitHubWorkflowType = node_type - self.source_path = source_path - self.parsed_obj = parsed_obj - self.model = model - - def __str__(self) -> str: - return f"GitHubWorkflowNode({self.name},{self.node_type})" - - -class GitHubJobNode(BaseNode): - """This class represents a callgraph node for GitHub Actions jobs.""" - - def __init__(self, name: str, source_path: str, parsed_obj: Identified[Job], **kwargs: Any) -> None: - """Initialize instance. - - Parameters - ---------- - name : str - Name of the workflow (or URL for reusable and external workflows). - source_path : str - The path of the workflow. - parsed_obj : Identified[Job] - The parsed Actions workflow object. - caller: BaseNode - The caller node. - """ - super().__init__(**kwargs) - self.name = name - self.source_path = source_path - self.parsed_obj = parsed_obj - - def __str__(self) -> str: - return f"GitHubJobNode({self.name})" - - -def is_parsed_obj_workflow( - parsed_obj: Workflow | Identified[ReusableWorkflowCallJob] | ActionStep, -) -> TypeGuard[Workflow]: - """Type guard for Workflow parsed_obj.""" - return not isinstance(parsed_obj, Identified) and "jobs" in parsed_obj - - -def is_parsed_obj_reusable_workflow_call_job( - obj: Workflow | Identified[ReusableWorkflowCallJob] | ActionStep, -) -> TypeGuard[Identified[ReusableWorkflowCallJob]]: - """Type guard for ReusableWorkflowCallJob parsed_obj.""" - return isinstance(obj, Identified) - - -def is_parsed_obj_action_step( - parsed_obj: Workflow | Identified[ReusableWorkflowCallJob] | ActionStep, -) -> TypeGuard[ActionStep]: - """Type guard for ActionStep parsed_obj.""" - return not isinstance(parsed_obj, Identified) and "uses" in parsed_obj - - -def find_expression_variables(value: str, exp_var: str) -> Iterable[str]: - """Find all the matching GitHub Actions expression variables in a string value. - - GitHub Actions Expression syntax: ${{ }} - See https://docs.github.com/en/actions/learn-github-actions/expressions#about-expressions - - Parameters - ---------- - value: str - The value in which the expression values are searched. - exp_var: str - The expression variable name. - - Yields - ------ - Iterable[str] - The expression variable names. - - Examples - -------- - >>> list(find_expression_variables("echo ${{ inputs.foo }}", "inputs")) - ['foo'] - >>> list(find_expression_variables("echo ${{ inputs.foo }} ${{ inputs.bar }}", "inputs")) - ['foo', 'bar'] - >>> list(find_expression_variables("echo ${{ inputs.foo }} ${{ inputs.bar }}", "matric")) - [] - """ - expressions = re.findall(r"\$\{\{.*?\}\}", value) - pattern = r"\$\{\{\s+" + exp_var + r"\.(?P(.*?))\s+\}\}" - for exp in expressions: - match = re.match(pattern, exp) - if match: - yield match.group("variable") - - -def resolve_matrix_variable(job_node: GitHubJobNode, var: str) -> Iterable[str]: - """Resolve the value of a GitHub Actions matrix variable. - - For the specification of matrix variables in GitHub Actions see: - https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs - - Parameters - ---------- - job_node: GitHubJobNode - The target GitHub Actions job. - var: str - The matrix variable that needs to be resolved. - - Yields - ------ - str - The possible values of the matrix variable. - - Raises - ------ - GitHubActionsValueError - When the matrix variable cannot be found. - """ - job_obj = job_node.parsed_obj.obj - if "strategy" not in job_obj: - raise GitHubActionsValueError(f"Unable to find `strategy` in {job_node.source_path} GitHub Action.") - if "matrix" not in job_obj["strategy"]: - raise GitHubActionsValueError(f"Unable to find `matrix` in {job_node.source_path} GitHub Action.") - matrix = job_obj["strategy"]["matrix"] - if not isinstance(matrix, dict): - raise GitHubActionsValueError(f"Unable to resolve matrix in {job_node.source_path} GitHub Action.") - - matrix_vals = matrix.get(var) - if matrix_vals is None: - raise GitHubActionsValueError(f"Unable to find variable {var} in {job_node.source_path} GitHub Action.") - - if isinstance(matrix_vals, list): - for val in matrix_vals: - # TODO: type of val permits dict/list, how to handle it? Just return Configuration instead of str - # and let the caller handle it? - if isinstance(val, str): - yield val - if isinstance(val, int): - yield str(val) - if isinstance(val, float): - yield str(val) - if isinstance(val, bool): - yield "true" if val else "false" - else: - raise GitHubActionsValueError(f"Unable to resolve matrix in {job_node.source_path} GitHub Action.") - - -def is_expression(value: str) -> bool: - """Determine if a value is a GitHub Actions expression. - - Parameters - ---------- - value: str - The input value. - - Returns - ------- - bool - True if the input value is a GitHub Actions expression. - - Examples - -------- - >>> is_expression("${{ foo }}") - True - >>> is_expression("${{ foo }") - False - >>> is_expression("${ foo }") - False - """ - return re.match(r"\$\{\{.*?\}\}", value) is not None - - -def find_language_setup_action(job_node: GitHubJobNode, lang_name: BuildLanguage) -> Language | None: - """Find the step that calls a language setup GitHub Actions and return the model. - - Parameters - ---------- - job_node: GitHubJobNode - The target GitHub Actions job node. - lang_name: BuildLanguage - The target language used in the build. - - Returns - ------- - Language | None - The language model for the language setup GitHub Action or None. - """ - for callee in job_node.callee: - model = callee.model - # Check if the model implements the Language protocol. - if isinstance(model, Language): - if model.lang_name == lang_name: - return model - return None - - -def build_call_graph_from_node(node: GitHubWorkflowNode, repo_path: str) -> None: - """Analyze the GitHub Actions node to build the call graph. - - Parameters - ---------- - node : GitHubWorkflowNode - The node for a single GitHub Actions workflow. - repo_path: str - The file system path to the repo. - """ - if not is_parsed_obj_workflow(node.parsed_obj): - return - jobs = node.parsed_obj["jobs"] - for job_name, job in jobs.items(): - job_with_id = Identified[Job](job_name, job) - job_node = GitHubJobNode(name=job_name, source_path=node.source_path, parsed_obj=job_with_id, caller=node) - node.add_callee(job_node) - - if is_normal_job(job): - # Add third-party workflows. - steps = job.get("steps") - if steps is None: - continue - for step in steps: - if is_action_step(step): - # TODO: change source_path for external workflows. - action_name = step["uses"] - external_node = GitHubWorkflowNode( - name=action_name, - node_type=GitHubWorkflowType.EXTERNAL, - source_path="", - parsed_obj=step, - caller=job_node, - ) - external_node.model = create_third_party_action_model(external_node) - job_node.add_callee(external_node) - - # Check the shell type configuration. We currently can support `bash`` and `sh`. - # By default `bash`` is used on non-Windows runners, which we support. - # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrunshell - # TODO: support Powershell for Windows runners, which is the default shell in GitHub Actions. - # Right now, the script with the default shell is passed to the parser, which will fail - # if the runner is Windows and Powershell is used. But there is no easy way to avoid passing - # the script because that means we need to accurately determine the runner's OS. - elif step.get("run") and ("shell" not in step or step["shell"] in {"bash", "sh"}): - try: - name = "UNKNOWN" - node_id = None - if "id" in step: - node_id = step["id"] - if "name" in step: - name = step["name"] - - callee = create_bash_node( - name=name, - node_id=node_id, - node_type=BashScriptType.INLINE, - source_path=node.source_path, - ci_step_ast=step, - repo_path=repo_path, - caller=job_node, - recursion_depth=0, - ) - except CallGraphError as error: - logger.debug(error) - continue - job_node.add_callee(callee) - - elif is_reusable_workflow_call_job(job): - workflow_call_job_with_id = Identified[ReusableWorkflowCallJob](job_name, job) - # Add reusable workflows. - logger.debug("Found reusable workflow: %s.", job["uses"]) - # TODO: change source_path for reusable workflows. - reusable_node = GitHubWorkflowNode( - name=job["uses"], - node_type=GitHubWorkflowType.REUSABLE, - source_path="", - parsed_obj=workflow_call_job_with_id, - caller=job_node, - ) - reusable_node.model = create_third_party_action_model(reusable_node) - job_node.add_callee(reusable_node) - - -def build_call_graph_from_path(root: BaseNode, workflow_path: str, repo_path: str, macaron_path: str = "") -> BaseNode: - """Build the call Graph for GitHub Actions workflows. - - At the moment it does not analyze third-party workflows to include their callees. - - Parameters - ---------- - root : BaseNode - The root call graph node. - workflow_path: str - The path to the CI workflow file. - repo_path: str - The path to the target repository. - macaron_path: str - Macaron's root path (optional). - - Returns - ------- - BaseNode - The callgraph node for the GitHub Actions workflow. - - Raises - ------ - ParseError - When parsing the workflow fails with error. - """ - if not macaron_path: - macaron_path = global_config.macaron_path - - # Parse GitHub Actions workflows. - logger.debug( - "Parsing %s", - workflow_path, - ) - try: - parsed_obj: Workflow = parse_action(workflow_path) - except ParseError as error: - logger.debug("Unable to parse GitHub Actions at the target %s: %s", repo_path, error) - raise ParseError from error - - # Add internal workflows. - workflow_name = os.path.basename(workflow_path) - workflow_node = GitHubWorkflowNode( - name=workflow_name, - node_type=GitHubWorkflowType.INTERNAL, - source_path=workflow_path, - parsed_obj=parsed_obj, - caller=root, - ) - build_call_graph_from_node(workflow_node, repo_path=repo_path) - - return workflow_node - - -def get_reachable_secrets(step_node: BashNode) -> Iterable[str]: - """Get reachable secrets to a GitHub Actions step. - - Parameters - ---------- - step_node: BashNode - The target GitHub Action step node. - - Yields - ------ - str - The reachable secret variable name. - """ - job_node = step_node.caller - if not isinstance(job_node, GitHubJobNode): - return - - def _find_secret_keys(ast: NormalJob | ReusableWorkflowCallJob | Step | None) -> Iterable[str]: - if ast is None: - return - if "uses" in ast: - return - normal_job = cast(NormalJob, ast) - if "env" in normal_job: - env = normal_job["env"] - if isinstance(env, dict): - for key, val in env.items(): - if isinstance(val, str): - if list(find_expression_variables(value=val, exp_var="secrets")): - yield key - - # Get reachable secrets set as environment variables in the job. - yield from _find_secret_keys(job_node.parsed_obj.obj) - - # Get reachable secrets set as environment variables in the step. - if step_node.node_type == BashScriptType.INLINE: - yield from _find_secret_keys(step_node.parsed_step_obj) - - -def get_ci_events(workflow_node: GitHubWorkflowNode) -> list[str] | None: - """Get the CI events that trigger the GitHub Action workflow. - - Parameters - ---------- - workflow_node: GitHubWorkflowNode - The target GitHub Action workflow node. - - Returns - ------- - list[str] | None - The list of event names or None. - """ - result: list[str] = [] - ast = workflow_node.parsed_obj - if not isinstance(ast, dict) or "on" not in ast: - raise GitHubActionsValueError(f"Unable to find `on` event in {workflow_node.source_path} GitHub Action.") - - on = cast(Workflow, ast)["on"] - - if isinstance(on, str): - result.append(on) - elif isinstance(on, list): - for hook in on: - result.append(hook) - else: - for key in on: - result.append(key) - - return result - - -class SetupJava(Language, ThirdPartyAction): - """This class models the official setup-java GitHub Action from GitHub. - - For the table of supported distributions see: - https://github.com/actions/setup-java?tab=readme-ov-file#supported-distributions - """ - - #: Name of the GitHub Action. - action_name = "actions/setup-java" - - #: Version of the GitHub Action. - action_version: None - - def __init__(self, external_node: GitHubWorkflowNode): - """Initialize the setup-java GitHub Action model. - - Parameters - ---------- - external_node: GitHubWorkflowNode - The external GitHub Action workflow node. - """ - # external_node is assumed to be an EXTERNAL node with ActionStep parsed_obj - step = external_node.parsed_obj - if not is_parsed_obj_action_step(step): - raise ValueError("Expected an action step node") - self._lang_name = BuildLanguage.JAVA - self._lang_distributions = None - self._lang_versions = None - self._lang_url = "https://github.com/actions/setup-java" - lang_distribution_exp = None - lang_version_exp = None - if distribution := get_step_input(step, key="distribution"): - if not is_expression(distribution): - self._lang_distributions = [distribution] - else: - lang_distribution_exp = distribution - if java_version := get_step_input(step, key="java-version"): - if not is_expression(java_version): - self._lang_versions = [java_version] - else: - lang_version_exp = java_version - # Handle matrix values. - matrix_values = {} - if lang_distribution_exp and "matrix." in lang_distribution_exp: - matrix_values["lang_distribution_var"] = find_expression_variables( - value=lang_distribution_exp, exp_var="matrix" - ) - if lang_version_exp and "matrix." in lang_version_exp: - matrix_values["lang_version_var"] = find_expression_variables(value=lang_version_exp, exp_var="matrix") - - if matrix_values: - job_node = external_node.caller - if job_node is None: - logger.debug("Unable to find the caller GitHub Action job for step %s.", external_node.name) - return - try: - if (variables := matrix_values.get("lang_distribution_var")) is not None: - values: list[str] = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_distributions = values - except GitHubActionsValueError as error: - logger.debug(error) - - try: - if (variables := matrix_values.get("lang_version_var")) is not None: - values = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_versions = values - except GitHubActionsValueError as error: - logger.debug(error) - - @property - def lang_name(self) -> str: - """Get the name of the language.""" - return self._lang_name - - @property - def lang_versions(self) -> list[str] | None: - """Get the possible version of the language.""" - return self._lang_versions - - @property - def lang_distributions(self) -> list[str] | None: - """Get the possible distributions of the language.""" - return self._lang_distributions - - @property - def lang_url(self) -> str | None: - """Get the URL that provides information about the language distributions and versions.""" - return self._lang_url - - -class OracleSetupJava(Language, ThirdPartyAction): - """This class models the Oracle setup-java GitHub Action. - - For the table of supported distributions see: - # https://github.com/oracle-actions/setup-java?tab=readme-ov-file#input-overview - """ - - #: Name of the GitHub Action. - action_name = "oracle-actions/setup-java" - - #: Version of the GitHub Action. - action_version: None - - def __init__(self, external_node: GitHubWorkflowNode): - """Initialize the Oracle setup-java GitHub Action model. - - Parameters - ---------- - external_node: GitHubWorkflowNode - The external GitHub Action workflow node. - """ - # external_node is assumed to be an EXTERNAL node with ActionStep parsed_obj - step = external_node.parsed_obj - if not is_parsed_obj_action_step(step): - raise ValueError("Expected an action step node") - self._lang_name = BuildLanguage.JAVA - self._lang_distributions = None - self._lang_versions = None - self._lang_url = "https://github.com/oracle-actions/setup-java" - lang_distribution_exp = None - lang_version_exp = None - if website := get_step_input(step, key="website"): - if not is_expression(website): - self._lang_distributions = [website] - else: - lang_distribution_exp = website - if java_release := get_step_input(step, key="release"): - if not is_expression(java_release): - self._lang_versions = [java_release] - else: - lang_version_exp = java_release - # Handle matrix values. - matrix_values = {} - if lang_distribution_exp and "matrix." in lang_distribution_exp: - matrix_values["lang_distribution_var"] = find_expression_variables( - value=lang_distribution_exp, exp_var="matrix" - ) - if lang_version_exp and "matrix." in lang_version_exp: - matrix_values["lang_version_var"] = find_expression_variables(value=lang_version_exp, exp_var="matrix") - - if matrix_values: - job_node = external_node.caller - if job_node is None: - logger.debug("Unable to find the caller GitHub Action job for step %s.", external_node.name) - return - try: - if (variables := matrix_values.get("lang_distribution_var")) is not None: - values: list[str] = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_distributions = values - except GitHubActionsValueError as error: - logger.debug(error) - - try: - if (variables := matrix_values.get("lang_version_var")) is not None: - values = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_versions = values - except GitHubActionsValueError as error: - logger.debug(error) - - @property - def lang_name(self) -> str: - """Get the name of the language.""" - return self._lang_name - - @property - def lang_versions(self) -> list[str] | None: - """Get the possible version of the language.""" - return self._lang_versions - - @property - def lang_distributions(self) -> list[str] | None: - """Get the possible distributions of the language.""" - return self._lang_distributions - - @property - def lang_url(self) -> str | None: - """Get the URL that provides information about the language distributions and versions.""" - return self._lang_url - - -class GraalVMSetup(Language, ThirdPartyAction): - """This class models the GraalVM setup GitHub Action from GitHub. - - For the table of supported distributions see: - https://github.com/graalvm/setup-graalvm - """ - - #: Name of the GitHub Action. - action_name = "graalvm/setup-graalvm" - - #: Version of the GitHub Action. - action_version: None - - def __init__(self, external_node: GitHubWorkflowNode): - """Initialize the setup-java GitHub Action model. - - Parameters - ---------- - external_node: GitHubWorkflowNode - The external GitHub Action workflow node. - """ - # external_node is assumed to be an EXTERNAL node with ActionStep parsed_obj - step = external_node.parsed_obj - if not is_parsed_obj_action_step(step): - raise ValueError("Expected an action step node") - self._lang_name = BuildLanguage.JAVA - self._lang_distributions = None - self._lang_versions = None - self._lang_url = "https://github.com/graalvm/setup-graalvm" - lang_distribution_exp = None - lang_version_exp = None - if distribution := get_step_input(step, key="distribution"): - if not is_expression(distribution): - self._lang_distributions = [distribution] - else: - lang_distribution_exp = distribution - if java_version := get_step_input(step, key="java-version"): - if not is_expression(java_version): - self._lang_versions = [java_version] - else: - lang_version_exp = java_version - # Handle matrix values. - matrix_values = {} - if lang_distribution_exp and "matrix." in lang_distribution_exp: - matrix_values["lang_distribution_var"] = find_expression_variables( - value=lang_distribution_exp, exp_var="matrix" - ) - if lang_version_exp and "matrix." in lang_version_exp: - matrix_values["lang_version_var"] = find_expression_variables(value=lang_version_exp, exp_var="matrix") - - if matrix_values: - job_node = external_node.caller - if job_node is None: - logger.debug("Unable to find the caller GitHub Action job for step %s.", external_node.name) - return - try: - if (variables := matrix_values.get("lang_distribution_var")) is not None: - values: list[str] = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_distributions = values - except GitHubActionsValueError as error: - logger.debug(error) - - try: - if (variables := matrix_values.get("lang_version_var")) is not None: - values = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_versions = values - except GitHubActionsValueError as error: - logger.debug(error) - - @property - def lang_name(self) -> str: - """Get the name of the language.""" - return self._lang_name - - @property - def lang_versions(self) -> list[str] | None: - """Get the possible version of the language.""" - return self._lang_versions - - @property - def lang_distributions(self) -> list[str] | None: - """Get the possible distributions of the language.""" - return self._lang_distributions - - @property - def lang_url(self) -> str | None: - """Get the URL that provides information about the language distributions and versions.""" - return self._lang_url - - -def create_third_party_action_model(external_node: GitHubWorkflowNode) -> ThirdPartyAction: - """Create an instances of third-party model object. - - Parameters - ---------- - external_node: GitHubWorkflowNode - The external GitHub Actions workflow node. - - Returns - ------- - ThirdPartyAction - An instance object for the ThirdPartyAction model. - """ - action_name = external_node.name - action_version = None - if "@" in external_node.name: - action_name, action_version = external_node.name.split("@", maxsplit=1) - match action_name: - case "actions/setup-java": - return SetupJava(external_node=external_node) - case "oracle-actions/setup-java": - return OracleSetupJava(external_node=external_node) - case "graalvm/setup-graalvm": - return GraalVMSetup(external_node=external_node) - return ThirdPartyAction(action_name=action_name, action_version=action_version) diff --git a/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py b/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py index c0fd6aa46..b24dc5963 100644 --- a/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py +++ b/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py @@ -3,29 +3,19 @@ """This module analyzes GitHub Actions CI.""" +from __future__ import annotations import glob import logging import os -from collections.abc import Iterable from datetime import datetime, timedelta, timezone -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_github_workflow_file +from macaron.code_analyzer.dataflow_analysis.core import Node, NodeForest from macaron.config.defaults import defaults from macaron.config.global_config import global_config -from macaron.errors import CallGraphError, GitHubActionsValueError, ParseError -from macaron.parsers.bashparser import BashNode, BashScriptType -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool, BuildToolCommand +from macaron.errors import GitHubActionsValueError, ParseError from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubJobNode, - GitHubWorkflowNode, - GitHubWorkflowType, - build_call_graph_from_path, - find_language_setup_action, - get_ci_events, - get_reachable_secrets, -) from macaron.slsa_analyzer.git_service.api_client import GhAPIClient, get_default_gh_client from macaron.slsa_analyzer.git_service.base_git_service import BaseGitService from macaron.slsa_analyzer.git_service.github import GitHub @@ -386,7 +376,7 @@ def workflow_run_in_date_time_range( raise GitHubActionsValueError("GitHub Actions workflow run misses jobs information.") for job in run_jobs["jobs"]: # If the deploy step is a Reusable Workflow, there won't be any steps in the caller job. - if callee_node_type == GitHubWorkflowType.REUSABLE.value: + if callee_node_type == "reusable": if not job["name"].startswith(job_id) or job["conclusion"] != "success": continue started_at = datetime.fromisoformat(job["started_at"]) @@ -576,7 +566,7 @@ def has_kws_in_log(self, latest_run: dict, build_log: list) -> bool: logger.info("No build kw in log file. Continue ...") return False - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for GitHub Actions workflows. At the moment it does not analyze third-party workflows to include their callees. @@ -596,106 +586,18 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: if not macaron_path: macaron_path = global_config.macaron_path - root: BaseNode = BaseNode() - gh_cg = CallGraph(root, repo_path) - # Parse GitHub Actions workflows. files = self.get_workflows(repo_path) + nodes: list[Node] = [] for workflow_path in files: try: - callee = build_call_graph_from_path( - root=root, workflow_path=workflow_path, repo_path=repo_path, macaron_path=macaron_path - ) + workflow_node = analyse_github_workflow_file(workflow_path, repo_path) + except ParseError: logger.debug("Skip adding workflow at %s to the callgraph.", workflow_path) continue - root.add_callee(callee) - return gh_cg - - def _get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """Traverse the callgraph and find all the reachable build tool commands.""" - for node in callgraph.bfs(): - # We are just interested in nodes that have bash commands. - if isinstance(node, BashNode): - # We collect useful contextual information for the called BashNode. - caller_node = node.caller - # The GitHub Actions workflow that triggers the path in the callgraph. - workflow_node = None - # The GitHub Actions job that triggers the path in the callgraph. - job_node = None - # The step in GitHub Actions job that triggers the path in the callgraph. - step_node = node if node.node_type == BashScriptType.INLINE else None - - # Walk up the callgraph to find the relevant caller nodes. - # In GitHub Actions a `GitHubWorkflowNode` may call several `GitHubJobNode`s - # and a `GitHubJobNode` may call several steps, which can be external `GitHubWorkflowNode` - # or inlined run nodes. We currently support the run steps that call shell scripts as - # `BashNode`. An inlined `BashNode` can call `BashNode` as bash files. - # TODO: revisit this implementation if analysis of external workflows is supported in - # the future, and decide if setting the caller workflow and job nodes to the nodes in the - # main triggering workflow is still expected. - while caller_node is not None: - match caller_node: - case GitHubWorkflowNode(): - workflow_node = caller_node - case GitHubJobNode(): - job_node = caller_node - case BashNode(node_type=BashScriptType.INLINE): - step_node = caller_node - - caller_node = caller_node.caller - - # Check if there was an issue in finding any of the caller nodes. - if workflow_node is None or job_node is None or step_node is None: - raise CallGraphError("Unable to traverse the call graph to find build commands.") - - # Find the bash commands that call the build tool. - for cmd in node.parsed_bash_obj.get("commands", []): - if build_tool.is_build_command(cmd): - lang_versions = lang_distributions = lang_url = None - if lang_model := find_language_setup_action(job_node, build_tool.language): - lang_versions = lang_model.lang_versions - lang_distributions = lang_model.lang_distributions - lang_url = lang_model.lang_url - yield BuildToolCommand( - ci_path=workflow_node.source_path, - command=cmd, - step_node=step_node, - language=build_tool.language, - language_versions=lang_versions, - language_distributions=lang_distributions, - language_url=lang_url, - reachable_secrets=list(get_reachable_secrets(step_node)), - events=get_ci_events(workflow_node), - ) - - def get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """Traverse the callgraph and find all the reachable build tool commands. - - This generator yields sorted build tool command objects to allow a deterministic behavior. - The objects are sorted based on the string representation of the build tool object. - - Parameters - ---------- - callgraph: CallGraph - The callgraph reachable from the CI workflows. - build_tool: BaseBuildTool - The corresponding build tool for which shell commands need to be detected. - - Yields - ------ - BuildToolCommand - The object that contains the build command as well useful contextual information. - - Raises - ------ - CallGraphError - Error raised when an error occurs while traversing the callgraph. - """ - yield from sorted( - self._get_build_tool_commands(callgraph=callgraph, build_tool=build_tool), - key=str, - ) + nodes.append(workflow_node) + return NodeForest(nodes) def get_third_party_configurations(self) -> list[str]: """Get the list of third-party CI configuration files. diff --git a/src/macaron/slsa_analyzer/ci_service/gitlab_ci.py b/src/macaron/slsa_analyzer/ci_service/gitlab_ci.py index cd7e3210d..ede49002f 100644 --- a/src/macaron/slsa_analyzer/ci_service/gitlab_ci.py +++ b/src/macaron/slsa_analyzer/ci_service/gitlab_ci.py @@ -1,9 +1,11 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module analyzes GitLab CI.""" -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from __future__ import annotations + +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.config.defaults import defaults from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService @@ -41,7 +43,7 @@ def load_defaults(self) -> None: def set_api_client(self) -> None: """Set the API client using the personal access token.""" - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -53,10 +55,10 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest The call graph built for the CI. """ - return CallGraph(BaseNode(), "") + return NodeForest([]) def has_latest_run_passed( self, repo_full_name: str, branch_name: str | None, commit_sha: str, commit_date: str, workflow: str diff --git a/src/macaron/slsa_analyzer/ci_service/jenkins.py b/src/macaron/slsa_analyzer/ci_service/jenkins.py index ebef614ca..c95edb4cb 100644 --- a/src/macaron/slsa_analyzer/ci_service/jenkins.py +++ b/src/macaron/slsa_analyzer/ci_service/jenkins.py @@ -3,21 +3,17 @@ """This module analyzes Jenkins CI.""" +from __future__ import annotations + import glob import logging import os import re -from collections.abc import Iterable -from enum import Enum -from typing import Any -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_bash_script +from macaron.code_analyzer.dataflow_analysis.core import Node, NodeForest from macaron.config.defaults import defaults from macaron.config.global_config import global_config -from macaron.errors import ParseError -from macaron.parsers import bashparser -from macaron.repo_verifier.repo_verifier import BaseBuildTool -from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService logger: logging.Logger = logging.getLogger(__name__) @@ -66,7 +62,7 @@ def load_defaults(self) -> None: def set_api_client(self) -> None: """Set the API client using the personal access token.""" - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -78,114 +74,36 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest : NodeForest The call graph built for the CI. """ if not macaron_path: macaron_path = global_config.macaron_path - root: BaseNode = BaseNode() - call_graph = CallGraph(root, repo_path) - - # To match lines that start with sh '' or sh ''' ''' (either single or triple quotes) - # TODO: we need to support multi-line cases. + # # To match lines that start with sh '' or sh ''' ''' (either single or triple quotes) + # # TODO: we need to support multi-line cases. pattern = r"^\s*sh\s+'{1,3}(.*?)'{1,3}$" workflow_files = self.get_workflows(repo_path) + nodes: list[Node] = [] + for workflow_path in workflow_files: try: with open(workflow_path, encoding="utf-8") as wf: lines = wf.readlines() except OSError as error: logger.debug("Unable to read Jenkinsfile %s: %s", workflow_path, error) - return call_graph - - # Add internal workflow. - workflow_name = os.path.basename(workflow_path) - workflow_node = JenkinsNode( - name=workflow_name, - node_type=JenkinsNodeType.INTERNAL, - source_path=workflow_path, - caller=root, - ) - root.add_callee(workflow_node) + return NodeForest([]) # Find matching lines. for line in lines: match = re.match(pattern, line) if not match: continue + nodes.append(analyse_bash_script(match[1], workflow_path, repo_path)) - try: - parsed_bash_script = bashparser.parse(match.group(1), macaron_path=macaron_path) - except ParseError as error: - logger.debug(error) - continue - - # TODO: Similar to GitHub Actions, we should enable support for recursive calls to bash scripts - # within Jenkinsfiles. While the implementation should be relatively straightforward, it’s - # recommended to first refactor the bashparser to make it agnostic to GitHub Actions. - bash_node = bashparser.BashNode( - "jenkins_inline_cmd", - bashparser.BashScriptType.INLINE, - workflow_path, - parsed_step_obj=None, - parsed_bash_obj=parsed_bash_script, - node_id=None, - caller=workflow_node, - ) - workflow_node.add_callee(bash_node) - - return call_graph - - def get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """ - Traverse the callgraph and find all the reachable build tool commands. - - Parameters - ---------- - callgraph: CallGraph - The callgraph reachable from the CI workflows. - build_tool: BaseBuildTool - The corresponding build tool for which shell commands need to be detected. - - Yields - ------ - BuildToolCommand - The object that contains the build command as well useful contextual information. - - Raises - ------ - CallGraphError - Error raised when an error occurs while traversing the callgraph. - """ - yield from sorted( - self._get_build_tool_commands(callgraph=callgraph, build_tool=build_tool), - key=str, - ) - - def _get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """Traverse the callgraph and find all the reachable build tool commands.""" - for node in callgraph.bfs(): - # We are just interested in nodes that have bash commands. - if isinstance(node, bashparser.BashNode): - # The Jenkins configuration that triggers the path in the callgraph. - workflow_node = node.caller - - # Find the bash commands that call the build tool. - for cmd in node.parsed_bash_obj.get("commands", []): - if build_tool.is_build_command(cmd): - yield BuildToolCommand( - ci_path=workflow_node.source_path if workflow_node else "", - command=cmd, - step_node=None, - language=build_tool.language, - language_versions=None, - language_distributions=None, - language_url=None, - reachable_secrets=[], - events=None, - ) + # return call_graph + return NodeForest(nodes) def has_latest_run_passed( self, repo_full_name: str, branch_name: str | None, commit_sha: str, commit_date: str, workflow: str @@ -214,41 +132,3 @@ def has_latest_run_passed( The feed back of the check, or empty if no passing workflow is found. """ return "" - - -class JenkinsNodeType(str, Enum): - """This class represents Jenkins node type.""" - - INTERNAL = "internal" # Configurations declared in one file. - - -class JenkinsNode(BaseNode): - """This class represents a callgraph node for Jenkinsfile configuration.""" - - def __init__( - self, - name: str, - node_type: JenkinsNodeType, - source_path: str, - **kwargs: Any, - ) -> None: - """Initialize instance. - - Parameters - ---------- - name : str - Name of the workflow. - node_type : JenkinsNodeType - The type of node. - source_path : str - The path of the workflow. - caller: BaseNode | None - The caller node. - """ - super().__init__(**kwargs) - self.name = name - self.node_type: JenkinsNodeType = node_type - self.source_path = source_path - - def __str__(self) -> str: - return f"JenkinsNodeType({self.name},{self.node_type})" diff --git a/src/macaron/slsa_analyzer/ci_service/travis.py b/src/macaron/slsa_analyzer/ci_service/travis.py index 8b34d27e8..a50936860 100644 --- a/src/macaron/slsa_analyzer/ci_service/travis.py +++ b/src/macaron/slsa_analyzer/ci_service/travis.py @@ -1,9 +1,11 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module analyzes Travis CI.""" -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from __future__ import annotations + +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.config.defaults import defaults from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService @@ -41,7 +43,7 @@ def load_defaults(self) -> None: def set_api_client(self) -> None: """Set the API client using the personal access token.""" - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -53,10 +55,10 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest The call graph built for the CI. """ - return CallGraph(BaseNode(), "") + return NodeForest([]) def has_latest_run_passed( self, repo_full_name: str, branch_name: str | None, commit_sha: str, commit_date: str, workflow: str diff --git a/src/macaron/slsa_analyzer/specs/ci_spec.py b/src/macaron/slsa_analyzer/specs/ci_spec.py index 0f00e5bdb..ad928b792 100644 --- a/src/macaron/slsa_analyzer/specs/ci_spec.py +++ b/src/macaron/slsa_analyzer/specs/ci_spec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the BuildSpec class.""" @@ -6,7 +6,7 @@ from collections.abc import Sequence from typing import TypedDict -from macaron.code_analyzer.call_graph import CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.asset import AssetLocator from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService from macaron.slsa_analyzer.provenance.intoto import InTotoV01Payload @@ -19,7 +19,7 @@ class CIInfo(TypedDict): service: BaseCIService """The CI service data.""" - callgraph: CallGraph + callgraph: NodeForest """The call graph for this CI service.""" provenance_assets: list[AssetLocator] diff --git a/tests/conftest.py b/tests/conftest.py index 77223948f..cb2cee8c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,14 +10,13 @@ import pytest from pytest_httpserver import HTTPServer -import macaron from macaron.build_spec_generator.cli_command_parser.gradle_cli_parser import GradleCLICommandParser from macaron.build_spec_generator.cli_command_parser.maven_cli_parser import MavenCLICommandParser -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_github_workflow +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.config.defaults import create_defaults, defaults, load_defaults from macaron.database.table_definitions import Analysis, Component, RepoFinderMetadata, Repository -from macaron.parsers.bashparser import BashScriptType, create_bash_node -from macaron.parsers.github_workflow_model import Identified, Job, NormalJob, RunStep, Workflow +from macaron.parsers.github_workflow_model import NormalJob, RunStep, Workflow from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.build_tool.conda import Conda @@ -33,11 +32,6 @@ from macaron.slsa_analyzer.build_tool.yarn import Yarn from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService from macaron.slsa_analyzer.ci_service.circleci import CircleCI -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubJobNode, - GitHubWorkflowNode, - GitHubWorkflowType, -) from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions from macaron.slsa_analyzer.ci_service.gitlab_ci import GitLabCI from macaron.slsa_analyzer.ci_service.jenkins import Jenkins @@ -489,7 +483,7 @@ def __init__( super().__init__(component, *args, **kwargs) -def build_github_actions_call_graph_for_commands(commands: list[str]) -> CallGraph: +def build_github_actions_call_graph_for_commands(commands: list[str]) -> NodeForest: """ Create a dummy callgraph that calls a list of bash commands for testing. @@ -498,37 +492,10 @@ def build_github_actions_call_graph_for_commands(commands: list[str]) -> CallGra commands: list[str] The list of bash commands. """ - root: BaseNode = BaseNode() - gh_cg = CallGraph(root, "") run_step: RunStep = {"run": ";".join(commands)} job_obj: NormalJob = {"runs-on": "", "steps": [run_step]} workflow_obj: Workflow = {"on": "release", "jobs": {"release": job_obj}} - workflow_node = GitHubWorkflowNode( - name="", - node_type=GitHubWorkflowType.INTERNAL, - source_path="", - parsed_obj=workflow_obj, - caller=root, - ) - root.add_callee(workflow_node) - job_obj_with_id: Identified[Job] = Identified("release", job_obj) - job_node = GitHubJobNode(name="", source_path="", parsed_obj=job_obj_with_id, caller=workflow_node) - workflow_node.add_callee(job_node) - - job_node.add_callee( - create_bash_node( - name="run", - node_id=None, - node_type=BashScriptType.INLINE, - source_path="", - ci_step_ast=run_step, - repo_path="", - caller=job_node, - recursion_depth=0, - macaron_path=macaron.MACARON_PATH, - ) - ) - + gh_cg = NodeForest([analyse_github_workflow(workflow_obj, "test.yaml", None)]) return gh_cg diff --git a/tests/parsers/bashparser/test_bashparser.py b/tests/parsers/bashparser/test_bashparser.py index 3f8ff5331..97c431034 100644 --- a/tests/parsers/bashparser/test_bashparser.py +++ b/tests/parsers/bashparser/test_bashparser.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """ @@ -12,9 +12,8 @@ import pytest from macaron import MACARON_PATH -from macaron.code_analyzer.call_graph import BaseNode -from macaron.errors import CallGraphError, ParseError -from macaron.parsers.bashparser import BashScriptType, create_bash_node, parse, parse_file +from macaron.errors import ParseError +from macaron.parsers.bashparser import parse, parse_file @pytest.mark.parametrize( @@ -47,36 +46,3 @@ def test_bashparser_parse_invalid() -> None: # Parse the bash script file. with pytest.raises(ParseError): parse_file(file_path=file_path, macaron_path=MACARON_PATH) - - -def test_create_bash_node_recursively() -> None: - """Test creating bash nodes from recursive script.""" - resources_dir = Path(__file__).parent.joinpath("resources", "bash_files") - with pytest.raises(CallGraphError, match="The analysis has reached maximum recursion depth .*"): - create_bash_node( - name="run", - node_id=None, - node_type=BashScriptType.FILE, - source_path=os.path.join(resources_dir, "recursive.sh"), - ci_step_ast=None, - repo_path=str(resources_dir), - caller=BaseNode(), - recursion_depth=0, - macaron_path=MACARON_PATH, - ) - - -def test_create_bash_node_path_traversal_attack() -> None: - """Test creating bash nodes from a script that is vulnerable to path traversal attacks.""" - resources_dir = Path(__file__).parent.joinpath("resources", "bash_files") - assert not create_bash_node( - name="run", - node_id=None, - node_type=BashScriptType.FILE, - source_path=os.path.join(resources_dir, "path_traversal.sh"), - ci_step_ast=None, - repo_path=str(resources_dir), - caller=BaseNode(), - recursion_depth=0, - macaron_path=MACARON_PATH, - ).callee diff --git a/tests/provenance/test_provenance_finder.py b/tests/provenance/test_provenance_finder.py index 5a1148364..774d2ff9e 100644 --- a/tests/provenance/test_provenance_finder.py +++ b/tests/provenance/test_provenance_finder.py @@ -13,7 +13,7 @@ from packageurl import PackageURL from pydriller import Git -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.provenance.provenance_finder import ( find_gav_provenance, find_npm_provenance, @@ -165,7 +165,7 @@ def test_provenance_on_unsupported_ci(macaron_path: Path, service: BaseCIService ci_info = CIInfo( service=service, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], @@ -190,7 +190,7 @@ def test_provenance_on_supported_ci(macaron_path: Path, test_dir: Path) -> None: ci_info = CIInfo( service=github_actions, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], diff --git a/tests/slsa_analyzer/build_tool/test_conda.py b/tests/slsa_analyzer/build_tool/test_conda.py index 896abad13..5adec7688 100644 --- a/tests/slsa_analyzer/build_tool/test_conda.py +++ b/tests/slsa_analyzer/build_tool/test_conda.py @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.conda import Conda from macaron.slsa_analyzer.build_tool.language import BuildLanguage @@ -120,7 +119,7 @@ def test_is_conda_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -220,7 +219,7 @@ def test_is_conda_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_docker.py b/tests/slsa_analyzer/build_tool/test_docker.py index 17e8e0114..4f256e5c9 100644 --- a/tests/slsa_analyzer/build_tool/test_docker.py +++ b/tests/slsa_analyzer/build_tool/test_docker.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the Docker build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.docker import Docker from macaron.slsa_analyzer.build_tool.language import BuildLanguage @@ -126,7 +125,7 @@ def test_is_docker_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -215,7 +214,7 @@ def test_is_docker_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_flit.py b/tests/slsa_analyzer/build_tool/test_flit.py index 9a3757c78..6ffbed7f0 100644 --- a/tests/slsa_analyzer/build_tool/test_flit.py +++ b/tests/slsa_analyzer/build_tool/test_flit.py @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.flit import Flit from macaron.slsa_analyzer.build_tool.language import BuildLanguage @@ -120,7 +119,7 @@ def test_is_flit_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -220,7 +219,7 @@ def test_is_flit_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_go.py b/tests/slsa_analyzer/build_tool/test_go.py index 7f0cb431f..3f2796326 100644 --- a/tests/slsa_analyzer/build_tool/test_go.py +++ b/tests/slsa_analyzer/build_tool/test_go.py @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.go import Go from macaron.slsa_analyzer.build_tool.language import BuildLanguage @@ -126,7 +125,7 @@ def test_is_go_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -215,7 +214,7 @@ def test_is_go_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_gradle.py b/tests/slsa_analyzer/build_tool/test_gradle.py index 4298e7fb8..6896159df 100644 --- a/tests/slsa_analyzer/build_tool/test_gradle.py +++ b/tests/slsa_analyzer/build_tool/test_gradle.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the Gradle build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.gradle import Gradle from macaron.slsa_analyzer.build_tool.language import BuildLanguage @@ -177,7 +176,7 @@ def test_is_gradle_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -332,7 +331,7 @@ def test_is_gradle_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_hatch.py b/tests/slsa_analyzer/build_tool/test_hatch.py index 40e8d0f30..3fd687476 100644 --- a/tests/slsa_analyzer/build_tool/test_hatch.py +++ b/tests/slsa_analyzer/build_tool/test_hatch.py @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.hatch import Hatch from macaron.slsa_analyzer.build_tool.language import BuildLanguage @@ -120,7 +119,7 @@ def test_is_hatch_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -220,7 +219,7 @@ def test_is_hatch_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_maven.py b/tests/slsa_analyzer/build_tool/test_maven.py index 19cb9573f..c67f99298 100644 --- a/tests/slsa_analyzer/build_tool/test_maven.py +++ b/tests/slsa_analyzer/build_tool/test_maven.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the Maven build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.build_tool.maven import Maven @@ -177,7 +176,7 @@ def test_is_maven_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -332,7 +331,7 @@ def test_is_maven_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_npm.py b/tests/slsa_analyzer/build_tool/test_npm.py index 423e02199..f27b623f0 100644 --- a/tests/slsa_analyzer/build_tool/test_npm.py +++ b/tests/slsa_analyzer/build_tool/test_npm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the NPM build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.build_tool.npm import NPM @@ -124,7 +123,7 @@ def test_is_npm_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -213,7 +212,7 @@ def test_is_npm_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_pip.py b/tests/slsa_analyzer/build_tool/test_pip.py index 1a069f31a..fa767bcce 100644 --- a/tests/slsa_analyzer/build_tool/test_pip.py +++ b/tests/slsa_analyzer/build_tool/test_pip.py @@ -5,7 +5,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.build_tool.pip import Pip @@ -91,7 +90,7 @@ def test_is_pip_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -202,7 +201,7 @@ def test_is_pip_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_poetry.py b/tests/slsa_analyzer/build_tool/test_poetry.py index 4923d23ef..ae42669af 100644 --- a/tests/slsa_analyzer/build_tool/test_poetry.py +++ b/tests/slsa_analyzer/build_tool/test_poetry.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the Poetry build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.build_tool.poetry import Poetry @@ -122,7 +121,7 @@ def test_is_poetry_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -222,7 +221,7 @@ def test_is_poetry_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_yarn.py b/tests/slsa_analyzer/build_tool/test_yarn.py index 06f645028..48f49977c 100644 --- a/tests/slsa_analyzer/build_tool/test_yarn.py +++ b/tests/slsa_analyzer/build_tool/test_yarn.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the Yarn build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.build_tool.yarn import Yarn @@ -124,7 +123,7 @@ def test_is_yarn_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -213,7 +212,7 @@ def test_is_yarn_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/checks/test_build_as_code_check.py b/tests/slsa_analyzer/checks/test_build_as_code_check.py index d34ae64e2..0092e0f86 100644 --- a/tests/slsa_analyzer/checks/test_build_as_code_check.py +++ b/tests/slsa_analyzer/checks/test_build_as_code_check.py @@ -9,19 +9,14 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode, CallGraph -from macaron.parsers.actionparser import parse as parse_action +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_github_workflow_file +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.build_tool.gradle import Gradle from macaron.slsa_analyzer.build_tool.pip import Pip from macaron.slsa_analyzer.checks.build_as_code_check import BuildAsCodeCheck, BuildAsCodeFacts from macaron.slsa_analyzer.checks.check_result import CheckResultType from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubWorkflowNode, - GitHubWorkflowType, - build_call_graph_from_node, -) from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions from macaron.slsa_analyzer.ci_service.jenkins import Jenkins from macaron.slsa_analyzer.provenance.intoto import InTotoV01Payload @@ -54,7 +49,7 @@ def test_build_as_code_check_no_callgraph( """Test the Build As Code Check when no callgraph is built for the CI service.""" ci_info = CIInfo( service=ci_services[ci_name], - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], @@ -146,7 +141,7 @@ def test_gha_workflow_deployment( check = BuildAsCodeCheck() ci_info = CIInfo( service=github_actions_service, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], @@ -160,20 +155,8 @@ def test_gha_workflow_deployment( gha_deploy.dynamic_data["build_spec"]["tools"] = [pip_tool] gha_deploy.dynamic_data["ci_services"] = [ci_info] - root: BaseNode = BaseNode() - gh_cg = CallGraph(root, "") workflow_path = os.path.join(workflows_dir, workflow_name) - parsed_obj = parse_action(workflow_path) - callee = GitHubWorkflowNode( - name=os.path.basename(workflow_path), - node_type=GitHubWorkflowType.INTERNAL, - source_path=workflow_path, - parsed_obj=parsed_obj, - caller=root, - ) - root.add_callee(callee) - build_call_graph_from_node(callee, repo_path="") - ci_info["callgraph"] = gh_cg + ci_info["callgraph"] = NodeForest([analyse_github_workflow_file(workflow_path, None)]) assert check.run_check(gha_deploy).result_type == expected_result @@ -192,7 +175,7 @@ def test_travis_ci_deploy( ci_info = CIInfo( service=travis_service, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], diff --git a/tests/slsa_analyzer/checks/test_build_service_check.py b/tests/slsa_analyzer/checks/test_build_service_check.py index 4a5496c39..21ab9c1fe 100644 --- a/tests/slsa_analyzer/checks/test_build_service_check.py +++ b/tests/slsa_analyzer/checks/test_build_service_check.py @@ -8,7 +8,7 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.checks.build_service_check import BuildServiceCheck, BuildServiceFacts from macaron.slsa_analyzer.checks.check_result import CheckResultType @@ -44,7 +44,7 @@ def test_build_service_check_no_callgraph( """Test the Build Service Check when no callgraph is built for the CI service.""" ci_info = CIInfo( service=ci_services[ci_name], - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], diff --git a/tests/slsa_analyzer/checks/test_github_actions_vulnerability_check.py b/tests/slsa_analyzer/checks/test_github_actions_vulnerability_check.py index 883dfcc09..a58ceaf2b 100644 --- a/tests/slsa_analyzer/checks/test_github_actions_vulnerability_check.py +++ b/tests/slsa_analyzer/checks/test_github_actions_vulnerability_check.py @@ -11,12 +11,12 @@ import pytest from pytest_httpserver import HTTPServer -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_github_workflow_file +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.config.defaults import load_defaults from macaron.slsa_analyzer.checks.check_result import CheckResultType from macaron.slsa_analyzer.checks.github_actions_vulnerability_check import GitHubActionsVulnsCheck from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import build_call_graph_from_path from macaron.slsa_analyzer.provenance.intoto import InTotoV01Payload from macaron.slsa_analyzer.specs.ci_spec import CIInfo from macaron.slsa_analyzer.specs.inferred_provenance import InferredProvenance @@ -29,17 +29,14 @@ def get_ci_info(ci_services: dict[str, BaseCIService], ci_name: str, workflow_pa """Get CIInfo instance.""" ci_info = CIInfo( service=ci_services[ci_name], - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], build_info_results=InTotoV01Payload(statement=InferredProvenance().payload), ) if ci_name == "github_actions": - root_node: BaseNode = BaseNode() - workflow_node = build_call_graph_from_path(root_node, workflow_path=workflow_path, repo_path="") - root_node.add_callee(workflow_node) - ci_info["callgraph"] = CallGraph(root_node, "") + ci_info["callgraph"] = NodeForest([analyse_github_workflow_file(workflow_path, None)]) return ci_info diff --git a/tests/slsa_analyzer/checks/test_provenance_l3_content_check.py b/tests/slsa_analyzer/checks/test_provenance_l3_content_check.py index 8584e5f35..4abf8df64 100644 --- a/tests/slsa_analyzer/checks/test_provenance_l3_content_check.py +++ b/tests/slsa_analyzer/checks/test_provenance_l3_content_check.py @@ -5,7 +5,7 @@ import os -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.asset import VirtualReleaseAsset from macaron.slsa_analyzer.checks.check_result import CheckResultType from macaron.slsa_analyzer.checks.provenance_l3_content_check import ProvenanceL3ContentCheck @@ -82,7 +82,7 @@ def test_expectation_check(self) -> None: # Test GitHub Actions. ci_info = CIInfo( service=github_actions, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], diff --git a/tests/slsa_analyzer/checks/test_trusted_builder_l3_check.py b/tests/slsa_analyzer/checks/test_trusted_builder_l3_check.py index c36eba0d5..6f72ab739 100644 --- a/tests/slsa_analyzer/checks/test_trusted_builder_l3_check.py +++ b/tests/slsa_analyzer/checks/test_trusted_builder_l3_check.py @@ -8,15 +8,10 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode, CallGraph -from macaron.parsers.actionparser import parse as parse_action +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_github_workflow_file +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.checks.check_result import CheckResultType from macaron.slsa_analyzer.checks.trusted_builder_l3_check import TrustedBuilderL3Check -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubWorkflowNode, - GitHubWorkflowType, - build_call_graph_from_node, -) from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions from macaron.slsa_analyzer.provenance.intoto import InTotoV01Payload from macaron.slsa_analyzer.specs.ci_spec import CIInfo @@ -47,7 +42,7 @@ def test_trusted_builder_l3_check( workflows_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "github", "workflow_files") ci_info = CIInfo( service=github_actions_service, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], @@ -57,18 +52,6 @@ def test_trusted_builder_l3_check( ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="") ctx.dynamic_data["ci_services"] = [ci_info] - root: BaseNode = BaseNode() - gh_cg = CallGraph(root, "") workflow_path = os.path.join(workflows_dir, workflow_name) - parsed_obj = parse_action(workflow_path) - callee = GitHubWorkflowNode( - name=workflow_name, - node_type=GitHubWorkflowType.INTERNAL, - source_path=workflow_path, - parsed_obj=parsed_obj, - caller=root, - ) - build_call_graph_from_node(callee, repo_path="") - root.add_callee(callee) - ci_info["callgraph"] = gh_cg + ci_info["callgraph"] = NodeForest([analyse_github_workflow_file(workflow_path, None)]) assert check.run_check(ctx).result_type == expected_result diff --git a/tests/slsa_analyzer/ci_service/test_github_actions.py b/tests/slsa_analyzer/ci_service/test_github_actions.py index 1995c3705..4da4f7d2a 100644 --- a/tests/slsa_analyzer/ci_service/test_github_actions.py +++ b/tests/slsa_analyzer/ci_service/test_github_actions.py @@ -1,21 +1,13 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests GitHub Actions CI service.""" -import os from datetime import datetime, timedelta from pathlib import Path import pytest -from macaron.code_analyzer.call_graph import BaseNode, CallGraph -from macaron.parsers.actionparser import parse as parse_action -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubWorkflowNode, - GitHubWorkflowType, - build_call_graph_from_node, -) from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions mock_repos = Path(__file__).parent.joinpath("mock_repos") @@ -30,59 +22,6 @@ def github_actions_() -> GitHubActions: return GitHubActions() -@pytest.mark.parametrize( - ( - "workflow_name", - "expect", - ), - [ - ( - "valid1.yaml", - [ - "GitHubWorkflowNode(valid1.yaml,GitHubWorkflowType.INTERNAL)", - "GitHubJobNode(build)", - "GitHubWorkflowNode(apache/maven-gh-actions-shared/.github/workflows/maven-verify.yml@v2,GitHubWorkflowType.REUSABLE)", - ], - ), - ( - "valid2.yaml", - [ - "GitHubWorkflowNode(valid2.yaml,GitHubWorkflowType.INTERNAL)", - "GitHubJobNode(build)", - "GitHubWorkflowNode(actions/checkout@v3,GitHubWorkflowType.EXTERNAL)", - "GitHubWorkflowNode(actions/cache@v3,GitHubWorkflowType.EXTERNAL)", - "GitHubWorkflowNode(actions/setup-java@v3,GitHubWorkflowType.EXTERNAL)", - "BashNode(Publish to Sonatype Snapshots,BashScriptType.INLINE)", - ], - ), - ], - ids=[ - "Internal and reusable workflows", - "Internal and external workflows", - ], -) -def test_build_call_graph(workflow_name: str, expect: list[str]) -> None: - """Test building call graphs for GitHub Actions workflows.""" - resources_dir = Path(__file__).parent.joinpath("resources", "github") - - # Parse GitHub Actions workflows. - root: BaseNode = BaseNode() - gh_cg = CallGraph(root, "") - workflow_path = os.path.join(resources_dir, workflow_name) - parsed_obj = parse_action(workflow_path) - - callee = GitHubWorkflowNode( - name=os.path.basename(workflow_path), - node_type=GitHubWorkflowType.INTERNAL, - source_path=workflow_path, - parsed_obj=parsed_obj, - caller=root, - ) - root.add_callee(callee) - build_call_graph_from_node(callee, repo_path="") - assert [str(node) for node in gh_cg.bfs()] == expect - - def test_is_detected(github_actions: GitHubActions) -> None: """Test detecting GitHub Action config files.""" assert github_actions.is_detected(str(ga_has_build_kws)) diff --git a/tests/slsa_analyzer/test_analyze_context.py b/tests/slsa_analyzer/test_analyze_context.py index 40a4ad881..4b1b1e776 100644 --- a/tests/slsa_analyzer/test_analyze_context.py +++ b/tests/slsa_analyzer/test_analyze_context.py @@ -6,7 +6,7 @@ from unittest import TestCase from unittest.mock import MagicMock -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.json_tools import JsonType from macaron.slsa_analyzer.asset import VirtualReleaseAsset from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions @@ -93,7 +93,7 @@ def test_provenances(self) -> None: gh_actions_ci_info = CIInfo( service=gh_actions, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[ From ad103cff5dcdf1a377e95816374140d687128dc2 Mon Sep 17 00:00:00 2001 From: Parth Govale Date: Mon, 15 Dec 2025 13:02:38 +0530 Subject: [PATCH 03/20] feat: prepare Macaron GitHub Action to publish on GitHub Marketplace (#1259) Prepare Macaron for publishing on GitHub Action Marketplace, and add the documentation. Signed-off-by: Demolus13 --- .github/workflows/test_macaron_action.yaml | 2 +- README.md | 18 +- action.yaml | 21 +-- docs/source/index.rst | 1 + docs/source/pages/macaron_action.rst | 175 ++++++++++++++++++ scripts/actions/run_macaron_analysis.sh | 2 +- .../run_macaron_policy_verification.sh | 2 +- scripts/actions/setup_macaron.sh | 72 +++++-- .../datalog/malware-detection.dl.template | 1 - 9 files changed, 255 insertions(+), 39 deletions(-) create mode 100644 docs/source/pages/macaron_action.rst diff --git a/.github/workflows/test_macaron_action.yaml b/.github/workflows/test_macaron_action.yaml index 5f3753a98..2621313c6 100644 --- a/.github/workflows/test_macaron_action.yaml +++ b/.github/workflows/test_macaron_action.yaml @@ -184,7 +184,7 @@ jobs: package_url: pkg:maven/io.github.behnazh-w.demo/example-maven-app@2.0?type=jar repo_path: https://github.com/behnazh-w/example-maven-app output_dir: macaron_output/detect_malicious_java_dep - sbom_path: ./resources/detect_malicious_java_dep/example-sbom.json + sbom_path: ./tests/tutorial_resources/detect_malicious_java_dep/example-sbom.json deps_depth: '1' - name: Run Macaron (verify policy - detect-malicious-upload) diff --git a/README.md b/README.md index d77aca1d5..925f6e127 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,27 @@ ![Macaron](./docs/source/assets/macaron.svg) -[Full Documentation](https://oracle.github.io/macaron/index.html) | [Tutorials](https://oracle.github.io/macaron/pages/tutorials/index.html) | [Videos](https://www.youtube.com/watch?v=ebo0kGKP6bw) | [Papers](#publications) | [Presentations](#presentations) +[Full Documentation](https://oracle.github.io/macaron/index.html) | [Tutorials](https://oracle.github.io/macaron/pages/tutorials/index.html) | [Videos](https://www.youtube.com/watch?v=ebo0kGKP6bw) | [Papers](#publications) | [Presentations](#presentations) | [Macaron GitHub Action](https://oracle.github.io/macaron/pages/macaron_action.html) **Macaron** is a software supply chain security analysis tool from Oracle Labs focused on verifying the **build integrity** of artifacts and their dependencies. It helps developers, security teams, and researchers ensure that packages are built as expected and have not been tampered with. +Use Macaron as a GitHub Action + +To use the Macaron GitHub Action, add the following step to your workflow (adjust the version as needed). In this example, we use an example policy. For detailed instructions and a comprehensive list of available options, please refer to the [Macaron GitHub Action documentation](https://oracle.github.io/macaron/pages/macaron_action.html). + +```yaml +- uses: oracle/macaron@v0.21.0 + with: + repo_path: 'https://github.com/example/project' + policy_file: check-github-actions + policy_purl: 'pkg:github.com/example/project' + output_dir: 'macaron-output' + upload_attestation: true +``` + +For detailed instructions and a comprehensive list of available options, please refer to the [Macaron GitHub Action documentation](https://oracle.github.io/macaron/pages/macaron_action.html). + ## Key Capabilities Macaron supports: diff --git a/action.yaml b/action.yaml index 0e77b216d..f28f9d2e9 100644 --- a/action.yaml +++ b/action.yaml @@ -58,28 +58,13 @@ outputs: runs: using: composite steps: - - name: Setup Python - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 - with: - python-version: 3.11.14 - - - name: Setup Go - uses: actions/setup-go@4dc6199c7b1a012772edbd06daecab0f50c9053c # v6.1.0 - with: - go-version: '1.23' - cache: false - - - name: Setup JDK - uses: actions/setup-java@b36c23c0d998641eff861008f374ee103c25ac73 # v4.4.0 - with: - java-version: '17' - distribution: oracle - - name: Setup Macaron - # Create or reuse a Python virtualenv with the macaron CLI and export the `MACARON` binary path via `$GITHUB_ENV` so later steps can use it. + # Create or reuse run_macaron.sh script run: | bash "$GITHUB_ACTION_PATH/scripts/actions/setup_macaron.sh" shell: bash + env: + ACTION_REF: ${{ github.action_ref }} - name: Run Macaron Analysis id: run-macaron-analysis diff --git a/docs/source/index.rst b/docs/source/index.rst index 43fe2af7f..bc9ab5a0d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -121,6 +121,7 @@ intermediate representations as abstractions. Using such abstractions, Macaron i pages/installation pages/using pages/cli_usage/index + pages/macaron_action pages/tutorials/index pages/output_files pages/checks/slsa_builds diff --git a/docs/source/pages/macaron_action.rst b/docs/source/pages/macaron_action.rst new file mode 100644 index 000000000..6c7db9407 --- /dev/null +++ b/docs/source/pages/macaron_action.rst @@ -0,0 +1,175 @@ +Macaron GitHub Action +===================== + +Overview +-------- + +This document describes the composite GitHub Action defined in ``action.yaml`` at the repository root. The action uses the Macaron CLI to run supply-chain security analysis and policy verification from a GitHub Actions workflow. + +Quick usage +----------- + +When using this action you can reference the action in your workflow. Example: + +.. code-block:: yaml + + jobs: + analyze: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + - name: Run Macaron Security Analysis + uses: oracle/macaron@v0.21.0 + with: + repo_path: 'https://github.com/example/project' + policy_file: check-github-actions + policy_purl: 'pkg:github.com/example/project' + output_dir: 'macaron-output' + upload_attestation: true + +Example: policy verification only +---------------------------------- + +To run only the policy verification step (when you already have an output +database), call the action with ``policy_file`` and set ``output_dir`` to the +directory containing ``macaron.db``: + +.. code-block:: yaml + + - name: Verify policy + uses: oracle/macaron@v0.21.0 + with: + policy_file: policy.dl + output_dir: macaron-output + upload_attestation: true + +Inputs +------ +The action exposes a number of inputs which map directly to Macaron CLI +options. Key inputs are listed below (see ``action.yaml`` for the full list): + +.. list-table:: + :header-rows: 1 + :widths: 20 60 20 + + * - Input + - Description + - Default + * - ``repo_path`` + - The path or URL of the repository to analyze. + - + * - ``package_url`` + - A PURL identifying a package to analyze instead of a repository. + - + * - ``sbom_path`` + - Path to an SBOM file to analyze. + - + * - ``python_venv`` + - Path to an existing Python virtualenv (used when analyzing Python + packages). + - + * - ``defaults_path`` + - Path to a Macaron defaults configuration file. + - + * - ``policy_file`` + - Path to a Datalog policy file for policy verification. + - + * - ``policy_purl`` + - PURL for a pre-defined policy to use with verification. + - + * - ``branch`` / ``digest`` + - Checkout options when analyzing a repository (branch name or commit + digest). + - + * - ``provenance_expectation`` + - The path to provenance expectation file or directory. + - + * - ``provenance_file`` + - The path to the provenance file in in-toto format. + - + * - ``deps_depth`` + - Dependency resolution depth (how many levels of transitive dependencies + to resolve). + - ``0`` + * - ``show_prelude`` + - Shows the Datalog prelude for the database. + - + * - ``github_token`` + - Token used by Macaron to access GitHub (for cloning, API access, + etc.). + - ``${{ github.token }}`` + * - ``output_dir`` + - Directory where Macaron writes results (database, reports, artifacts). + - ``output`` + * - ``upload_attestation`` + - When ``true``, the action will attempt to upload a generated + verification attestation (VSA) after policy verification. + - ``false`` + * - ``subject_path`` + - Path to the artifact serving as the subject of the attestation. + - ``${{ github.workspace }}`` + +Outputs +------- + +The composite action exposes the following outputs (set by the +``run_macaron_policy_verification.sh`` script when applicable): + +.. list-table:: + :header-rows: 1 + :widths: 20 70 + + * - Output + - Description + * - ``policy_report`` + - Path to the generated policy report JSON file produced by + ``macaron verify-policy``. This file contains the policy evaluation + results. + * - ``vsa_report`` + - Path to the generated VSA (Verification Summary Attestation) in + `in-toto `_ JSONL format. If no VSA was produced + during verification, the action emits the string ``"VSA Not Generated."`` + instead of a path. + +Default Policies +---------------- + +Macaron provides policy templates to run pre-defined policies: + +.. list-table:: + :header-rows: 1 + :widths: 20 60 20 + + * - Policy name + - Description + - Template + * - ``check-github-actions`` + - Detects whether a component was built using GitHub Actions that + are known to be vulnerable or otherwise unsafe. The policy + evaluates a check named `mcn_githubactions_vulnerabilities_1` and + reports a passed/failed result for the component when applied. + - `check-github-actions template `_ + * - ``malware-detection`` + - Checks a component for indicators of malicious or suspicious content. + The policy evaluates a check named mcn_detect_malicious_metadata_1 + and reports a passed/failed result for the component when applied. + - `malware-detection template `_ + * - ``malware-detection-dependencies`` + - Checks the component and its transitive dependencies for indicators + of malicious or suspicious content. The policy ensures the component + and each dependency pass the `mcn_detect_malicious_metadata_1` check. + - `malware-detection-dependencies template `_ + +How the action works +-------------------- + +1. ``Setup Macaron``: downloads ``run_macaron.sh`` script to install and run macaron in the action. + +2. ``Run Macaron Analysis``: calls ``scripts/actions/run_macaron_analysis.sh`` + which assembles the ``macaron analyze`` command from the inputs and runs + it. Results are written into ``output_dir``. + +3. ``Run Macaron Policy Verification``: if a policy file or PURL is supplied, + the corresponding script runs ``macaron verify-policy`` against the + analysis database and writes ``policy_report`` and ``vsa_report`` to + ``$GITHUB_OUTPUT`` when produced. diff --git a/scripts/actions/run_macaron_analysis.sh b/scripts/actions/run_macaron_analysis.sh index fc97fd916..34305479c 100644 --- a/scripts/actions/run_macaron_analysis.sh +++ b/scripts/actions/run_macaron_analysis.sh @@ -19,7 +19,7 @@ else fi OUTPUT_DIR=${OUTPUT_DIR:-output} -CMD="$CMD --output-dir ${OUTPUT_DIR} -lr . analyze" +CMD="$CMD --output ${OUTPUT_DIR} -lr . analyze" if [ -n "${REPO_PATH:-}" ]; then CMD="$CMD -rp ${REPO_PATH}" diff --git a/scripts/actions/run_macaron_policy_verification.sh b/scripts/actions/run_macaron_policy_verification.sh index fb6218e36..46eb9bee0 100644 --- a/scripts/actions/run_macaron_policy_verification.sh +++ b/scripts/actions/run_macaron_policy_verification.sh @@ -25,7 +25,7 @@ if [ -n "$DEFAULTS_PATH" ]; then else CMD="$MACARON" fi -CMD="$CMD --output-dir ${OUTPUT_DIR} verify-policy --database ${OUTPUT_DIR}/macaron.db" +CMD="$CMD --output ${OUTPUT_DIR} verify-policy --database ${OUTPUT_DIR}/macaron.db" if [ -n "$FILE" ] && [ -f "$FILE" ]; then CMD="$CMD --file $FILE" diff --git a/scripts/actions/setup_macaron.sh b/scripts/actions/setup_macaron.sh index fe2bd9b20..a002bb534 100644 --- a/scripts/actions/setup_macaron.sh +++ b/scripts/actions/setup_macaron.sh @@ -4,25 +4,65 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. set -euo pipefail -# Setup Macaron virtualenv and make available via GitHub Actions environment files. -# This script writes `MACARON=` to `$GITHUB_ENV` so later steps can invoke the macaron CLI, and appends the venv `bin` directory to `$GITHUB_PATH`. - MACARON_DIR="${RUNNER_TEMP:-/tmp}/macaron" -VENV_MACARON="$MACARON_DIR/.venv/bin/macaron" - mkdir -p "$MACARON_DIR" -if [ -x "$VENV_MACARON" ]; then - echo "Using macaron from existing venv: $VENV_MACARON" - echo "MACARON=$VENV_MACARON" >> "$GITHUB_ENV" - echo "$MACARON_DIR/.venv/bin" >> "$GITHUB_PATH" - exit 0 +ACTION_DIR="${RUNNER_TEMP:-/tmp}/macaron-action" +rm -rf "$ACTION_DIR" +mkdir -p "$ACTION_DIR" + +git clone --filter=blob:none --no-checkout https://github.com/oracle/macaron.git "$ACTION_DIR" + +TARGET_REF="${ACTION_REF:-main}" +MACARON_IMAGE_TAG="" +cd "$ACTION_DIR" +if [[ "$TARGET_REF" =~ ^[0-9a-f]{40}$ ]]; then + # Check for tags pointing directly at the SHA. + tags=$(git tag --points-at "$TARGET_REF") + if [[ -n "$tags" ]]; then + # Get the first tag (main or first one listed) + MACARON_IMAGE_TAG="$(echo "$tags" | head -n1)" + echo "SHA $TARGET_REF maps to exact tag: $MACARON_IMAGE_TAG" + else + # Search all tags that contain the commit (could be ancestor). + history_tags=$(git tag --contains "$TARGET_REF") + if [[ -n "$history_tags" ]]; then + MACARON_IMAGE_TAG="$(echo "$history_tags" | head -n1)" + echo "SHA $TARGET_REF is contained in tag: $MACARON_IMAGE_TAG" + else + echo "No tag found for SHA $TARGET_REF. Defaulting to 'latest'." + MACARON_IMAGE_TAG="latest" + fi + fi +elif [[ "$TARGET_REF" =~ ^v[0-9] ]]; then + MACARON_IMAGE_TAG="$TARGET_REF" + echo "Ref is a direct tag: $MACARON_IMAGE_TAG" +else + echo "Using 'latest' image." + MACARON_IMAGE_TAG="latest" fi cd "$MACARON_DIR" -git clone https://github.com/oracle/macaron.git . -make venv -export PATH="$MACARON_DIR/.venv/bin:$PATH" -make setup -echo "MACARON=$VENV_MACARON" >> "$GITHUB_ENV" -echo "$MACARON_DIR/.venv/bin" >> "$GITHUB_PATH" + +# Download image using macaron_image_tag else latest release +if [ "${MACARON_IMAGE_TAG}" != "latest" ]; then + echo "MACARON_IMAGE_TAG detected: ${MACARON_IMAGE_TAG}" + URL="https://raw.githubusercontent.com/oracle/macaron/refs/tags/${MACARON_IMAGE_TAG}/scripts/release_scripts/run_macaron.sh" + SCRIPT_NAME="run_macaron_${MACARON_IMAGE_TAG}.sh" +else + echo "Using default latest release." + URL="https://raw.githubusercontent.com/oracle/macaron/release/scripts/release_scripts/run_macaron.sh" + SCRIPT_NAME="run_macaron.sh" +fi + +# Get the run_macaron.sh script +if [ ! -f "$SCRIPT_NAME" ]; then + echo "Downloading $SCRIPT_NAME from: $URL" + curl -fSL -o "$SCRIPT_NAME" "$URL" +else + echo "$SCRIPT_NAME already exists, skipping download." +fi + +chmod +x "$SCRIPT_NAME" +echo "MACARON=$MACARON_DIR/$SCRIPT_NAME" >> "$GITHUB_ENV" +echo "MACARON_IMAGE_TAG=${MACARON_IMAGE_TAG}" >> "$GITHUB_ENV" diff --git a/src/macaron/resources/policies/datalog/malware-detection.dl.template b/src/macaron/resources/policies/datalog/malware-detection.dl.template index 77eedc5cf..4429cfec6 100644 --- a/src/macaron/resources/policies/datalog/malware-detection.dl.template +++ b/src/macaron/resources/policies/datalog/malware-detection.dl.template @@ -3,7 +3,6 @@ Policy("check-component", component_id, "Check component artifacts.") :- check_passed(component_id, "mcn_detect_malicious_metadata_1"). - apply_policy_to("check-component", component_id) :- is_component(component_id, purl), match("", purl). From 1665a69d745ffa3d5888545d40b9878b12390abc Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Thu, 18 Dec 2025 10:34:55 +1000 Subject: [PATCH 04/20] feat: infer chronologically likeliest setuptools version (#1260) Signed-off-by: Abhinav Pradeep --- .../common_spec/base_spec.py | 2 +- .../common_spec/pypi_spec.py | 16 +++ .../metadata/similar_projects.py | 3 - src/macaron/repo_finder/repo_finder_pypi.py | 2 +- .../package_registry/pypi_registry.py | 103 ++++++++++++++++-- .../pypi/test_wheel_absence.py | 2 +- 6 files changed, 115 insertions(+), 13 deletions(-) diff --git a/src/macaron/build_spec_generator/common_spec/base_spec.py b/src/macaron/build_spec_generator/common_spec/base_spec.py index b410729fe..c567609f7 100644 --- a/src/macaron/build_spec_generator/common_spec/base_spec.py +++ b/src/macaron/build_spec_generator/common_spec/base_spec.py @@ -62,7 +62,7 @@ class BaseBuildSpecDict(TypedDict, total=False): build_commands: NotRequired[list[list[str]]] #: List of shell commands to test the project. - test_commands: NotRequired[list[str]] + test_commands: NotRequired[list[list[str]]] #: Environment variables required during build or test. environment: NotRequired[dict[str, str]] diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index bb90ba6a1..999afbb19 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -120,6 +120,8 @@ def resolve_fields(self, purl: PackageURL) -> None: python_version_set: set[str] = set() wheel_name_python_version_list: list[str] = [] wheel_name_platforms: set[str] = set() + # Precautionary fallback to default version + chronologically_likeliest_version: str = defaults.get("heuristic.pypi", "default_setuptools") if pypi_package_json is not None: if pypi_package_json.package_json or pypi_package_json.download(dest=""): @@ -150,6 +152,9 @@ def resolve_fields(self, purl: PackageURL) -> None: parsed_build_requires["setuptools"] = "==" + defaults.get( "heuristic.pypi", "setuptools_version_emitting_platform_unknown" ) + chronologically_likeliest_version = ( + pypi_package_json.get_chronologically_suitable_setuptools_version() + ) except SourceCodeError: logger.debug("Could not find pure wheel matching this PURL") @@ -165,6 +170,10 @@ def resolve_fields(self, purl: PackageURL) -> None: requires = json_extract(content, ["build-system", "requires"], list) if requires: build_requires_set.update(elem.replace(" ", "") for elem in requires) + # If we cannot find `requires` in `[build-system]`, we lean on the fact that setuptools + # was the de-facto build tool, and infer a setuptools version to include. + else: + build_requires_set.add(f"setuptools=={chronologically_likeliest_version}") backend = json_extract(content, ["build-system", "build-backend"], str) if backend: build_backends_set.add(backend.replace(" ", "")) @@ -177,6 +186,10 @@ def resolve_fields(self, purl: PackageURL) -> None: build_requires_set, build_backends_set, ) + # Here we have successfully analyzed the pyproject.toml file. Now, if we have a setup.py/cfg, + # we also need to infer a setuptools version to infer. + if pypi_package_json.file_exists("setup.py") or pypi_package_json.file_exists("setup.cfg"): + build_requires_set.add(f"setuptools=={chronologically_likeliest_version}") except TypeError as error: logger.debug( "Found a type error while reading the pyproject.toml file from the sdist: %s", error @@ -185,6 +198,9 @@ def resolve_fields(self, purl: PackageURL) -> None: logger.debug("Failed to read the pyproject.toml file from the sdist: %s", error) except SourceCodeError as error: logger.debug("No pyproject.toml found: %s", error) + # Here we do not have a pyproject.toml file. Instead, we lean on the fact that setuptools + # was the de-facto build tool, and infer a setuptools version to include. + build_requires_set.add(f"setuptools=={chronologically_likeliest_version}") except SourceCodeError as error: logger.debug("No source distribution found: %s", error) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/similar_projects.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/similar_projects.py index 872c1143d..b98686c99 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/similar_projects.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/similar_projects.py @@ -74,9 +74,6 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes False, pypi_package_json.pypi_registry, {}, - "", - "", - "", PyPIInspectorAsset("", [], {}), ) if not adjacent_pypi_json.download(""): diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index 8aa05d7ce..2941c71cf 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -59,7 +59,7 @@ def find_repo( if not pypi_registry: return "", RepoFinderInfo.PYPI_NO_REGISTRY pypi_asset = PyPIPackageJsonAsset( - purl.name, purl.version, False, pypi_registry, {}, "", "", "", PyPIInspectorAsset("", [], {}) + purl.name, purl.version, False, pypi_registry, {}, PyPIInspectorAsset("", [], {}) ) if not pypi_asset: diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index a4306d905..ce8630d37 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -4,6 +4,7 @@ """The module provides abstractions for the pypi package registry.""" from __future__ import annotations +import bisect import hashlib import logging import os @@ -15,7 +16,7 @@ import zipfile from collections.abc import Callable, Generator, Iterator from contextlib import contextmanager -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import datetime from typing import TYPE_CHECKING @@ -502,6 +503,42 @@ def get_maintainer_join_date(self, username: str) -> datetime | None: return res.replace(tzinfo=None) if res else None + def get_matching_setuptools_version(self, package_release_datetime: datetime) -> str: + """Find the setuptools that would be "latest" for the input datetime. + + Parameters + ---------- + package_release_datetime: str + Release datetime of a package we wish to rebuild + + Returns + ------- + str: Matching version of setuptools + """ + setuptools_endpoint = urllib.parse.urljoin(self.registry_url, "pypi/setuptools/json") + setuptools_json = self.download_package_json(setuptools_endpoint) + releases = json_extract(setuptools_json, ["releases"], dict) + if releases: + release_tuples = [ + (version, release_info[0].get("upload_time")) + for version, release_info in releases.items() + if release_info + ] + # Cannot assume this is sorted, as releases is just a dict + release_tuples.sort(key=lambda x: x[1]) + # bisect_left gives position to insert package_release_datetime to maintain order, hence we do -1 + index = ( + bisect.bisect_left( + release_tuples, package_release_datetime, key=lambda x: datetime.strptime(x[1], "%Y-%m-%dT%H:%M:%S") + ) + - 1 + ) + return str(release_tuples[index][0]) + # This realistically cannot happen: it would mean we somehow are trying to rebuild + # for a package and version with no releases. + # Return default just in case. + return defaults.get("heuristic.pypi", "default_setuptools") + @staticmethod def extract_attestation(attestation_data: dict) -> dict | None: """Extract the first attestation file from a PyPI attestation response. @@ -618,13 +655,16 @@ class PyPIPackageJsonAsset: package_json: dict #: The source code temporary location name. - package_sourcecode_path: str + package_sourcecode_path: str = field(init=False) #: The wheel temporary location name. - wheel_path: str + wheel_path: str = field(init=False) #: Name of the wheel file. - wheel_filename: str + wheel_filename: str = field(init=False) + + #: The datetime that the wheel was uploaded. + package_upload_time: datetime | None = field(default=None, init=False) #: The pypi inspector information about this package inspector_asset: PyPIInspectorAsset @@ -769,6 +809,11 @@ def get_wheel_url(self, tag: str = "none-any") -> str | None: if not urls: return None for distribution in urls: + # In this way we have a package_upload_time even if we cannot find the wheel. + try: + self.package_upload_time = datetime.strptime(distribution.get("upload_time") or "", "%Y-%m-%dT%H:%M:%S") + except ValueError: + logging.debug("Could not parse the uploaded datetime: %s", distribution.get("upload_time") or "") # Only examine wheels if distribution.get("packagetype") != "bdist_wheel": continue @@ -779,6 +824,12 @@ def get_wheel_url(self, tag: str = "none-any") -> str | None: # Continue to getting url wheel_url: str = distribution.get("url") or "" if wheel_url: + try: + self.package_upload_time = datetime.strptime( + distribution.get("upload_time") or "", "%Y-%m-%dT%H:%M:%S" + ) + except ValueError: + logging.debug("Could not parse the uploaded datetime: %s", distribution.get("upload_time") or "") try: parsed_url = urllib.parse.urlparse(wheel_url) except ValueError: @@ -919,6 +970,33 @@ def get_sourcecode_file_contents(self, path: str) -> bytes: logger.debug(error_msg) raise SourceCodeError(error_msg) from read_error + def file_exists(self, path: str) -> bool: + """Check if a file exists in the downloaded source code. + + The path can be relative to the package_sourcecode_path attribute, or an absolute path. + + Parameters + ---------- + path: str + The absolute or relative to package_sourcecode_path file path to check for. + + Returns + ------- + bool: Whether or not a file at path absolute or relative to package_sourcecode_path exists. + """ + if not self.package_sourcecode_path: + # No source code files were downloaded + return False + + if not os.path.isabs(path): + path = os.path.join(self.package_sourcecode_path, path) + + if not os.path.exists(path): + # Could not find a file at that path + return False + + return True + def iter_sourcecode(self) -> Iterator[tuple[str, bytes]]: """ Iterate through all source code files. @@ -1054,6 +1132,19 @@ def get_inspector_src_preview_links(self) -> bool: # If all distributions were invalid and went along a 'continue' path. return bool(self.inspector_asset) + def get_chronologically_suitable_setuptools_version(self) -> str: + """Find version of setuptools that would be "latest" for this package. + + Returns + ------- + str + Chronologically likeliest setuptools version + """ + if self.package_upload_time: + return self.pypi_registry.get_matching_setuptools_version(self.package_upload_time) + # If we cannot infer upload time for the package, return the default + return defaults.get("heuristic.pypi", "default_setuptools") + def find_or_create_pypi_asset( asset_name: str, asset_version: str | None, pypi_registry_info: PackageRegistryInfo @@ -1091,8 +1182,6 @@ def find_or_create_pypi_asset( logger.debug("Failed to create PyPIPackageJson asset.") return None - asset = PyPIPackageJsonAsset( - asset_name, asset_version, False, package_registry, {}, "", "", "", PyPIInspectorAsset("", [], {}) - ) + asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {}, PyPIInspectorAsset("", [], {})) pypi_registry_info.metadata.append(asset) return asset diff --git a/tests/malware_analyzer/pypi/test_wheel_absence.py b/tests/malware_analyzer/pypi/test_wheel_absence.py index 7f6e4e593..c00feb2cb 100644 --- a/tests/malware_analyzer/pypi/test_wheel_absence.py +++ b/tests/malware_analyzer/pypi/test_wheel_absence.py @@ -125,7 +125,7 @@ def test_get_inspector_src_preview_links(mock_send_head_http_raw: MagicMock) -> mock_send_head_http_raw.return_value = MagicMock() # Assume valid URL for testing purposes. pypi_package_json = PyPIPackageJsonAsset( - package_name, version, False, pypi_registry, package_json, "", "", "", PyPIInspectorAsset("", [], {}) + package_name, version, False, pypi_registry, package_json, PyPIInspectorAsset("", [], {}) ) assert pypi_package_json.get_inspector_src_preview_links() is True From d190f653c1dc39c50c151972cd8e68aa60317dc7 Mon Sep 17 00:00:00 2001 From: Behnaz Hassanshahi Date: Fri, 19 Dec 2025 14:16:23 +1000 Subject: [PATCH 05/20] fix!: use --output option for Macaron Python Package (#1266) Fixes inconsistent output path handling between Docker CLI and Python package CLI in Macaron by introducing a --output option and HOST_OUTPUT env var, ensuring reports/logs use container-external paths when set. Signed-off-by: behnazh-w --- .gitignore | 1 + docs/source/pages/cli_usage/index.rst | 2 +- scripts/release_scripts/run_macaron.sh | 6 ++ src/macaron/__main__.py | 46 ++++++++++----- .../build_spec_generator.py | 7 ++- src/macaron/config/defaults.py | 11 ++-- src/macaron/config/global_config.py | 3 + src/macaron/dependency_analyzer/cyclonedx.py | 5 +- src/macaron/output_reporter/__init__.py | 59 ++++++++++++++++++- src/macaron/output_reporter/reporter.py | 13 ++-- src/macaron/provenance/provenance_verifier.py | 5 +- src/macaron/repo_finder/repo_utils.py | 7 ++- .../ci_service/base_ci_service.py | 5 +- tests/config/test_defaults.py | 7 +-- tests/conftest.py | 2 +- .../cases/apache_maven_sbom/test.yaml | 3 +- tests/integration/run.py | 19 +++--- tests/macaron_testcase.py | 2 +- tests/test_main.py | 2 +- 19 files changed, 146 insertions(+), 59 deletions(-) diff --git a/.gitignore b/.gitignore index 758a3d0cb..fe358ea54 100644 --- a/.gitignore +++ b/.gitignore @@ -167,6 +167,7 @@ gradlew.bat .macaron reports output +output_dir cdx_debug.json sbom_debug.json golang/internal/filewriter/mock_dir/result.json diff --git a/docs/source/pages/cli_usage/index.rst b/docs/source/pages/cli_usage/index.rst index dc169c3a2..668794ec9 100644 --- a/docs/source/pages/cli_usage/index.rst +++ b/docs/source/pages/cli_usage/index.rst @@ -42,7 +42,7 @@ Common Options Disable Rich UI output. This will turn off any rich formatting (e.g., colored output, tables, etc.) used in the terminal UI. -.. option:: -o OUTPUT_DIR, --output-dir OUTPUT_DIR +.. option:: -o OUTPUT, --output OUTPUT_DIR The output destination path for Macaron. This is where Macaron will store the results of the analysis. diff --git a/scripts/release_scripts/run_macaron.sh b/scripts/release_scripts/run_macaron.sh index 65dd08954..306ae1bf7 100755 --- a/scripts/release_scripts/run_macaron.sh +++ b/scripts/release_scripts/run_macaron.sh @@ -56,6 +56,9 @@ IMAGE="ghcr.io/oracle/macaron" # Workspace directory inside of the container. MACARON_WORKSPACE="/home/macaron" +# Host output path outside the container. +HOST_OUTPUT="" + # The entrypoint to run Macaron or the Policy Engine. # It it set by default to macaron. # We use an array here to preserve the arguments as provided by the user. @@ -388,8 +391,10 @@ fi if [[ -n "${arg_output:-}" ]]; then output="${arg_output}" argv_main+=("--output" "${MACARON_WORKSPACE}/output/") + HOST_OUTPUT="${arg_output}" else output=$(pwd)/output + HOST_OUTPUT="output" echo "Setting default output directory to ${output}." fi @@ -659,6 +664,7 @@ docker run \ --rm -i "${tty[@]}" \ -e "USER_UID=${USER_UID}" \ -e "USER_GID=${USER_GID}" \ + -e "HOST_OUTPUT=${HOST_OUTPUT}" \ "${proxy_vars[@]}" \ "${prod_vars[@]}" \ "${mounts[@]}" \ diff --git a/src/macaron/__main__.py b/src/macaron/__main__.py index e23844e5e..dd103eec6 100644 --- a/src/macaron/__main__.py +++ b/src/macaron/__main__.py @@ -22,6 +22,7 @@ from macaron.config.global_config import global_config from macaron.console import RichConsoleHandler, access_handler from macaron.errors import ConfigurationError +from macaron.output_reporter import find_report_output_path from macaron.output_reporter.reporter import HTMLReporter, JSONReporter, PolicyReporter from macaron.policy_engine.policy_engine import run_policy_engine, show_prelude from macaron.repo_finder import repo_finder @@ -280,14 +281,14 @@ def verify_policy(verify_policy_args: argparse.Namespace) -> int: rich_handler = access_handler.get_handler() if vsa is not None: vsa_filepath = os.path.join(global_config.output_path, "vsa.intoto.jsonl") - rich_handler.update_vsa(os.path.relpath(vsa_filepath, os.getcwd())) + rich_handler.update_vsa(find_report_output_path(vsa_filepath)) logger.info( "Generating the Verification Summary Attestation (VSA) to %s.", - os.path.relpath(vsa_filepath, os.getcwd()), + find_report_output_path(vsa_filepath), ) logger.info( "To decode and inspect the payload, run `cat %s | jq -r '.payload' | base64 -d | jq`.", - os.path.relpath(vsa_filepath, os.getcwd()), + find_report_output_path(vsa_filepath), ) try: with open(vsa_filepath, mode="w", encoding="utf-8") as file: @@ -295,7 +296,7 @@ def verify_policy(verify_policy_args: argparse.Namespace) -> int: except OSError as err: logger.error( "Could not generate the VSA to %s. Error: %s", - os.path.relpath(vsa_filepath, os.getcwd()), + find_report_output_path(vsa_filepath), err, ) else: @@ -372,7 +373,7 @@ def perform_action(action_args: argparse.Namespace) -> None: if not action_args.disable_rich_output: rich_handler.start("dump-defaults") # Create the defaults.ini file in the output dir and exit. - create_defaults(action_args.output_dir, os.getcwd()) + create_defaults(action_args.output) sys.exit(os.EX_OK) case "verify-policy": @@ -466,6 +467,9 @@ def main(argv: list[str] | None = None) -> None: global_config.gl_token = _get_token_from_dict_or_env("MCN_GITLAB_TOKEN", token_dict) global_config.gl_self_host_token = _get_token_from_dict_or_env("MCN_SELF_HOSTED_GITLAB_TOKEN", token_dict) + # Set the host output path, which would be set if Macaron is running inside a container. + global_config.host_output_path = _get_host_output_path_env() + main_parser = argparse.ArgumentParser(prog="macaron") main_parser.add_argument( @@ -492,7 +496,7 @@ def main(argv: list[str] | None = None) -> None: main_parser.add_argument( "-o", - "--output-dir", + "--output", default=os.path.join(os.getcwd(), "output"), help="The output destination path for Macaron", ) @@ -724,29 +728,29 @@ def main(argv: list[str] | None = None) -> None: try: # Set the output directory. - if not args.output_dir: + if not args.output: logger.error("The output path cannot be empty. Exiting ...") sys.exit(os.EX_USAGE) - if os.path.isfile(args.output_dir): + if os.path.isfile(args.output): logger.error("The output directory already exists. Exiting ...") sys.exit(os.EX_USAGE) - if os.path.isdir(args.output_dir): + if os.path.isdir(args.output): logger.info( "Setting the output directory to %s", - os.path.relpath(args.output_dir, os.getcwd()), + find_report_output_path(args.output), ) else: logger.info( "No directory at %s. Creating one ...", - os.path.relpath(args.output_dir, os.getcwd()), + find_report_output_path(args.output), ) - os.makedirs(args.output_dir) + os.makedirs(args.output) # Add file handler to the root logger. Remove stream handler from the # root logger to prevent dependencies printing logs to stdout. - debug_log_path = os.path.join(args.output_dir, "debug.log") + debug_log_path = os.path.join(args.output, "debug.log") log_file_handler = logging.FileHandler(debug_log_path, "w") log_file_handler.setFormatter(logging.Formatter(log_format)) if args.disable_rich_output: @@ -769,8 +773,8 @@ def main(argv: list[str] | None = None) -> None: # set through analyze sub-command. global_config.load( macaron_path=macaron.MACARON_PATH, - output_path=args.output_dir, - build_log_path=os.path.join(args.output_dir, "build_log"), + output_path=args.output, + build_log_path=os.path.join(args.output, "build_log"), debug_level=log_level, local_repos_path=args.local_repos_path, resources_path=os.path.join(macaron.MACARON_PATH, "resources"), @@ -800,5 +804,17 @@ def _get_token_from_dict_or_env(token: str, token_dict: dict[str, str]) -> str: return token_dict[token] if token in token_dict else os.environ.get(token) or "" +def _get_host_output_path_env() -> str: + """ + Get the host output path from the HOST_OUTPUT environment variable. + + Returns + ------- + str + The HOST_OUTPUT environment variable or an empty string. + """ + return os.environ.get("HOST_OUTPUT") or "" + + if __name__ == "__main__": main() diff --git a/src/macaron/build_spec_generator/build_spec_generator.py b/src/macaron/build_spec_generator/build_spec_generator.py index c23fc3c80..9d7fd94ca 100644 --- a/src/macaron/build_spec_generator/build_spec_generator.py +++ b/src/macaron/build_spec_generator/build_spec_generator.py @@ -17,6 +17,7 @@ from macaron.build_spec_generator.reproducible_central.reproducible_central import gen_reproducible_central_build_spec from macaron.console import access_handler from macaron.errors import GenerateBuildSpecError +from macaron.output_reporter import find_report_output_path from macaron.path_utils.purl_based_path import get_purl_based_dir logger: logging.Logger = logging.getLogger(__name__) @@ -120,17 +121,17 @@ def gen_build_spec_for_purl( logger.info( "Generating the %s format build spec to %s", build_spec_format.value, - os.path.relpath(build_spec_file_path, os.getcwd()), + find_report_output_path(build_spec_file_path), ) rich_handler = access_handler.get_handler() - rich_handler.update_gen_build_spec("Build Spec Path:", os.path.relpath(build_spec_file_path, os.getcwd())) + rich_handler.update_gen_build_spec("Build Spec Path:", find_report_output_path(build_spec_file_path)) try: with open(build_spec_file_path, mode="w", encoding="utf-8") as file: file.write(build_spec_content) except OSError as error: logger.error( "Could not create the build spec at %s. Error: %s", - os.path.relpath(build_spec_file_path, os.getcwd()), + find_report_output_path(build_spec_file_path), error, ) return os.EX_OSERR diff --git a/src/macaron/config/defaults.py b/src/macaron/config/defaults.py index a5b487c0b..d0f355092 100644 --- a/src/macaron/config/defaults.py +++ b/src/macaron/config/defaults.py @@ -10,6 +10,7 @@ import shutil from macaron.console import access_handler +from macaron.output_reporter import find_report_output_path logger: logging.Logger = logging.getLogger(__name__) @@ -138,15 +139,13 @@ def load_defaults(user_config_path: str) -> bool: return False -def create_defaults(output_path: str, cwd_path: str) -> bool: +def create_defaults(output_path: str) -> bool: """Create the ``defaults.ini`` file at the Macaron's root dir for end users. Parameters ---------- output_path : str The path where the ``defaults.ini`` will be created. - cwd_path : str - The path to the current working directory. Returns ------- @@ -169,12 +168,12 @@ def create_defaults(output_path: str, cwd_path: str) -> bool: shutil.copy2(src_path, dest_path) logger.info( "Dumped the default values in %s.", - os.path.relpath(os.path.join(output_path, "defaults.ini"), cwd_path), + find_report_output_path(os.path.join(output_path, "defaults.ini")), ) - rich_handler.update_dump_defaults(os.path.relpath(dest_path, cwd_path)) + rich_handler.update_dump_defaults(find_report_output_path(dest_path)) return True # We catch OSError to support errors on different platforms. except OSError as error: - logger.error("Failed to create %s: %s.", os.path.relpath(dest_path, cwd_path), error) + logger.error("Failed to create %s: %s.", find_report_output_path(dest_path), error) rich_handler.update_dump_defaults("[bold red]Failed[/]") return False diff --git a/src/macaron/config/global_config.py b/src/macaron/config/global_config.py index 4e2befa6f..78bedc34b 100644 --- a/src/macaron/config/global_config.py +++ b/src/macaron/config/global_config.py @@ -49,6 +49,9 @@ class GlobalConfig: #: The path to the local .m2 Maven repository. This attribute is None if there is no available .m2 directory. local_maven_repo: str | None = None + #: The host output path, if Macaron is executed as a container. + host_output_path: str = "" + def load( self, macaron_path: str, diff --git a/src/macaron/dependency_analyzer/cyclonedx.py b/src/macaron/dependency_analyzer/cyclonedx.py index 9fec0536f..c46a8a773 100644 --- a/src/macaron/dependency_analyzer/cyclonedx.py +++ b/src/macaron/dependency_analyzer/cyclonedx.py @@ -26,6 +26,7 @@ from macaron.config.target_config import Configuration from macaron.database.table_definitions import Component from macaron.errors import CycloneDXParserError, DependencyAnalyzerError +from macaron.output_reporter import find_report_output_path from macaron.output_reporter.scm import SCMStatus from macaron.repo_finder.repo_finder import find_repo from macaron.repo_finder.repo_finder_enums import RepoFinderInfo @@ -359,7 +360,7 @@ def resolve_dependencies(main_ctx: Any, sbom_path: str, recursive: bool = False) "Running %s version %s dependency analyzer on %s", dep_analyzer.tool_name, dep_analyzer.tool_version, - os.path.relpath(main_ctx.component.repository.fs_path, os.getcwd()), + find_report_output_path(main_ctx.component.repository.fs_path), ) log_path = os.path.join( @@ -397,7 +398,7 @@ def resolve_dependencies(main_ctx: Any, sbom_path: str, recursive: bool = False) logger.info( "Stored dependency resolver log for %s to %s.", dep_analyzer.tool_name, - os.path.relpath(log_path, os.getcwd()), + find_report_output_path(log_path), ) # Use repo finder to find more repositories to analyze. diff --git a/src/macaron/output_reporter/__init__.py b/src/macaron/output_reporter/__init__.py index f29d8ac6c..bbcaa8ea3 100644 --- a/src/macaron/output_reporter/__init__.py +++ b/src/macaron/output_reporter/__init__.py @@ -1,2 +1,59 @@ -# Copyright (c) 2022 - 2022, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module contains helper functions for reporting.""" + +import logging +import os +from pathlib import Path + +from macaron.config.global_config import global_config + +logger: logging.Logger = logging.getLogger(__name__) + + +def find_report_output_path(file_name: str, host_output_path: str | None = None) -> str: + """ + Determine the output path for a report file. + + If ``host_output_path`` is empty or None, returns the file path relative + to the current working directory. Otherwise, prefixes the path (stripping + the first directory component) with the provided container host output path. + Returns empty string if path has no parts to strip. + + Parameters + ---------- + file_name : str + Path to the input file (absolute or relative). + host_output_path : str | None + Base output directory path. + + Returns + ------- + str + Output path as string. + + Examples + -------- + >>> find_report_output_path("output/reports/maven/foo/bar", host_output_path=None) + 'output/reports/maven/foo/bar' + >>> find_report_output_path("output/reports/maven/foo/bar", host_output_path="output_dir") + 'output_dir/reports/maven/foo/bar' + >>> find_report_output_path("foo", host_output_path="output") + 'output/' + >>> find_report_output_path("", host_output_path="output") + '' + """ + if not file_name: + return "" + if host_output_path is None: + host_output_path = global_config.host_output_path + try: + file_path = Path(os.path.relpath(file_name, os.getcwd())) + except (ValueError, OSError) as error: + logger.debug("Failed to create path for %s: %s", file_name, error) + return "" + if not host_output_path: + return str(file_path) + + return os.path.join(host_output_path, file_path.relative_to(file_path.parts[0])).rstrip(".") diff --git a/src/macaron/output_reporter/reporter.py b/src/macaron/output_reporter/reporter.py index 45589836e..418c79160 100644 --- a/src/macaron/output_reporter/reporter.py +++ b/src/macaron/output_reporter/reporter.py @@ -20,6 +20,7 @@ import macaron.output_reporter.jinja2_extensions as jinja2_extensions # pylint: disable=consider-using-from-import from macaron.console import access_handler +from macaron.output_reporter import find_report_output_path from macaron.output_reporter.results import Report from macaron.output_reporter.scm import SCMStatus @@ -62,13 +63,13 @@ def write_file(self, file_path: str, data: str) -> bool: """ try: with open(file_path, mode=self.mode, encoding=self.encoding) as file: - logger.info("Writing to file %s", os.path.relpath(file_path, os.getcwd())) + logger.info("Writing to file %s", find_report_output_path(file_path)) file.write(data) return True except OSError as error: logger.error( "Cannot write to %s. Error: %s", - os.path.relpath(file_path, os.getcwd()), + find_report_output_path(file_path), error, ) return False @@ -128,7 +129,7 @@ def generate(self, target_dir: str, report: Report | dict) -> None: dep_file_name = os.path.join(target_dir, "dependencies.json") serialized_configs = list(report.get_serialized_configs()) self.write_file(dep_file_name, json.dumps(serialized_configs, indent=self.indent)) - self.rich_handler.update_report_table("Dependencies Report", os.path.relpath(dep_file_name, os.getcwd())) + self.rich_handler.update_report_table("Dependencies Report", find_report_output_path(dep_file_name)) for record in report.get_records(): if record.context and record.status == SCMStatus.AVAILABLE: @@ -136,7 +137,7 @@ def generate(self, target_dir: str, report: Report | dict) -> None: json_data = json.dumps(record.get_dict(), indent=self.indent) self.write_file(file_name, json_data) self.rich_handler.update_report_table( - "JSON Report", os.path.relpath(file_name, os.getcwd()), record.record_id + "JSON Report", find_report_output_path(file_name), record.record_id ) except TypeError as error: logger.critical("Cannot serialize output report to JSON: %s", error) @@ -231,7 +232,7 @@ def generate(self, target_dir: str, report: Report | dict) -> None: html = self.template.render(deepcopy(record.get_dict())) self.write_file(file_name, html) self.rich_handler.update_report_table( - "HTML Report", os.path.relpath(file_name, os.getcwd()), record.record_id + "HTML Report", find_report_output_path(file_name), record.record_id ) except TemplateSyntaxError as error: location = f"line {error.lineno}" @@ -285,7 +286,7 @@ def generate(self, target_dir: str, report: Report | dict) -> None: json.dumps(report, indent=self.indent), ) self.rich_handler.update_policy_report( - os.path.relpath(os.path.join(target_dir, "policy_report.json"), os.getcwd()) + find_report_output_path(os.path.join(target_dir, "policy_report.json")) ) except (TypeError, ValueError, OSError) as error: logger.critical("Cannot serialize the policy report to JSON: %s", error) diff --git a/src/macaron/provenance/provenance_verifier.py b/src/macaron/provenance/provenance_verifier.py index 0be9920fa..06356eff6 100644 --- a/src/macaron/provenance/provenance_verifier.py +++ b/src/macaron/provenance/provenance_verifier.py @@ -16,6 +16,7 @@ from macaron.config.defaults import defaults from macaron.config.global_config import global_config +from macaron.output_reporter import find_report_output_path from macaron.provenance.provenance_extractor import ProvenancePredicate, SLSAGithubGenericBuildDefinitionV01 from macaron.provenance.provenance_finder import ProvenanceAsset from macaron.repo_finder.commit_finder import AbstractPurlType, determine_abstract_purl_type @@ -336,7 +337,7 @@ def _verify_slsa( verified = "PASSED: SLSA verification passed" in output log_path = os.path.join(global_config.build_log_path, f"{os.path.basename(source_path)}.slsa_verifier.log") with open(log_path, mode="a", encoding="utf-8") as log_file: - logger.info("Storing SLSA verifier output for %s to %s", asset_name, os.path.relpath(log_path, os.getcwd())) + logger.info("Storing SLSA verifier output for %s to %s", asset_name, find_report_output_path(log_path)) log_file.writelines( [f"SLSA verifier output for cmd: {' '.join(cmd)}\n", output, "--------------------------------\n"] ) @@ -359,7 +360,7 @@ def _verify_slsa( ) with open(error_log_path, mode="a", encoding="utf-8") as log_file: logger.info( - "Storing SLSA verifier log for%s to %s", asset_name, os.path.relpath(error_log_path, os.getcwd()) + "Storing SLSA verifier log for%s to %s", asset_name, find_report_output_path(error_log_path) ) log_file.write(f"SLSA verifier output for cmd: {' '.join(cmd)}\n") log_file.writelines(errors) diff --git a/src/macaron/repo_finder/repo_utils.py b/src/macaron/repo_finder/repo_utils.py index d99280e0f..56d48b42a 100644 --- a/src/macaron/repo_finder/repo_utils.py +++ b/src/macaron/repo_finder/repo_utils.py @@ -14,6 +14,7 @@ from macaron.config.global_config import global_config from macaron.console import access_handler +from macaron.output_reporter import find_report_output_path from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService from macaron.slsa_analyzer.git_service.base_git_service import NoneGitService from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR, decode_git_tags, parse_git_tags @@ -78,7 +79,7 @@ def generate_report(purl: str, commit: str, repo: str, target_dir: str) -> bool: fullpath = f"{target_dir}/{filename}" os.makedirs(os.path.dirname(fullpath), exist_ok=True) - logger.info("Writing report to: %s", os.path.relpath(fullpath, os.getcwd())) + logger.info("Writing report to: %s", find_report_output_path(fullpath)) try: with open(fullpath, "w", encoding="utf-8") as file: @@ -87,10 +88,10 @@ def generate_report(purl: str, commit: str, repo: str, target_dir: str) -> bool: logger.debug("Failed to write report to file: %s", error) return False - logger.info("Report written to: %s", os.path.relpath(fullpath, os.getcwd())) + logger.info("Report written to: %s", find_report_output_path(fullpath)) rich_handler = access_handler.get_handler() - rich_handler.update_find_source_table("JSON Report:", os.path.relpath(fullpath, os.getcwd())) + rich_handler.update_find_source_table("JSON Report:", find_report_output_path(fullpath)) return True diff --git a/src/macaron/slsa_analyzer/ci_service/base_ci_service.py b/src/macaron/slsa_analyzer/ci_service/base_ci_service.py index 9df7e8e70..56979e055 100644 --- a/src/macaron/slsa_analyzer/ci_service/base_ci_service.py +++ b/src/macaron/slsa_analyzer/ci_service/base_ci_service.py @@ -11,6 +11,7 @@ from datetime import datetime from macaron.code_analyzer.dataflow_analysis.core import NodeForest +from macaron.output_reporter import find_report_output_path from macaron.slsa_analyzer.git_service.api_client import BaseAPIClient from macaron.slsa_analyzer.git_service.base_git_service import BaseGitService @@ -146,9 +147,7 @@ def has_kws_in_config(self, kws: list, build_tool_name: str, repo_path: str) -> line.strip(), ) return keyword, config - logger.info( - "No build command found for %s in %s", build_tool_name, os.path.relpath(file_path, os.getcwd()) - ) + logger.info("No build command found for %s in %s", build_tool_name, find_report_output_path(file_path)) return "", "" except FileNotFoundError as error: logger.debug(error) diff --git a/tests/config/test_defaults.py b/tests/config/test_defaults.py index 45d138590..d0b09c5ee 100644 --- a/tests/config/test_defaults.py +++ b/tests/config/test_defaults.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the defaults module.""" @@ -9,7 +9,6 @@ import pytest from macaron.config.defaults import create_defaults, defaults, load_defaults -from macaron.config.global_config import global_config def test_load_defaults() -> None: @@ -29,7 +28,7 @@ def test_load_defaults() -> None: def test_create_defaults() -> None: """Test dumping the default values.""" output_dir = os.path.dirname(os.path.abspath(__file__)) - assert create_defaults(output_dir, global_config.macaron_path) is True + assert create_defaults(output_dir) is True @pytest.mark.xfail( @@ -38,7 +37,7 @@ def test_create_defaults() -> None: ) def test_create_defaults_without_permission() -> None: """Test dumping default config in cases where the user does not have write permission to the output location.""" - assert create_defaults(output_path="/", cwd_path="/") is False + assert create_defaults(output_path="/") is False @pytest.mark.parametrize( diff --git a/tests/conftest.py b/tests/conftest.py index cb2cee8c0..413de3498 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -82,7 +82,7 @@ def setup_test(test_dir: Path, macaron_path: Path) -> NoReturn: # type: ignore """ # Load values from defaults.ini. if not test_dir.joinpath("defaults.ini").exists(): - create_defaults(str(test_dir), str(macaron_path)) + create_defaults(str(test_dir)) load_defaults(str(macaron_path)) yield diff --git a/tests/integration/cases/apache_maven_sbom/test.yaml b/tests/integration/cases/apache_maven_sbom/test.yaml index 2e2e47a34..bd7a861a3 100644 --- a/tests/integration/cases/apache_maven_sbom/test.yaml +++ b/tests/integration/cases/apache_maven_sbom/test.yaml @@ -13,6 +13,7 @@ steps: kind: analyze options: ini: config.ini + output: output_dir command_args: - -purl - pkg:maven/org.apache.maven/maven@4.0.0-alpha-1-SNAPSHOT?type=pom @@ -28,5 +29,5 @@ steps: kind: compare options: kind: deps_report - result: output/reports/maven/org_apache_maven/maven/dependencies.json + result: output_dir/reports/maven/org_apache_maven/maven/dependencies.json expected: dependencies.json diff --git a/tests/integration/run.py b/tests/integration/run.py index e1cec81cf..e78cf57a6 100644 --- a/tests/integration/run.py +++ b/tests/integration/run.py @@ -390,6 +390,7 @@ class AnalyzeStepOptions(TypedDict): expectation: str | None provenance: str | None sbom: str | None + output: str | None @dataclass @@ -404,15 +405,7 @@ def options_schema(cwd: str) -> cfgv.Map: None, *[ cfgv.NoAdditionalKeys( - [ - "main_args", - "command_args", - "env", - "ini", - "expectation", - "provenance", - "sbom", - ], + ["main_args", "command_args", "env", "ini", "expectation", "provenance", "sbom", "output"], ), cfgv.Optional( key="main_args", @@ -444,6 +437,11 @@ def options_schema(cwd: str) -> cfgv.Map: check_fn=check_required_file(cwd), default=None, ), + cfgv.Optional( + key="output", + check_fn=cfgv.check_string, + default=None, + ), ], ) @@ -454,6 +452,9 @@ def cmd(self, macaron_cmd: str) -> list[str]: ini_file = self.options.get("ini", None) if ini_file is not None: args.extend(["--defaults-path", ini_file]) + output = self.options.get("output", None) + if output is not None: + args.extend(["--output", output]) args.append("analyze") expectation_file = self.options.get("expectation", None) if expectation_file is not None: diff --git a/tests/macaron_testcase.py b/tests/macaron_testcase.py index a799d8f6e..9f94b4f39 100644 --- a/tests/macaron_testcase.py +++ b/tests/macaron_testcase.py @@ -26,7 +26,7 @@ def setUpClass(cls) -> None: """Set up the necessary values for the tests.""" # Load values from defaults.ini. if not cls.macaron_test_dir.joinpath("defaults.ini").exists(): - create_defaults(str(cls.macaron_test_dir), str(cls.macaron_path)) + create_defaults(str(cls.macaron_test_dir)) load_defaults(os.path.join(str(cls.macaron_test_dir), "defaults.ini")) diff --git a/tests/test_main.py b/tests/test_main.py index ce7f9d7c4..50f57b7d9 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Simple tests for the main method.""" From bd6880c5e899da1086dd24fa27d29a2950523747 Mon Sep 17 00:00:00 2001 From: Behnaz Hassanshahi Date: Thu, 8 Jan 2026 16:22:13 +1000 Subject: [PATCH 06/20] test: update log4j package version to the latest (#1276) Update the log4j package version in integration tests to the latest version, for which GitHub workflow runs are not deleted, and update the documentations. Signed-off-by: behnazh-w --- .github/workflows/test_macaron_action.yaml | 4 ++-- .../pages/tutorials/detect_malicious_java_dep.rst | 12 ++++++------ .../tutorials/detect_vulnerable_github_actions.rst | 2 +- .../cases/org_apache_logging_log4j/policy_purl.dl | 4 ++-- .../cases/org_apache_logging_log4j/test.yaml | 4 ++-- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test_macaron_action.yaml b/.github/workflows/test_macaron_action.yaml index 2621313c6..930863d30 100644 --- a/.github/workflows/test_macaron_action.yaml +++ b/.github/workflows/test_macaron_action.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. name: Test Macaron Action (tutorials) @@ -111,7 +111,7 @@ jobs: - name: Run Macaron (analyze purl - log4j-core example) uses: ./ with: - package_url: pkg:maven/org.apache.logging.log4j/log4j-core@3.0.0-beta3 + package_url: pkg:maven/org.apache.logging.log4j/log4j-core@2.25.3 output_dir: macaron_output/detect_vulnerable_github_actions - name: Run Macaron (verify policy - github_actions_vulns for purl) diff --git a/docs/source/pages/tutorials/detect_malicious_java_dep.rst b/docs/source/pages/tutorials/detect_malicious_java_dep.rst index 27bac87ba..02a2bc6df 100644 --- a/docs/source/pages/tutorials/detect_malicious_java_dep.rst +++ b/docs/source/pages/tutorials/detect_malicious_java_dep.rst @@ -25,7 +25,7 @@ dependencies: * - Artifact name - `Package URL (PURL) `_ * - `log4j-core `_ - - ``pkg:maven/org.apache.logging.log4j/log4j-core@3.0.0-beta3?type=jar`` + - ``pkg:maven/org.apache.logging.log4j/log4j-core@2.25.3?type=jar`` * - `jackson-databind `_ - ``pkg:maven/io.github.behnazh-w.demo/jackson-databind@1.0?type=jar`` @@ -110,20 +110,20 @@ As you scroll down in the HTML report, you will see a section for the dependenci | Macaron has found the two dependencies as expected: * ``io.github.behnazh-w.demo:jackson-databind:1.0`` -* ``org.apache.logging.log4j:log4j-core:3.0.0-beta3`` +* ``org.apache.logging.log4j:log4j-core:2.25.3`` -When we open the reports for each dependency, we see that ``mcn_find_artifact_pipeline_1`` is passed for ``org.apache.logging.log4j:log4j-core:3.0.0-beta3`` -and a GitHub Actions workflow run is found for publishing version ``3.0.0-beta3``. However, this check is failing for ``io.github.behnazh-w.demo:jackson-databind:1.0``. +When we open the reports for each dependency, we see that ``mcn_find_artifact_pipeline_1`` is passed for ``org.apache.logging.log4j:log4j-core:2.25.3`` +and a GitHub Actions workflow run is found for publishing version ``2.25.3``. However, this check is failing for ``io.github.behnazh-w.demo:jackson-databind:1.0``. This means that ``io.github.behnazh-w.demo:jackson-databind:1.0`` could have been built and published manually to Maven Central and could potentially be malicious. .. _fig_find_artifact_pipeline_log4j: .. figure:: ../../_static/images/tutorial_log4j_find_pipeline.png - :alt: mcn_find_artifact_pipeline_1 for org.apache.logging.log4j:log4j-core:3.0.0-beta3 + :alt: mcn_find_artifact_pipeline_1 for org.apache.logging.log4j:log4j-core:2.25.3 :align: center - ``org.apache.logging.log4j:log4j-core:3.0.0-beta3`` + ``org.apache.logging.log4j:log4j-core:2.25.3`` .. _fig_infer_artifact_pipeline_bh_jackson_databind: diff --git a/docs/source/pages/tutorials/detect_vulnerable_github_actions.rst b/docs/source/pages/tutorials/detect_vulnerable_github_actions.rst index f6f1747cf..953523a82 100644 --- a/docs/source/pages/tutorials/detect_vulnerable_github_actions.rst +++ b/docs/source/pages/tutorials/detect_vulnerable_github_actions.rst @@ -107,7 +107,7 @@ Alternatively, run the ``analyze`` command with the PURL of a package: .. code-block:: shell - ./run_macaron.sh analyze -purl pkg:maven/org.apache.logging.log4j/log4j-core@3.0.0-beta3 + ./run_macaron.sh analyze -purl pkg:maven/org.apache.logging.log4j/log4j-core@2.25.3 Then, ensure that the ``mcn_githubactions_vulnerabilities_1`` check passes for the component. You can create a similar policy to the one shown earlier and store it in a file (e.g., ``check_github_actions_vuln.dl``): diff --git a/tests/integration/cases/org_apache_logging_log4j/policy_purl.dl b/tests/integration/cases/org_apache_logging_log4j/policy_purl.dl index f81ac7b07..49df9eba0 100644 --- a/tests/integration/cases/org_apache_logging_log4j/policy_purl.dl +++ b/tests/integration/cases/org_apache_logging_log4j/policy_purl.dl @@ -1,4 +1,4 @@ -/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Copyright (c) 2024 - 2026, Oracle and/or its affiliates. All rights reserved. */ /* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ #include "prelude.dl" @@ -20,4 +20,4 @@ Policy("test_policy", component_id, "") :- is_repo_url(component_id, "https://github.com/apache/logging-log4j2"). apply_policy_to("test_policy", component_id) :- - is_component(component_id, "pkg:maven/org.apache.logging.log4j/log4j-core@3.0.0-beta3"). + is_component(component_id, "pkg:maven/org.apache.logging.log4j/log4j-core@2.25.3"). diff --git a/tests/integration/cases/org_apache_logging_log4j/test.yaml b/tests/integration/cases/org_apache_logging_log4j/test.yaml index 8da5f01b6..7871c7a5a 100644 --- a/tests/integration/cases/org_apache_logging_log4j/test.yaml +++ b/tests/integration/cases/org_apache_logging_log4j/test.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. description: | @@ -15,7 +15,7 @@ steps: options: command_args: - -purl - - pkg:maven/org.apache.logging.log4j/log4j-core@3.0.0-beta3 + - pkg:maven/org.apache.logging.log4j/log4j-core@2.25.3 - name: Run macaron verify-policy to verify passed/failed checks kind: verify options: From ed3a80f2a768542cc9598769ca4a0ec9f6a91d56 Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Thu, 15 Jan 2026 21:00:21 +1000 Subject: [PATCH 07/20] chore: infer interpreter version via Python tags in distribution names (#1265) Signed-off-by: Abhinav Pradeep --- .../build_spec_generator.py | 4 +- .../build_spec_generator/common_spec/core.py | 2 +- .../common_spec/pypi_spec.py | 36 +++++++---- .../dockerfile/pypi_dockerfile_output.py | 64 +++++++++++++++++-- 4 files changed, 84 insertions(+), 22 deletions(-) diff --git a/src/macaron/build_spec_generator/build_spec_generator.py b/src/macaron/build_spec_generator/build_spec_generator.py index 9d7fd94ca..e66be4ac2 100644 --- a/src/macaron/build_spec_generator/build_spec_generator.py +++ b/src/macaron/build_spec_generator/build_spec_generator.py @@ -98,8 +98,8 @@ def gen_build_spec_for_purl( case BuildSpecFormat.DOCKERFILE: try: build_spec_content = gen_dockerfile(build_spec) - except ValueError as error: - logger.error("Error while serializing the build spec: %s.", error) + except GenerateBuildSpecError as error: + logger.error("Error while generating the build spec: %s.", error) return os.EX_DATAERR build_spec_file_path = os.path.join(build_spec_dir_path, "dockerfile.buildspec") diff --git a/src/macaron/build_spec_generator/common_spec/core.py b/src/macaron/build_spec_generator/common_spec/core.py index 26b2f329f..4c2cf1ecd 100644 --- a/src/macaron/build_spec_generator/common_spec/core.py +++ b/src/macaron/build_spec_generator/common_spec/core.py @@ -378,7 +378,7 @@ def gen_generic_build_spec( "purl": str(purl), "language": target_language, "build_tools": build_tool_names, - "build_commands": [selected_build_command], + "build_commands": [selected_build_command] if selected_build_command else [], } ) ECOSYSTEMS[purl.type.upper()].value(base_build_spec_dict).resolve_fields(purl) diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index 999afbb19..d9bfd4b82 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -155,6 +155,16 @@ def resolve_fields(self, purl: PackageURL) -> None: chronologically_likeliest_version = ( pypi_package_json.get_chronologically_suitable_setuptools_version() ) + try: + # Get information from the wheel file name. + logger.debug(pypi_package_json.wheel_filename) + _, _, _, tags = parse_wheel_filename(pypi_package_json.wheel_filename) + for tag in tags: + wheel_name_python_version_list.append(tag.interpreter) + wheel_name_platforms.add(tag.platform) + logger.debug(python_version_set) + except InvalidWheelFilename: + logger.debug("Could not parse wheel file name to extract version") except SourceCodeError: logger.debug("Could not find pure wheel matching this PURL") @@ -214,17 +224,6 @@ def resolve_fields(self, purl: PackageURL) -> None: except (InvalidRequirement, InvalidSpecifier) as error: logger.debug("Malformed requirement encountered %s : %s", requirement, error) - try: - # Get information from the wheel file name. - logger.debug(pypi_package_json.wheel_filename) - _, _, _, tags = parse_wheel_filename(pypi_package_json.wheel_filename) - for tag in tags: - wheel_name_python_version_list.append(tag.interpreter) - wheel_name_platforms.add(tag.platform) - logger.debug(python_version_set) - except InvalidWheelFilename: - logger.debug("Could not parse wheel file name to extract version") - self.data["language_version"] = list(python_version_set) or wheel_name_python_version_list # Use the default build command for pure Python packages. @@ -243,9 +242,18 @@ def resolve_fields(self, purl: PackageURL) -> None: if not patched_build_commands: # Resolve and patch build commands. - selected_build_commands = self.data["build_commands"] or self.get_default_build_commands( - self.data["build_tools"] - ) + + # To ensure that selected_build_commands is never empty, we seed with the fallback + # command of python -m build --wheel -n + if self.data["build_commands"]: + selected_build_commands = self.data["build_commands"] + else: + self.data["build_commands"] = ["python -m build --wheel -n".split()] + selected_build_commands = ( + self.get_default_build_commands(self.data["build_tools"]) or self.data["build_commands"] + ) + + logger.debug(selected_build_commands) patched_build_commands = ( patch_commands( diff --git a/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py b/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py index ef2360a5c..457cfe15c 100644 --- a/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py +++ b/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py @@ -1,9 +1,10 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module implements the logic to generate a dockerfile from a Python buildspec.""" import logging +import re from textwrap import dedent from packaging.specifiers import InvalidSpecifier, SpecifierSet @@ -35,8 +36,7 @@ def gen_dockerfile(buildspec: BaseBuildSpecDict) -> str: """ language_version: str | None = pick_specific_version(buildspec) if language_version is None: - logger.debug("Could not derive a specific interpreter version.") - raise GenerateBuildSpecError("Could not derive specific interpreter version.") + raise GenerateBuildSpecError("Could not derive specific interpreter version") backend_install_commands: str = " && ".join(build_backend_commands(buildspec)) build_tool_install: str = "" if ( @@ -124,8 +124,18 @@ def pick_specific_version(buildspec: BaseBuildSpecDict) -> str | None: try: version_set &= SpecifierSet(version) except InvalidSpecifier as error: - logger.debug("Malformed interpreter version encountered: %s (%s)", version, error) - return None + logger.debug("Non-standard interpreter version encountered: %s (%s)", version, error) + # Whilst the Python tags specify interpreter implementation + # as well as version, with no standard way to parse out the + # implementation, we can attempt to heuristically: + try_parse_version = infer_interpreter_version(version) + if try_parse_version: + try: + version_set &= SpecifierSet(f">={try_parse_version}") + except InvalidSpecifier as error_for_retry: + logger.debug("Could not parse interpreter version from: %s (%s)", version, error_for_retry) + + logger.debug(version_set) # Now to get the latest acceptable one, we can step through all interpreter # versions. For the most accurate result, we can query python.org for a @@ -141,6 +151,50 @@ def pick_specific_version(buildspec: BaseBuildSpecDict) -> str | None: return None +def infer_interpreter_version(specifier: str) -> str | None: + """Infer interpreter version from Python-tag. + + Note: This function is called on version specifiers + that we cannot trivially parse. In the case that + it is a Python-tag, which is obtained from the + wheel name, we attempt to infer the corresponding + interpreter version. + + Parameters + ---------- + specifier: str + specifier string that could not be trivially parsed. + + Returns + ------- + str | None + The interpreter version inferred from the specifier, or + None if we cannot parse the specifier as a Python-tag. + + Examples + -------- + >>> infer_interpreter_version("py3") + '3' + >>> infer_interpreter_version("cp314") + '3.14' + >>> infer_interpreter_version("pypy311") + '3.11' + >>> infer_interpreter_version("malformed123") + """ + # The primary alternative interpreter implementations are documented here: + # https://www.python.org/download/alternatives/ + # We parse tags for these implementations using below regular expression: + pattern = re.compile(r"^(py|cp|ip|pp|pypy|jy|graalpy)(\d{1,3})$") + parsed_tag = pattern.match(specifier) + if parsed_tag: + digits = parsed_tag.group(2) + # As match succeeded len(digits) \in {1,2,3} + if len(digits) == 1: + return parsed_tag.group(2) + return f"{digits[0]}.{digits[1:]}" + return None + + def build_backend_commands(buildspec: BaseBuildSpecDict) -> list[str]: """Generate the installation commands for each inferred build backend. From d995156f23e54cb7c879fc4ffdcc787505849f8b Mon Sep 17 00:00:00 2001 From: Behnaz Hassanshahi Date: Mon, 19 Jan 2026 18:54:59 +1000 Subject: [PATCH 08/20] chore: address Value field issue for empty strings in dataflow analysis (#1281) This patch addresses an issue in the dataflow analysis related to parsing single-quoted strings in bash scripts. Previously, for empty single-quoted strings the code incorrectly assumed that a Value field with an empty string would always be present. Signed-off-by: behnazh-w --- src/macaron/code_analyzer/dataflow_analysis/bash.py | 8 +++++--- src/macaron/parsers/bashparser_model.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/macaron/code_analyzer/dataflow_analysis/bash.py b/src/macaron/code_analyzer/dataflow_analysis/bash.py index f350448a5..4a4903c86 100644 --- a/src/macaron/code_analyzer/dataflow_analysis/bash.py +++ b/src/macaron/code_analyzer/dataflow_analysis/bash.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Dataflow analysis implementation for analysing Bash shell scripts.""" @@ -1811,7 +1811,7 @@ def convert_shell_word_to_value( if dbl_quoted_parts is not None: return convert_shell_value_sequence_to_fact_value(dbl_quoted_parts, context), True - sgl_quoted_str = parse_sql_quoted_string(word) + sgl_quoted_str = parse_sgl_quoted_string(word) if sgl_quoted_str is not None: return facts.StringLiteral(sgl_quoted_str), True @@ -1842,7 +1842,7 @@ def parse_dbl_quoted_string(word: bashparser_model.Word) -> list[LiteralOrEnvVar return None -def parse_sql_quoted_string(word: bashparser_model.Word) -> str | None: +def parse_sgl_quoted_string(word: bashparser_model.Word) -> str | None: """Parse single quoted string. If the given word is a single quoted string, return the string @@ -1851,6 +1851,8 @@ def parse_sql_quoted_string(word: bashparser_model.Word) -> str | None: if len(word["Parts"]) == 1: part = word["Parts"][0] if bashparser_model.is_sgl_quoted(part): + if "Value" not in part: + return "" return part["Value"] return None diff --git a/src/macaron/parsers/bashparser_model.py b/src/macaron/parsers/bashparser_model.py index 09ca83813..edd2a6063 100644 --- a/src/macaron/parsers/bashparser_model.py +++ b/src/macaron/parsers/bashparser_model.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Type definitions for Bash AST as produced (and json-serialised) by the "mvdan.cc/sh/v3/syntax" bash parser.""" @@ -159,7 +159,7 @@ class SglQuoted(TypedDict): Left: Pos Right: Pos Dollar: NotRequired[bool] - Value: str + Value: NotRequired[str] def is_sgl_quoted(part: WordPart) -> TypeGuard[SglQuoted]: From 65ecc4ca5cfca9674cc315cb840a28630e3a1a50 Mon Sep 17 00:00:00 2001 From: Behnaz Hassanshahi Date: Tue, 20 Jan 2026 17:51:52 +1000 Subject: [PATCH 09/20] fix: add the missing provenance asset links to the reports (#1271) Fixes missing provenance asset links in reports, by adding explicit links to provenance files. Signed-off-by: behnazh-w --- src/macaron/provenance/__init__.py | 13 ++++++++++++ src/macaron/provenance/provenance_finder.py | 12 +---------- src/macaron/slsa_analyzer/analyzer.py | 3 ++- .../slsa_analyzer/git_service/api_client.py | 12 ++++++----- .../slsa_analyzer/git_service/github.py | 21 ++++++++++++------- 5 files changed, 36 insertions(+), 25 deletions(-) diff --git a/src/macaron/provenance/__init__.py b/src/macaron/provenance/__init__.py index a99afa31c..7e3c5a63b 100644 --- a/src/macaron/provenance/__init__.py +++ b/src/macaron/provenance/__init__.py @@ -2,3 +2,16 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This package contains the provenance tools for software components.""" + +from dataclasses import dataclass + +from macaron.slsa_analyzer.provenance.intoto import InTotoPayload + + +@dataclass(frozen=True) +class ProvenanceAsset: + """This class exists to hold a provenance payload with the original asset's name and URL.""" + + payload: InTotoPayload + name: str + url: str diff --git a/src/macaron/provenance/provenance_finder.py b/src/macaron/provenance/provenance_finder.py index 4935ca62d..0c1385d0f 100644 --- a/src/macaron/provenance/provenance_finder.py +++ b/src/macaron/provenance/provenance_finder.py @@ -6,7 +6,6 @@ import logging import os import tempfile -from dataclasses import dataclass from functools import partial from packageurl import PackageURL @@ -14,6 +13,7 @@ from macaron.artifact.local_artifact import get_local_artifact_hash from macaron.config.defaults import defaults +from macaron.provenance import ProvenanceAsset from macaron.repo_finder.commit_finder import AbstractPurlType, determine_abstract_purl_type from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder from macaron.repo_finder.repo_utils import get_repo_tags @@ -30,7 +30,6 @@ ) from macaron.slsa_analyzer.package_registry.npm_registry import NPMAttestationAsset from macaron.slsa_analyzer.package_registry.pypi_registry import find_or_create_pypi_asset -from macaron.slsa_analyzer.provenance.intoto import InTotoPayload from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError from macaron.slsa_analyzer.provenance.loader import load_provenance_payload from macaron.slsa_analyzer.provenance.slsa import SLSAProvenanceData @@ -41,15 +40,6 @@ logger: logging.Logger = logging.getLogger(__name__) -@dataclass(frozen=True) -class ProvenanceAsset: - """This class exists to hold a provenance payload with the original asset's name and URL.""" - - payload: InTotoPayload - name: str - url: str - - class ProvenanceFinder: """This class is used to find and retrieve provenance files from supported registries.""" diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index 31b4f0937..a76e45e1b 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -527,9 +527,10 @@ def run_single( # Try to discover GitHub attestation for the target software component. artifact_hash = get_artifact_hash(parsed_purl, local_artifact_dirs, package_registries_info) if artifact_hash: - provenance_payload = git_service.get_attestation_payload( + provenance_asset = git_service.get_attestation( analyze_ctx.component.repository.full_name, artifact_hash ) + provenance_payload = provenance_asset.payload if provenance_asset else None if provenance_payload: try: provenance_repo_url, provenance_commit_digest = extract_repo_and_commit_from_provenance( diff --git a/src/macaron/slsa_analyzer/git_service/api_client.py b/src/macaron/slsa_analyzer/git_service/api_client.py index 98012fae8..9921c2dc9 100644 --- a/src/macaron/slsa_analyzer/git_service/api_client.py +++ b/src/macaron/slsa_analyzer/git_service/api_client.py @@ -648,8 +648,8 @@ def download_asset(self, url: str, download_path: str) -> bool: return download_file_with_size_limit(url, headers, download_path, timeout, size_limit) - def get_attestation(self, full_name: str, artifact_hash: str) -> dict: - """Download and return the attestation associated with the passed artifact hash, if any. + def get_attestation(self, full_name: str, artifact_hash: str) -> tuple[str | None, dict]: + """Download and return the attestation url and content associated with the passed artifact hash, if any. Parameters ---------- @@ -660,12 +660,14 @@ def get_attestation(self, full_name: str, artifact_hash: str) -> dict: Returns ------- - dict - The attestation data, or an empty dict if not found. + tuple[str|None,dict] + The attestation url and data, or None and an empty dict if not found. """ url = f"{GhAPIClient._REPO_END_POINT}/{full_name}/attestations/sha256:{artifact_hash}" response_data = send_get_http(url, self.headers) - return response_data or {} + if not response_data: + return (None, {}) + return (url, response_data) def get_default_gh_client(access_token: str) -> GhAPIClient: diff --git a/src/macaron/slsa_analyzer/git_service/github.py b/src/macaron/slsa_analyzer/git_service/github.py index ff7ecc593..d5e1c8548 100644 --- a/src/macaron/slsa_analyzer/git_service/github.py +++ b/src/macaron/slsa_analyzer/git_service/github.py @@ -9,10 +9,11 @@ from macaron.config.global_config import global_config from macaron.errors import ConfigurationError, RepoCheckOutError from macaron.json_tools import json_extract +from macaron.provenance import ProvenanceAsset from macaron.slsa_analyzer import git_url from macaron.slsa_analyzer.git_service.api_client import GhAPIClient, get_default_gh_client from macaron.slsa_analyzer.git_service.base_git_service import BaseGitService -from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, ValidateInTotoPayloadError, validate_intoto_payload +from macaron.slsa_analyzer.provenance.intoto import ValidateInTotoPayloadError, validate_intoto_payload from macaron.slsa_analyzer.provenance.loader import decode_provenance logger: logging.Logger = logging.getLogger(__name__) @@ -96,7 +97,7 @@ def check_out_repo(self, git_obj: Git, branch: str, digest: str, offline_mode: b return git_obj - def get_attestation_payload(self, repository_name: str, artifact_hash: str) -> InTotoPayload | None: + def get_attestation(self, repository_name: str, artifact_hash: str) -> ProvenanceAsset | None: """Get the GitHub attestation associated with the given PURL, or None if it cannot be found. The schema of GitHub attestation can be found on the API page: @@ -111,12 +112,12 @@ def get_attestation_payload(self, repository_name: str, artifact_hash: str) -> I Returns ------- - InTotoPayload | None - The attestation payload, if found. + ProvenanceAsset | None + The provenance asset, if found. """ - git_attestation_dict = self.api_client.get_attestation(repository_name, artifact_hash) + attestation_url, git_attestation_dict = self.api_client.get_attestation(repository_name, artifact_hash) - if not git_attestation_dict: + if not attestation_url or not git_attestation_dict: return None git_attestation_list = json_extract(git_attestation_dict, ["attestations"], list) @@ -124,9 +125,13 @@ def get_attestation_payload(self, repository_name: str, artifact_hash: str) -> I return None payload = decode_provenance(git_attestation_list[0]) - + validated_payload = None try: - return validate_intoto_payload(payload) + validated_payload = validate_intoto_payload(payload) except ValidateInTotoPayloadError as error: logger.debug("Invalid attestation payload: %s", error) return None + if not validated_payload: + return None + + return ProvenanceAsset(validated_payload, artifact_hash, attestation_url) From 2d54593de3098777db2ac5df0d941885561a1fd9 Mon Sep 17 00:00:00 2001 From: Behnaz Hassanshahi Date: Wed, 28 Jan 2026 14:24:13 +1000 Subject: [PATCH 10/20] chore(deps): update semgrep to v1.149.0 and ignore GHSA-7gcm-g887-7qv7 temporarily (#1290) This PR updates semgrep to v1.149.0 and temporarily ignores GHSA-7gcm-g887-7qv7 until the fix is available. Signed-off-by: behnazh-w --- Makefile | 5 +++-- pyproject.toml | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 173298790..83b0d2d5c 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. # Use bash as the shell when executing a rule's recipe. For more details: @@ -274,12 +274,13 @@ requirements.txt: pyproject.toml # editable mode (like the one in development here) because they may not have # a PyPI entry; also print out CVE description and potential fixes if audit # found an issue. +# Ignoring GHSA-7gcm-g887-7qv7: remove the exception when a fix is available. .PHONY: audit audit: if ! $$(python -c "import pip_audit" &> /dev/null); then \ echo "No package pip_audit installed, upgrade your environment!" && exit 1; \ fi; - python -m pip_audit --skip-editable --desc on --fix --dry-run + python -m pip_audit --skip-editable --desc on --fix --dry-run --ignore-vuln GHSA-7gcm-g887-7qv7 # Run some or all checks over the package code base. .PHONY: check check-code check-bandit check-flake8 check-lint check-mypy check-go check-actionlint diff --git a/pyproject.toml b/pyproject.toml index 65fd534dc..0c0f16641 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. # https://flit.pypa.io/en/latest/pyproject_toml.html @@ -37,7 +37,7 @@ dependencies = [ "beautifulsoup4 >= 4.12.0,<5.0.0", "problog >= 2.2.6,<3.0.0", "cryptography >=44.0.0,<45.0.0", - "semgrep == 1.113.0", + "semgrep == 1.149.0", "email-validator >=2.2.0,<3.0.0", "rich >=13.5.3,<15.0.0", "lark >= 1.3.0,<2.0.0", From aebdf78e7eab15e5c7ac510f741fd9e7dcb0943f Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Wed, 4 Feb 2026 15:25:50 +0530 Subject: [PATCH 11/20] feat: include has_binaries flag in build spec (#1278) Signed-off-by: Abhinav Pradeep --- .../common_spec/base_spec.py | 5 +- .../common_spec/pypi_spec.py | 111 ++++++++++-------- .../dockerfile/pypi_dockerfile_output.py | 2 + src/macaron/errors.py | 6 +- .../package_registry/pypi_registry.py | 62 +++++++++- .../dockerfile/test_pypi_dockerfile_output.py | 3 +- .../expected_default.buildspec | 3 +- .../expected_default.buildspec | 3 +- .../pypi_toga/expected_default.buildspec | 1 + .../expected_default.buildspec | 26 ++++ .../cases/pypi_tree-sitter/test.yaml | 38 ++++++ 11 files changed, 199 insertions(+), 61 deletions(-) create mode 100644 tests/integration/cases/pypi_tree-sitter/expected_default.buildspec create mode 100644 tests/integration/cases/pypi_tree-sitter/test.yaml diff --git a/src/macaron/build_spec_generator/common_spec/base_spec.py b/src/macaron/build_spec_generator/common_spec/base_spec.py index c567609f7..698a0b948 100644 --- a/src/macaron/build_spec_generator/common_spec/base_spec.py +++ b/src/macaron/build_spec_generator/common_spec/base_spec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module includes base build specification and helper classes.""" @@ -81,6 +81,9 @@ class BaseBuildSpecDict(TypedDict, total=False): #: be a list of these that were used in building the wheel alongside their version. build_backends: NotRequired[list[str]] + #: Flag to indicate if the artifact includes binaries. + has_binaries: NotRequired[bool] + class BaseBuildSpec(ABC): """Abstract base class for build specification behavior and field resolution.""" diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index d9bfd4b82..0471afd72 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module includes build specification and helper classes for PyPI packages.""" @@ -14,10 +14,9 @@ from packaging.specifiers import InvalidSpecifier from packaging.utils import InvalidWheelFilename, parse_wheel_filename -from macaron.build_spec_generator.build_command_patcher import CLI_COMMAND_PATCHES, patch_commands from macaron.build_spec_generator.common_spec.base_spec import BaseBuildSpec, BaseBuildSpecDict from macaron.config.defaults import defaults -from macaron.errors import GenerateBuildSpecError, SourceCodeError +from macaron.errors import GenerateBuildSpecError, SourceCodeError, WheelTagError from macaron.json_tools import json_extract from macaron.slsa_analyzer.package_registry import pypi_registry from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo @@ -114,9 +113,9 @@ def resolve_fields(self, purl: PackageURL) -> None: pypi_package_json = pypi_registry.find_or_create_pypi_asset(purl.name, purl.version, registry_info) patched_build_commands: list[list[str]] = [] - build_requires_set: set[str] = set() build_backends_set: set[str] = set() parsed_build_requires: dict[str, str] = {} + sdist_build_requires: dict[str, str] = {} python_version_set: set[str] = set() wheel_name_python_version_list: list[str] = [] wheel_name_platforms: set[str] = set() @@ -134,8 +133,16 @@ def resolve_fields(self, purl: PackageURL) -> None: if py_version := json_extract(release, ["requires_python"], str): python_version_set.add(py_version.replace(" ", "")) + self.data["has_binaries"] = not pypi_package_json.has_pure_wheel() + + if self.data["has_binaries"]: + logger.debug("Can not find a pure wheel") + else: + logger.debug("Found pure wheel matching this PURL") + try: - with pypi_package_json.wheel(): + # The wheel function handles downloading binaries in the case that we cannot find a pure wheel. + with pypi_package_json.wheel(download_binaries=self.data["has_binaries"]): logger.debug("Wheel at %s", pypi_package_json.wheel_path) # Should only have .dist-info directory. logger.debug("It has directories %s", ",".join(os.listdir(pypi_package_json.wheel_path))) @@ -165,8 +172,10 @@ def resolve_fields(self, purl: PackageURL) -> None: logger.debug(python_version_set) except InvalidWheelFilename: logger.debug("Could not parse wheel file name to extract version") + except WheelTagError: + logger.debug("Can not analyze non-pure wheels") except SourceCodeError: - logger.debug("Could not find pure wheel matching this PURL") + logger.debug("Could not download wheel matching this PURL") logger.debug("From .dist_info:") logger.debug(parsed_build_requires) @@ -179,27 +188,32 @@ def resolve_fields(self, purl: PackageURL) -> None: content = tomli.loads(pyproject_content.decode("utf-8")) requires = json_extract(content, ["build-system", "requires"], list) if requires: - build_requires_set.update(elem.replace(" ", "") for elem in requires) + for requirement in requires: + self.add_parsed_requirement(sdist_build_requires, requirement) # If we cannot find `requires` in `[build-system]`, we lean on the fact that setuptools # was the de-facto build tool, and infer a setuptools version to include. else: - build_requires_set.add(f"setuptools=={chronologically_likeliest_version}") + self.add_parsed_requirement( + sdist_build_requires, f"setuptools=={chronologically_likeliest_version}" + ) backend = json_extract(content, ["build-system", "build-backend"], str) if backend: build_backends_set.add(backend.replace(" ", "")) python_version_constraint = json_extract(content, ["project", "requires-python"], str) if python_version_constraint: python_version_set.add(python_version_constraint.replace(" ", "")) - self.apply_tool_specific_inferences(build_requires_set, python_version_set, content) + self.apply_tool_specific_inferences(sdist_build_requires, python_version_set, content) logger.debug( "After analyzing pyproject.toml from the sdist: build-requires: %s, build_backend: %s", - build_requires_set, + sdist_build_requires, build_backends_set, ) # Here we have successfully analyzed the pyproject.toml file. Now, if we have a setup.py/cfg, # we also need to infer a setuptools version to infer. if pypi_package_json.file_exists("setup.py") or pypi_package_json.file_exists("setup.cfg"): - build_requires_set.add(f"setuptools=={chronologically_likeliest_version}") + self.add_parsed_requirement( + sdist_build_requires, f"setuptools=={chronologically_likeliest_version}" + ) except TypeError as error: logger.debug( "Found a type error while reading the pyproject.toml file from the sdist: %s", error @@ -210,26 +224,23 @@ def resolve_fields(self, purl: PackageURL) -> None: logger.debug("No pyproject.toml found: %s", error) # Here we do not have a pyproject.toml file. Instead, we lean on the fact that setuptools # was the de-facto build tool, and infer a setuptools version to include. - build_requires_set.add(f"setuptools=={chronologically_likeliest_version}") + self.add_parsed_requirement( + sdist_build_requires, f"setuptools=={chronologically_likeliest_version}" + ) except SourceCodeError as error: logger.debug("No source distribution found: %s", error) + logger.debug("After complete analysis of the sdist:") + logger.debug(sdist_build_requires) + # Merge in pyproject.toml information only when the wheel dist_info does not contain the same. # Hatch is an interesting example of this merge being required. - for requirement in build_requires_set: - try: - parsed_requirement = Requirement(requirement) - if parsed_requirement.name not in parsed_build_requires: - parsed_build_requires[parsed_requirement.name] = str(parsed_requirement.specifier) - except (InvalidRequirement, InvalidSpecifier) as error: - logger.debug("Malformed requirement encountered %s : %s", requirement, error) + for requirement_name, specifier in sdist_build_requires.items(): + if requirement_name not in parsed_build_requires: + parsed_build_requires[requirement_name] = specifier self.data["language_version"] = list(python_version_set) or wheel_name_python_version_list - # Use the default build command for pure Python packages. - if "any" in wheel_name_platforms: - patched_build_commands = self.get_default_build_commands(self.data["build_tools"]) - # If we were not able to find any build and backends, use the default setuptools. if not parsed_build_requires: parsed_build_requires["setuptools"] = "==" + defaults.get("heuristic.pypi", "default_setuptools") @@ -239,44 +250,39 @@ def resolve_fields(self, purl: PackageURL) -> None: logger.debug("Combined build-requires: %s", parsed_build_requires) self.data["build_requires"] = parsed_build_requires self.data["build_backends"] = list(build_backends_set) + # We do not generate a build command for non-pure packages + if not self.data["has_binaries"]: + patched_build_commands = self.get_default_build_commands(self.data["build_tools"]) + self.data["build_commands"] = patched_build_commands - if not patched_build_commands: - # Resolve and patch build commands. - - # To ensure that selected_build_commands is never empty, we seed with the fallback - # command of python -m build --wheel -n - if self.data["build_commands"]: - selected_build_commands = self.data["build_commands"] - else: - self.data["build_commands"] = ["python -m build --wheel -n".split()] - selected_build_commands = ( - self.get_default_build_commands(self.data["build_tools"]) or self.data["build_commands"] - ) - - logger.debug(selected_build_commands) - - patched_build_commands = ( - patch_commands( - cmds_sequence=selected_build_commands, - patches=CLI_COMMAND_PATCHES, - ) - or [] - ) - if not patched_build_commands: - raise GenerateBuildSpecError(f"Failed to patch command sequences {selected_build_commands}.") + def add_parsed_requirement(self, build_requirements: dict[str, str], requirement: str) -> None: + """ + Parse a requirement string and add it to build_requirements, doing appropriate error handling. - self.data["build_commands"] = patched_build_commands + Parameters + ---------- + build_requirements: dict[str,str] + Dictionary of build requirements to populate. + requirement: str + Requirement string to parse. + """ + try: + parsed_requirement = Requirement(requirement) + if parsed_requirement.name not in build_requirements: + build_requirements[parsed_requirement.name] = str(parsed_requirement.specifier) + except (InvalidRequirement, InvalidSpecifier) as error: + logger.debug("Malformed requirement encountered %s : %s", requirement, error) def apply_tool_specific_inferences( - self, build_requires_set: set[str], python_version_set: set[str], pyproject_contents: dict[str, Any] + self, build_requirements: dict[str, str], python_version_set: set[str], pyproject_contents: dict[str, Any] ) -> None: """ Based on build tools inferred, look into the pyproject.toml for related additional dependencies. Parameters ---------- - build_requires_set: set[str] - Set of build requirements to populate. + build_requirements: dict[str,str] + Dictionary of build requirements to populate. python_version_set: set[str] Set of compatible interpreter versions to populate. pyproject_contents: dict[str, Any] @@ -291,7 +297,8 @@ def apply_tool_specific_inferences( for _, section in hatch_build_hooks.items(): dependencies = section.get("dependencies") if dependencies: - build_requires_set.update(elem.replace(" ", "") for elem in dependencies) + for requirement in dependencies: + self.add_parsed_requirement(build_requirements, requirement) # If we have flit as a build_tool, we will check if the legacy header [tool.flit.metadata] exists, # and if so, check to see if we can use its "requires-python". if "flit" in self.data["build_tools"]: diff --git a/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py b/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py index 457cfe15c..adb956346 100644 --- a/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py +++ b/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py @@ -34,6 +34,8 @@ def gen_dockerfile(buildspec: BaseBuildSpecDict) -> str: GenerateBuildSpecError Raised if dockerfile cannot be generated. """ + if buildspec["has_binaries"]: + raise GenerateBuildSpecError("We currently do not support generating a dockerfile for non-pure Python packages") language_version: str | None = pick_specific_version(buildspec) if language_version is None: raise GenerateBuildSpecError("Could not derive specific interpreter version") diff --git a/src/macaron/errors.py b/src/macaron/errors.py index d088914de..569ec1817 100644 --- a/src/macaron/errors.py +++ b/src/macaron/errors.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains error classes for Macaron.""" @@ -129,3 +129,7 @@ class QueryMacaronDatabaseError(Exception): class GenerateBuildSpecError(Exception): """Happens when there is an unexpected error while generating the build spec file.""" + + +class WheelTagError(MacaronError): + """Happens when a Python wheel with unsupported tags is requested for analysis.""" diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index ce8630d37..e11c8260a 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """The module provides abstractions for the pypi package registry.""" @@ -22,9 +22,10 @@ import requests from bs4 import BeautifulSoup, Tag +from packaging.utils import InvalidWheelFilename, parse_wheel_filename from macaron.config.defaults import defaults -from macaron.errors import ConfigurationError, InvalidHTTPResponseError, SourceCodeError +from macaron.errors import ConfigurationError, InvalidHTTPResponseError, SourceCodeError, WheelTagError from macaron.json_tools import json_extract from macaron.malware_analyzer.datetime_parser import parse_datetime from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry @@ -863,8 +864,29 @@ def get_latest_release_upload_time(self) -> str | None: return None @contextmanager - def wheel(self) -> Generator[None]: - """Download and cleanup wheel of the package with a context manager.""" + def wheel(self, download_binaries: bool) -> Generator[None]: + """Download and cleanup wheel of the package with a context manager. + + Parameters + ---------- + download_binaries: bool + Whether or not to download a wheel with binaries. + + Returns + ------- + Generator[None] + Generator that yields None and takes care of resource cleanup on + exiting the context in which it was called + + Raises + ------ + WheelTagError + If download_binaries is True + SourceCodeError + If we are unable to download the requested wheel + """ + if download_binaries: + raise WheelTagError("Macaron does not currently support analysis of non-pure Python wheels.") if not self.download_wheel(): raise SourceCodeError("Unable to download requested wheel.") yield @@ -889,6 +911,38 @@ def download_wheel(self) -> bool: logger.debug(error) return False + def has_pure_wheel(self) -> bool: + """Check whether the PURL has a pure wheel from its package json. + + Returns + ------- + bool + Whether the PURL has a pure wheel or not. + """ + if self.component_version: + urls = json_extract(self.package_json, ["releases", self.component_version], list) + else: + # Get the latest version. + urls = json_extract(self.package_json, ["urls"], list) + if not urls: + return False + for distribution in urls: + file_name: str = distribution.get("filename") or "" + # Parse out and check none and any + # Catch exceptions + try: + _, _, _, tags = parse_wheel_filename(file_name) + # Check if none and any are in the tags (i.e. the wheel is pure) + # Technically a wheel can have multiple tag sets. Our condition for + # a pure wheel is that it has only one tag set with abi "none" and + # platform "any" + if len(tags) == 1 and all(tag.abi == "none" and tag.platform == "any" for tag in tags): + return True + except InvalidWheelFilename: + logger.debug("Could not parse wheel name.") + return False + return False + @contextmanager def sourcecode(self) -> Generator[None]: """Download and cleanup source code of the package with a context manager.""" diff --git a/tests/build_spec_generator/dockerfile/test_pypi_dockerfile_output.py b/tests/build_spec_generator/dockerfile/test_pypi_dockerfile_output.py index b62ea049a..c8d4d8882 100644 --- a/tests/build_spec_generator/dockerfile/test_pypi_dockerfile_output.py +++ b/tests/build_spec_generator/dockerfile/test_pypi_dockerfile_output.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """ @@ -27,6 +27,7 @@ def fixture_base_build_spec() -> BaseBuildSpecDict: "ecosystem": "pypi", "purl": "pkg:pypi/cachetools@6.2.1", "language": "python", + "has_binaries": False, "build_tools": ["pip"], "build_commands": [["python", "-m", "build"]], "build_requires": {"setuptools": "==80.9.0", "wheel": ""}, diff --git a/tests/integration/cases/pypi_cachetools/expected_default.buildspec b/tests/integration/cases/pypi_cachetools/expected_default.buildspec index 0b5d8acfa..2a05c0e95 100644 --- a/tests/integration/cases/pypi_cachetools/expected_default.buildspec +++ b/tests/integration/cases/pypi_cachetools/expected_default.buildspec @@ -1,5 +1,5 @@ { - "macaron_version": "0.18.0", + "macaron_version": "0.20.0", "group_id": null, "artifact_id": "cachetools", "version": "6.2.1", @@ -24,6 +24,7 @@ "-n" ] ], + "has_binaries": false, "build_requires": { "setuptools": "==80.9.0", "wheel": "" diff --git a/tests/integration/cases/pypi_markdown-it-py/expected_default.buildspec b/tests/integration/cases/pypi_markdown-it-py/expected_default.buildspec index e7842d046..e610ee866 100644 --- a/tests/integration/cases/pypi_markdown-it-py/expected_default.buildspec +++ b/tests/integration/cases/pypi_markdown-it-py/expected_default.buildspec @@ -1,5 +1,5 @@ { - "macaron_version": "0.18.0", + "macaron_version": "0.20.0", "group_id": null, "artifact_id": "markdown-it-py", "version": "4.0.0", @@ -21,6 +21,7 @@ "build" ] ], + "has_binaries": false, "build_requires": { "flit": "==3.12.0", "flit_core": "<4,>=3.4" diff --git a/tests/integration/cases/pypi_toga/expected_default.buildspec b/tests/integration/cases/pypi_toga/expected_default.buildspec index 819113207..875523655 100644 --- a/tests/integration/cases/pypi_toga/expected_default.buildspec +++ b/tests/integration/cases/pypi_toga/expected_default.buildspec @@ -24,6 +24,7 @@ "-n" ] ], + "has_binaries": false, "build_requires": { "setuptools": "==80.3.1", "setuptools_dynamic_dependencies": "==1.0.0", diff --git a/tests/integration/cases/pypi_tree-sitter/expected_default.buildspec b/tests/integration/cases/pypi_tree-sitter/expected_default.buildspec new file mode 100644 index 000000000..5eccc6d34 --- /dev/null +++ b/tests/integration/cases/pypi_tree-sitter/expected_default.buildspec @@ -0,0 +1,26 @@ +{ + "macaron_version": "0.20.0", + "group_id": null, + "artifact_id": "tree-sitter", + "version": "0.25.2", + "git_repo": "https://github.com/tree-sitter/py-tree-sitter", + "git_tag": "e2a5b21449c30c6a4fb49a55567a4699c3271f10", + "newline": "lf", + "language_version": [ + ">=3.10" + ], + "ecosystem": "pypi", + "purl": "pkg:pypi/tree-sitter@0.25.2", + "language": "python", + "build_tools": [ + "pip" + ], + "build_commands": [], + "has_binaries": true, + "build_requires": { + "setuptools": ">=43" + }, + "build_backends": [ + "setuptools.build_meta" + ] +} diff --git a/tests/integration/cases/pypi_tree-sitter/test.yaml b/tests/integration/cases/pypi_tree-sitter/test.yaml new file mode 100644 index 000000000..13cf9d7d7 --- /dev/null +++ b/tests/integration/cases/pypi_tree-sitter/test.yaml @@ -0,0 +1,38 @@ +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Test buildspec generation for a non-pure wheel. + +tags: +- macaron-python-package +- tutorial + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/tree-sitter@0.25.2 +- name: Generate the buildspec + kind: gen-build-spec + options: + command_args: + - -purl + - pkg:pypi/tree-sitter@0.25.2 +- name: Compare Buildspec. + kind: compare + options: + kind: default_build_spec + result: output/buildspec/pypi/tree-sitter/macaron.buildspec + expected: expected_default.buildspec +- name: Generate the buildspec + kind: gen-build-spec + expect_fail: true + options: + command_args: + - -purl + - pkg:pypi/markdown-it-py@0.25.2 + - --output-format + - dockerfile From f1d9ac405621b3430d73df6db912ff935210c739 Mon Sep 17 00:00:00 2001 From: Behnaz Hassanshahi Date: Fri, 6 Feb 2026 14:16:57 +1000 Subject: [PATCH 12/20] refactor!: improve Macaron wheel name and add a new install script (#1291) This PR introduces breaking changes by improving the Macaron wheel naming for clearer platform and architecture identification, and by adding a new installation script. It also makes the slsa-verifier installation optional and adjusts dependency handling for security advisories. Signed-off-by: behnazh-w --- CONTRIBUTING.md | 6 + Makefile | 95 +++++++++----- docker/Dockerfile.final | 9 ++ docs/source/pages/installation.rst | 53 +++++++- .../release_scripts/install_macaron_python.sh | 123 ++++++++++++++++++ src/macaron/provenance/provenance_verifier.py | 35 ++++- .../micronaut-test.dl | 3 +- .../test.yaml | 6 +- .../cases/ossf_scorecard/test.yaml | 3 +- .../cases/urllib3_expectation_dir/test.yaml | 6 +- .../cases/urllib3_expectation_file/test.yaml | 3 +- .../urllib3_invalid_expectation/test.yaml | 3 +- 12 files changed, 290 insertions(+), 55 deletions(-) create mode 100755 scripts/release_scripts/install_macaron_python.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7e003927e..78e293085 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -116,6 +116,12 @@ make setup **Note**: Running the above command will prompt you for sudo access to install [Soufflé Datalog engine](https://github.com/souffle-lang/souffle). You can install Soufflé on your system before running `make setup` to avoid getting prompted. +**Note**: The [slsa-verifier](https://github.com/slsa-framework/slsa-verifier) dependency needs to be installed separately using the following command. This dependency is only used to verify some provenances, so you might not always need it for development. + +```bash +make install-slsa-verifier +``` + With that in place, you’re ready to build and contribute to Macaron! ### Updating dependent packages diff --git a/Makefile b/Makefile index 83b0d2d5c..1209a07b3 100644 --- a/Makefile +++ b/Makefile @@ -5,12 +5,35 @@ # https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html SHELL := bash -# Set the package's name, version, and path for use throughout the Makefile. +# Set the package's name and version for use throughout the Makefile. PACKAGE_NAME := macaron PACKAGE_VERSION := $(shell python -c $$'try: import $(PACKAGE_NAME); print($(PACKAGE_NAME).__version__);\nexcept: print("unknown");') + +# Determine the OS,architecture, and number of cores. +OS := $(shell uname -s) +ifeq ($(OS),Darwin) + PLATFORM_NAME := macosx + OS_DISTRO := "Darwin" +else + ifeq ($(OS),Linux) + PLATFORM_NAME := linux + OS_DISTRO := "$(shell grep '^NAME=' /etc/os-release | sed 's/^NAME=//' | sed 's/"//g')" + OS_MAJOR_VERSION := "$(shell grep '^VERSION=' /etc/os-release | sed -r 's/^[^0-9]+([0-9]+)\..*/\1/')" + endif +endif +ARCH := $(shell uname -m) +NPROC := $(shell nproc) + +# Construct short package identifier. +PACKAGE_SDIST_NAME := $(PACKAGE_NAME)-$(PACKAGE_VERSION) + +# Construct full package identifier. +PACKAGE_WHEEL_DIST_NAME := $(PACKAGE_NAME)-$(PACKAGE_VERSION)-py3-none-$(PLATFORM_NAME)_$(ARCH) + +# Set the Python version, package, and repo paths. +PYTHON ?= python3.11 PACKAGE_PATH := $(shell pwd)/src/$(PACKAGE_NAME) REPO_PATH := $(shell pwd) -PYTHON ?= python3.11 # This variable contains the first goal that matches any of the listed goals # here, else it contains an empty string. The net effect is to filter out @@ -93,26 +116,28 @@ setup: force-upgrade setup-go setup-binaries setup-schemastore go install github.com/CycloneDX/cyclonedx-gomod/cmd/cyclonedx-gomod@v1.3.0 setup-go: go build -o $(PACKAGE_PATH)/bin/ $(REPO_PATH)/golang/cmd/... -setup-binaries: $(PACKAGE_PATH)/bin/slsa-verifier souffle gnu-sed +setup-binaries: souffle gnu-sed -# Install SLSA Verifier. +# Install SLSA Verifier if not already installed. +# Get the checksum from https://github.com/slsa-framework/slsa-verifier/blob/main/SHA256SUM.md. SLSA_VERIFIER_TAG := v2.7.1 SLSA_VERIFIER_BIN := slsa-verifier-linux-amd64 -SLSA_VERIFIER_BIN_PATH := $(PACKAGE_PATH)/bin/$(SLSA_VERIFIER_BIN) -SLSA_VERIFIER_PROVENANCE := $(SLSA_VERIFIER_BIN).intoto.jsonl -SLSA_VERIFIER_PROVENANCE_PATH := $(PACKAGE_PATH)/bin/$(SLSA_VERIFIER_PROVENANCE) - -$(PACKAGE_PATH)/bin/slsa-verifier: - mkdir -p $(PACKAGE_PATH)/bin \ - && wget -O $(PACKAGE_PATH)/bin/slsa-verifier https://github.com/slsa-framework/slsa-verifier/releases/download/$(SLSA_VERIFIER_TAG)/$(SLSA_VERIFIER_BIN) \ - && wget -O $(SLSA_VERIFIER_PROVENANCE_PATH) https://github.com/slsa-framework/slsa-verifier/releases/download/$(SLSA_VERIFIER_TAG)/$(SLSA_VERIFIER_PROVENANCE) \ - && chmod +x $(PACKAGE_PATH)/bin/slsa-verifier \ - && EXPECTED_HASH=$$(jq -r '.payload' $(SLSA_VERIFIER_PROVENANCE_PATH) | base64 -d | jq -r '.subject[] | select(.name == "$(SLSA_VERIFIER_BIN)") | .digest.sha256') \ - && ACTUAL_HASH=$$(sha256sum $(PACKAGE_PATH)/bin/slsa-verifier | awk '{print $$1}'); \ - if [ "$$EXPECTED_HASH" != "$$ACTUAL_HASH" ]; then \ - echo "Hash mismatch: expected $$EXPECTED_HASH, got $$ACTUAL_HASH"; \ - exit 1; \ - fi +SLSA_VERIFIER_BIN_PATH := $(HOME)/.local/bin +SLSA_VERIFIER_CHECKSUM := 946dbec729094195e88ef78e1734324a27869f03e2c6bd2f61cbc06bd5350339 +.PHONY: install-slsa-verifier +install-slsa-verifier: + if ! command -v slsa-verifier >/dev/null 2>&1; then \ + mkdir -p $(SLSA_VERIFIER_BIN_PATH) \ + && curl --fail -L -o $(SLSA_VERIFIER_BIN_PATH)/slsa-verifier https://github.com/slsa-framework/slsa-verifier/releases/download/$(SLSA_VERIFIER_TAG)/$(SLSA_VERIFIER_BIN) \ + && SLSA_VERIFIER_COMPUTED_HASH=$$(sha256sum $(SLSA_VERIFIER_BIN_PATH)/slsa-verifier | cut -d' ' -f1) \ + && if [ $$SLSA_VERIFIER_COMPUTED_HASH != $(SLSA_VERIFIER_CHECKSUM) ]; then \ + echo "slsa-verifier checksum could not be verified. Removing slsa-verifier binary and exiting." >&2 \ + && rm -f ${SLSA_VERIFIER_BIN_PATH}/slsa-verifier \ + && exit 1; \ + fi; \ + chmod +x $(SLSA_VERIFIER_BIN_PATH)/slsa-verifier \ + && command -v $(SLSA_VERIFIER_BIN_PATH)/slsa-verifier; \ + fi; # Set up schemastore for GitHub Actions specs. setup-schemastore: $(PACKAGE_PATH)/resources/schemastore/github-workflow.json $(PACKAGE_PATH)/resources/schemastore/LICENSE $(PACKAGE_PATH)/resources/schemastore/NOTICE @@ -238,8 +263,8 @@ setup-integration-test-utility-for-docker: # Generate a Software Bill of Materials (SBOM). .PHONY: sbom sbom: requirements - cyclonedx-py requirements --output-format json --output-file dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-sbom.json - $$HOME/go/bin/cyclonedx-gomod mod -json -output dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-sbom-go.json $(REPO_PATH) + cyclonedx-py requirements --output-format json --output-file dist/$(PACKAGE_WHEEL_DIST_NAME)-sbom.json + $$HOME/go/bin/cyclonedx-gomod mod -json -output dist/$(PACKAGE_WHEEL_DIST_NAME)-sbom-go.json $(REPO_PATH) # Generate a requirements.txt file containing version and integrity hashes for all # packages currently installed in the virtual environment. There's no easy way to @@ -261,26 +286,25 @@ requirements.txt: pyproject.toml [[ $$pkg =~ (.*)==(.*) ]] && curl -s https://pypi.org/pypi/$${BASH_REMATCH[1]}/$${BASH_REMATCH[2]}/json | python -c "import json, sys; print(''.join(f''' \\\\\n --hash=sha256:{pkg['digests']['sha256']}''' for pkg in json.load(sys.stdin)['urls']));" >> requirements.txt; \ done echo -e -n "$(PACKAGE_NAME)==$(PACKAGE_VERSION)" >> requirements.txt - if [ -f dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION).tar.gz ]; then \ - echo -e -n " \\\\\n $$(python -m pip hash --algorithm sha256 dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION).tar.gz | grep '^\-\-hash')" >> requirements.txt; \ + if [ -f dist/$(PACKAGE_SDIST_NAME).tar.gz ]; then \ + echo -e -n " \\\\\n $$(python -m pip hash --algorithm sha256 dist/$(PACKAGE_SDIST_NAME).tar.gz | grep '^\-\-hash')" >> requirements.txt; \ fi - if [ -f dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-py3-none-any.whl ]; then \ - echo -e -n " \\\\\n $$(python -m pip hash --algorithm sha256 dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-py3-none-any.whl | grep '^\-\-hash')" >> requirements.txt; \ + if [ -f dist/$(PACKAGE_WHEEL_DIST_NAME).whl ]; then \ + echo -e -n " \\\\\n $$(python -m pip hash --algorithm sha256 dist/$(PACKAGE_WHEEL_DIST_NAME).whl | grep '^\-\-hash')" >> requirements.txt; \ fi echo "" >> requirements.txt - cp requirements.txt dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-requirements.txt + cp requirements.txt dist/$(PACKAGE_WHEEL_DIST_NAME)-requirements.txt # Audit the currently installed packages. Skip packages that are installed in # editable mode (like the one in development here) because they may not have # a PyPI entry; also print out CVE description and potential fixes if audit # found an issue. -# Ignoring GHSA-7gcm-g887-7qv7: remove the exception when a fix is available. .PHONY: audit audit: if ! $$(python -c "import pip_audit" &> /dev/null); then \ echo "No package pip_audit installed, upgrade your environment!" && exit 1; \ fi; - python -m pip_audit --skip-editable --desc on --fix --dry-run --ignore-vuln GHSA-7gcm-g887-7qv7 + python -m pip_audit --skip-editable --desc on --fix --dry-run # Run some or all checks over the package code base. .PHONY: check check-code check-bandit check-flake8 check-lint check-mypy check-go check-actionlint @@ -360,15 +384,16 @@ integration-test-update: # When building these artifacts, we need the environment variable SOURCE_DATE_EPOCH # set to the build date/epoch. For more details, see: https://flit.pypa.io/en/latest/reproducible.html .PHONY: dist -dist: dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-py3-none-any.whl dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION).tar.gz dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-docs-html.zip dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-build-epoch.txt -dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-py3-none-any.whl: check test integration-test - flit build --setup-py --format wheel -dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION).tar.gz: check test integration-test - flit build --setup-py --format sdist +dist: dist/$(PACKAGE_WHEEL_DIST_NAME).whl dist/$(PACKAGE_SDIST_NAME).tar.gz dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-docs-html.zip dist/$(PACKAGE_WHEEL_DIST_NAME)-build-epoch.txt +dist/$(PACKAGE_WHEEL_DIST_NAME).whl: check test integration-test + SOURCE_DATE_EPOCH=$(SOURCE_DATE_EPOCH) flit build --setup-py --format wheel + mv dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-py3-none-any.whl dist/$(PACKAGE_WHEEL_DIST_NAME).whl +dist/$(PACKAGE_SDIST_NAME).tar.gz: check test integration-test + SOURCE_DATE_EPOCH=$(SOURCE_DATE_EPOCH) flit build --setup-py --format sdist dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-docs-html.zip: docs python -m zipfile -c dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-docs-html.zip docs/_build/html -dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-build-epoch.txt: - echo $(SOURCE_DATE_EPOCH) > dist/$(PACKAGE_NAME)-$(PACKAGE_VERSION)-build-epoch.txt +dist/$(PACKAGE_WHEEL_DIST_NAME)-build-epoch.txt: + echo $(SOURCE_DATE_EPOCH) > dist/$(PACKAGE_WHEEL_DIST_NAME)-build-epoch.txt # Build the HTML documentation from the package's source. .PHONY: docs diff --git a/docker/Dockerfile.final b/docker/Dockerfile.final index 2fc470904..b8c0a8a24 100644 --- a/docker/Dockerfile.final +++ b/docker/Dockerfile.final @@ -43,6 +43,15 @@ RUN : \ && rm -rf $HOME/dist \ && deactivate +# Install slsa-verifier. +# Copy only the Makefile from the build context +COPY Makefile . +RUN : \ + && make install-slsa-verifier \ + # Test that slsa-verifier exists and is executable. + && ls -l $HOME/.local/bin/slsa-verifier \ + && test -x $HOME/.local/bin/slsa-verifier + COPY --chown=macaron:macaron docker/user.sh $HOME/user.sh # We enable the root user here so that the user.sh script can modify the diff --git a/docs/source/pages/installation.rst b/docs/source/pages/installation.rst index 6e10c7005..6c3fb7cb2 100644 --- a/docs/source/pages/installation.rst +++ b/docs/source/pages/installation.rst @@ -7,6 +7,8 @@ Installation Guide ================== +.. contents:: :local: + ------------- Prerequisites ------------- @@ -23,9 +25,11 @@ Prerequisites Download -------- -Macaron is currently distributed as a Docker image. We provide a bash script ``run_macaron.sh`` to easily download and run it. +Macaron is currently distributed as a Docker image and Python package. Note that the Python package is only published as a GitHub release asset. We provide bash scripts to easily download and run Macaron. -.. note:: When run, Macaron will create output files inside the current directory where ``run_macaron.sh`` is run. If you run Docker Desktop, please make sure that the current directory is bind mountable for Docker (see the `File Sharing settings `_). +''''''''''''''''''''''''''''''''' +Install Macaron as a Docker image +''''''''''''''''''''''''''''''''' Download the ``run_macaron.sh`` script and make it executable by running the commands (replace ``tag`` with the version you want or ``release`` for the latest version): @@ -34,9 +38,11 @@ Download the ``run_macaron.sh`` script and make it executable by running the com curl -O https://raw.githubusercontent.com/oracle/macaron/refs/tags//scripts/release_scripts/run_macaron.sh chmod +x run_macaron.sh ----------------------------------------- -Verify that the installation is complete ----------------------------------------- +.. note:: When run, Macaron will create output files inside the current directory where ``run_macaron.sh`` is run. If you run Docker Desktop, please make sure that the current directory is bind mountable for Docker (see the `File Sharing settings `_). + +'''''''''''''''''''''' +Check the Docker Image +'''''''''''''''''''''' To verify your setup, go to the directory containing the downloaded ``run_macaron.sh`` script and run this command in order to print out the help message for Macaron: @@ -51,6 +57,43 @@ To verify your setup, go to the directory containing the downloaded ``run_macaro .. note:: By default, the script will always check the docker registry to ensure the docker image is up-to-date. This can be overridden if necessary (e.g. if running offline with a pre-installed image) by assigning the environment variable ``DOCKER_PULL``. For example: ``DOCKER_PULL=never ./run_macaron.sh --help`` +''''''''''''''''''''''''''''''''''' +Install Macaron as a Python package +''''''''''''''''''''''''''''''''''' + +Download the ``install_macaron_python.sh`` script and make it executable by running the commands (replace ``tag`` with the version you want or ``release`` for the latest version): + +.. code-block:: shell + + curl -O https://raw.githubusercontent.com/oracle/macaron/refs/tags//scripts/release_scripts/install_macaron_python.sh + chmod +x install_macaron_python.sh + +Install the package by providing a version. The installation will automatically create a virtual environment at ``./.venv`` if one does not already exist. The script uses your system's ``python3`` interpreter, and requires Python ``3.11.14`` or later to be available: + +.. code-block:: shell + + ./install_macaron_python.sh 0.20.0 + +Macaron might call `slsa-verifier `_ for the ``mcn_provenance_verified_1`` check if it is already installed on your machine. You can also pass the ``--install-slsa-verifier`` option to the script to install it for you. Note that if slsa-verifier is not installed, we only log an error but proceed with the rest of the analysis. For further information run: + +.. code-block:: shell + + ./install_macaron_python.sh --help + +If you run Macaron as a Python package and would like to run the :ref:`verify-policy ` or :ref:`gen-build-spec ` commands, you need to install the Datalog engine `souffle `_ separately. + +'''''''''''''''''''''''' +Check the Python package +'''''''''''''''''''''''' + +To verify your setup, activate the virtual environment and run this command in order to print out the help message for Macaron: + +.. code-block:: shell + + source .venv/bin/activate + macaron --help + + .. _prepare-github-token: --------------------------- diff --git a/scripts/release_scripts/install_macaron_python.sh b/scripts/release_scripts/install_macaron_python.sh new file mode 100755 index 000000000..b6e97fc4c --- /dev/null +++ b/scripts/release_scripts/install_macaron_python.sh @@ -0,0 +1,123 @@ +#!/bin/bash + +# Copyright (c) 2026 - 2026, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. +set -euo pipefail + +print_help() { +cat << EOF +Usage: $0 [--install-slsa-verifier] [-h|--help] + +Arguments: + Version of Macaron to install. + --install-slsa-verifier (Optional) Install the SLSA Verifier binary. + -h, --help Show this help and exit. + +Examples: + $0 0.21.0 + $0 0.21.0 --install-slsa-verifier +EOF +} + +# SLSA Verifier Installer +# Get the checksum from https://github.com/slsa-framework/slsa-verifier/blob/main/SHA256SUM.md. +install_slsa_verifier() { + SLSA_VERIFIER_TAG="v2.7.1" + SLSA_VERIFIER_BIN="slsa-verifier-linux-amd64" + SLSA_VERIFIER_BIN_PATH="${HOME}/.local/bin" + SLSA_VERIFIER_CHECKSUM="946dbec729094195e88ef78e1734324a27869f03e2c6bd2f61cbc06bd5350339" + + if ! command -v slsa-verifier >/dev/null 2>&1; then + echo "[Info] Installing slsa-verifier..." + mkdir -p "$SLSA_VERIFIER_BIN_PATH" + curl --fail -L -o "${SLSA_VERIFIER_BIN_PATH}/slsa-verifier" "https://github.com/slsa-framework/slsa-verifier/releases/download/${SLSA_VERIFIER_TAG}/${SLSA_VERIFIER_BIN}" + SLSA_VERIFIER_COMPUTED_HASH=$(sha256sum "${SLSA_VERIFIER_BIN_PATH}/slsa-verifier" | cut -d' ' -f1) + if [ "$SLSA_VERIFIER_COMPUTED_HASH" != "$SLSA_VERIFIER_CHECKSUM" ]; then + echo "[Error] SLSA verification did not pass. Removing slsa-verifier binary and exiting." >&2 + rm -f "${SLSA_VERIFIER_BIN_PATH}/slsa-verifier" + exit 1 + fi + chmod +x "${SLSA_VERIFIER_BIN_PATH}/slsa-verifier" + echo "[Info] slsa-verifier installed at: ${SLSA_VERIFIER_BIN_PATH}/slsa-verifier" + else + echo "[Info] slsa-verifier already installed." + fi +} + +# Handle arguments. +INSTALL_SLSA=0 +MACARON_VERSION="" + +for arg in "$@"; do + case "$arg" in + -h|--help) + print_help + exit 0 + ;; + --install-slsa-verifier) + INSTALL_SLSA=1 + ;; + *) + if [[ -z "$MACARON_VERSION" ]]; then + MACARON_VERSION="$arg" + fi + ;; + esac +done + +if [[ -z "$MACARON_VERSION" ]]; then + echo "Error: Please provide the Macaron version as an argument." + print_help + exit 1 +fi + +if [[ "$INSTALL_SLSA" -eq 1 ]]; then + install_slsa_verifier +fi + +# Macaron Installer + +# Configuration. +PYTHON_VERSION="3" +MACARON_DISTRO="py3-none-linux_x86_64" +MACARON_WHEEL="macaron-${MACARON_VERSION}-${MACARON_DISTRO}.whl" +MACARON_REQUIREMENTS="macaron-${MACARON_VERSION}-${MACARON_DISTRO}-requirements.txt" +MACARON_REPO="https://github.com/oracle/macaron" +VENV_DIR=".venv" + +echo "Using Macaron version: $MACARON_VERSION" + +# Download Macaron release assets if not already downloaded. +echo "Checking for release files..." +if [[ ! -f "$MACARON_WHEEL" ]]; then + echo "Downloading wheel: $MACARON_WHEEL" + wget "${MACARON_REPO}/releases/download/v${MACARON_VERSION}/${MACARON_WHEEL}" +else + echo "Using existing wheel: $MACARON_WHEEL" +fi + +if [[ ! -f "$MACARON_REQUIREMENTS" ]]; then + echo "Downloading requirements: $MACARON_REQUIREMENTS" + wget "${MACARON_REPO}/releases/download/v${MACARON_VERSION}/${MACARON_REQUIREMENTS}" +else + echo "Using existing requirements: $MACARON_REQUIREMENTS" +fi + +# Set up Python virtual environment. +if [[ ! -d "$VENV_DIR" ]]; then + echo "Creating virtual environment with Python ${PYTHON_VERSION}..." + python${PYTHON_VERSION} -m venv "${VENV_DIR}" +fi + +# shellcheck disable=SC1091 +source "${VENV_DIR}/bin/activate" +export PATH="${VENV_DIR}/bin:$PATH" + +# Install Macaron package and dependencies. +echo "Installing Macaron..." +pip install --no-deps "${MACARON_WHEEL}" +pip install --no-deps -r "${MACARON_REQUIREMENTS}" + +# Check version. +echo "Macaron successfully installed:" +macaron --version diff --git a/src/macaron/provenance/provenance_verifier.py b/src/macaron/provenance/provenance_verifier.py index 06356eff6..72b457ca0 100644 --- a/src/macaron/provenance/provenance_verifier.py +++ b/src/macaron/provenance/provenance_verifier.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains methods for verifying provenance files.""" @@ -6,6 +6,7 @@ import hashlib import logging import os +import shutil import subprocess # nosec B404 import tarfile import zipfile @@ -202,7 +203,6 @@ def verify_ci_provenance(analyze_ctx: AnalyzeContext, ci_info: CIInfo, download_ return False sub_verified = _verify_slsa( - analyze_ctx.macaron_path, download_path, provenance.asset, sub_asset["name"], @@ -303,19 +303,42 @@ def _validate_path_traversal(path: str) -> bool: return False -def _verify_slsa( - macaron_path: str, download_path: str, prov_asset: AssetLocator, asset_name: str, repository_url: str -) -> bool: +def _is_slsa_verifier_installed() -> bool: + """Check if slsa-verifer is present on the execution path. + + Returns + ------- + bool + True if slsa-verifer is present on the execution path. + """ + if shutil.which("slsa-verifier") is None: + logger.debug("slsa-verifier is not on the execution path.") + return False + return True + + +def _verify_slsa(download_path: str, prov_asset: AssetLocator, asset_name: str, repository_url: str) -> bool: """Run SLSA verifier to verify the artifact.""" source_path = get_repo_dir_name(repository_url, sanitize=False) if not source_path: logger.error("Invalid repository source path to verify: %s.", repository_url) return False + if not _is_slsa_verifier_installed(): + os.environ["PATH"] = os.path.join(Path.home(), ".local", "bin") + os.pathsep + os.environ.get("PATH", "") + logger.debug("PATH: %s", os.environ["PATH"]) + # Try the ~/.local/bin path. + if not _is_slsa_verifier_installed(): + logger.error( + "slsa-verifier is not installed or is not present on the execution path." + " See https://github.com/slsa-framework/slsa-verifier for instructions." + ) + return False + errors: list[str] = [] verified = False cmd = [ - os.path.join(macaron_path, "bin/slsa-verifier"), + "slsa-verifier", "verify-artifact", os.path.join(download_path, asset_name), "--provenance-path", diff --git a/tests/integration/cases/micronaut-projects_micronaut-test/micronaut-test.dl b/tests/integration/cases/micronaut-projects_micronaut-test/micronaut-test.dl index e0f43e2ce..e307a28b8 100644 --- a/tests/integration/cases/micronaut-projects_micronaut-test/micronaut-test.dl +++ b/tests/integration/cases/micronaut-projects_micronaut-test/micronaut-test.dl @@ -1,4 +1,4 @@ -/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Copyright (c) 2024 - 2026, Oracle and/or its affiliates. All rights reserved. */ /* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ #include "prelude.dl" @@ -9,6 +9,7 @@ Policy("test_policy", component_id, "") :- check_passed(component_id, "mcn_build_service_1"), check_passed(component_id, "mcn_version_control_system_1"), check_passed(component_id, "mcn_provenance_available_1"), + check_passed(component_id, "mcn_provenance_verified_1"), check_passed(component_id, "mcn_provenance_derived_repo_1"), check_passed(component_id, "mcn_build_tool_1"), build_tool_check(gradle_id, "gradle", "java"), diff --git a/tests/integration/cases/micronaut-projects_micronaut-test/test.yaml b/tests/integration/cases/micronaut-projects_micronaut-test/test.yaml index c7cda9fc2..4bf43d20e 100644 --- a/tests/integration/cases/micronaut-projects_micronaut-test/test.yaml +++ b/tests/integration/cases/micronaut-projects_micronaut-test/test.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. description: | @@ -9,6 +9,10 @@ tags: - macaron-python-package steps: +- name: Install slsa-verifier + kind: shell + options: + cmd: make --file ../../../../Makefile install-slsa-verifier - name: Run macaron analyze micronaut-projects/micronaut-test kind: analyze options: diff --git a/tests/integration/cases/ossf_scorecard/test.yaml b/tests/integration/cases/ossf_scorecard/test.yaml index 653140505..c16124433 100644 --- a/tests/integration/cases/ossf_scorecard/test.yaml +++ b/tests/integration/cases/ossf_scorecard/test.yaml @@ -1,11 +1,10 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. description: > Test CUE provenance expectation for ossf/scorecard, policy verification, and VSA generation. tags: -- macaron-python-package - macaron-docker-image steps: diff --git a/tests/integration/cases/urllib3_expectation_dir/test.yaml b/tests/integration/cases/urllib3_expectation_dir/test.yaml index 8646c8edd..88d589ef4 100644 --- a/tests/integration/cases/urllib3_expectation_dir/test.yaml +++ b/tests/integration/cases/urllib3_expectation_dir/test.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. description: | @@ -10,6 +10,10 @@ tags: - macaron-docker-image steps: +- name: Install slsa-verifier + kind: shell + options: + cmd: make --file ../../../../Makefile install-slsa-verifier - name: Run macaron analyze with expectation directory kind: analyze options: diff --git a/tests/integration/cases/urllib3_expectation_file/test.yaml b/tests/integration/cases/urllib3_expectation_file/test.yaml index 5b204387b..fc6593160 100644 --- a/tests/integration/cases/urllib3_expectation_file/test.yaml +++ b/tests/integration/cases/urllib3_expectation_file/test.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. description: | @@ -6,7 +6,6 @@ description: | The CUE expectation file is provided as a single file path. tags: -- macaron-python-package - macaron-docker-image - tutorial diff --git a/tests/integration/cases/urllib3_invalid_expectation/test.yaml b/tests/integration/cases/urllib3_invalid_expectation/test.yaml index 960e10ebe..697a9a83e 100644 --- a/tests/integration/cases/urllib3_invalid_expectation/test.yaml +++ b/tests/integration/cases/urllib3_invalid_expectation/test.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. description: | @@ -6,7 +6,6 @@ description: | The CUE expectation file is invalid. tags: -- macaron-python-package - macaron-docker-image steps: From 66a64bbfcf774cd332a383a0e3786f704ced4191 Mon Sep 17 00:00:00 2001 From: Behnaz Hassanshahi Date: Tue, 10 Feb 2026 08:49:08 +1000 Subject: [PATCH 13/20] chore(deps): update Go dependencies (#1295) This PR updates the Go version to 1.24 and also updates the Go dependencies. Signed-off-by: behnazh-w --- .pre-commit-config.yaml | 6 +-- CONTRIBUTING.md | 2 +- go.mod | 15 +++---- go.sum | 42 ++++++++++--------- golang/README.md | 2 +- golang/internal/bashparser/bashparser_test.go | 5 +-- golang/internal/filewriter/filewriter_test.go | 5 +-- 7 files changed, 39 insertions(+), 38 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d9321295a..6166236b5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. # See https://pre-commit.com for more information @@ -224,7 +224,7 @@ repos: # A linter for Golang - repo: https://github.com/golangci/golangci-lint - rev: v2.3.0 + rev: v2.8.0 hooks: - id: golangci-lint @@ -236,7 +236,7 @@ repos: # Other staged files shouldn't trigger these hooks. # Documentation: https://github.com/TekWizely/pre-commit-golang/blob/v1.0.0-rc.1/README.md. - repo: https://github.com/tekwizely/pre-commit-golang - rev: v1.0.0-rc.1 + rev: v1.0.0-rc.4 hooks: - id: go-build-mod - id: go-build-repo-mod diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 78e293085..5b27488d0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,7 +85,7 @@ Please see the [README for the malware analyzer](./src/macaron/malware_analyzer/ ### Prerequisites - Python 3.11.14 -- Go 1.23 +- Go 1.24 - JDK 17 ### Prepare the environment diff --git a/go.mod b/go.mod index d724fc781..36f40d378 100644 --- a/go.mod +++ b/go.mod @@ -3,12 +3,12 @@ module github.com/oracle/macaron -go 1.23.0 +go 1.24.0 -toolchain go1.23.2 +toolchain go1.24.13 require ( - cuelang.org/go v0.14.1 + cuelang.org/go v0.15.4 mvdan.cc/sh/v3 v3.12.0 ) @@ -18,8 +18,9 @@ require ( github.com/google/uuid v1.6.0 // indirect github.com/mitchellh/go-wordwrap v1.0.1 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect - github.com/protocolbuffers/txtpbfmt v0.0.0-20250627152318-f293424e46b5 // indirect - golang.org/x/net v0.42.0 // indirect - golang.org/x/text v0.27.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect + github.com/protocolbuffers/txtpbfmt v0.0.0-20251016062345-16587c79cd91 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/net v0.46.0 // indirect + golang.org/x/text v0.30.0 // indirect + google.golang.org/protobuf v1.33.0 // indirect ) diff --git a/go.sum b/go.sum index 53aa010f2..167c82781 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ -cuelabs.dev/go/oci/ociregistry v0.0.0-20250715075730-49cab49c8e9d h1:lX0EawyoAu4kgMJJfy7MmNkIHioBcdBGFRSKDZ+CWo0= -cuelabs.dev/go/oci/ociregistry v0.0.0-20250715075730-49cab49c8e9d/go.mod h1:4WWeZNxUO1vRoZWAHIG0KZOd6dA25ypyWuwD3ti0Tdc= -cuelang.org/go v0.14.1 h1:kxFAHr7bvrCikbtVps2chPIARazVdnRmlz65dAzKyWg= -cuelang.org/go v0.14.1/go.mod h1:aSP9UZUM5m2izHAHUvqtq0wTlWn5oLjuv2iBMQZBLLs= +cuelabs.dev/go/oci/ociregistry v0.0.0-20250722084951-074d06050084 h1:4k1yAtPvZJZQTu8DRY8muBo0LHv6TqtrE0AO5n6IPYs= +cuelabs.dev/go/oci/ociregistry v0.0.0-20250722084951-074d06050084/go.mod h1:4WWeZNxUO1vRoZWAHIG0KZOd6dA25ypyWuwD3ti0Tdc= +cuelang.org/go v0.15.4 h1:lrkTDhqy8dveHgX1ZLQ6WmgbhD8+rXa0fD25hxEKYhw= +cuelang.org/go v0.15.4/go.mod h1:NYw6n4akZcTjA7QQwJ1/gqWrrhsN4aZwhcAL0jv9rZE= github.com/cockroachdb/apd/v3 v3.2.1 h1:U+8j7t0axsIgvQUqthuNm82HIrYXodOV2iWLWtEaIwg= github.com/cockroachdb/apd/v3 v3.2.1/go.mod h1:klXJcjp+FffLTHlhIG69tezTDvdP065naDsHzKhYSqc= github.com/emicklei/proto v1.14.2 h1:wJPxPy2Xifja9cEMrcA/g08art5+7CGJNFNk35iXC1I= @@ -28,26 +28,28 @@ github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJw github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= -github.com/protocolbuffers/txtpbfmt v0.0.0-20250627152318-f293424e46b5 h1:WWs1ZFnGobK5ZXNu+N9If+8PDNVB9xAqrib/stUXsV4= -github.com/protocolbuffers/txtpbfmt v0.0.0-20250627152318-f293424e46b5/go.mod h1:BnHogPTyzYAReeQLZrOxyxzS739DaTNtTvohVdbENmA= +github.com/protocolbuffers/txtpbfmt v0.0.0-20251016062345-16587c79cd91 h1:s1LvMaU6mVwoFtbxv/rCZKE7/fwDmDY684FfUe4c1Io= +github.com/protocolbuffers/txtpbfmt v0.0.0-20251016062345-16587c79cd91/go.mod h1:JSbkp0BviKovYYt9XunS95M3mLPibE9bGg+Y95DsEEY= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= -golang.org/x/mod v0.26.0 h1:EGMPT//Ezu+ylkCijjPc+f4Aih7sZvaAr+O3EHBxvZg= -golang.org/x/mod v0.26.0/go.mod h1:/j6NAhSk8iQ723BGAUyoAcn7SlD7s15Dp9Nd/SfeaFQ= -golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs= -golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8= -golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= -golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= -golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= -golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0= -golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= +golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= +golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= +golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY= +golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= +golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= +golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= +google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= +google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= mvdan.cc/sh/v3 v3.12.0 h1:ejKUR7ONP5bb+UGHGEG/k9V5+pRVIyD+LsZz7o8KHrI= mvdan.cc/sh/v3 v3.12.0/go.mod h1:Se6Cj17eYSn+sNooLZiEUnNNmNxg0imoYlTu4CyaGyg= diff --git a/golang/README.md b/golang/README.md index 4cefbe323..6ab4b1695 100644 --- a/golang/README.md +++ b/golang/README.md @@ -1,7 +1,7 @@ # Go module documentation ## Quick start Prerequisites -- Go (tested on `go 1.23.0 linux/amd64`). Installation instructions [here](https://go.dev/doc/install). +- Go (tested on `go 1.24 linux/amd64`). Installation instructions [here](https://go.dev/doc/install). - Prepare the required libraries by running this command from the root dir of this repository: ```bash diff --git a/golang/internal/bashparser/bashparser_test.go b/golang/internal/bashparser/bashparser_test.go index 4cf0a6813..3825f459b 100644 --- a/golang/internal/bashparser/bashparser_test.go +++ b/golang/internal/bashparser/bashparser_test.go @@ -1,4 +1,4 @@ -/* Copyright (c) 2022 - 2022, Oracle and/or its affiliates. All rights reserved. */ +/* Copyright (c) 2022 - 2026, Oracle and/or its affiliates. All rights reserved. */ /* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ package bashparser @@ -29,7 +29,6 @@ func Test_parse_valid_bash_script(t *testing.T) { var result map[string]interface{} err := json.Unmarshal([]byte(json_content), &result) if err != nil { - t.Errorf(string(err.Error())) - t.Errorf("Cannot unmarshal the returned JSON content from parsing %s.", json_content) + t.Errorf("Cannot unmarshal the returned JSON content from parsing %s: %v.", json_content, err) } } diff --git a/golang/internal/filewriter/filewriter_test.go b/golang/internal/filewriter/filewriter_test.go index f8faa73cd..7f8e273a3 100644 --- a/golang/internal/filewriter/filewriter_test.go +++ b/golang/internal/filewriter/filewriter_test.go @@ -1,4 +1,4 @@ -/* Copyright (c) 2022 - 2022, Oracle and/or its affiliates. All rights reserved. */ +/* Copyright (c) 2022 - 2026, Oracle and/or its affiliates. All rights reserved. */ /* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ package filewriter @@ -26,8 +26,7 @@ func Test_store_to_file(t *testing.T) { read_content, err := os.ReadFile(out_path) if err != nil { - t.Errorf("Error when trying to store to %s.", out_path) - t.Errorf(err.Error()) + t.Errorf("Error when trying to store to %s: %v.", out_path, err) } else { if string(read_content) != store_content { t.Errorf("The store content is not correct") From 20c11695020663da06ee4ae3585633e784ec046d Mon Sep 17 00:00:00 2001 From: Behnaz Hassanshahi Date: Thu, 12 Feb 2026 10:13:11 +1000 Subject: [PATCH 14/20] chore(deps): update Python dependencies (#1299) Updates the Python dependencies. In particular, this PR updates cryptography to address CVE-2026-26007. Signed-off-by: behnazh-w --- pyproject.toml | 21 +++++++++---------- src/macaron/__main__.py | 4 ++-- .../repo_finder/repo_finder_deps_dev.py | 4 ++-- src/macaron/slsa_analyzer/git_url.py | 4 ++-- tests/integration/run.py | 4 ++-- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0c0f16641..a30882001 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "requests >=2.32.3,<3.0.0", "pydriller >=2.0,<3.0.0", "yamale >=6.0.0,<7.0.0", - "packaging >=24.0,<25.0.0", + "packaging >=25.0,<27.0.0", "jinja2 >=3.1.2,<4.0.0", "SQLAlchemy >=2.0.0,<3.0.0", "defusedxml >=0.7.1,<1.0.0", @@ -33,11 +33,11 @@ dependencies = [ "ruamel.yaml >= 0.18.6,<1.0.0", "jsonschema >= 4.22.0,<5.0.0", "cyclonedx-bom >=7.0.0,<8.0.0", - "cyclonedx-python-lib[validation] >=8.0.0,<11.0.0", + "cyclonedx-python-lib[validation] >=9.0.0,<12.0.0", "beautifulsoup4 >= 4.12.0,<5.0.0", "problog >= 2.2.6,<3.0.0", - "cryptography >=44.0.0,<45.0.0", - "semgrep == 1.149.0", + "cryptography >=46.0.5,<47.0.0", + "semgrep == 1.151.0", "email-validator >=2.2.0,<3.0.0", "rich >=13.5.3,<15.0.0", "lark >= 1.3.0,<2.0.0", @@ -71,16 +71,16 @@ macaron = 'macaron.__main__:main' # installed. Make sure to keep the requirements in sync with the workflows! actions = [ "commitizen >=4.0.0,<5.0.0", - "twine >=5.0.0,<6.0.0", + "twine >=6.0.0,<7.0.0", ] dev = [ "flit >=3.2.0,<4.0.0", - "mypy >=1.0.0,<1.16", + "mypy >=1.19.1,<1.20", "types-pyyaml >=6.0.4,<7.0.0", "types-requests >=2.25.6,<3.0.0", "types-jsonschema >=4.22.0,<5.0.0", "pip-audit >=2.5.6,<3.0.0", - "pylint >=3.0.3,<4.0.0", + "pylint >=4.0.4,<5.0.0", "cyclonedx-bom >=7.0.0,<8.0.0", "types-beautifulsoup4 >= 4.12.0,<5.0.0", ] @@ -98,12 +98,12 @@ hooks = [ # Note that the `custom_exit_code` and `env` plugins may currently be unmaintained. test = [ "hypothesis >=6.100.1,<7.0.0", - "pytest >=8.2.2,<9.0.0", + "pytest >=9.0.2,<10.0.0", "pytest-custom_exit_code >=0.3.0,<1.0.0", - "pytest-cov >=6.0.0,<7.0.0", + "pytest-cov >=7.0.0,<8.0.0", "pytest-env >=1.0.0,<2.0.0", "pytest_httpserver >=1.0.10,<2.0.0", - "syrupy >=4.0.0,<5.0.0", + "syrupy >=5.1.0,<6.0.0", ] test-docker = [ @@ -217,7 +217,6 @@ ignore_missing_imports = true # https://pylint.pycqa.org/en/latest/user_guide/configuration/index.html [tool.pylint.MASTER] fail-under = 10.0 -suggestion-mode = true # Remove this setting when pylint v4 is released. load-plugins = [ "pylint.extensions.check_elif", "pylint.extensions.for_any_all", diff --git a/src/macaron/__main__.py b/src/macaron/__main__.py index dd103eec6..addb0f881 100644 --- a/src/macaron/__main__.py +++ b/src/macaron/__main__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This is the main entrypoint to run Macaron.""" @@ -92,7 +92,7 @@ def analyze_slsa_levels_single(analyzer_single_args: argparse.Namespace) -> None local_maven_repo = os.path.join(home_dir, ".m2") if not os.path.isdir(local_maven_repo): - logger.debug("The default local Maven repo at %s does not exist. Ignore ...") + logger.debug("The default local Maven repo at %s does not exist. Ignore ...", local_maven_repo) global_config.local_maven_repo = None global_config.local_maven_repo = local_maven_repo diff --git a/src/macaron/repo_finder/repo_finder_deps_dev.py b/src/macaron/repo_finder/repo_finder_deps_dev.py index 07b5e4f34..e3f92cc4c 100644 --- a/src/macaron/repo_finder/repo_finder_deps_dev.py +++ b/src/macaron/repo_finder/repo_finder_deps_dev.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the PythonRepoFinderDD class to be used for finding repositories using deps.dev.""" @@ -179,7 +179,7 @@ def get_attestation(purl: PackageURL) -> tuple[dict | None, str | None, bool]: and a flag for whether the attestation is verified. """ if purl.type != "pypi": - logger.debug("PURL type (%s) attestation not yet supported via deps.dev.") + logger.debug("PURL type (%s) attestation not yet supported via deps.dev.", purl.type) return None, None, False if not purl.version: diff --git a/src/macaron/slsa_analyzer/git_url.py b/src/macaron/slsa_analyzer/git_url.py index 62a40833f..6fa019991 100644 --- a/src/macaron/slsa_analyzer/git_url.py +++ b/src/macaron/slsa_analyzer/git_url.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module provides methods to perform generic actions on Git URLS.""" @@ -338,7 +338,7 @@ def clone_remote_repo(clone_dir: str, url: str) -> Repo | None: ) return Repo(path=clone_dir) except (subprocess.CalledProcessError, OSError): - logger.debug("The clone dir %s is not empty. An attempt to update it failed.") + logger.debug("The clone dir %s is not empty. An attempt to update it failed.", clone_dir) return None # Ensure that the parent directory where the repo is cloned into exists. diff --git a/tests/integration/run.py b/tests/integration/run.py index e78cf57a6..45d7ed93a 100644 --- a/tests/integration/run.py +++ b/tests/integration/run.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Integration test utility.""" @@ -1220,7 +1220,7 @@ def main(argv: Sequence[str] | None = None) -> int: path = shutil.which(args.macaron) if path is None: - logger.error("'%s' is not a command.") + logger.error("'%s' is not a command.", args.macaron) return 1 macaron_cmd = os.path.abspath(path) From 750038bacb4ed4bad3114043db0abf4f82c25e1c Mon Sep 17 00:00:00 2001 From: Nicholas Allen Date: Tue, 17 Feb 2026 14:11:19 +1000 Subject: [PATCH 15/20] fix: handle GitHub Actions job needs field case-insensitively in analysis. (#1305) Signed-off-by: Nicholas Allen --- .../code_analyzer/dataflow_analysis/github.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/macaron/code_analyzer/dataflow_analysis/github.py b/src/macaron/code_analyzer/dataflow_analysis/github.py index 6da30e745..222f55fb1 100644 --- a/src/macaron/code_analyzer/dataflow_analysis/github.py +++ b/src/macaron/code_analyzer/dataflow_analysis/github.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Dataflow analysis implementation for analysing GitHub Actions Workflow build pipelines.""" @@ -316,6 +316,15 @@ def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str] return result + @staticmethod + def _find_job_id_case_insensitive(jobs: dict[str, RawGitHubActionsJobNode], job_id: str) -> str | None: + if job_id in jobs: + return job_id + for actual_job_id in jobs: + if actual_job_id.lower() == job_id.lower(): + return actual_job_id + return None + @staticmethod def create( workflow: github_workflow_model.Workflow, context: core.NonOwningContextRef[GitHubActionsWorkflowContext] @@ -352,10 +361,16 @@ def create( needs = job_node.definition["needs"] if isinstance(needs, list): for need in needs: - # TODO invalid needs id? - edges.append(need) + actual_need = GitHubActionsWorkflowNode._find_job_id_case_insensitive(jobs, need) + if actual_need is None: + raise CallGraphError("needs refers to invalid job") + edges.append(actual_need) elif isinstance(needs, str): - edges.append(needs) + actual_need = GitHubActionsWorkflowNode._find_job_id_case_insensitive(jobs, needs) + if actual_need is None: + raise CallGraphError("needs refers to invalid job") + edges.append(actual_need) + dependency_graph[job_id] = edges ts = TopologicalSorter(dependency_graph) From bf788f3b548d0ed3821006368a311ff5a740e4e3 Mon Sep 17 00:00:00 2001 From: Behnaz Hassanshahi Date: Wed, 18 Feb 2026 09:50:52 +1000 Subject: [PATCH 16/20] fix(gen-build-spec): handle errors gracefully when build tool is not supported (#1303) This PR improves error handling in the build spec generation process for unsupported build tools. Signed-off-by: behnazh-w --- .../build_command_patcher.py | 17 ++-- .../common_spec/maven_spec.py | 16 +-- .../common_spec/pypi_spec.py | 13 +-- .../common_spec/test_core.py | 98 ++++++++++++++++++- .../test_build_command_patcher.py | 25 ++++- 5 files changed, 140 insertions(+), 29 deletions(-) diff --git a/src/macaron/build_spec_generator/build_command_patcher.py b/src/macaron/build_spec_generator/build_command_patcher.py index 4fe26f2ba..224ec5715 100644 --- a/src/macaron/build_spec_generator/build_command_patcher.py +++ b/src/macaron/build_spec_generator/build_command_patcher.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the implementation of the build command patching.""" @@ -83,23 +83,26 @@ def _patch_commands( which just holds the original command as a list of string, without any changes. """ result: list[CLICommand] = [] - for cmds in cmds_sequence: + for cmd in cmds_sequence: + # Checking if the command is a valid non-empty list. + if not cmd: + continue effective_cli_parser = None for cli_parser in cli_parsers: - if cli_parser.is_build_tool(cmds[0]): + if cli_parser.is_build_tool(cmd[0]): effective_cli_parser = cli_parser break if not effective_cli_parser: - result.append(UnparsedCLICommand(original_cmds=cmds)) + result.append(UnparsedCLICommand(original_cmds=cmd)) continue try: - cli_command = effective_cli_parser.parse(cmds) + cli_command = effective_cli_parser.parse(cmd) except CommandLineParseError as error: logger.error( "Failed to patch the cli command %s. Error %s.", - " ".join(cmds), + " ".join(cmd), error, ) return None @@ -117,7 +120,7 @@ def _patch_commands( except PatchBuildCommandError as error: logger.error( "Failed to patch the build command %s. Error %s.", - " ".join(cmds), + " ".join(cmd), error, ) return None diff --git a/src/macaron/build_spec_generator/common_spec/maven_spec.py b/src/macaron/build_spec_generator/common_spec/maven_spec.py index 1d0abf4f8..de0b4c5df 100644 --- a/src/macaron/build_spec_generator/common_spec/maven_spec.py +++ b/src/macaron/build_spec_generator/common_spec/maven_spec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module includes build specification and helper classes for Maven packages.""" @@ -12,7 +12,6 @@ from macaron.build_spec_generator.common_spec.base_spec import BaseBuildSpec, BaseBuildSpecDict from macaron.build_spec_generator.common_spec.jdk_finder import find_jdk_version_from_central_maven_repo from macaron.build_spec_generator.common_spec.jdk_version_normalizer import normalize_jdk_version -from macaron.errors import GenerateBuildSpecError logger: logging.Logger = logging.getLogger(__name__) @@ -46,11 +45,6 @@ def get_default_build_commands( ------- list[list[str]] The build command as a list[list[str]]. - - Raises - ------ - GenerateBuildSpecError - If there is no default build command available for the specified build tool. """ default_build_commands = [] @@ -65,11 +59,10 @@ def get_default_build_commands( pass if not default_build_commands: - logger.critical( + logger.debug( "There is no default build command available for the build tools %s.", build_tool_names, ) - raise GenerateBuildSpecError("Unable to find a default build command.") return default_build_commands @@ -118,12 +111,13 @@ def resolve_fields(self, purl: PackageURL) -> None: selected_build_commands = self.data["build_commands"] or self.get_default_build_commands( self.data["build_tools"] ) - patched_build_commands = patch_commands( cmds_sequence=selected_build_commands, patches=CLI_COMMAND_PATCHES, ) if not patched_build_commands: - raise GenerateBuildSpecError(f"Failed to patch command sequences {selected_build_commands}.") + logger.debug("Failed to patch build command sequences %s", selected_build_commands) + self.data["build_commands"] = [] + return self.data["build_commands"] = patched_build_commands diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index 0471afd72..097648214 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -16,7 +16,7 @@ from macaron.build_spec_generator.common_spec.base_spec import BaseBuildSpec, BaseBuildSpecDict from macaron.config.defaults import defaults -from macaron.errors import GenerateBuildSpecError, SourceCodeError, WheelTagError +from macaron.errors import SourceCodeError, WheelTagError from macaron.json_tools import json_extract from macaron.slsa_analyzer.package_registry import pypi_registry from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo @@ -55,11 +55,6 @@ def get_default_build_commands( ------- list[list[str]] The build command as a list[list[str]]. - - Raises - ------ - GenerateBuildSpecError - If there is no default build command available for the specified build tool. """ default_build_commands = [] @@ -77,16 +72,16 @@ def get_default_build_commands( case "hatch": default_build_commands.append("hatch build".split()) case "conda": - default_build_commands.append('echo("Not supported")'.split()) + # TODO: update this if a build command can be used for conda. + pass case _: pass if not default_build_commands: - logger.critical( + logger.debug( "There is no default build command available for the build tools %s.", build_tool_names, ) - raise GenerateBuildSpecError("Unable to find a default build command.") return default_build_commands diff --git a/tests/build_spec_generator/common_spec/test_core.py b/tests/build_spec_generator/common_spec/test_core.py index 7df8b1615..a0620c869 100644 --- a/tests/build_spec_generator/common_spec/test_core.py +++ b/tests/build_spec_generator/common_spec/test_core.py @@ -1,11 +1,15 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the tests for build spec generation""" import pytest +from packageurl import PackageURL +from macaron.build_spec_generator.common_spec.base_spec import BaseBuildSpecDict from macaron.build_spec_generator.common_spec.core import ( + ECOSYSTEMS, + LANGUAGES, MacaronBuildToolName, compose_shell_commands, get_language_version, @@ -141,3 +145,95 @@ def test_get_language_version( ) -> None: """Test the get_language_version function.""" assert get_language_version(build_command_info) == expected + + +@pytest.mark.parametrize( + ("base_build_spec_dict"), + [ + pytest.param( + BaseBuildSpecDict( + { + "macaron_version": "0.20.0", + "group_id": "foo", + "artifact_id": "bar", + "version": "1.0.0", + "git_repo": "bla", + "git_tag": "bla", + "newline": "lf", + "language_version": [], + "ecosystem": "maven", + "purl": "pkg:maven/foo/bar@1.0.0", + "language": LANGUAGES.MAVEN.value, + "build_tools": [MacaronBuildToolName.MAVEN], + "build_commands": [], + } + ), + id="empty build command for maven", + ), + pytest.param( + BaseBuildSpecDict( + { + "macaron_version": "0.20.0", + "group_id": "foo", + "artifact_id": "bar", + "version": "1.0.0", + "git_repo": "bla", + "git_tag": "bla", + "newline": "lf", + "language_version": [], + "ecosystem": "maven", + "purl": "pkg:maven/foo/bar@1.0.0", + "language": LANGUAGES.MAVEN.value, + "build_tools": ["ant"], + "build_commands": [["ant", "dist"]], + } + ), + id="unsupported build tool for maven", + ), + pytest.param( + BaseBuildSpecDict( + { + "macaron_version": "0.20.0", + "group_id": None, + "artifact_id": "bar", + "version": "1.0.0", + "git_repo": "bla", + "git_tag": "bla", + "newline": "lf", + "language_version": [], + "ecosystem": "pypi", + "purl": "pkg:pypi/bar@1.0.0", + "language": LANGUAGES.PYPI.value, + "build_tools": [MacaronBuildToolName.FLIT], + "build_commands": [], + } + ), + id="empty build command for pypi", + ), + pytest.param( + BaseBuildSpecDict( + { + "macaron_version": "0.20.0", + "group_id": None, + "artifact_id": "bar", + "version": "1.0.0", + "git_repo": "bla", + "git_tag": "bla", + "newline": "lf", + "language_version": [], + "ecosystem": "pypi", + "purl": "pkg:pypi/bar@1.0.0", + "language": LANGUAGES.PYPI.value, + "build_tools": ["uv"], + "build_commands": [["python", "-m", "build"]], + } + ), + id="unsupported build tool for pypi", + ), + ], +) +def test_resolve_fields(base_build_spec_dict: BaseBuildSpecDict) -> None: + """Test the buildspec field resolution for each ecosystem.""" + ECOSYSTEMS[base_build_spec_dict["ecosystem"].upper()].value(base_build_spec_dict).resolve_fields( + PackageURL.from_string(base_build_spec_dict["purl"]) + ) diff --git a/tests/build_spec_generator/test_build_command_patcher.py b/tests/build_spec_generator/test_build_command_patcher.py index b83359698..dad1f04ee 100644 --- a/tests/build_spec_generator/test_build_command_patcher.py +++ b/tests/build_spec_generator/test_build_command_patcher.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the test for the build command patcher.""" @@ -559,3 +559,26 @@ def test_patching_multiple_commands_error( ) is None ) + + +@pytest.mark.parametrize( + ("original_cmd_sequence"), + [ + pytest.param( + [], + id="empty sequence", + ), + pytest.param( + [[]], + id="empty command", + ), + ], +) +def test_empty_command(maven_cli_parser: MavenCLICommandParser, original_cmd_sequence: list[list[str]]) -> None: + """Test the patch command for empty commands.""" + patch_cmds = _patch_commands( + cmds_sequence=original_cmd_sequence, + cli_parsers=[maven_cli_parser], + patches={PatchCommandBuildTool.MAVEN: {}}, + ) + assert patch_cmds == [] From bdd3448373d761eda42ef20c582143f69c8bfea8 Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Wed, 18 Feb 2026 11:59:03 +1000 Subject: [PATCH 17/20] feat: improve buildspec and dockerfile generation (#1279) Signed-off-by: Abhinav Pradeep --- .../common_spec/pypi_spec.py | 30 +++- .../dockerfile/pypi_dockerfile_output.py | 161 ++++++++++++++++-- .../package_registry/pypi_registry.py | 46 +++++ .../test_pypi_dockerfile_output.ambr | 29 +++- .../expected_dockerfile.buildspec | 29 +++- .../expected_default.buildspec | 3 +- .../expected_dockerfile.buildspec | 27 ++- .../pypi_toga/expected_default.buildspec | 3 +- .../pypi_toga/expected_dockerfile.buildspec | 29 +++- .../expected_default.buildspec | 2 +- 10 files changed, 313 insertions(+), 46 deletions(-) diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index 097648214..328481f45 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -112,8 +112,9 @@ def resolve_fields(self, purl: PackageURL) -> None: parsed_build_requires: dict[str, str] = {} sdist_build_requires: dict[str, str] = {} python_version_set: set[str] = set() - wheel_name_python_version_list: list[str] = [] + wheel_name_python_version_set: set[str] = set() wheel_name_platforms: set[str] = set() + dependency_python_version_set: set[str] = set() # Precautionary fallback to default version chronologically_likeliest_version: str = defaults.get("heuristic.pypi", "default_setuptools") @@ -128,6 +129,8 @@ def resolve_fields(self, purl: PackageURL) -> None: if py_version := json_extract(release, ["requires_python"], str): python_version_set.add(py_version.replace(" ", "")) + logger.debug("From package JSON inferred Python constraints: %s", python_version_set) + self.data["has_binaries"] = not pypi_package_json.has_pure_wheel() if self.data["has_binaries"]: @@ -162,9 +165,13 @@ def resolve_fields(self, purl: PackageURL) -> None: logger.debug(pypi_package_json.wheel_filename) _, _, _, tags = parse_wheel_filename(pypi_package_json.wheel_filename) for tag in tags: - wheel_name_python_version_list.append(tag.interpreter) + wheel_name_python_version_set.add(tag.interpreter) wheel_name_platforms.add(tag.platform) - logger.debug(python_version_set) + if wheel_name_python_version_set: + logger.debug( + "From wheel name inferred Python constraints: %s", wheel_name_python_version_set + ) + python_version_set.update(wheel_name_python_version_set) except InvalidWheelFilename: logger.debug("Could not parse wheel file name to extract version") except WheelTagError: @@ -234,8 +241,6 @@ def resolve_fields(self, purl: PackageURL) -> None: if requirement_name not in parsed_build_requires: parsed_build_requires[requirement_name] = specifier - self.data["language_version"] = list(python_version_set) or wheel_name_python_version_list - # If we were not able to find any build and backends, use the default setuptools. if not parsed_build_requires: parsed_build_requires["setuptools"] = "==" + defaults.get("heuristic.pypi", "default_setuptools") @@ -243,6 +248,21 @@ def resolve_fields(self, purl: PackageURL) -> None: build_backends_set.add("setuptools.build_meta") logger.debug("Combined build-requires: %s", parsed_build_requires) + + for package, constraint in parsed_build_requires.items(): + package_requirement = package + constraint + python_version_constraints = registry.get_python_requires_for_package_requirement(package_requirement) + if python_version_constraints: + dependency_python_version_set.add(python_version_constraints) + + # We will prefer to use Python version constraints from the package's + # dependencies. In the case that such inference was unsuccessful, we default + # to the Python version constraints inferred from other sources. + if dependency_python_version_set: + self.data["language_version"] = sorted(dependency_python_version_set) + else: + self.data["language_version"] = sorted(python_version_set) + self.data["build_requires"] = parsed_build_requires self.data["build_backends"] = list(build_backends_set) # We do not generate a build command for non-pure packages diff --git a/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py b/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py index adb956346..87e5a1d0d 100644 --- a/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py +++ b/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py @@ -7,11 +7,13 @@ import re from textwrap import dedent +from bs4 import BeautifulSoup, FeatureNotFound from packaging.specifiers import InvalidSpecifier, SpecifierSet from packaging.version import InvalidVersion, Version from macaron.build_spec_generator.common_spec.base_spec import BaseBuildSpecDict from macaron.errors import GenerateBuildSpecError +from macaron.util import send_get_http_raw logger: logging.Logger = logging.getLogger(__name__) @@ -36,9 +38,18 @@ def gen_dockerfile(buildspec: BaseBuildSpecDict) -> str: """ if buildspec["has_binaries"]: raise GenerateBuildSpecError("We currently do not support generating a dockerfile for non-pure Python packages") - language_version: str | None = pick_specific_version(buildspec) + language_version: str | None = pick_specific_version(buildspec["language_version"]) if language_version is None: raise GenerateBuildSpecError("Could not derive specific interpreter version") + try: + version = Version(language_version) + except InvalidVersion as error: + logger.debug("Ran into issue converting %s to a version: %s", language_version, error) + raise GenerateBuildSpecError("Derived interpreter version could not be parsed") from error + if not buildspec["build_tools"]: + raise GenerateBuildSpecError("Cannot generate dockerfile when build tool is unknown") + if not buildspec["build_commands"]: + raise GenerateBuildSpecError("Cannot generate dockerfile when build command is unknown") backend_install_commands: str = " && ".join(build_backend_commands(buildspec)) build_tool_install: str = "" if ( @@ -51,6 +62,12 @@ def gen_dockerfile(buildspec: BaseBuildSpecDict) -> str: build_tool_install = ( f"pip install {buildspec['build_tools'][0]} && if test -f \"flit.ini\"; then python -m flit.tomlify; fi && " ) + modern_build_command = build_tool_install + " ".join(x for x in buildspec["build_commands"][0]) + legacy_build_command = ( + 'if test -f "setup.py"; then pip install wheel && python setup.py bdist_wheel; ' + "else python -m build --wheel -n; fi" + ) + dockerfile_content = f""" #syntax=docker/dockerfile:1.10 FROM oraclelinux:9 @@ -73,13 +90,22 @@ def gen_dockerfile(buildspec: BaseBuildSpecDict) -> str: gcc-c++ gdb lzma glibc-devel libstdc++-devel openssl-devel \\ readline-devel zlib-devel libzstd-devel libffi-devel bzip2-devel \\ xz-devel sqlite sqlite-devel sqlite-libs libuuid-devel gdbm-libs \\ - perf expat expat-devel mpdecimal python3-pip + perf expat expat-devel mpdecimal python3-pip \\ + perl perl-File-Compare + + {openssl_install_commands(version)} + + ENV LD_LIBRARY_PATH=/opt/openssl/lib + ENV CPPFLAGS=-I/opt/openssl/include + ENV LDFLAGS=-L/opt/openssl/lib # Build interpreter and create venv RUN < str: EOF # Run the build - RUN {"source /deps/bin/activate && " + build_tool_install + " ".join(x for x in buildspec["build_commands"][0])} + RUN source /deps/bin/activate && {modern_build_command if version in SpecifierSet(">=3.6") else legacy_build_command} """ return dedent(dockerfile_content) -def pick_specific_version(buildspec: BaseBuildSpecDict) -> str | None: +def openssl_install_commands(version: Version) -> str: + """Appropriate openssl install commands for a given CPython version. + + Parameters + ---------- + version: Version + CPython version we are trying to build + + Returns + ------- + str + Install commands for the corresponding openssl version + """ + # As per https://peps.python.org/pep-0644, all Python >= 3.10 requires at least OpenSSL 1.1.1, + # and 3.6 to 3.9 can be compiled with OpenSSL 1.1.1. Therefore, we compile as below: + if version in SpecifierSet(">=3.6"): + openssl_version = "1.1.1w" + source_url = "https://www.openssl.org/source/old/1.1.1/openssl-1.1.1w.tar.gz" + # From the same document, "Python versions 3.6 to 3.9 are compatible with OpenSSL 1.0.2, + # 1.1.0, and 1.1.1". As an attempt to generalize for any >= 3.3, we use OpenSSL 1.0.2. + else: + openssl_version = "1.0.2u" + source_url = "https://www.openssl.org/source/old/1.0.2/openssl-1.0.2u.tar.gz" + + return f"""# Build OpenSSL {openssl_version} + RUN < str | None: """Find the latest python interpreter version satisfying inferred constraints. Parameters ---------- - buildspec: BaseBuildSpecDict - The base build spec generated for the artifact. + inferred_constraints: list[str] + List of inferred Python version constraints Returns ------- str | None String in format major.minor.patch for the latest valid Python interpreter version, or None if no such version can be found. + + Examples + -------- + >>> pick_specific_version([">=3.0"]) + '3.4.10' + >>> pick_specific_version([">=3.8"]) + '3.8.20' + >>> pick_specific_version([">=3.0", "!=3.4", "!=3.3", "!=3.5"]) + '3.6.15' + >>> pick_specific_version(["<=3.12"]) + '3.4.10' + >>> pick_specific_version(["<=3.12", "==3.6"]) + '3.6.15' """ - # We can most smoothly rebuild Python 3.0.0 and above on OL - version_set = SpecifierSet(">=3.0.0") - for version in buildspec["language_version"]: + # We cannot create virtual environments for Python versions <= 3.3.0, as + # it did not exist back then + version_set = SpecifierSet(">=3.4.0") + for version in inferred_constraints: try: version_set &= SpecifierSet(version) except InvalidSpecifier as error: @@ -139,14 +214,14 @@ def pick_specific_version(buildspec: BaseBuildSpecDict) -> str | None: logger.debug(version_set) - # Now to get the latest acceptable one, we can step through all interpreter + # Now to get the earliest acceptable one, we can step through all interpreter # versions. For the most accurate result, we can query python.org for a - # list of all versions, but for now we can approximate by stepping down - # through every minor version from 3.14.0 to 3.0.0 - for minor in range(14, -1, -1): + # list of all versions, but for now we can approximate by stepping up + # through every minor version from 3.3.0 to 3.14.0 + for minor in range(3, 15, 1): try: if Version(f"3.{minor}.0") in version_set: - return f"3.{minor}.0" + return get_latest_cpython_patch(3, minor) except InvalidVersion as error: logger.debug("Ran into issue converting %s to a version: %s", minor, error) return None @@ -197,6 +272,59 @@ def infer_interpreter_version(specifier: str) -> str | None: return None +def get_latest_cpython_patch(major: int, minor: int) -> str: + """Given major and minor interpreter version, return latest CPython patched version. + + Parameters + ---------- + major: int + Major component of version + minor: int + Minor component of version + + Returns + ------- + str + Full major.minor.patch version string corresponding to latest + patch for input major and minor. + """ + latest_patch: Version | None = None + # We install CPython source + response = send_get_http_raw("https://www.python.org/ftp/python/") + if not response: + raise GenerateBuildSpecError("Failed to fetch index of CPython versions.") + + html: str = "" + soup: BeautifulSoup | None = None + + try: + html = response.content.decode("utf-8") + soup = BeautifulSoup(html, "html.parser") + except (UnicodeDecodeError, FeatureNotFound) as error: + raise GenerateBuildSpecError("Failed to parse index of CPython versions.") from error + + # Versions can most reliably be found in anchor tags like: + # {Version}/ + for anchor in soup.find_all("a", href=True): + # Get text enclosed in the anchor tag stripping spaces. + text = anchor.get_text(strip=True) + sanitized_text = text.rstrip("/") + # Try to convert to a version. + try: + parsed_version = Version(sanitized_text) + if parsed_version.major == major and parsed_version.minor == minor: + if latest_patch is None or parsed_version > latest_patch: + latest_patch = parsed_version + except InvalidVersion: + # Try the next tag + continue + + if not latest_patch: + raise GenerateBuildSpecError(f"Failed to infer latest patch for CPython {major}.{minor}") + + return str(latest_patch) + + def build_backend_commands(buildspec: BaseBuildSpecDict) -> list[str]: """Generate the installation commands for each inferred build backend. @@ -214,7 +342,10 @@ def build_backend_commands(buildspec: BaseBuildSpecDict) -> list[str]: return [] commands: list[str] = [] for backend, version_constraint in buildspec["build_requires"].items(): - commands.append(f'/deps/bin/pip install "{backend}{version_constraint}"') + if backend == "setuptools": + commands.append("/deps/bin/pip install --upgrade setuptools") + else: + commands.append(f'/deps/bin/pip install "{backend}{version_constraint}"') # For a stable order on the install commands commands.sort() return commands diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index e11c8260a..935f662c7 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -22,7 +22,9 @@ import requests from bs4 import BeautifulSoup, Tag +from packaging.requirements import InvalidRequirement, Requirement from packaging.utils import InvalidWheelFilename, parse_wheel_filename +from packaging.version import InvalidVersion, Version from macaron.config.defaults import defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError, SourceCodeError, WheelTagError @@ -540,6 +542,50 @@ def get_matching_setuptools_version(self, package_release_datetime: datetime) -> # Return default just in case. return defaults.get("heuristic.pypi", "default_setuptools") + def get_python_requires_for_package_requirement(self, package_requirement: str) -> str | None: + """Return the Python version constraint string for earliest version of the package satisfying package_requirement. + + Parameters + ---------- + package_constraint: str + pip style requirement string. + + Returns + ------- + str | None + Corresponding Python version constraint string. + """ + try: + parsed_requirement = Requirement(package_requirement) + endpoint = urllib.parse.urljoin(self.registry_url, f"pypi/{parsed_requirement.name}/json") + json = self.download_package_json(endpoint) + releases = json_extract(json, ["releases"], dict) + if releases: + # Find smallest requirement satisfying parsed_requirement.name + version_tuples: list[tuple[str, Version]] = [] + for version in releases.keys(): + try: + version_name = str(version) + parsed_version = Version(version_name) + if parsed_version in parsed_requirement.specifier: + version_tuple = (version_name, parsed_version) + version_tuples.append(version_tuple) + except InvalidVersion: + continue + if not version_tuples: + return None + lowest_staisfying_version = min(version_tuples, key=lambda version_tuple: version_tuple[1]) + release_info = releases[lowest_staisfying_version[0]] + if isinstance(release_info, list) and release_info: + release = release_info[0] + if isinstance(release, dict): + constraint_specification = release.get("requires_python") + if isinstance(constraint_specification, str): + return constraint_specification + return None + except InvalidRequirement: + return None + @staticmethod def extract_attestation(attestation_data: dict) -> dict | None: """Extract the first attestation file from a PyPI attestation response. diff --git a/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr b/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr index 696ee6f8d..655628572 100644 --- a/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr +++ b/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr @@ -13,8 +13,8 @@ # Download and unzip interpreter RUN <=3.10" + ">=3.6", + ">=3.8" ], "ecosystem": "pypi", "purl": "pkg:pypi/markdown-it-py@4.0.0", diff --git a/tests/integration/cases/pypi_markdown-it-py/expected_dockerfile.buildspec b/tests/integration/cases/pypi_markdown-it-py/expected_dockerfile.buildspec index 981619196..e4133eb2c 100644 --- a/tests/integration/cases/pypi_markdown-it-py/expected_dockerfile.buildspec +++ b/tests/integration/cases/pypi_markdown-it-py/expected_dockerfile.buildspec @@ -10,8 +10,8 @@ RUN dnf -y install gcc make # Download and unzip interpreter RUN <=3.8", ">=3.9" ], "ecosystem": "pypi", diff --git a/tests/integration/cases/pypi_toga/expected_dockerfile.buildspec b/tests/integration/cases/pypi_toga/expected_dockerfile.buildspec index 47e1e012a..d50340d8b 100644 --- a/tests/integration/cases/pypi_toga/expected_dockerfile.buildspec +++ b/tests/integration/cases/pypi_toga/expected_dockerfile.buildspec @@ -10,8 +10,8 @@ RUN dnf -y install gcc make # Download and unzip interpreter RUN <=3.10" + "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" ], "ecosystem": "pypi", "purl": "pkg:pypi/tree-sitter@0.25.2", From fe4a048b18e327b7d01ae5dc05b7554715fc90ba Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Wed, 18 Feb 2026 15:47:54 +1000 Subject: [PATCH 18/20] feat: validate buildspec dockerfile (#1280) Signed-off-by: Abhinav Pradeep --- .../common_spec/base_spec.py | 3 ++ .../common_spec/pypi_spec.py | 5 +++ .../dockerfile/pypi_dockerfile_output.py | 38 ++++++++++++++++++- .../package_registry/pypi_registry.py | 8 ++++ .../test_pypi_dockerfile_output.ambr | 26 ++++++++++++- .../dockerfile/test_pypi_dockerfile_output.py | 10 +++++ .../expected_default.buildspec | 10 ++++- .../expected_dockerfile.buildspec | 26 ++++++++++++- .../expected_default.buildspec | 10 ++++- .../expected_dockerfile.buildspec | 26 ++++++++++++- .../pypi_toga/expected_default.buildspec | 10 ++++- .../pypi_toga/expected_dockerfile.buildspec | 26 ++++++++++++- .../expected_default.buildspec | 5 ++- .../cases/pypi_tree-sitter/test.yaml | 2 +- 14 files changed, 195 insertions(+), 10 deletions(-) diff --git a/src/macaron/build_spec_generator/common_spec/base_spec.py b/src/macaron/build_spec_generator/common_spec/base_spec.py index 698a0b948..6477801fd 100644 --- a/src/macaron/build_spec_generator/common_spec/base_spec.py +++ b/src/macaron/build_spec_generator/common_spec/base_spec.py @@ -84,6 +84,9 @@ class BaseBuildSpecDict(TypedDict, total=False): #: Flag to indicate if the artifact includes binaries. has_binaries: NotRequired[bool] + #: The artifacts that were analyzed in generating the build specification. + upstream_artifacts: dict[str, list[str]] + class BaseBuildSpec(ABC): """Abstract base class for build specification behavior and field resolution.""" diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index 328481f45..ee67578c9 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -106,6 +106,7 @@ def resolve_fields(self, purl: PackageURL) -> None: metadata=[], ) + upstream_artifacts: dict[str, list[str]] = {} pypi_package_json = pypi_registry.find_or_create_pypi_asset(purl.name, purl.version, registry_info) patched_build_commands: list[list[str]] = [] build_backends_set: set[str] = set() @@ -141,6 +142,7 @@ def resolve_fields(self, purl: PackageURL) -> None: try: # The wheel function handles downloading binaries in the case that we cannot find a pure wheel. with pypi_package_json.wheel(download_binaries=self.data["has_binaries"]): + upstream_artifacts["wheels"] = pypi_package_json.wheel_urls logger.debug("Wheel at %s", pypi_package_json.wheel_path) # Should only have .dist-info directory. logger.debug("It has directories %s", ",".join(os.listdir(pypi_package_json.wheel_path))) @@ -184,6 +186,8 @@ def resolve_fields(self, purl: PackageURL) -> None: try: with pypi_package_json.sourcecode(): + upstream_artifacts["sdist"] = [pypi_package_json.sdist_url] + logger.debug("sdist url at %s", upstream_artifacts["sdist"]) try: # Get the build time requirements from ["build-system", "requires"] pyproject_content = pypi_package_json.get_sourcecode_file_contents("pyproject.toml") @@ -269,6 +273,7 @@ def resolve_fields(self, purl: PackageURL) -> None: if not self.data["has_binaries"]: patched_build_commands = self.get_default_build_commands(self.data["build_tools"]) self.data["build_commands"] = patched_build_commands + self.data["upstream_artifacts"] = upstream_artifacts def add_parsed_requirement(self, build_requirements: dict[str, str], requirement: str) -> None: """ diff --git a/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py b/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py index 87e5a1d0d..67d1c6308 100644 --- a/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py +++ b/src/macaron/build_spec_generator/dockerfile/pypi_dockerfile_output.py @@ -62,18 +62,30 @@ def gen_dockerfile(buildspec: BaseBuildSpecDict) -> str: build_tool_install = ( f"pip install {buildspec['build_tools'][0]} && if test -f \"flit.ini\"; then python -m flit.tomlify; fi && " ) + modern_build_command = build_tool_install + " ".join(x for x in buildspec["build_commands"][0]) legacy_build_command = ( 'if test -f "setup.py"; then pip install wheel && python setup.py bdist_wheel; ' "else python -m build --wheel -n; fi" ) + wheel_url: str = "" + wheel_name: str = "" + + wheel_urls = buildspec["upstream_artifacts"]["wheels"] + # We currently only look for the pure wheel, if it exists + if wheel_urls: + wheel_url = list(wheel_urls)[0] + wheel_name = wheel_url.rsplit("/", 1)[-1] + else: + logger.debug("We could not find an upstream artifact, and therefore we cannot run validation") + dockerfile_content = f""" #syntax=docker/dockerfile:1.10 FROM oraclelinux:9 # Install core tools - RUN dnf -y install which wget tar git + RUN dnf -y install which wget tar unzip git # Install compiler and make RUN dnf -y install gcc make @@ -127,6 +139,30 @@ def gen_dockerfile(buildspec: BaseBuildSpecDict) -> str: # Run the build RUN source /deps/bin/activate && {modern_build_command if version in SpecifierSet(">=3.6") else legacy_build_command} + + # Validate script + RUN cat <<'EOF' >/validate + [ -n "{wheel_url}" ] || {{ echo "No upstream artifact to validate against."; exit 1; }} + # Capture artifacts generated + WHEELS=(/src/dist/*.whl) + # Ensure we only have one artifact + [ ${{#WHEELS[@]}} -eq 1 ] || {{ echo "Unexpected artifacts produced!"; exit 1; }} + # BUILT_WHEEL is the artifact we built + BUILT_WHEEL=${{WHEELS[0]}} + # Ensure the artifact produced is not the literal returned by the glob + [ -e $BUILT_WHEEL ] || {{ echo "No wheels found!"; exit 1; }} + # Download the wheel + wget -q {wheel_url} + # Compare wheel names + [ $(basename $BUILT_WHEEL) == "{wheel_name}" ] || {{ echo "Wheel name does not match!"; exit 1; }} + # Compare file tree + (unzip -Z1 $BUILT_WHEEL | grep -v '\\.dist-info' | sort) > built.tree + (unzip -Z1 "{wheel_name}" | grep -v '\\.dist-info' | sort ) > pypi_artifact.tree + diff -u built.tree pypi_artifact.tree || {{ echo "File trees do not match!"; exit 1; }} + echo "Success!" + EOF + + ENTRYPOINT ["/bin/bash","/validate"] """ return dedent(dockerfile_content) diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 935f662c7..9f988ed80 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -704,6 +704,12 @@ class PyPIPackageJsonAsset: #: The source code temporary location name. package_sourcecode_path: str = field(init=False) + #: URL of the sdist file. + sdist_url: str = field(init=False) + + #: URL of the wheel file. + wheel_urls: list[str] = field(init=False) + #: The wheel temporary location name. wheel_path: str = field(init=False) @@ -832,6 +838,7 @@ def get_sourcecode_url(self, package_type: str = "sdist") -> str | None: fragment="", ).geturl() logger.debug("Found source URL: %s", configured_source_url) + self.sdist_url = configured_source_url return configured_source_url return None @@ -892,6 +899,7 @@ def get_wheel_url(self, tag: str = "none-any") -> str | None: fragment="", ).geturl() logger.debug("Found wheel URL: %s", configured_wheel_url) + self.wheel_urls = [configured_wheel_url] return configured_wheel_url return None diff --git a/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr b/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr index 655628572..8ff65b0da 100644 --- a/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr +++ b/tests/build_spec_generator/dockerfile/__snapshots__/test_pypi_dockerfile_output.ambr @@ -6,7 +6,7 @@ FROM oraclelinux:9 # Install core tools - RUN dnf -y install which wget tar git + RUN dnf -y install which wget tar unzip git # Install compiler and make RUN dnf -y install gcc make @@ -69,5 +69,29 @@ # Run the build RUN source /deps/bin/activate && python -m build + # Validate script + RUN cat <<'EOF' >/validate + [ -n "https://files.pythonhosted.org/packages/96/c5/1e741d26306c42e2bf6ab740b2202872727e0f606033c9dd713f8b93f5a8/cachetools-6.2.1-py3-none-any.whl" ] || { echo "No upstream artifact to validate against."; exit 1; } + # Capture artifacts generated + WHEELS=(/src/dist/*.whl) + # Ensure we only have one artifact + [ ${#WHEELS[@]} -eq 1 ] || { echo "Unexpected artifacts produced!"; exit 1; } + # BUILT_WHEEL is the artifact we built + BUILT_WHEEL=${WHEELS[0]} + # Ensure the artifact produced is not the literal returned by the glob + [ -e $BUILT_WHEEL ] || { echo "No wheels found!"; exit 1; } + # Download the wheel + wget -q https://files.pythonhosted.org/packages/96/c5/1e741d26306c42e2bf6ab740b2202872727e0f606033c9dd713f8b93f5a8/cachetools-6.2.1-py3-none-any.whl + # Compare wheel names + [ $(basename $BUILT_WHEEL) == "cachetools-6.2.1-py3-none-any.whl" ] || { echo "Wheel name does not match!"; exit 1; } + # Compare file tree + (unzip -Z1 $BUILT_WHEEL | grep -v '\.dist-info' | sort) > built.tree + (unzip -Z1 "cachetools-6.2.1-py3-none-any.whl" | grep -v '\.dist-info' | sort ) > pypi_artifact.tree + diff -u built.tree pypi_artifact.tree || { echo "File trees do not match!"; exit 1; } + echo "Success!" + EOF + + ENTRYPOINT ["/bin/bash","/validate"] + ''' # --- diff --git a/tests/build_spec_generator/dockerfile/test_pypi_dockerfile_output.py b/tests/build_spec_generator/dockerfile/test_pypi_dockerfile_output.py index c8d4d8882..4c8902325 100644 --- a/tests/build_spec_generator/dockerfile/test_pypi_dockerfile_output.py +++ b/tests/build_spec_generator/dockerfile/test_pypi_dockerfile_output.py @@ -32,6 +32,16 @@ def fixture_base_build_spec() -> BaseBuildSpecDict: "build_commands": [["python", "-m", "build"]], "build_requires": {"setuptools": "==80.9.0", "wheel": ""}, "build_backends": ["setuptools.build_meta"], + "upstream_artifacts": { + "wheels": [ + "https://files.pythonhosted.org/packages/96/c5/" + "1e741d26306c42e2bf6ab740b2202872727e0f606033c9dd713f8b93f5a8/cachetools-6.2.1-py3-none-any.whl" + ], + "sdist": [ + "https://files.pythonhosted.org/packages/cc/7e/" + "b975b5814bd36faf009faebe22c1072a1fa1168db34d285ef0ba071ad78c/cachetools-6.2.1.tar.gz" + ], + }, } ) diff --git a/tests/integration/cases/pypi_cachetools/expected_default.buildspec b/tests/integration/cases/pypi_cachetools/expected_default.buildspec index 2a05c0e95..87859fbd4 100644 --- a/tests/integration/cases/pypi_cachetools/expected_default.buildspec +++ b/tests/integration/cases/pypi_cachetools/expected_default.buildspec @@ -31,5 +31,13 @@ }, "build_backends": [ "setuptools.build_meta" - ] + ], + "upstream_artifacts": { + "wheels": [ + "https://files.pythonhosted.org/packages/96/c5/1e741d26306c42e2bf6ab740b2202872727e0f606033c9dd713f8b93f5a8/cachetools-6.2.1-py3-none-any.whl" + ], + "sdist": [ + "https://files.pythonhosted.org/packages/cc/7e/b975b5814bd36faf009faebe22c1072a1fa1168db34d285ef0ba071ad78c/cachetools-6.2.1.tar.gz" + ] + } } diff --git a/tests/integration/cases/pypi_cachetools/expected_dockerfile.buildspec b/tests/integration/cases/pypi_cachetools/expected_dockerfile.buildspec index 254f0b56e..9fbfdddd3 100644 --- a/tests/integration/cases/pypi_cachetools/expected_dockerfile.buildspec +++ b/tests/integration/cases/pypi_cachetools/expected_dockerfile.buildspec @@ -3,7 +3,7 @@ FROM oraclelinux:9 # Install core tools -RUN dnf -y install which wget tar git +RUN dnf -y install which wget tar unzip git # Install compiler and make RUN dnf -y install gcc make @@ -65,3 +65,27 @@ EOF # Run the build RUN source /deps/bin/activate && python -m build --wheel -n + +# Validate script +RUN cat <<'EOF' >/validate + [ -n "https://files.pythonhosted.org/packages/96/c5/1e741d26306c42e2bf6ab740b2202872727e0f606033c9dd713f8b93f5a8/cachetools-6.2.1-py3-none-any.whl" ] || { echo "No upstream artifact to validate against."; exit 1; } + # Capture artifacts generated + WHEELS=(/src/dist/*.whl) + # Ensure we only have one artifact + [ ${#WHEELS[@]} -eq 1 ] || { echo "Unexpected artifacts produced!"; exit 1; } + # BUILT_WHEEL is the artifact we built + BUILT_WHEEL=${WHEELS[0]} + # Ensure the artifact produced is not the literal returned by the glob + [ -e $BUILT_WHEEL ] || { echo "No wheels found!"; exit 1; } + # Download the wheel + wget -q https://files.pythonhosted.org/packages/96/c5/1e741d26306c42e2bf6ab740b2202872727e0f606033c9dd713f8b93f5a8/cachetools-6.2.1-py3-none-any.whl + # Compare wheel names + [ $(basename $BUILT_WHEEL) == "cachetools-6.2.1-py3-none-any.whl" ] || { echo "Wheel name does not match!"; exit 1; } + # Compare file tree + (unzip -Z1 $BUILT_WHEEL | grep -v '\.dist-info' | sort) > built.tree + (unzip -Z1 "cachetools-6.2.1-py3-none-any.whl" | grep -v '\.dist-info' | sort ) > pypi_artifact.tree + diff -u built.tree pypi_artifact.tree || { echo "File trees do not match!"; exit 1; } + echo "Success!" +EOF + +ENTRYPOINT ["/bin/bash","/validate"] diff --git a/tests/integration/cases/pypi_markdown-it-py/expected_default.buildspec b/tests/integration/cases/pypi_markdown-it-py/expected_default.buildspec index 3fbb4fcbc..de0634640 100644 --- a/tests/integration/cases/pypi_markdown-it-py/expected_default.buildspec +++ b/tests/integration/cases/pypi_markdown-it-py/expected_default.buildspec @@ -29,5 +29,13 @@ }, "build_backends": [ "flit_core.buildapi" - ] + ], + "upstream_artifacts": { + "wheels": [ + "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl" + ], + "sdist": [ + "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz" + ] + } } diff --git a/tests/integration/cases/pypi_markdown-it-py/expected_dockerfile.buildspec b/tests/integration/cases/pypi_markdown-it-py/expected_dockerfile.buildspec index e4133eb2c..e6596fc1b 100644 --- a/tests/integration/cases/pypi_markdown-it-py/expected_dockerfile.buildspec +++ b/tests/integration/cases/pypi_markdown-it-py/expected_dockerfile.buildspec @@ -3,7 +3,7 @@ FROM oraclelinux:9 # Install core tools -RUN dnf -y install which wget tar git +RUN dnf -y install which wget tar unzip git # Install compiler and make RUN dnf -y install gcc make @@ -65,3 +65,27 @@ EOF # Run the build RUN source /deps/bin/activate && pip install flit && if test -f "flit.ini"; then python -m flit.tomlify; fi && flit build + +# Validate script +RUN cat <<'EOF' >/validate + [ -n "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl" ] || { echo "No upstream artifact to validate against."; exit 1; } + # Capture artifacts generated + WHEELS=(/src/dist/*.whl) + # Ensure we only have one artifact + [ ${#WHEELS[@]} -eq 1 ] || { echo "Unexpected artifacts produced!"; exit 1; } + # BUILT_WHEEL is the artifact we built + BUILT_WHEEL=${WHEELS[0]} + # Ensure the artifact produced is not the literal returned by the glob + [ -e $BUILT_WHEEL ] || { echo "No wheels found!"; exit 1; } + # Download the wheel + wget -q https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl + # Compare wheel names + [ $(basename $BUILT_WHEEL) == "markdown_it_py-4.0.0-py3-none-any.whl" ] || { echo "Wheel name does not match!"; exit 1; } + # Compare file tree + (unzip -Z1 $BUILT_WHEEL | grep -v '\.dist-info' | sort) > built.tree + (unzip -Z1 "markdown_it_py-4.0.0-py3-none-any.whl" | grep -v '\.dist-info' | sort ) > pypi_artifact.tree + diff -u built.tree pypi_artifact.tree || { echo "File trees do not match!"; exit 1; } + echo "Success!" +EOF + +ENTRYPOINT ["/bin/bash","/validate"] diff --git a/tests/integration/cases/pypi_toga/expected_default.buildspec b/tests/integration/cases/pypi_toga/expected_default.buildspec index 076503858..ac873e87f 100644 --- a/tests/integration/cases/pypi_toga/expected_default.buildspec +++ b/tests/integration/cases/pypi_toga/expected_default.buildspec @@ -33,5 +33,13 @@ }, "build_backends": [ "setuptools.build_meta" - ] + ], + "upstream_artifacts": { + "wheels": [ + "https://files.pythonhosted.org/packages/2b/1a/6a9c8230ad30e819f0965bbd596c736a03e16003d27b0363c632c84d4861/toga-0.5.1-py3-none-any.whl" + ], + "sdist": [ + "https://files.pythonhosted.org/packages/17/e7/0924150329474d61e9f40f8bba1056d640cba22438e05355924019111b46/toga-0.5.1.tar.gz" + ] + } } diff --git a/tests/integration/cases/pypi_toga/expected_dockerfile.buildspec b/tests/integration/cases/pypi_toga/expected_dockerfile.buildspec index d50340d8b..a8918d0ce 100644 --- a/tests/integration/cases/pypi_toga/expected_dockerfile.buildspec +++ b/tests/integration/cases/pypi_toga/expected_dockerfile.buildspec @@ -3,7 +3,7 @@ FROM oraclelinux:9 # Install core tools -RUN dnf -y install which wget tar git +RUN dnf -y install which wget tar unzip git # Install compiler and make RUN dnf -y install gcc make @@ -65,3 +65,27 @@ EOF # Run the build RUN source /deps/bin/activate && python -m build --wheel -n + +# Validate script +RUN cat <<'EOF' >/validate + [ -n "https://files.pythonhosted.org/packages/2b/1a/6a9c8230ad30e819f0965bbd596c736a03e16003d27b0363c632c84d4861/toga-0.5.1-py3-none-any.whl" ] || { echo "No upstream artifact to validate against."; exit 1; } + # Capture artifacts generated + WHEELS=(/src/dist/*.whl) + # Ensure we only have one artifact + [ ${#WHEELS[@]} -eq 1 ] || { echo "Unexpected artifacts produced!"; exit 1; } + # BUILT_WHEEL is the artifact we built + BUILT_WHEEL=${WHEELS[0]} + # Ensure the artifact produced is not the literal returned by the glob + [ -e $BUILT_WHEEL ] || { echo "No wheels found!"; exit 1; } + # Download the wheel + wget -q https://files.pythonhosted.org/packages/2b/1a/6a9c8230ad30e819f0965bbd596c736a03e16003d27b0363c632c84d4861/toga-0.5.1-py3-none-any.whl + # Compare wheel names + [ $(basename $BUILT_WHEEL) == "toga-0.5.1-py3-none-any.whl" ] || { echo "Wheel name does not match!"; exit 1; } + # Compare file tree + (unzip -Z1 $BUILT_WHEEL | grep -v '\.dist-info' | sort) > built.tree + (unzip -Z1 "toga-0.5.1-py3-none-any.whl" | grep -v '\.dist-info' | sort ) > pypi_artifact.tree + diff -u built.tree pypi_artifact.tree || { echo "File trees do not match!"; exit 1; } + echo "Success!" +EOF + +ENTRYPOINT ["/bin/bash","/validate"] diff --git a/tests/integration/cases/pypi_tree-sitter/expected_default.buildspec b/tests/integration/cases/pypi_tree-sitter/expected_default.buildspec index c0612c42d..2173ac78b 100644 --- a/tests/integration/cases/pypi_tree-sitter/expected_default.buildspec +++ b/tests/integration/cases/pypi_tree-sitter/expected_default.buildspec @@ -22,5 +22,8 @@ }, "build_backends": [ "setuptools.build_meta" - ] + ], + "upstream_artifacts": { + "sdist": ["https://files.pythonhosted.org/packages/66/7c/0350cfc47faadc0d3cf7d8237a4e34032b3014ddf4a12ded9933e1648b55/tree-sitter-0.25.2.tar.gz"] + } } diff --git a/tests/integration/cases/pypi_tree-sitter/test.yaml b/tests/integration/cases/pypi_tree-sitter/test.yaml index 13cf9d7d7..0b15a8bce 100644 --- a/tests/integration/cases/pypi_tree-sitter/test.yaml +++ b/tests/integration/cases/pypi_tree-sitter/test.yaml @@ -33,6 +33,6 @@ steps: options: command_args: - -purl - - pkg:pypi/markdown-it-py@0.25.2 + - pkg:pypi/tree-sitter@0.25.2 - --output-format - dockerfile From cbf03c7194c3a8b32e11753eb49e88184b1ff964 Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Thu, 19 Feb 2026 14:15:46 +1000 Subject: [PATCH 19/20] chore: improve pure wheel check (#1309) Signed-off-by: Abhinav Pradeep --- src/macaron/slsa_analyzer/package_registry/pypi_registry.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 9f988ed80..bce197890 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -987,10 +987,7 @@ def has_pure_wheel(self) -> bool: try: _, _, _, tags = parse_wheel_filename(file_name) # Check if none and any are in the tags (i.e. the wheel is pure) - # Technically a wheel can have multiple tag sets. Our condition for - # a pure wheel is that it has only one tag set with abi "none" and - # platform "any" - if len(tags) == 1 and all(tag.abi == "none" and tag.platform == "any" for tag in tags): + if all(tag.abi == "none" and tag.platform == "any" for tag in tags): return True except InvalidWheelFilename: logger.debug("Could not parse wheel name.") From 41162501345aa2dab3c0e84bd4eb55f47b536391 Mon Sep 17 00:00:00 2001 From: Behnaz Hassanshahi Date: Thu, 19 Feb 2026 15:43:45 +1000 Subject: [PATCH 20/20] fix(gen-build-spec): remove the default -Dmaven.test.skip=true mvn option from the default spec (#1301) This PR removes the default -Dmaven.test.skip=true option from the default buildspec for Maven artifacts. Signed-off-by: behnazh-w --- .../tutorials/rebuild_third_party_artifacts.rst | 3 +-- .../build_spec_generator/build_command_patcher.py | 1 - .../build_spec_generator/common_spec/jdk_finder.py | 6 +++--- .../reproducible_central/reproducible_central.py | 12 ++++++++++-- .../test_reproducible_central.py | 8 +++++--- .../expected_macaron.buildspec | 2 +- .../expected_reproducible_central.buildspec | 2 +- .../computer-k8s/expected_default.buildspec | 1 - 8 files changed, 21 insertions(+), 14 deletions(-) diff --git a/docs/source/pages/tutorials/rebuild_third_party_artifacts.rst b/docs/source/pages/tutorials/rebuild_third_party_artifacts.rst index aaef86b8e..24bb9fd22 100644 --- a/docs/source/pages/tutorials/rebuild_third_party_artifacts.rst +++ b/docs/source/pages/tutorials/rebuild_third_party_artifacts.rst @@ -131,7 +131,6 @@ By default we generate the buildspec in JSON format as follows: [ "mvn", "-DskipTests=true", - "-Dmaven.test.skip=true", "-Dmaven.site.skip=true", "-Drat.skip=true", "-Dmaven.javadoc.skip=true", @@ -161,7 +160,7 @@ The resulting file will be saved as ``output/buildspec/maven/org_apache_hugegrap tool=mvn jdk=8 newline=lf - command="mvn -DskipTests=true -Dmaven.test.skip=true -Dmaven.site.skip=true -Drat.skip=true -Dmaven.javadoc.skip=true clean package" + command="mvn -Dmaven.test.skip=true -DskipTests=true -Dmaven.site.skip=true -Drat.skip=true -Dmaven.javadoc.skip=true clean package" buildinfo=target/computer-k8s-1.0.0.buildinfo You can now use this file to automate rebuilding artifacts, for example as part of the Reproducible Central infrastructure. diff --git a/src/macaron/build_spec_generator/build_command_patcher.py b/src/macaron/build_spec_generator/build_command_patcher.py index 224ec5715..cbfd32722 100644 --- a/src/macaron/build_spec_generator/build_command_patcher.py +++ b/src/macaron/build_spec_generator/build_command_patcher.py @@ -47,7 +47,6 @@ # To remove "-Dgpg.passphrase=$MACARON_UNKNOWN" "gpg.passphrase": None, "skipTests": "true", - "maven.test.skip": "true", "maven.site.skip": "true", "rat.skip": "true", "maven.javadoc.skip": "true", diff --git a/src/macaron/build_spec_generator/common_spec/jdk_finder.py b/src/macaron/build_spec_generator/common_spec/jdk_finder.py index 538a57b28..45d5b71dd 100644 --- a/src/macaron/build_spec_generator/common_spec/jdk_finder.py +++ b/src/macaron/build_spec_generator/common_spec/jdk_finder.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module includes the functions for obtaining the JDK version from a Java artifact.""" @@ -198,7 +198,7 @@ def find_jdk_version_from_remote_maven_repo_standalone( local_artifact_path, ) except InvalidHTTPResponseError as error: - logger.error("Failed why trying to download jar file. Error: %s", error) + logger.debug("Failed while trying to download jar file. Error: %s", error) return None except OSError as os_error: logger.critical("Critical %s", os_error) @@ -278,7 +278,7 @@ def find_jdk_version_from_remote_maven_repo_cache( local_artifact_path, ) except InvalidHTTPResponseError as error: - logger.error("Failed why trying to download jar file. Error: %s", error) + logger.debug("Failed while trying to download jar file. Error: %s", error) return None except OSError as os_error: logger.critical("Critical %s", os_error) diff --git a/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py b/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py index 5a6ec8389..c5f861c90 100644 --- a/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py +++ b/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the logic to generate a build spec in the Reproducible Central format.""" @@ -85,6 +85,14 @@ def gen_reproducible_central_build_spec(build_spec: BaseBuildSpecDict) -> str | if build_spec["group_id"] is None: raise GenerateBuildSpecError(f"Version is missing in PURL {build_spec['purl']}") + # Add -Dmaven.test.skip for Maven builds. + # TODO: Use the build tool associated with the build command once + # https://github.com/oracle/macaron/issues/1300 is closed. + adapted_build_commands = [ + cmd[:1] + ["-Dmaven.test.skip=true"] + cmd[1:] if ReproducibleCentralBuildTool.MAVEN in cmd[0] else cmd + for cmd in build_spec["build_commands"] + ] + template_format_values: dict[str, str] = { "macaron_version": importlib_metadata.version("macaron"), "group_id": build_spec["group_id"], @@ -96,7 +104,7 @@ def gen_reproducible_central_build_spec(build_spec: BaseBuildSpecDict) -> str | "newline": build_spec["newline"], "buildinfo": f"target/{build_spec['artifact_id']}-{build_spec['version']}.buildinfo", "jdk": build_spec["language_version"][0], - "command": compose_shell_commands(build_spec["build_commands"]), + "command": compose_shell_commands(adapted_build_commands), } return STRING_TEMPLATE.format_map(template_format_values) diff --git a/tests/build_spec_generator/reproducible_central/test_reproducible_central.py b/tests/build_spec_generator/reproducible_central/test_reproducible_central.py index f28b93f66..f95fefeb7 100644 --- a/tests/build_spec_generator/reproducible_central/test_reproducible_central.py +++ b/tests/build_spec_generator/reproducible_central/test_reproducible_central.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2025 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains tests for Reproducible Central build spec generation.""" @@ -40,7 +40,7 @@ def test_successful_build_spec(base_build_spec: BaseBuildSpecDict) -> None: assert "groupId=com.oracle" in content assert "artifactId=example-artifact" in content assert "tool=mvn" in content - assert 'command="mvn package"' in content + assert 'command="mvn -Dmaven.test.skip=true package"' in content def test_unsupported_build_tool(base_build_spec: BaseBuildSpecDict) -> None: @@ -80,6 +80,8 @@ def test_compose_shell_commands_integration(base_build_spec: BaseBuildSpecDict) """Test that the correct compose_shell_commands function is used.""" base_build_spec["build_commands"] = [["mvn", "clean", "package"], ["echo", "done"]] content = gen_reproducible_central_build_spec(base_build_spec) - expected_commands = compose_shell_commands([["mvn", "clean", "package"], ["echo", "done"]]) + expected_commands = compose_shell_commands( + [["mvn", "-Dmaven.test.skip=true", "clean", "package"], ["echo", "done"]] + ) assert content assert f'command="{expected_commands}"' in content diff --git a/tests/integration/cases/behnazh-w_example-maven-app_gen_rc_build_spec/expected_macaron.buildspec b/tests/integration/cases/behnazh-w_example-maven-app_gen_rc_build_spec/expected_macaron.buildspec index 2fe2e28c8..d5f9ec1ae 100644 --- a/tests/integration/cases/behnazh-w_example-maven-app_gen_rc_build_spec/expected_macaron.buildspec +++ b/tests/integration/cases/behnazh-w_example-maven-app_gen_rc_build_spec/expected_macaron.buildspec @@ -1 +1 @@ -{"macaron_version": "0.17.0", "group_id": "io.github.behnazh-w.demo", "artifact_id": "core", "version": "2.0.3", "git_repo": "https://github.com/behnazh-w/example-maven-provenance", "git_tag": "597be192fb50f03b86c34f1bfc494fea1eab264f", "newline": "lf", "language_version": "17", "ecosystem": "maven", "purl": "pkg:maven/io.github.behnazh-w.demo/core@2.0.3", "language": "java", "build_tool": "maven", "build_commands": [["./mvnw", "-DskipTests=true", "-Dmaven.test.skip=true", "-Dmaven.site.skip=true", "-Drat.skip=true", "-Dmaven.javadoc.skip=true", "clean", "package"]]} +{"macaron_version": "0.17.0", "group_id": "io.github.behnazh-w.demo", "artifact_id": "core", "version": "2.0.3", "git_repo": "https://github.com/behnazh-w/example-maven-provenance", "git_tag": "597be192fb50f03b86c34f1bfc494fea1eab264f", "newline": "lf", "language_version": "17", "ecosystem": "maven", "purl": "pkg:maven/io.github.behnazh-w.demo/core@2.0.3", "language": "java", "build_tool": "maven", "build_commands": [["./mvnw", "-DskipTests=true", "-Dmaven.site.skip=true", "-Drat.skip=true", "-Dmaven.javadoc.skip=true", "clean", "package"]]} diff --git a/tests/integration/cases/behnazh-w_example-maven-app_gen_rc_build_spec/expected_reproducible_central.buildspec b/tests/integration/cases/behnazh-w_example-maven-app_gen_rc_build_spec/expected_reproducible_central.buildspec index f1622f4e7..1586f4512 100644 --- a/tests/integration/cases/behnazh-w_example-maven-app_gen_rc_build_spec/expected_reproducible_central.buildspec +++ b/tests/integration/cases/behnazh-w_example-maven-app_gen_rc_build_spec/expected_reproducible_central.buildspec @@ -14,6 +14,6 @@ jdk=17 newline=lf -command="./mvnw -DskipTests=true -Dmaven.test.skip=true -Dmaven.site.skip=true -Drat.skip=true -Dmaven.javadoc.skip=true clean package" +command="./mvnw -Dmaven.test.skip=true -DskipTests=true -Dmaven.site.skip=true -Drat.skip=true -Dmaven.javadoc.skip=true clean package" buildinfo=target/core-2.0.3.buildinfo diff --git a/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec b/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec index 86325ad7f..05dbdb6f2 100644 --- a/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec +++ b/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec @@ -19,7 +19,6 @@ [ "mvn", "-DskipTests=true", - "-Dmaven.test.skip=true", "-Dmaven.site.skip=true", "-Drat.skip=true", "-Dmaven.javadoc.skip=true",