diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0a4a65c..fd7a3f9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,48 +1,153 @@ -#========================== STEPS USED IN WORKFLOWS ==================================== -.step_python_setup_install: &step_python_setup_install - - echo -e "\e[95m===== Setup Python" - - python --version ; pip --version - - python3 -m venv venv - - source venv/bin/activate +stages: + - build + - test + - notify -.step_install_pyrrha_test: &step_install_pyrrha_test - - echo -e "\e[95m===== Install Pyrrha with test extension" - - pip install '.[test]' +# Paths that should trigger test and build pipelines. +# Anchored here so test and build jobs stay in sync. +.source_paths: &source_paths + - tests/**/* + - src/**/* + - pyproject.toml + - .gitlab-ci.yml + - ci/ghidra/* -.step_configure_disassembler: &step_configure_disassembler - - if [[ ${DISASSEMBLER} == "ida" ]]; then - echo -e "\e[95m===== Configure IDA" && - mkdir -p ~/.idapro/ && - echo $KEY | base64 -d > ~/.idapro/$KEY_NAME && - echo $REG | base64 -d > ~/.idapro/ida.reg && - export IDA_LICENSE=keyfile=$KEY_NAME && - idapyswitch -a ; fi; +#======================== BUILD DOCKER IMAGE AND PUSH TO REGISTRY ====================== +build_image: + stage: build + image: docker:29-dind + tags: + - dind + rules: + # Release tags always build, regardless of which files changed. + - if: '$CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/' + # Branch builds only when source, tests, or packaging metadata changed. + - if: '$CI_COMMIT_BRANCH == "main"' + changes: *source_paths + - if: '$CI_COMMIT_BRANCH == "dev"' + changes: *source_paths + parallel: + matrix: + - BACKEND: "ida" + VERSION: [91, 93] + LATEST: 93 + - BACKEND: "ghidra" + LATEST: "12.1" + VERSION: "12.0.4" + GHIDRA_SHA256: "c3b458661d69e26e203d739c0c82d143cc8a4a29d9e571f099c2cf4bda62a120" + DATE: "20260303" + - BACKEND: "ghidra" + LATEST: "12.1" + VERSION: "12.1" + GHIDRA_SHA256: "aa5cbcbbf48f41ca185fce900e19592f1ade4cd5994eb6e0ede468dac8a6f302" + DATE: "20260513" + variables: + DOCKER_IMAGE_NAME: $CI_REGISTRY_IMAGE/pyrrha-$BACKEND + DOCKER_HOST: unix:///var/run/docker.sock + DOCKER_TLS_CERTDIR: "" + before_script: + - echo "$CA_CERT" > /usr/local/share/ca-certificates/local-ca.pem + - update-ca-certificates + - | + dockerd-entrypoint.sh --host=unix:///var/run/docker.sock & + for i in $(seq 1 30); do + if docker info >/dev/null 2>&1; then + echo "Docker daemon is ready" + break + fi + echo "Waiting for docker daemon... ($i/30)" + sleep 2 + done + - echo "$CI_REGISTRY_PASSWORD" | docker login --username "$CI_REGISTRY_USER" --password-stdin "$CI_REGISTRY" + - apk add --no-cache bash + script: + # Resolve the pyrrha version component from the ref that triggered the pipeline. + # - tag v1.2.3 -> "1.2.3" + # - main branch -> "main" + # - dev branch -> "dev" + - | + if [[ -n "$CI_COMMIT_TAG" && "$CI_COMMIT_TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + PYRRHA_VERSION="${CI_COMMIT_TAG#v}" + REF_KIND="tag" + elif [[ "$CI_COMMIT_BRANCH" == "main" ]]; then + PYRRHA_VERSION="main" + REF_KIND="main" + elif [[ "$CI_COMMIT_BRANCH" == "dev" ]]; then + PYRRHA_VERSION="dev" + REF_KIND="dev" + else + echo "Unexpected ref (branch=$CI_COMMIT_BRANCH, tag=$CI_COMMIT_TAG) — aborting." + exit 1 + fi + echo "Resolved PYRRHA_VERSION=$PYRRHA_VERSION (REF_KIND=$REF_KIND)" + + - | + if [ "$BACKEND" = "ghidra" ]; then + ci/ghidra/build.sh --version "$VERSION" --date "$DATE" --sha256 "$GHIDRA_SHA256" --name "$CI_REGISTRY_IMAGE/ci/$BACKEND" + fi + - | + if [ "$BACKEND" = "ghidra" ]; then + docker push "$CI_REGISTRY_IMAGE/ci/$BACKEND:$VERSION" + fi -.step_gen_artifacts: &step_gen_artifacts - - echo -e "\e[95m===== Generate artifacts" - - mkdir -p ${ARTIFACTS} - - cp -r tests ${ARTIFACTS} - - (cd ${ARTIFACTS} && pyrrha $MAPPER --db ${DB} --debug tests/test_fw ${MAPPER_OPTIONS}) - - ls ${ARTIFACTS} + # Primary image tag: - + - PRIMARY_TAG="${VERSION}-${PYRRHA_VERSION}" + - echo "Building $DOCKER_IMAGE_NAME:$PRIMARY_TAG" + - | + docker build --pull \ + -t "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" \ + -f ci/pyrrha/Dockerfile \ + --build-arg DISASS_IMAGE=$CI_REGISTRY_IMAGE/ci/${BACKEND} \ + --build-arg DISASS_IMAGE_VERSION=$VERSION \ + . + - docker push "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" -.step_run_tests: &step_run_tests - - echo -e "\e[95m===== Tests" - - coverage run --source=${TEST_COVERAGE_SOURCE} -m pytest --junitxml=report.xml -vvv -x ${TEST_SUP_OPTIONS} ${TEST_PATH} - - coverage xml - - coverage report + # Additional floating tags, only for the LATEST backend version of this matrix row. + # - main -> `latest` and `` + # - dev -> `latest-dev` + # - tag -> `stable` + - | + if [[ "$VERSION" == "$LATEST" ]]; then + case "$REF_KIND" in + main) + for t in "latest" "$VERSION"; do + docker tag "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" "$DOCKER_IMAGE_NAME:$t" + docker push "$DOCKER_IMAGE_NAME:$t" + done + ;; + dev) + docker tag "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" "$DOCKER_IMAGE_NAME:latest-dev" + docker push "$DOCKER_IMAGE_NAME:latest-dev" + ;; + tag) + docker tag "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" "$DOCKER_IMAGE_NAME:stable" + docker push "$DOCKER_IMAGE_NAME:stable" + ;; + esac + fi + after_script: + - docker logout $CI_REGISTRY #========================== OBJECTS TESTS ==================================== -test_data_structures: +test_data_structures: stage: test - before_script: - - *step_python_setup_install - - *step_install_pyrrha_test + # Only run tests when source, tests, or packaging metadata changed. + # Inherited by test_fs and test_fs-cg via `extends`. + rules: + - if: '$CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/' + - changes: *source_paths + before_script: + - echo -e "\e[95m===== Install Pyrrha with test extension" + - pip install '.[test]' script: - - *step_run_tests + - echo -e "\e[95m===== Tests" + - coverage run --source=${TEST_COVERAGE_SOURCE} -m pytest --junitxml=report.xml -vvv -x ${TEST_SUP_OPTIONS} ${TEST_PATH} + - coverage xml + - coverage report image: python:latest variables: - TEST_COVERAGE_SOURCE: pyrrha_mapper.common.objects + TEST_COVERAGE_SOURCE: pyrrha_mapper.mappers.objects TEST_PATH: tests/test_filesystem_objects.py coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' artifacts: @@ -52,88 +157,132 @@ test_data_structures: coverage_format: cobertura path: coverage.xml +test_decomp_objects: + extends: + - test_data_structures + variables: + TEST_COVERAGE_SOURCE: pyrrha_mapper.mappers.decomp_objects + TEST_PATH: tests/test_decomp_objects.py + #========================== MAPPERS TESTS ==================================== -.run_pyrrha_test_artifacts: - stage: test - before_script: - - *step_python_setup_install - - *step_install_pyrrha_test - script: - - *step_gen_artifacts - - *step_run_tests - coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' +test_fs: + extends: + - test_data_structures artifacts: name: db_$CI_JOB_NAME_SLUG - when: always paths: - - ${ARTIFACTS}/${DB}.srctrldb - - ${ARTIFACTS}/${DB}.srctrlprj - reports: - junit: report.xml - coverage_report: - coverage_format: cobertura - path: coverage.xml - variables: - ARTIFACTS: tmp/artifacts - -test_fs: - extends: - - .run_pyrrha_test_artifacts - image: python:latest + - test_artifacts/ + when: always variables: DB: fs MAPPER: fs - TEST_COVERAGE_SOURCE: pyrrha_mapper.common.filesystem_mapper,pyrrha_mapper.fs + TEST_COVERAGE_SOURCE: pyrrha_mapper.mappers.imports_mapper TEST_PATH: tests/test_cli.py::TestFSMapper - + PYTEST_ARTIFACTS_DIR: test_artifacts -.test_fs-cg: - extends: - - .run_pyrrha_test_artifacts - before_script: - - !reference [.run_pyrrha_test_artifacts, before_script] - - *step_configure_disassembler - image: - name: $CONTAINER_PATH/${DISASSEMBLER}:${VERSION} +test_fs-cg: + extends: + - test_fs + # Pull the image built in this pipeline when on main, dev, or a release tag + # (tagged as -main, -dev, or -). + # For any other branch (e.g. feature branches) no image is built, so we + # fall back to `latest` which always tracks the last successful main build. + image: + name: $CI_REGISTRY_IMAGE/pyrrha-${BACKEND}:${DISASS_IMAGE_TAG} docker: user: user + before_script: + - echo -e "\e[95m===== Install Pyrrha with test extension" + - pip install '.[test]' variables: - DB: ${DISASSEMBLER}_${VERSION}_${EXPORTER} + DB: ${BACKEND}_${VERSION} MAPPER: fs-cg - MAPPER_OPTIONS: '--disassembler ${DISASSEMBLER} --exporter ${EXPORTER}' - TEST_COVERAGE_SOURCE: pyrrha_mapper.common.filesystem_mapper,pyrrha_mapper.intercg + MAPPER_OPTIONS: '--backend ${BACKEND}' + TEST_COVERAGE_SOURCE: pyrrha_mapper.mappers.intercg_bin_loader,pyrrha_mapper.mappers.intercg_mapper TEST_PATH: tests/test_cli.py::TestFsCgMapper TEST_SUP_OPTIONS: ${MAPPER_OPTIONS} - -test_fs-cg_ghidra: - extends: - - .test_fs-cg - variables: - DISASSEMBLER: ghidra + HEXRAYS_LICENSE: "${IDA_LICENSE}" + # Default: fall back to latest (last successful main build). + # Overridden per-ref by the rules below. + DISASS_IMAGE_TAG: "latest" + # Point pyghidra at the JDK as proper job variables so they are present in + # the environment of EVERY process in the job — including the + # `coverage run -m pytest` process and any multiprocessing workers spawned + # by the mapper (which use the "spawn" start method and re-exec Python). + # Shell `export`s in before_script are not guaranteed to reach those + # re-exec'd children, but GitLab job variables always are. + # /opt/java/openjdk is the temurin JDK path, confirmed present in all + # ghidra images. pyghidra reads JAVA_HOME_OVERRIDE first (launcher.py:202), + # bypassing LaunchSupport entirely. + JAVA_HOME: /opt/java/openjdk + JAVA_HOME_OVERRIDE: /opt/java/openjdk + rules: + - if: '$CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/' + variables: + DISASS_IMAGE_TAG: "${VERSION}-${CI_COMMIT_TAG#v}" # strip the leading v, e.g. 12.0.4-1.2.3 + - if: '$CI_COMMIT_BRANCH == "main"' + changes: *source_paths + variables: + DISASS_IMAGE_TAG: "${VERSION}-main" + - if: '$CI_COMMIT_BRANCH == "dev"' + changes: *source_paths + variables: + DISASS_IMAGE_TAG: "${VERSION}-dev" + - changes: *source_paths # any other branch: use latest parallel: matrix: - - VERSION: 11.1.2 - EXPORTER: binexport + - BACKEND: "ida" + VERSION: [91, 93] + - BACKEND: "ghidra" + VERSION: ["12.0.4"] -test_fs-cg_ida: +test_decomp: extends: - - .test_fs-cg + - test_fs-cg variables: - DISASSEMBLER: ida - parallel: - matrix: - - VERSION: 84 - EXPORTER: [quokka, binexport] - - VERSION: 91 - EXPORTER: quokka + DB: decomp_${BACKEND}_${VERSION} + MAPPER: decomp + MAPPER_OPTIONS: '--backend ${BACKEND}' + TEST_COVERAGE_SOURCE: pyrrha_mapper.mappers.decomp_mapper,pyrrha_mapper.mappers.decomp_objects + TEST_PATH: tests/test_cli.py::TestDecompMapper + TEST_SUP_OPTIONS: ${MAPPER_OPTIONS} + +#========================== TRIGGER INTERNAL DOC UPDATE ================================ +.trigger_docs_base: + stage: notify + trigger: + project: firmware-re/cartography/pyrrha-internal-documentation + branch: main + # strategy: depend # flip on to surface downstream failure on pyrrha + # CI. Off for now — a broken docs build shouldn't + # block pyrrha. + variables: + UPSTREAM_PIPELINE_URL: $CI_PIPELINE_URL + +trigger_docs_main: + extends: .trigger_docs_base + variables: + UPSTREAM_REF: "main" + UPSTREAM_SHA: $CI_COMMIT_SHA + rules: + - if: $CI_COMMIT_BRANCH == "main" + when: on_success + +trigger_docs_dev: + extends: .trigger_docs_base + variables: + UPSTREAM_REF: "dev" + UPSTREAM_SHA: $CI_COMMIT_SHA + rules: + - if: $CI_COMMIT_BRANCH == "dev" + when: on_success + +trigger_docs_tag: + extends: .trigger_docs_base + variables: + UPSTREAM_REF: $CI_COMMIT_TAG + UPSTREAM_TAG: $CI_COMMIT_TAG + UPSTREAM_SHA: $CI_COMMIT_SHA rules: - - if: $VERSION == "84" - variables: - KEY: $IDA_KEY - KEY_NAME: ida.key - REG: $IDA84_REG - - if: $VERSION == "91" - variables: - KEY: $LICENSE - KEY_NAME: ida_license.hexlic - REG: $IDA_REG + - if: $CI_COMMIT_TAG =~ /^v\d+\.\d+/ + when: on_success \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 582e682..586cfbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,29 @@ +## Unreleased + +### Features +- All mappers now share a single disassembler `--backend` value (`ida`/`ghidra`) implemented in one common place, replacing the previous per-mapper disassembler/exporter selection. +- Remove the `qbinary`/Quokka dependency: `fs-cg` and `decomp` now interact directly with the disassemblers (IDA, Ghidra), so Pyrrha can run on systems without Quokka. +- Add ELF SONAME support: binaries are indexed by their `DT_SONAME` so imports referencing a SONAME resolve even without a matching symlink. +- `decomp` mapper: full rework around a class-based object integrated with the common backend layer. +- `decomp` mapper: add a `-e/--export` option to dump the result as JSON, loadable through the new `ExportedDecompilation` object exposed by Pyrrha. +- Expose `Binary` image base and relocatable information on the internal representation. +- Improve the documentation (installation, quick summary, decomp mapper) and add unit tests for the `decomp` export model plus functional tests for the `decomp` mapper. + +### Fixes +- `intercg` mapper: various fixes around addresses and demangled names, missing Ghidra thunks, extended ignore list, and argument renaming. +- `fs-cg` mapper: avoid an infinite loop in trampoline resolution and add a real timeout to program loading. +- `fs`/`fs-cg` mappers: pass `load_binary` arguments through a `partial` mechanism and improve multiprocessing error handling. +- `decomp` mapper: fix the mapping run (`map` now reports success/failure, runs the decompilation and call-graph indexing phases, and records the binary node so functions get a valid parent). +- `decomp` mapper: fix call-graph source cross-references (call-site locations are looked up by callee address and no longer raise on the first reference). +- `decomp` mapper: fix command-line arguments and improve the decompilation script (correct `NamedTemporaryFile` usage, better IDA decompilation output). +- `decomp` mapper (IDA backend): use the `ida_domain` 0.5.0 pseudocode API (`get_pseudocode(func).to_text(...)`), fixing a `TypeError` that broke every IDA decompilation run. +- `decomp` mapper: skip imported functions during source and call-graph indexing (they have no decompiled body), removing spurious per-function error/warning logs. +- `cli`: keep an existing suffix in the DB path and annotate the `decomp` mapper variable with its base type to fix a type-checking error. + +### Internal +- Reorganize the repository into two submodules (`backend` and `mappers`) and rework the mappers so backend support lives in a single common place; remove unused modules and the `heimdallr`/disassembly-sync prototype. +- CI: build and test IDA and Ghidra Docker images, run the `decomp` export-model and functional tests, export test artifacts, and trigger builds only on relevant changes. + ## v1.0.1—Improve exe-decomp mapper ### Features diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 1eb160f..0000000 --- a/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -FROM docker.io/library/python:3.11-slim -SHELL ["/bin/bash", "-c"] - -ENV PYRRHA_INSTALL_DIR=/tmp/pyrrha_install -ENV PYRRHA_WORKING_DIR=/tmp/pyrrha - -RUN mkdir -p $PYRRHA_INSTALL_DIR - -WORKDIR ${PYRRHA_INSTALL_DIR} - -RUN python3 -m pip install --no-cache-dir -U pip - -COPY src src/ -COPY pyproject.toml ./ -COPY README.md ./ - -RUN python3 -m pip install --no-cache-dir . && \ - rm -rf $PYRRHA_INSTALL_DIR - -WORKDIR ${PYRRHA_WORKING_DIR} - -ENTRYPOINT ["pyrrha"] diff --git a/README.md b/README.md index d441e1b..020f963 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,11 @@ path to function. ## Installation -The installation is done in three parts: +The installation is done in two parts: -1. Install mapper external dependencies: IDA dissassembler (with the decompilation option for the `exe-decomp` mapper) and [`Quokka` IDA plugin](https://github.com/quarkslab/quokka/releases). 1. Install `Pyrrha` itself. 1. Install [`NumbatUI`](https://github.com/quarkslab/NumbatUI) (or [`Sourcetrail`](https://github.com/CoatiSoftware/Sourcetrail)) to be able to visualize Pyrrha's results. - +1. _(Optional)_ Install Ghidra or IDA if you want to use `fs-cg` or `decomp` mappers. > [!NOTE] > A quick start installation is available on [Pyrrha documentation](https://quarkslab.github.io/pyrrha/#installation). @@ -46,11 +45,11 @@ The installation is done in three parts: ## Usage The usage workflow is composed of two steps which allow you to separate DB creation and result visualization. + 1. Run Pyrrha to obtain NumbatUI compatible files (`*.srctrlprj` for the project file and `*.srctrldb` for the DB file). With the python package, you can just launch the command `pyrrha`. 2. Visualize your results with Sourcetrail/NumbatUI. - > [!NOTE] > The detailed documentation of each mapper is available in the [documentation](https://quarkslab.github.io/pyrrha/mappers/mappers/). diff --git a/ci/ghidra/Dockerfile b/ci/ghidra/Dockerfile index 69b5785..3e4e4c0 100644 --- a/ci/ghidra/Dockerfile +++ b/ci/ghidra/Dockerfile @@ -14,56 +14,118 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM openjdk:21-jdk-slim -SHELL ["/bin/bash", "-c"] +# ======================== Ghidra Download and Extraction ============================== +# Use a dedicated stage so that wget, unzip, and the zip archive itself are +# never committed to the final image layer. +FROM debian:bookworm-slim AS ghidra-download -# ======================== Ghidra Installation ================================= - -ARG GHIDRA_VERSION=11.1.2 -ARG GHIDRA_RELEASE_DATE=20240709 +ARG GHIDRA_VERSION=12.0.4 +ARG GHIDRA_RELEASE_DATE=20260303 +ARG GHIDRA_SHA256=c3b458661d69e26e203d739c0c82d143cc8a4a29d9e571f099c2cf4bda62a120 ARG GHIDRA_URL=https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_${GHIDRA_VERSION}_build/ghidra_${GHIDRA_VERSION}_PUBLIC_${GHIDRA_RELEASE_DATE}.zip -ENV GHIDRA_INSTALL_DIR=/opt/ghidra_${GHIDRA_VERSION}_PUBLIC RUN apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + apt-get install --yes --no-install-recommends \ ca-certificates \ - libfreetype6 \ - libmagic1 \ - libpython3-dev \ - python3-minimal \ - python3-pip \ - python3-venv \ - python-is-python3 \ unzip \ wget \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* + +# Download, verify checksum, extract, then discard the archive immediately. +# Also strip files not needed at runtime to reduce COPY --from size. +RUN wget --no-verbose "${GHIDRA_URL}" -O /tmp/ghidra.zip && \ + echo "${GHIDRA_SHA256} /tmp/ghidra.zip" | sha256sum --check --strict && \ + unzip -q /tmp/ghidra.zip -d /opt/ && \ + rm /tmp/ghidra.zip && \ + find /opt/ghidra_${GHIDRA_VERSION}_PUBLIC -name "*.bat" -delete && \ + rm -rf \ + /opt/ghidra_${GHIDRA_VERSION}_PUBLIC/docs \ + /opt/ghidra_${GHIDRA_VERSION}_PUBLIC/Extensions/Eclipse \ + /opt/ghidra_${GHIDRA_VERSION}_PUBLIC/licenses + +# ======================== Ghidra Installation and Runtime Image ======================= +# eclipse-temurin:21-jdk-noble provides JDK 21, required by Ghidra 12.0.4+. +# JPype1==1.5.2 (hard-pinned by pyghidra) is compatible with JDK 21 on Linux. +# +# Ubuntu 24.04 (Noble) is used instead of 22.04 (Jammy) because Noble ships +# openjdk-21-jdk in its default repos whereas Jammy does not. +# +# Problem: eclipse-temurin sets JAVA_HOME=/opt/java/openjdk. pyghidra's launcher +# runs LaunchSupport to discover the JDK home, then sets JAVA_HOME to that result +# before calling jpype.startJVM(None). LaunchSupport may return /opt/java/openjdk +# (the temurin path), which JPype cannot reliably use to locate libjvm.so. +# +# Fix: install the standard Ubuntu openjdk-21-jdk package (predictable path at +# /usr/lib/jvm/java-21-openjdk-amd64) and inject JAVA_HOME_OVERRIDE into Ghidra's +# support/launch.properties. pyghidra reads this file and uses the override path +# directly, bypassing LaunchSupport's JDK search entirely. +FROM eclipse-temurin:21-jdk-noble -RUN wget $GHIDRA_URL -O ghidra.zip && unzip ghidra.zip -d /opt/ && rm ghidra.zip +ARG GHIDRA_VERSION=12.0.4 +# Exported so that pyghidra and other tools spawned inside the container can +# locate the Ghidra installation without extra configuration. +ENV GHIDRA_INSTALL_DIR=/opt/ghidra_${GHIDRA_VERSION}_PUBLIC ENV PATH=${GHIDRA_INSTALL_DIR}:${PATH} -# ======================== Plugin Installation ============================== +# User creation. +# eclipse-temurin:21-jdk-noble already has a user at UID 1000 ("ubuntu"), +# so we use UID 1001 to avoid conflicts. +# Created here (before COPY) so --chown can reference it directly, avoiding +# a separate chown -R layer that would double Ghidra's disk footprint. +RUN useradd --create-home -u 1001 -m user -ARG BINEXPORT_URL=https://github.com/google/binexport/archive/refs/heads/main.zip -ARG GRADLE_VERSION=8.14.3 -ARG GRADLE_URL=https://services.gradle.org/distributions/gradle-${GRADLE_VERSION}-bin.zip -ARG GHIDRA_PLUGIN_DIR=/root/.config/ghidra/ghidra_${GHIDRA_VERSION}_PUBLIC/Extensions +# Copy only the extracted Ghidra tree from the download stage, owned by user +# from the start — no separate chown layer needed. +COPY --chown=user:user --from=ghidra-download /opt/ghidra_${GHIDRA_VERSION}_PUBLIC ${GHIDRA_INSTALL_DIR} -RUN wget ${GRADLE_URL} -O gradle.zip \ - && unzip gradle.zip -d gradle \ - && wget ${BINEXPORT_URL} -O binexport.zip \ - && mkdir -p ${GHIDRA_PLUGIN_DIR} \ - && unzip binexport.zip binexport-main/java/* -d binexport \ - && (cd binexport/binexport-main/java/ && /gradle/gradle-${GRADLE_VERSION}/bin/gradle buildExtension -PGHIDRA_INSTALL_DIR=${GHIDRA_INSTALL_DIR} && unzip dist/ghidra_${GHIDRA_VERSION}_PUBLIC_$( date +%Y%m%d)_BinExport.zip -d ${GHIDRA_PLUGIN_DIR}) \ - && rm -rf gradle.zip gradle binexport.zip binexport \ - && apt-get purge --yes wget unzip && apt --yes autoremove +# Install the minimal runtime dependencies for Ghidra headless + pyghidra and +# for building/installing the pyrrha-mapper Python package. +# - openjdk-21-jdk: provides a standard, well-known JDK path that both +# LaunchSupport and JPype handle correctly. Available in Ubuntu 24.04 (Noble). +# - Python 3.11 is installed via the deadsnakes PPA as Ubuntu 24.04 (Noble) +# ships 3.12 by default. +# All apt artefacts are removed in the same RUN layer to keep layer size down. +RUN apt-get update && \ + apt-get install --yes --no-install-recommends \ + openjdk-21-jdk \ + software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install --yes --no-install-recommends \ + libfreetype6 \ + libmagic1 \ + python3.11 \ + python3.11-venv \ + python3-pip \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* -# ======================== USER CREATION ============================== -ARG USER_GHIDRA_PLUGIN_DIR=/home/user/.config/ghidra/ghidra_${GHIDRA_VERSION}_PUBLIC/Extensions +# Override JAVA_HOME to the Ubuntu OpenJDK path so that both LaunchSupport and +# JPype use the same predictable JDK installation, overriding the temurin default +# of /opt/java/openjdk which JPype cannot reliably resolve libjvm.so from. +ENV JAVA_HOME=/usr/lib/jvm/java-21-openjdk-amd64 + +# Inject JAVA_HOME_OVERRIDE into Ghidra's launch.properties so pyghidra reads +# it directly and skips LaunchSupport's JDK search entirely. This is the +# supported mechanism (see pyghidra/launcher.py _jvm_args()). +RUN echo "JAVA_HOME_OVERRIDE=/usr/lib/jvm/java-21-openjdk-amd64" \ + >> "${GHIDRA_INSTALL_DIR}/support/launch.properties" + +# Make python3.11 the default python3 and python. +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 -RUN useradd --create-home -u 1000 -m user && chown -R user:user $GHIDRA_INSTALL_DIR -RUN mkdir -p ${USER_GHIDRA_PLUGIN_DIR} && mv ${GHIDRA_PLUGIN_DIR}/* ${USER_GHIDRA_PLUGIN_DIR} && chown -R user:user /home/user/.config USER user WORKDIR /home/user +# Python virtual environment — activated automatically for both interactive +# shells (.bashrc) and non-interactive processes (ENV PATH). +ENV VIRTUAL_ENV=/home/user/.venv +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" + +RUN python3.11 -m venv "${VIRTUAL_ENV}" && \ + pip install --no-cache-dir --upgrade pip && \ + echo "source ${VIRTUAL_ENV}/bin/activate" >> /home/user/.bashrc + CMD ["/bin/bash"] \ No newline at end of file diff --git a/ci/ghidra/build.sh b/ci/ghidra/build.sh new file mode 100755 index 0000000..7375489 --- /dev/null +++ b/ci/ghidra/build.sh @@ -0,0 +1,191 @@ +#!/usr/bin/env bash +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Build a Docker image with Ghidra and pyrrha-mapper. +# +# The Ghidra release is downloaded and verified at build time — no local +# installer file is required. The produced image is tagged :. +# +# Usage: +# ./build_ghidra.sh [OPTIONS] +# +# Options: +# -v, --version Ghidra version (default: 12.0.4). +# -d, --date Ghidra release date string (default: 20260303). +# -s, --sha256 Expected SHA-256 of the Ghidra zip (required when +# overriding --version, to ensure integrity). +# -n, --name Base image name (default: pyrrha-ghidra). +# Image tagged :, also :latest. +# -h, --help Print this help and exit. +# +# Examples: +# # Build with defaults: +# ./build_ghidra.sh +# +# # Build a specific version: +# ./build_ghidra.sh --version 12.0.4 --date 20260303 \ +# --sha256 c3b458661d69e26e203d739c0c82d143cc8a4a29d9e571f099c2cf4bda62a120 +# +# # Build under a custom image name: +# ./build_ghidra.sh --name myorg/ghidra + +set -euo pipefail + +# ── Docker command resolution ───────────────────────────────────────────────── + +# Determine whether docker must be run via sudo. A plain `docker info` is +# attempted first; if it fails (e.g. the current user is not in the docker +# group), sudo is prepended for all subsequent docker calls. +if docker info > /dev/null 2>&1; then + DOCKER="docker" +elif sudo docker info > /dev/null 2>&1; then + DOCKER="sudo docker" +else + echo "ERROR: Cannot connect to the Docker daemon (tried both 'docker' and 'sudo docker')." >&2 + exit 1 +fi +readonly DOCKER + +# ── Constants ───────────────────────────────────────────────────────────────── + +readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly DOCKERFILE="${SCRIPT_DIR}/Dockerfile" +readonly IMAGE_NAME_DEFAULT="ghidra" + +# Default Ghidra release — update these when a new version is published. +readonly DEFAULT_VERSION="12.0.4" +readonly DEFAULT_DATE="20260303" +readonly DEFAULT_SHA256="c3b458661d69e26e203d739c0c82d143cc8a4a29d9e571f099c2cf4bda62a120" + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +usage() { + cat <: and also :latest. + +Options: + -v, --version Ghidra version (default: ${DEFAULT_VERSION}). + -d, --date Ghidra release date string (default: ${DEFAULT_DATE}). + -s, --sha256 Expected SHA-256 of the Ghidra zip. + Required when overriding --version. + Default: ${DEFAULT_SHA256} + -n, --name Base image name (default: ${IMAGE_NAME_DEFAULT}). + Image tagged : and :latest. + -h, --help Print this help and exit. + +Examples: + # Build with defaults: + $(basename "$0") + + # Build a specific version: + $(basename "$0") --version 12.0.4 --date 20260303 \\ + --sha256 c3b458661d69e26e203d739c0c82d143cc8a4a29d9e571f099c2cf4bda62a120 + + # Build under a custom image name: + $(basename "$0") --name myorg/ghidra +EOF + exit 0 +} + +die() { + echo "ERROR: $*" >&2 + echo >&2 + usage + exit 1 +} + +# ── Argument parsing ────────────────────────────────────────────────────────── + +ghidra_version="${DEFAULT_VERSION}" +ghidra_date="${DEFAULT_DATE}" +ghidra_sha256="${DEFAULT_SHA256}" +image_name="${IMAGE_NAME_DEFAULT}" +version_overridden=false +sha256_overridden=false + +while [[ $# -gt 0 ]]; do + case "$1" in + -v|--version) + [[ -n "${2:-}" ]] || die "--version requires an argument." + ghidra_version="$2" + version_overridden=true + shift 2 + ;; + -d|--date) + [[ -n "${2:-}" ]] || die "--date requires an argument." + ghidra_date="$2" + shift 2 + ;; + -s|--sha256) + [[ -n "${2:-}" ]] || die "--sha256 requires an argument." + ghidra_sha256="$2" + sha256_overridden=true + shift 2 + ;; + -n|--name) + [[ -n "${2:-}" ]] || die "--name requires an argument." + image_name="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + die "Unknown option: $1" + ;; + esac +done + +# If the user overrode --version but not --sha256, the default SHA-256 is +# almost certainly wrong for a different version. +if [[ "${version_overridden}" == true && "${sha256_overridden}" == false ]]; then + die "You overrode --version but not --sha256. " \ + "Please provide the correct SHA-256 for Ghidra ${ghidra_version} via --sha256." +fi + +# ── Pre-flight checks ───────────────────────────────────────────────────────── + +[[ -f "${DOCKERFILE}" ]] || die "Dockerfile not found at: ${DOCKERFILE}" + +# ── Build ───────────────────────────────────────────────────────────────────── + +image_tag="${image_name}:${ghidra_version}" + +echo "==> Building ${image_tag}" +echo " Ghidra version : ${ghidra_version}" +echo " Release date : ${ghidra_date}" +echo " SHA-256 : ${ghidra_sha256}" + +${DOCKER} build \ + --build-arg "GHIDRA_VERSION=${ghidra_version}" \ + --build-arg "GHIDRA_RELEASE_DATE=${ghidra_date}" \ + --build-arg "GHIDRA_SHA256=${ghidra_sha256}" \ + --tag "${image_tag}" \ + --file "${DOCKERFILE}" \ + "${SCRIPT_DIR}" + +${DOCKER} tag "${image_tag}" "${image_name}:latest" + +echo "==> Successfully built ${image_tag}" +echo "==> Also tagged as ${image_name}:latest" +echo "==> Done." \ No newline at end of file diff --git a/ci/ida/.dockerignore b/ci/ida/.dockerignore new file mode 100644 index 0000000..bf9ed0a --- /dev/null +++ b/ci/ida/.dockerignore @@ -0,0 +1,6 @@ +# Exclude version-specific ida.reg backups; only the current ida.reg +# (copied by the build script before invoking docker build) is needed. +ida_*.reg + +# Exclude licence files — never baked into images, mounted at runtime only. +*.hexlic diff --git a/ci/ida/Dockerfile b/ci/ida/Dockerfile index 02087cb..ad8029a 100644 --- a/ci/ida/Dockerfile +++ b/ci/ida/Dockerfile @@ -14,73 +14,145 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM docker.io/library/debian:testing-slim -SHELL ["/bin/bash", "-c"] - # ============= How to generate the required data =========================== -# paths can be changed from commandline if needed -# idapro.hexlic: license file, downloaded from your account on hex-rays website -# ida-pro_91.run: executable file, downloaded from your account on hex-rays website -# ida.reg: history file, to be generated manually. Keep in memory that the licence has alredy been accepted. -# 1. Build this docker with an empty ida.reg and launch it. -# 2. Launch idat and accept the license. -# 3. In another terminal, get the id of the current ida docker with `docker ps` -# 4. Run `docker cp ID:/root/.idapro/ida.reg ./` where ID is the id get at the -# previous step, you know have a correct ida.reg. -# 5. Rebuild your image with the correct ida.reg +# All sensitive files are passed exclusively via --mount=type=secret so they +# never appear in any image layer or in `docker history`. +# +# Required files (place them next to this Dockerfile): +# idapro.hexlic : license file, downloaded from your account on hex-rays website +# ida-pro_91.run: executable installer, downloaded from your account on hex-rays website +# +# Build command: +# docker build \ +# --build-arg IDA_VERSION=91 \ +# --build-arg IDA_INSTALLER=ida-pro_91.run \ +# -t pyrrha-ida . +# +# ida.reg is injected by Dockerfile.final after interactive EULA acceptance. +# See build_ida.sh for the full two-phase build procedure. +# +# Run command (licence is mounted at runtime, never stored in the image): +# docker run --rm \ +# --mount type=bind,dsr=/home/user/.idapro/ida_license.hexlic,src=idapro.hexlic,ro \ +# pyrrha-ida +# # =========================================================================== # ======================== IDA Installation ================================= +# Contains build-only packages and the installer; none of this reaches the +# final image. +FROM docker.io/library/debian:testing-slim AS ida-install ARG IDA_VERSION=91 +ARG IDA_INSTALL_DIR=/opt/ida_${IDA_VERSION} + +# Path to the installer .run file on the build host, passed via --build-arg. +# It is bind-mounted into the build container at a fixed target path so that +# the ARG value (which Docker cannot expand inside --mount options) is only +# used inside the shell command where substitution works normally. ARG IDA_INSTALLER=ida-pro_${IDA_VERSION}.run -ENV IDA_INSTALL_DIR=/opt/ida_${IDA_VERSION} +# Build-time dependencies: +# - libxcb-xinerama0 : required by the .run installer's Qt bootstrap +# - libopengl0 : required by IDA's Qt layer for rendering +# - remaining libs : runtime deps also needed at install time RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ - ca-certificates \ - ccache \ - cmake \ - g++ \ - gcc \ - git \ - libpython3-dev \ + libfontconfig1 \ + libmagic1 \ + libopengl0 \ libqt5gui5 \ + libsecret-1-0 \ + libxcb-xinerama0 \ + python3-minimal \ + python-is-python3 \ + && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* \ + && mkdir -p "${IDA_INSTALL_DIR}" ~/.local/share/applications/ + +# The installer is bind-mounted from the build context so the large .run +# binary is never written to any layer. Docker cannot expand ARGs inside +# --mount options, so src=. mounts the whole context and the ARG is used +# only inside the shell command where substitution works normally. +ARG IDA_INSTALLER +RUN --mount=type=bind,src=.,target=/build/context \ + cp "/build/context/${IDA_INSTALLER}" /tmp/ida_installer.run && \ + chmod +x /tmp/ida_installer.run && \ + DEBIAN_FRONTEND=noninteractive /tmp/ida_installer.run \ + --mode unattended \ + --prefix "${IDA_INSTALL_DIR}" && \ + rm /tmp/ida_installer.run && \ + # Strip unneeded files from the IDA tree to reduce COPY --from size: + # - documentation + # - desktop integration files + # - uninstaller + rm -rf \ + "${IDA_INSTALL_DIR}/README_python3.txt" \ + "${IDA_INSTALL_DIR}/Uninstall IDA"* \ + "${IDA_INSTALL_DIR}/uninstall"* \ + "${IDA_INSTALL_DIR}/appico.png" \ + "${IDA_INSTALL_DIR}/hvui.png" + +# ======================== IDA Runtime image ================================= +# Only the IDA tree and the runtime shared libraries are present here. +# No installer, no build tooling. +# ida.reg is NOT present at this stage — it is injected by Dockerfile.final +# after the user has accepted the EULA interactively (see build_ida.sh). +FROM docker.io/library/debian:testing-slim + +ARG IDA_VERSION=91 + +# Exported so that qbinary/idascript and other tools spawned inside the +# container can locate the IDA installation without extra configuration. +ENV IDA_INSTALL_DIR=/opt/ida_${IDA_VERSION} +ENV PATH=${IDA_INSTALL_DIR}:${PATH} +# IDA looks for the licence at the path provided by this variable. +# At runtime, mount the licence as a secret at that exact path: +# docker run --mount type=secret,id=ida_license,src=idapro.hexlic,... +ENV HEXRAYS_LICENSE=/run/secrets/ida_license +# IDADIR is required by ida_domain to locate the IDA installation at runtime. +ENV IDADIR=${IDA_INSTALL_DIR} + +# Runtime-only shared libraries required by IDA and its Qt layer. +# apt-get clean and list removal are in the same layer to avoid bloat. +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ libfontconfig1 \ libmagic1 \ + libopengl0 \ + libpython3.13 \ + libqt5gui5 \ libsecret-1-0 \ - make \ - ninja-build \ + libxcb-cursor0 \ python3-minimal \ - python3-pip \ python3-venv \ python-is-python3 \ - unzip \ - xcb-proto \ - wget \ - zlib1g-dev \ - && mkdir -p $IDA_INSTALL_DIR ~/.local/share/applications/ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* -RUN --mount=type=bind,src=${IDA_INSTALLER},target=${IDA_INSTALLER} DEBIAN_FRONTEND=noninteractive apt-get install --yes --reinstall libxcb-xinerama0 && \ - ./${IDA_INSTALLER} --mode unattended --prefix ${IDA_INSTALL_DIR} +# Copy only the installed IDA tree from the build stage. +COPY --from=ida-install ${IDA_INSTALL_DIR} ${IDA_INSTALL_DIR} -ENV PATH=${IDA_INSTALL_DIR}:${PATH} +# ── User creation ───────────────────────────────────────────────────────────── +RUN useradd --create-home -u 1000 -m user && \ + chown -R user:user "${IDA_INSTALL_DIR}" -# ======================== Plugin Installation ============================== - -ARG QUOKKA_VERSION=v0.6.1 -ARG QUOKKA_URL=https://github.com/quarkslab/quokka/releases/download/${QUOKKA_VERSION}/${IDA_VERSION}-quokka_plugin0064.so -ARG BINEXPORT_URL=https://github.com/google/binexport/releases/download/v12-20240417-ghidra_11.0.3/BinExport-Linux.zip -RUN if [[ ${IDA_VERSION} -eq 84 ]]; then \ - wget ${QUOKKA_URL} -O ${IDA_INSTALL_DIR}/plugins/quokka64.so \ - && wget ${BINEXPORT_URL} -O binexport.zip \ - && unzip -j binexport.zip ida/binexport12_ida.so ida/binexport12_ida64.so -d ${IDA_INSTALL_DIR}/plugins/ \ - && rm -f binexport.zip ; \ - else wget ${QUOKKA_URL} -O ${IDA_INSTALL_DIR}/plugins/quokka.so ; fi \ - && apt-get purge --yes wget \ - && rm -rf /var/lib/apt/lists/* - -RUN useradd --create-home -u 1000 -m user && chown -R user:user $IDA_INSTALL_DIR USER user -RUN $IDA_INSTALL_DIR/idapyswitch -a WORKDIR /home/user + +# virtualenv (automatically activated at runtime for both interactive and +# non-interactive sessions via ENV PATH and .bashrc respectively) +ENV VIRTUAL_ENV=/home/user/.venv +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" +RUN python -m venv "${VIRTUAL_ENV}" && \ + echo "source ${VIRTUAL_ENV}/bin/activate" >> /home/user/.bashrc + +# Register the venv's Python interpreter with IDA so that idascript and +# qbinary can drive it programmatically. +# idapyswitch -a (auto) relies on python3-config which is not installed; +# instead point it directly to the system libpython via --force-path. +# libpython3.13 provides the shared library on Debian testing. +RUN libpython="$(find /usr /lib -name 'libpython3*.so*' 2>/dev/null | head -1)" && \ + [ -n "${libpython}" ] || { echo "ERROR: libpython not found"; exit 1; } && \ + "${IDA_INSTALL_DIR}/idapyswitch" --force-path "${libpython}" + +CMD ["/bin/bash"] diff --git a/ci/ida/Dockerfile.final b/ci/ida/Dockerfile.final new file mode 100644 index 0000000..82e9d1d --- /dev/null +++ b/ci/ida/Dockerfile.final @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Phase 2 Dockerfile: extends the setup image produced by Dockerfile by +# injecting the ida.reg file generated during the interactive phase 1 run. +# Invoked by build_ida.sh after ida.reg has been extracted from the container. + +ARG IMAGE_NAME=pyrrha-ida +ARG IDA_VERSION=91 + +FROM ${IMAGE_NAME}:${IDA_VERSION}-setup + +# Cache-busting ARG: set to the md5 hash of ida.reg by the build script so +# Docker does not reuse a cached layer for the COPY below. +ARG IDA_REG_HASH + +# Copy ida.reg generated during the interactive phase 1 run. +# The build script copies it into the build context as ida.reg before +# invoking docker build, so it is available as a plain build context file. +COPY --chown=user:user ida.reg /home/user/.idapro/ida.reg diff --git a/ci/ida/build.sh b/ci/ida/build.sh new file mode 100755 index 0000000..a062c2a --- /dev/null +++ b/ci/ida/build.sh @@ -0,0 +1,291 @@ +#!/usr/bin/env bash +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Build one Docker image per requested IDA version. Each version is built from +# its own installer file (ida-pro_.run) located next to this script. +# Produced images are tagged :; the numerically greatest version is +# additionally tagged :latest. +# +# IDA 9.x requires interactive EULA acceptance before it writes ida.reg. +# The build therefore proceeds in two phases per version: +# Phase 1 — build a setup image without ida.reg, run it interactively so +# the user can accept the EULA, then extract the resulting ida.reg +# from the stopped container. Skipped if ida_.reg already +# exists on disk from a prior run. +# Phase 2 — build the final image via Dockerfile.final, which extends the +# setup image and injects ida.reg. +# +# The installer is passed via bind-mount (no size limit, never committed to any +# layer). It must be located next to this script as ida-pro_.run. +# The licence file (idapro.hexlic) is NEVER baked into any image layer. +# Pass it at runtime via a Docker bind: +# docker run --mount type=bind,dsr=/home/user/.idapro/ida_license.hexlic,src=idapro.hexlic,ro : + +set -euo pipefail + +# ── Docker command resolution ───────────────────────────────────────────────── + +# Determine whether docker must be run via sudo. A plain `docker info` is +# attempted first; if it fails (e.g. the current user is not in the docker +# group), sudo is prepended for all subsequent docker calls. +if docker info > /dev/null 2>&1; then + DOCKER="docker" +elif sudo docker info > /dev/null 2>&1; then + DOCKER="sudo docker" +else + echo "ERROR: Cannot connect to the Docker daemon (tried both 'docker' and 'sudo docker')." >&2 + exit 1 +fi +readonly DOCKER + +# ── Constants ───────────────────────────────────────────────────────────────── + +readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly DOCKERFILE="${SCRIPT_DIR}/Dockerfile" +readonly DOCKERFILE_FINAL="${SCRIPT_DIR}/Dockerfile.final" +readonly DEFAULT_LICENSE="${SCRIPT_DIR}/idapro.hexlic" +readonly IMAGE_NAME_DEFAULT="ida" + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +usage() { + cat < [--version ...] [OPTIONS] + +Build one Docker image per requested IDA version. Each version is built from +its own installer file (ida-pro_.run) located next to this script. +Produced images are tagged :; the numerically greatest version is +additionally tagged :latest. + +IDA 9.x requires interactive EULA acceptance on first launch. The build runs +in two phases: phase 1 launches IDA so you can accept the EULA and extracts +the resulting ida.reg; phase 2 builds the final image with ida.reg injected. +Phase 1 is skipped if ida_.reg already exists from a prior run. + +The licence file (idapro.hexlic) is NEVER baked into any image layer. +Pass it at runtime via a Docker secret: + docker run --mount type=secret,id=ida_license,src=idapro.hexlic : + +The installer is passed via bind-mount and is never committed to any layer. + +Options: + -v, --version IDA version number (e.g. 91). Repeatable. + Installer resolved as ./ida-pro_.run. + -n, --name Base image name (default: pyrrha-ida). + Images tagged :, newest also :latest. + -l, --license Path to idapro.hexlic (default: ./idapro.hexlic). + Validated at startup, never passed to docker build. + -h, --help Print this help and exit. + +Examples: + # Build a single version with defaults: + $(basename "$0") --version 91 + + # Build two versions under a custom image name: + $(basename "$0") --version 91 --version 92 --name myorg/ida + + # Build with a licence file stored elsewhere: + $(basename "$0") --version 91 --license /secure/idapro.hexlic +EOF + exit 0 +} + +die() { + echo "ERROR: $*" >&2 + echo >&2 + usage + exit 1 +} + +# ── Argument parsing ────────────────────────────────────────────────────────── + +versions=() +image_name="${IMAGE_NAME_DEFAULT}" +license_path="${DEFAULT_LICENSE}" + +while [[ $# -gt 0 ]]; do + case "$1" in + -v|--version) + [[ -n "${2:-}" ]] || die "--version requires an argument." + versions+=("$2") + shift 2 + ;; + -n|--name) + [[ -n "${2:-}" ]] || die "--name requires an argument." + image_name="$2" + shift 2 + ;; + -l|--license) + [[ -n "${2:-}" ]] || die "--license requires an argument." + license_path="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + die "Unknown option: $1" + ;; + esac +done + +[[ ${#versions[@]} -gt 0 ]] || die "At least one --version is required." + +# ── Pre-flight checks ───────────────────────────────────────────────────────── + +[[ -f "${DOCKERFILE}" ]] || die "Dockerfile not found at: ${DOCKERFILE}" +[[ -f "${DOCKERFILE_FINAL}" ]] || die "Dockerfile.final not found at: ${DOCKERFILE_FINAL}" +[[ -f "${license_path}" ]] || die "Licence file not found at: ${license_path}" + +# ── Build loop ──────────────────────────────────────────────────────────────── + +# The latest version is the numerically greatest one; it will also be tagged +# as :latest after its build. +latest_version="$(printf '%s\n' "${versions[@]}" | sort -n | tail -1)" + +for version in "${versions[@]}"; do + # Each version has its own installer: ida-pro_.run + installer_path="${SCRIPT_DIR}/ida-pro_${version}.run" + + [[ -f "${installer_path}" ]] || \ + die "Installer not found for version ${version}: ${installer_path}" + + # IDA_INSTALLER is a plain filename relative to the build context so that + # Docker bind-mounting the context can resolve it without path doubling. + installer_filename="$(basename "${installer_path}")" + + image_tag="${image_name}:${version}" + tmp_image="${image_name}:${version}-setup" + tmp_container="ida-setup-${version}" + ida_reg_path="${SCRIPT_DIR}/ida_${version}.reg" + + echo "==> Building ${image_tag}" + echo " Installer : ${installer_path}" + echo " Licence : ${license_path}" + echo " ida.reg : ${ida_reg_path} (exists: $([ -f "${ida_reg_path}" ] && echo yes || echo no))" + + # ── Phase 1: build setup image and extract ida.reg ──────────────────── + # IDA 9.x requires interactive EULA acceptance before writing ida.reg. + # Skipped if ida_.reg already exists from a prior run. + if [[ ! -f "${ida_reg_path}" ]]; then + echo "==> [Phase 1] ida.reg not found at ${ida_reg_path}, running setup..." + + echo "==> [Phase 1] Building setup image ${tmp_image}..." + ${DOCKER} build \ + --build-arg "IDA_VERSION=${version}" \ + --build-arg "IDA_INSTALLER=${installer_filename}" \ + --tag "${tmp_image}" \ + --file "${DOCKERFILE}" \ + "${SCRIPT_DIR}" + + echo "==> [Phase 1] Starting temporary container." + echo " Accept the IDA EULA when prompted, then close IDA." + echo " The container will stop automatically afterwards." + + # Remove any leftover container from a previous failed attempt. + ${DOCKER} rm "${tmp_container}" 2>/dev/null || true + + # Find the IDA GUI binary. In IDA 9.x it is simply 'ida' (no suffix); + # older versions used 'ida64'. We match exactly those two names. + ida_binary="$(${DOCKER} run --rm "${tmp_image}" \ + find "/opt/ida_${version}" -maxdepth 1 -type f -executable \ + \( -name 'ida' -o -name 'ida64' \) | head -1)" + [[ -n "${ida_binary}" ]] || \ + die "Could not find IDA GUI binary (ida or ida64) in /opt/ida_${version}." + echo " IDA binary: ${ida_binary}" + + # Allow the root-owned Docker container to connect to the user's X + # display. Revoked immediately after the container stops. + xhost +local:root + + # Do NOT use --rm: we need the stopped container's filesystem to + # extract ida.reg after the user has accepted the EULA and closed IDA. + # Note: docker run does not support --mount type=secret (build-only); + # the licence is passed as a read-only bind mount instead. + # '|| true' prevents set -e from aborting the script when IDA exits + # with a non-zero code (which it does on normal close). + ${DOCKER} run --name "${tmp_container}" \ + -v "${license_path}:/run/secrets/ida_license:ro" \ + -e DISPLAY="${DISPLAY:-:0}" \ + -v /tmp/.X11-unix:/tmp/.X11-unix \ + "${tmp_image}" \ + "${ida_binary}" || true + + # Revoke the X display permission as soon as the container exits. + xhost -local:root + + echo "==> [Phase 1] Container stopped. Extracting ida.reg..." + ${DOCKER} cp "${tmp_container}:/home/user/.idapro/ida.reg" "${ida_reg_path}" || \ + die "docker cp failed — ida.reg not found in container '${tmp_container}'." \ + "Make sure you accepted the EULA and closed IDA before the container exited." + + [[ -f "${ida_reg_path}" ]] || \ + die "ida.reg was not saved to ${ida_reg_path} after docker cp." + + ${DOCKER} rm "${tmp_container}" + ${DOCKER} rmi "${tmp_image}" 2>/dev/null || true + echo "==> [Phase 1] ida.reg saved to: ${ida_reg_path}" + else + echo "==> [Phase 1] Skipped — ida.reg already exists at: ${ida_reg_path}" + fi + + # ── Phase 2: build the final image with ida.reg injected ───────────── + # Uses Dockerfile.final which extends the setup image and only adds + # ida.reg, avoiding the COPY-in-wrong-stage problem of a single Dockerfile. + echo "==> [Phase 2] Building final image ${image_tag}..." + + cp "${ida_reg_path}" "${SCRIPT_DIR}/ida.reg" || \ + die "Failed to copy ${ida_reg_path} into build context." + [[ -f "${SCRIPT_DIR}/ida.reg" ]] || \ + die "ida.reg is missing from build context (${SCRIPT_DIR}/ida.reg)." + + # Pass the md5 hash of ida.reg as a build arg to bust the Docker cache at + # the COPY instruction, preventing reuse of a layer built before the file + # existed. + ida_reg_hash="$(md5sum "${SCRIPT_DIR}/ida.reg" | cut -d' ' -f1)" + + # Phase 2 needs the setup image as its base; build it if it was cleaned up. + if ! ${DOCKER} image inspect "${tmp_image}" > /dev/null 2>&1; then + echo "==> [Phase 2] Setup image not found, rebuilding ${tmp_image}..." + ${DOCKER} build \ + --build-arg "IDA_VERSION=${version}" \ + --build-arg "IDA_INSTALLER=${installer_filename}" \ + --tag "${tmp_image}" \ + --file "${DOCKERFILE}" \ + "${SCRIPT_DIR}" + fi + ${DOCKER} build \ + --build-arg "IMAGE_NAME=${image_name}" \ + --build-arg "IDA_VERSION=${version}" \ + --build-arg "IDA_REG_HASH=${ida_reg_hash}" \ + --tag "${image_tag}" \ + --file "${DOCKERFILE_FINAL}" \ + "${SCRIPT_DIR}" + + rm -f "${SCRIPT_DIR}/ida.reg" + + # Tag the newest version as :latest. + if [[ "${version}" == "${latest_version}" ]]; then + ${DOCKER} tag "${image_tag}" "${image_name}:latest" + echo "==> Also tagged ${image_tag} as ${image_name}:latest" + fi + + echo "==> Successfully built ${image_tag}" +done + +echo "==> Done." diff --git a/ci/pyrrha/Dockerfile b/ci/pyrrha/Dockerfile new file mode 100644 index 0000000..4788010 --- /dev/null +++ b/ci/pyrrha/Dockerfile @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ----------------------------------------------------------------------------- +# Pyrrha image +# +# Layered on top of an image which contained one of the supported disassembler. +# The image should provide: +# * the disassembler already installed and on PATH (or correct env variable set) +# * a non-root "user" (uid 1000) +# * a Python >= 3.11 venv already on PATH +# +# The base image tag is parameterised so callers can reuse a pre-built image or +# have it built on-demand by scripts/build-docker.sh. +# +# Manual usage: +# docker build -t pyrrha-ghidra:latest ci/ghidra +# docker build -t pyrrha-ida:latest \ +# --build-arg DISASS_IMAGE=ida --build-arg DISASS_IMAGE_VERSION=latest . +# ----------------------------------------------------------------------------- + +ARG GHIDRA_IMAGE=pyrrha-ghidra:latest + +ARG DISASS_IMAGE=ghidra +ARG DISASS_IMAGE_VERSION=latest + +FROM ${DISASS_IMAGE}:${DISASS_IMAGE_VERSION} +SHELL ["/bin/bash", "-c"] + +ENV DISASS_IMAGE_USER=user +ENV PYRRHA_INSTALL_DIR=/tmp/pyrrha_install +ENV PYRRHA_WORKING_DIR=/tmp/pyrrha + +USER ${DISASS_IMAGE_USER} +WORKDIR ${PYRRHA_INSTALL_DIR} + +RUN python3 -m pip install --no-cache-dir -U pip + +COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER} src src/ +COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER} pyproject.toml ./ +COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER} README.md ./ + + +# Overwrite the PyPI install from the base image with the local working copy. +# --force-reinstall guarantees we replace the base layer's version even when +# the local pyproject.toml reports an identical version number. +RUN pip install --no-cache-dir --force-reinstall . && \ + rm -rf ${PYRRHA_INSTALL_DIR} + +WORKDIR ${PYRRHA_WORKING_DIR} + +CMD ["pyrrha"] diff --git a/docs/contributing/dev_mapper.md b/docs/contributing/dev_mapper.md index f46fd8f..2fd1054 100644 --- a/docs/contributing/dev_mapper.md +++ b/docs/contributing/dev_mapper.md @@ -6,7 +6,7 @@ First develop your mapper. We are using `numbat` to manipulate the db used by so Then, add the required dependencies into `pyproject.toml`. ## Integration into the main program -Once the mapper is ready, it should be integrated into `pyrrha` CLI by adding the corresponding subcommand in the `src/pyrrha_mapper/__main__.py`. The CLI system is handled with [click](https://click.palletsprojects.com) +Once the mapper is ready, it should be integrated into `pyrrha` CLI by adding the corresponding subcommand in the `src/pyrrha_mapper/__main__.py`. The CLI system is handled with [click](https://click.palletsprojects.com). The subcommand corresponds to a function implementing the main of your mapper and some decorators to declare the subcommand name, its options and its arguments. @@ -100,7 +100,7 @@ Finally, you should add a page relative to your mapper inside the documentation. 1. Write your documentation in a markdown file that should be place into the `docs/mappers` folder. !!! tip - We are using `material` theme of the `mkdocs` doc system. It provides a lot of nice features to improve your documentation like this note block. Do not hesitate to take a look at their [documentation](https://squidfunk.github.io/mkdocs-material/reference/)! + We are using `materialx` theme of the `mkdocs` doc system. It provides a lot of nice features to improve your documentation like this note block. Do not hesitate to take a look at their [documentation](https://jaywhj.github.io/mkdocs-materialx/)! 2. Add your mapper in mapper lists (in `README.md` and in `docs/mappers/mappers.md`). 3. Complete the `nav` section in the `mkdocs.yml` file to add your file in the site navigation system. diff --git a/docs/disassembler.md b/docs/disassembler.md deleted file mode 100644 index 6b0046d..0000000 --- a/docs/disassembler.md +++ /dev/null @@ -1,143 +0,0 @@ -# Disassembler Integration - -Some pyrrha mappers and especially the `exe-decomp` enables jumping in a disassembler from the UI -by right-clicking on a function and selecting "Open in disassembler". Executing arbitrary command -is made available by the [Numbat feature](https://quarkslab.github.io/numbat/customization/) and -requires opening a Sourcetrail DB with ``NumbatUI``. - -The link between Numbat and a disassembler is made by implementing custom URL protocol handlers. As such, -clicking "Open in disassembler" will trigger a command like: - -```bash -xdg-open 'disas://e62f747cf47383858bd563febb813e20?idb=inadyn.i64&offset=0x0124c8' -``` - -On Linux `xdg-open` will open the URL with the default application associated with the `disas` protocol. -For windows and MacOS, application opened are respectively `start` and `open`. For it to work, -we need to register a custom URL handler for the `disas` protocol. This is done by using [heimdallr](https://github.com/interruptlabs/heimdallr-client) developped by [Interrupt Labs](https://interruptlabs.com/). - - -## Heimdallr - -Heimdallr is a custom URL handler that allows you to open a disassembler from the UI. Developpres provides -an [IDA plugin](https://github.com/interruptlabs/heimdallr-ida) to support it and some folks added a [Ghidra -support](https://github.com/foundryzero/ghidra-deep-links). It works by running a gRPC server in the disassembler -that will listen for incoming requests. The image below summarizes the workflow on Linux: - -![](img/heimdallr.svg) - -As shown on the image the Linux system handles URL handlers with `.desktop` files that needs to be registered. -The handler will call `heimdallr_client` utility that is in charge of identifying running gRPC servers to send -the query to a running disassembler or to start it. - -## Installation - -Heimdallr is fairly unmaintained and undocumented. Still, it works rather well. In order to get it working -one need to perform the following steps: - -1. Install `heimdallr-ida` plugin in IDA -2. Install `heimdallr-client` "globally" so that it is reachable by the URL handler dispatcher -3. Configure a `settings.json` file to specify disassembler path etc. -4. Create and register a `.desktop` file to handle the `disas://` protocol. - -**1-heimdallr-ida**: The plugin is available on the [Github page](https://github.com/interruptlabs/heimdallr-ida). -The README.md provides installation steps. The ``install()`` command will automatically copy files in the IDA Pro -directory and creates a default `settings.json` file in `$HOME/.config/heimdallr/settings.json`. - -!!! tip - The install command might be a bit buggy, so it is recommended to install the plugin manually by copying the - files in IDA. - -**2-heimdallr-client**: The client is available on the [Github page](https://github.com/interruptlabs/heimdallr-client). -It can be installed with `pip`: - -```bash -pip3 install git+https://git@github.com/interruptlabs/heimdallr-client.git#egg=heimdallr_client -``` - -!!! note - It should be installed globally so that it is reachable by the URL handler dispatcher. Thus it is recommended - to install it with `--user`. - -**3-Configuring settings**: The `$HOME/.config/heimdallr` will contain all files used by `heimdallr` to locate -running RPC server instances in order to send them requests. The file `settings.json` is used to configure -the disassembler path and paths where to look for binaries. Thus configure carefully your IDA path inside. - -```json -{ - "ida_location": "/my/path/to/ida", - "idb_path": [ - ], - "heimdallr_client": "heimdallr_client" -} -``` - -!!! note - The IDA location binary provided should be a non-blocking IDA or bash script, as `heimdallr-client` - will run it with `subprocess.run` and wait for it before sending the request. - - -**4-Creating protocol handler**: The `.desktop` file is used to register the `disas://` protocol handler. -On Linux, it is usually located in `~/.local/share/applications/`. Creates a file in this directory with -the following content: - -???+ "`heimdallr.desktop`" - ```ini - [Desktop Entry] - Name=Heimdallr-handler - Comment=Disas URL handler - GenericName=heimdallr-handler-generic - Exec=heimdallr_client %u - Type=Application - StartupNotify=true - Categories=GNOME;GTK;Utility; - MimeType=x-scheme-handler/disas; - ``` - -Then you need to update the associated `mimeinfo.cache` file with: - -```bash -update-desktop-database ~/.local/share/applications -``` -This will allow you to handle URLs with the `disas://` scheme. -*It shall add the line: `x-scheme-handler/disas=heimdallr.desktop` in the file.* - - -## Testing - -You can test that URL are properly resolved by running: - -```bash -xdg-mime query default x-scheme-handler/disas -``` -This should return `heimdallr.desktop`. Then you can try opening a binary with: - -```bash -xdg-open 'disas://e62f747cf47383858bd563febb813e20?idb=inadyn.i64&offset=0x0124c8' -``` - -Where you provide the MD5 hash of the binary, its DB name and the offset to jump to. -By default, heimdallr look in the IDA Pro history to locate the idb. Otherwise, it search -for directories referenced in the "idb_path" field of the `settings.json` file. - - - -## Usage in Pyrrha mappers - -Pyrrha uses `heimdallr` to resolve binaries location and offsets. Thus when working -on a specific firmware you might need to specify its root directory in the `ida_path` -of the `settings.json` file. Pyrrha provides an utility command to list, add and remove -entries in this file. - -```bash -pyrrha workspace-utils --list # list all entries -``` - -```bash -pyrrha workspace-utils --add /path/to/firmware/rootfs # add directory in search path -``` - -```bash -pyrrha workspace-utils --delete /path/to/firmware/rootfs # remove directory from search path -``` - diff --git a/docs/index.md b/docs/index.md index c6743eb..07e3913 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,89 +8,110 @@ ## Installation -The installation is done in three parts: + -1. Install mapper external dependencies: IDA dissassembler (with the decompilation option for the `exe-decomp` mapper) and [`Quokka` IDA plugin](https://github.com/quarkslab/quokka/releases). -1. Install `Pyrrha` itself. -1. Install [`NumbatUI`](https://github.com/quarkslab/NumbatUI) (or [`Sourcetrail`](https://github.com/CoatiSoftware/Sourcetrail)) to be able to visualize Pyrrha's results. - -!!! example "Quick Start" +??? code "Install Visualisation Tool" === "Sourcetrail" - 1. Install Quokka plugin by downloaded the appropriate version from its [release](https://github.com/quarkslab/quokka/releases) page. Then follow the instructions according to your OS. - - 2. Install Sourcetrail and Pyrrha. - - === "Linux" - ```bash - SOURCETRAIL_URL='https://github.com/CoatiSoftware/Sourcetrail/releases/download/2021.4.19/Sourcetrail_2021_4_19_Linux_64bit.tar.gz' - CHECKSUM=""f65a401daad8e16f29f7b2ff062a559999b6a8d44606db36cdf803de0cd7816d - EXTRACTION_DIR="/tmp/Sourcetrail_2021_4_19_Linux_64bit" - DOWNLOAD_PATH="$EXTRACTION_PATH.tar.gz" + === "Linux" + ```bash + SOURCETRAIL_URL='https://github.com/CoatiSoftware/Sourcetrail/releases/download/2021.4.19/Sourcetrail_2021_4_19_Linux_64bit.tar.gz' + CHECKSUM=""f65a401daad8e16f29f7b2ff062a559999b6a8d44606db36cdf803de0cd7816d + EXTRACTION_DIR="/tmp/Sourcetrail_2021_4_19_Linux_64bit" + DOWNLOAD_PATH="$EXTRACTION_PATH.tar.gz" - wget $SOURCETRAIL_URL -O $DOWNLOAD_PATH - echo $CHECKSUM $DOWNLOAD_PATH | sha256sum -c + wget $SOURCETRAIL_URL -O $DOWNLOAD_PATH + echo $CHECKSUM $DOWNLOAD_PATH | sha256sum -c - if [ $? == 0 ]; then - echo '==== Install Sourcetrail' - tar xf $DOWNLOAD_PATH -C $EXTRACTION_DIR - sudo $EXTRACTION_DIR/Sourcetrail/install.sh - rm -rf $DOWNLOAD_PATH $EXTRACTION_DIR - fi + if [ $? == 0 ]; then + echo '==== Install Sourcetrail' + tar xf $DOWNLOAD_PATH -C $EXTRACTION_DIR + sudo $EXTRACTION_DIR/Sourcetrail/install.sh + rm -rf $DOWNLOAD_PATH $EXTRACTION_DIR + fi + ``` + === "Windows" - # Install pyrrha - if [ $? == 0 ]; then - echo '==== Install Pyrrha' - pip install pyrrha-mapper - fi - ``` - === "Windows" + Download last Sourcetrail [release](https://github.com/CoatiSoftware/Sourcetrail/releases), unzip it and run the `setup.exe`. - 1. Download last Sourcetrail [release](https://github.com/CoatiSoftware/Sourcetrail/releases), unzip it and run the `setup.exe`. - 2. Install pyrrha: `pip install pyrrha-mapper` + === "MacOS" - === "MacOS" - - 1. Download last Sourcetrail [release](https://github.com/CoatiSoftware/Sourcetrail/releases), and install it following [Sourcetrail documentation](https://github.com/CoatiSoftware/Sourcetrail/releases). - 2. Install pyrrha: `pip install pyrrha-mapper` + Download last Sourcetrail [release](https://github.com/CoatiSoftware/Sourcetrail/releases), and install it following [Sourcetrail documentation](https://github.com/CoatiSoftware/Sourcetrail/releases). === "NumbatUI (Ubuntu/Debian)" - _Tested only for last Ubuntu/Debian._ - - First install Quokka plugin by downloaded the appropriate version from its [release](https://github.com/quarkslab/quokka/releases) page. + _Tested only for last Ubuntu/Debian._ + + Run the following script that will clone and build `NumbatUI` and install `Pyrrha`. `NumbatUI` executable will be in `numbatui/build/Release/app`. + + ```sh + # Prerequisites for Numbat UI + sudo apt-get update + sudo apt-get install -y \ + cmake \ + git \ + build-essential \ + libboost-filesystem-dev libboost-program-options-dev libboost-system-dev libboost-date-time-dev \ + qt6-svg-dev qt6-base-dev qt6-5compat-dev \ + unzip wget \ + libclang-17-dev clang-17 + + # Clone and Build NumbatUI + git clone https://github.com/quarkslab/NumbatUI.git numbatui + cd numbatui + mkdir -p build/Release + cd build/Release + cmake -DCMAKE_BUILD_TYPE="Release" -DBUILD_CXX_LANGUAGE_PACKAGE=ON -DBUILD_PYTHON_LANGUAGE_PACKAGE=ON ../.. && make NumbatUI -j $(nproc) + ``` + +!!! code "Install Pyrrha" + === ":fontawesome-brands-python: Python Package" + Require a local installation of **IDA Pro 9.1+** and/or **Ghidra 12.0+** except for `fs` mapper. + ```python + # in a virtualenv + pip install pyrrha-mapper + ``` + === ":fontawesome-brands-docker: Docker Image" + Download the docker image from Github Registry, this image is backed by Ghidra. + + ```sh + docker pull ghcr.io/quarkslab/pyrrha:latest + ``` - Then run the following script that will clone and build `NumbatUI` and install `Pyrrha`. `NumbatUI` will in `numbatui/build/Release/app`. - - ``` - # Prerequisites for Numbat UI - sudo apt-get update - sudo apt-get install -y \ - cmake \ - git \ - build-essential \ - libboost-filesystem-dev libboost-program-options-dev libboost-system-dev libboost-date-time-dev \ - qt6-svg-dev qt6-base-dev qt6-5compat-dev \ - unzip wget \ - libclang-17-dev clang-17 - - # Clone and Build NumbatUI - git clone https://github.com/quarkslab/NumbatUI.git numbatui - cd numbatui - mkdir -p build/Release - cd build/Release - cmake -DCMAKE_BUILD_TYPE="Release" -DBUILD_CXX_LANGUAGE_PACKAGE=ON -DBUILD_PYTHON_LANGUAGE_PACKAGE=ON ../.. && make NumbatUI -j $(nproc) - - # Install pyrrha - pip install pyrrha-mapper - ``` !!! note Detailed instructions can be found on the [dedicated documentation page](installation.md). --8<-- "README.md:usage" + +!!! code "Run Pyrrha" + === ":fontawesome-brands-python: Python Package" + If your backend is not on `PATH`, indicate its directory using the matching environment variable. + ```sh + export IDADIR=/opt/idapro + export GHIDRA_INSTALL_DIR=/opt/ghidra_12.0.4_PUBLIC + ``` + Run **Pyrrha**, to obtain NumbatUI/Sourcetrail compatible files. + ``` + pyrrha MAPPER [OPTIONS] ROOT_DIRECTORY + ``` + + === ":fontawesome-brands-docker: Docker Image" + Download the docker image from Github Registry, this image is backed by Ghidra. + + ```sh + cd ROOT_DIRECTORY/.. + docker run --rm -t -v $PWD:/tmp/pyrrha ghcr.io/quarkslab/pyrrha:latest MAPPER [OPTIONS] ROOT_DIRECTORY + ``` + +!!! code "Visualize results" + You should have a `*.srctrlprj` file corresponding to the project file and a `*.srctrldb` file for the DB. + Run `NumbatUI` or `Sourcetrail` on the project file. You can now navigate into the results. + + The user interface is described in depth in the [NumbatUI documentation](https://github.com/quarkslab/NumbatUI/blob/main/DOCUMENTATION.md#user-interface). + Do not hesitate to take a look at all the possibilities offered by NumbatUI, especially [Custom Trails](https://github.com/quarkslab/NumbatUI/blob/main/DOCUMENTATION.md#custom-trail-dialog). !!! note The detailed documentation of each mapper is available in the [documentation](mappers/mappers.md). diff --git a/docs/installation.md b/docs/installation.md index ba09749..e5c3317 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,77 +1,50 @@ # Installation The installation is done in three parts: -- installing `Pyrrha` (as a Python module); -- installing mappers external dependencies if required; -- installing `NumbatUI` to be able to visualize Pyrrha's results. +- installing **Pyrrha** (as a Python module); +- installing **NumbatUI** to be able to visualize Pyrrha's results. !!! info - It is also possible to visualize results with `Sourcetrail`, it is the base from which `NumbatUI` was forked. The user won't be able to use new features like the renaming of the node or the launch of external programs from Sourcetrail/NumbatUI. + It is also possible to visualize results with **Sourcetrail**, it is the base from which **NumbatUI** was forked. The user won't be able to use new features like the renaming of the nodes. ## Pyrrha Installation -=== "Python Package" - Pyrrha requires a Python version >= 3.10. - It is recommended to install the Python package inside a virtualenv. You can use `pip` to install it. - ```python - pip install pyrrha-mapper +=== ":fontawesome-brands-python: Python Package" + **Pyrrha** relies on a backend (IDA or Ghidra) to generate its results, except for the light mapper `fs`. This installation is not covered here, we consider the following prerequisites: + + - Python **≥ 3.10**. + - A local installation of **IDA Pro 9.1+** and/or **Ghidra 12.0+** — + required by the InterCG mapper. + + Then you can install **Pyrrha** Python package in a virtual environment with `pip`. + ```sh + # Do not forget to activate your virtualenv + pip install pyrrha-mapper ``` If you prefer using sources to install Pyrrha, do the following: - ```commandline + ```sh # Do not forget to activate your virtualenv pip install 'pyrrha @ git+https://github.com/quarkslab/pyrrha' - - # If you prefer, you can manually clone the repository and then install the package - git clone https://github.com/quarkslab/pyrrha - cd pyrrha - pip install '.' ``` -=== "Docker" - `pyrrha` can be used with a docker. It provides Pyrrha, but you still need to install NumbatUI on your system as described in the [NumbatUI Installation](#numbatui-installation) section. +=== ":fontawesome-brands-docker: Docker Image" + **Pyrrha** can be used with a docker. It provides **Pyrrha** with a backend (**Ghidra**), but you still need to install NumbatUI on your system as described in the [**NumbatUI** Installation](#numbatui-installation) section. The docker image is directly available from our [Github registry](https://github.com/orgs/quarkslab/packages/container/package/pyrrha). ```commandline cd ROOT_DIRECTORY/.. - docker run --rm -t -v $PWD:/tmp/pyrrha ghcr.io/quarkslab/pyrrha:latest fs [OPTIONS] ROOT_DIRECTORY + docker run --rm -t -v $PWD:/tmp/pyrrha ghcr.io/quarkslab/pyrrha:latest MAPPER [OPTIONS] ROOT_DIRECTORY ``` - !!! warning - The docker image has only be built for the `fs` mapper. + ## Visualizer Installation - -=== "NumbatUI" - NumbatUI should be compiled locally, as explained in its [README](https://github.com/quarkslab/NumbatUI/blob/main/README.md). For the moment it has only be tested on Ubuntu/Debian distributions. - Here are the summarized compilation instructions: - - **Prerequisites** - ```commandline - apt-get update - apt-get install -y \ - cmake \ - git \ - build-essential \ - libboost-filesystem-dev libboost-program-options-dev libboost-system-dev libboost-date-time-dev \ - qt6-svg-dev qt6-base-dev qt6-5compat-dev \ - unzip wget \ - libclang-17-dev clang-17 - ``` - - **Compilation** - ```commandline - git clone https://github.com/quarkslab/NumbatUI.git numbatui - cd numbatui - mkdir -p build/Release - cd build/Release - cmake -DCMAKE_BUILD_TYPE="Release" -DBUILD_CXX_LANGUAGE_PACKAGE=ON -DBUILD_PYTHON_LANGUAGE_PACKAGE=ON ../.. && make NumbatUI -j $(nproc) - ``` -=== "Sourcetrail" +=== "**Sourcetrail**" === "Linux" - ```bash + ```sh SOURCETRAIL_URL='https://github.com/CoatiSoftware/Sourcetrail/releases/download/2021.4.19/Sourcetrail_2021_4_19_Linux_64bit.tar.gz' CHECKSUM=""f65a401daad8e16f29f7b2ff062a559999b6a8d44606db36cdf803de0cd7816d EXTRACTION_DIR="/tmp/Sourcetrail_2021_4_19_Linux_64bit" @@ -86,12 +59,6 @@ The installation is done in three parts: sudo $EXTRACTION_DIR/Sourcetrail/install.sh rm -rf $DOWNLOAD_PATH $EXTRACTION_DIR fi - - # Install pyrrha - if [ $? == 0 ]; then - echo '==== Install Pyrrha' - pip install pyrrha-mapper - fi ``` === "Windows" @@ -101,16 +68,31 @@ The installation is done in three parts: Download last Sourcetrail [release](https://github.com/CoatiSoftware/Sourcetrail/releases), and install it following [Sourcetrail documentation](https://github.com/CoatiSoftware/Sourcetrail/releases). +=== "**NumbatUI**" + **NumbatUI** should be compiled locally, as explained in its [README](https://github.com/quarkslab/NumbatUI/blob/main/README.md). For the moment it has only be tested on Ubuntu/Debian distributions. + Here are the summarized compilation instructions: -## External Dependencies - -The `fs-cg` and the `exec-decomp` mappers require to have a proper installation of [Quokka](https://github.com/quarkslab/quokka) and so of IDA. The `exec-decomp` also requires to have an IDA license with decompiler. - -The Quokka plugin for IDA can directly be downloaded from the [Release page](https://github.com/quarkslab/quokka/releases). The associated Python package is directly installed during Pyrrha Python package installation. - -!!! note - The `fs-cg` and the `exec-decomp` mappers could be used without Quokka and IDA if you already have the cache files for your firmware (`.decompiled` and `.quokka` files). More details in the corresponding mapper documentation. + **Prerequisites** + ```sh + apt-get update + apt-get install -y \ + cmake \ + git \ + build-essential \ + libboost-filesystem-dev libboost-program-options-dev libboost-system-dev libboost-date-time-dev \ + qt6-svg-dev qt6-base-dev qt6-5compat-dev \ + unzip wget \ + libclang-17-dev clang-17 + ``` + **Compilation** + ```sh + git clone https://github.com/quarkslab/NumbatUI.git numbatui + cd numbatui + mkdir -p build/Release + cd build/Release + cmake -DCMAKE_BUILD_TYPE="Release" -DBUILD_CXX_LANGUAGE_PACKAGE=ON -DBUILD_PYTHON_LANGUAGE_PACKAGE=ON ../.. && make NumbatUI -j $(nproc) + ``` ## Documentation diff --git a/docs/mappers/exe-decomp.md b/docs/mappers/exe-decomp.md index a27a96d..1657d76 100644 --- a/docs/mappers/exe-decomp.md +++ b/docs/mappers/exe-decomp.md @@ -1,30 +1,44 @@ -# `exe-decomp`: Executable Decompilation mapper +# `decomp`: Executable Decompilation mapper ## Introduction This mapper is not a firmware mapper but an executable mapper. It will map its call graph and its decompiled code with cross-references within the source code. In order the mapper will: -* Export the executable (Quokka) to extract its call graph -* Decompile all functions (with Hex-Rays) to dump the whole decompiled code +* Decompile all functions (with Hex-Rays or Ghidra) to dump the whole decompiled code * Index all functions with the associated decompilation * Apply cross-references between functions ## Usage ```commandline -Usage: pyrrha exe-decomp [OPTIONS] EXECUTABLE +Usage: pyrrha decomp [OPTIONS] EXECUTABLE - Map a single executable call graph into a numbatui-compatible database.It also index the decompiled code - along with all call cross-references. + Map a single executable call graph into a NumbatUI-compatible database. Also indexes the decompiled code along with + all call cross-references. Options: - -d, --debug Set log level to DEBUG - --db PATH NumbatUI DB file path (.srctrldb). [default: pyrrha.srctrldb] - --disassembler DISASSEMBLER Disassembler to use for disassembly. [default: Disassembler.AUTO] - -h, --help Show this message and exit. + -d, --debug Set log level to DEBUG. + --db PATH NumbatUI DB file path (.srctrldb). [default: decomp.srctrldb] + -b, --backend [ida|ghidra] Backend to use. [default: Backend.IDA] + -e, --export Create a JSON export of the resulting decompilation mapping. + -h, --help Show this message and exit. ``` +## JSON export + +With the `-e/--export` option, the mapper writes a JSON file next to the database (`.json`) describing the result of the run. It is loaded back into an `ExportedDecompilation` object exposed by Pyrrha, so results can be post-processed without re-running a disassembler: + +```python +from pyrrha_mapper.mappers import ExportedDecompilation + +result = ExportedDecompilation.from_json_export("my_binary.json") +for func in result.iter_functions(): + print(hex(func.addr), func.name, func.type) +``` + +An `ExportedDecompilation` stores the analysed binary identity (`path`, `id`, `name`) and its functions, keyed by their parser-space entry-point address. Each function is an `ExportedFunction` carrying its `Symbol`, its `FuncType`, the addresses it calls and is called by, its decompiled `source`, and the in-source locations of its declaration and call sites (`ExportedLocation`). + !!! note This mapper create the Quokka export of the binary as well as a cache version of all the decompiled function of the analyzed binary.. If these files already exist, it loads them without regenerate them. Like that it also allowed to use `pyrrha` in systems without Quokka and/or IDA. diff --git a/docs/mappers/mappers.md b/docs/mappers/mappers.md index 8b801bb..56170dd 100644 --- a/docs/mappers/mappers.md +++ b/docs/mappers/mappers.md @@ -5,4 +5,4 @@ Pyrrha provides the following mappers: - [`fs`](fs.md): a filesystem mapper. It maps ELF/PE files, their imports and their exports. Also map symlinks which target ELF files. - [`fs-cg`](fs-cg.md): a filesystem call graph mapper. It maps the whole firmware by interconnecting call graphs of all executables (requires disassembly). -- [`exe-decomp`](exe-decomp.md): Map an executable call graph along with its decompiled code. The mapper will use Sourcetrail source code indexing features to cross-reference calls within the source code. +- [`decomp`](exe-decomp.md): Map an executable call graph along with its decompiled code. The mapper will use Sourcetrail source code indexing features to cross-reference calls within the source code. diff --git a/mkdocs.yml b/mkdocs.yml index 289cbbd..60dc132 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -5,10 +5,14 @@ site_author: "Quarkslab" repo_url: "https://github.com/quarkslab/pyrrha" repo_name: "quarkslab/pyrrha" watch: [ mkdocs.yml, README.md, CHANGELOG.md, src/pyrrha_mapper ] -copyright: Copyright © 2023-2025 Quarkslab +copyright: Copyright © 2023-2026 Quarkslab theme: - name: "material" + name: "materialx" + admonition: + code: + icon: octicons/file-code-24 + color: rgba(158, 158, 158, 0.7) palette: # Palette toggle for light mode - media: "(prefers-color-scheme: light)" @@ -30,6 +34,7 @@ theme: features: - content.code.annotate - content.code.copy + - content.tabs.link - footer nav: @@ -39,10 +44,9 @@ nav: - mappers/mappers.md - Filesystem (Overview): mappers/fs.md - Filesystem (CallGraph): mappers/fs-cg.md - - Exe-Decomp: mappers/exe-decomp.md + - Decompilation Graph: mappers/exe-decomp.md - Contributing: - Mapper Development: contributing/dev_mapper.md - - Disassembler Integration: disassembler.md - Changelog: changelog.md - License: license.md diff --git a/pyproject.toml b/pyproject.toml index 4301a80..acdb934 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,11 +25,10 @@ authors = [ ] readme = "README.md" description = "A mapper collection for firmware analysis" -requires-python = ">=3.10" +requires-python = ">=3.11" license = { text = "Apache License 2.0" } classifiers = [ 'Development Status :: 4 - Beta', - 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', 'Programming Language :: Python :: 3.13', @@ -41,12 +40,12 @@ classifiers = [ dependencies = [ 'click>=8.2.0', 'coloredlogs', - 'lief>=0.15.0', + 'lief>=0.17.0', 'numbat>=0.2.6', 'pydantic', 'rich', - # InterCG mapper - "qbinary>=0.0.3", # will also install idascript + "ida_domain", + "pyghidra" ] dynamic = ['version'] @@ -65,7 +64,7 @@ doc = [ 'mkdocs', 'mkdocs-autorefs', 'mkdocs-glightbox', - 'mkdocs-material[imaging]', + 'mkdocs-materialx[imaging]', 'mkdocs-section-index', 'mike', 'pymdown-extensions' @@ -75,7 +74,7 @@ test = ['pytest', 'pytest-cov'] typechecking = ['mypy'] [tool.setuptools.dynamic] -version = {attr = "pyrrha_mapper.__version__"} +version = {attr = "pyrrha_mapper.__version__"} [tool.mypy] plugins = ['pydantic.mypy'] @@ -105,5 +104,3 @@ convention = "numpy" [tool.ruff.format] line-ending = "auto" docstring-code-format = true - - diff --git a/src/pyrrha_mapper/__init__.py b/src/pyrrha_mapper/__init__.py index d2092aa..075f0dc 100644 --- a/src/pyrrha_mapper/__init__.py +++ b/src/pyrrha_mapper/__init__.py @@ -16,8 +16,8 @@ """Pyrrha is a mapper collection for firmware analysis.""" -from pyrrha_mapper.common import Binary, FileSystem, FileSystemMapper, Symbol, Symlink +from pyrrha_mapper.mappers import Binary, FileSystem, Symbol, Symlink __version__ = "1.0.1" -__all__ = ["Binary", "FileSystem", "FileSystemMapper", "Symbol", "Symlink"] +__all__ = ["Binary", "FileSystem", "Symbol", "Symlink"] diff --git a/src/pyrrha_mapper/__main__.py b/src/pyrrha_mapper/__main__.py index d30407f..0c02421 100644 --- a/src/pyrrha_mapper/__main__.py +++ b/src/pyrrha_mapper/__main__.py @@ -15,25 +15,121 @@ # limitations under the License. """CLI Module.""" -import json +import functools import logging import multiprocessing -import os -import shutil -import sys from pathlib import Path import click import coloredlogs # type: ignore # no typing used in this library from numbat import SourcetrailDB -from qbinary.types import Disassembler, ExportFormat -from pyrrha_mapper import exedecomp, fs, intercg -from pyrrha_mapper.common import FileSystem -from pyrrha_mapper.types import ResolveDuplicateOption +from pyrrha_mapper.mappers import ( + FileSystem, + FileSystemImportsMapper, + GhidraDecompilMapper, + IdaDecompilMapper, + InterImageCGMapper, +) +from pyrrha_mapper.mappers.decomp_mapper import DecompilMapper +from pyrrha_mapper.types import Backend, ResolveDuplicateOption # ------------------------------------------------------------------------------- -# Common stuff for mappers +# Shared option decorators +# ------------------------------------------------------------------------------- + + +def resolve_duplicates_options(f): + """Add the three mutually exclusive resolve-duplicate options (decorator).""" + + @click.option( + "--ignore", + "resolve_duplicates", + flag_value=ResolveDuplicateOption.IGNORE, + help="When resolving duplicate imports, ignore them.", + default=True, + ) + @click.option( + "--arbitrary", + "resolve_duplicates", + flag_value=ResolveDuplicateOption.ARBITRARY, + help="When resolving duplicate imports, select the first one available.", + ) + @click.option( + "--interactive", + "resolve_duplicates", + flag_value=ResolveDuplicateOption.INTERACTIVE, + help="When resolving duplicate imports, user manually selects which one to use.", + ) + @functools.wraps(f) + def wrapper(*args, **kwargs): + return f(*args, **kwargs) + + return wrapper + + +def jobs_option(max_fraction: float = 1.0): + """Add a ``--jobs`` option (decorator). + + :param max_fraction: fraction of CPU count to use as the upper bound (default 1.0). + """ + + def decorator(f): + max_jobs = max(1, int(multiprocessing.cpu_count() * max_fraction)) + + @click.option( + "-j", + "--jobs", + help="Number of parallel jobs.", + type=click.IntRange(1, max_jobs, clamp=True), + metavar="INT", + default=1, + show_default=True, + ) + @functools.wraps(f) + def wrapper(*args, **kwargs): + return f(*args, **kwargs) + + return wrapper + + return decorator + + +def backend_option(f): + """*Add the ``--backend`` option.""" + + @click.option( + "-b", + "--backend", + required=False, + type=click.Choice([Backend.IDA, Backend.GHIDRA], case_sensitive=False), + default=Backend.IDA, + show_default=True, + help="Backend to use.", + ) + @functools.wraps(f) + def wrapper(*args, **kwargs): + return f(*args, **kwargs) + + return wrapper + + +def root_directory_argument(f): + """Add the ``root_directory`` argument (decorator).""" + + @click.argument( + "root_directory", + type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), + ) + @functools.wraps(f) + def wrapper(*args, **kwargs): + return f(*args, **kwargs) + + return wrapper + + +# ------------------------------------------------------------------------------- +# Common command helpers # ------------------------------------------------------------------------------- @@ -57,16 +153,16 @@ def __init__(self, *args, **kwargs): ) self.params.insert( 0, - click.core.Option(("-d", "--debug"), is_flag=True, help="Set log level to DEBUG"), + click.core.Option(("-d", "--debug"), is_flag=True, help="Set log level to DEBUG."), ) self.no_args_is_help = True def setup_logs(is_debug_level: bool, db_path: Path | None = None) -> None: - """Set up logs. + """Set up coloured console logging and an optional log file. - :param is_debug_level: if True set the log level as DEBUG else INFO - :param db_path: if provided, save a collocated log file. + :param is_debug_level: if True, set the log level to DEBUG, else INFO. + :param db_path: if provided, write a collocated ``.log`` file. """ log_format = dict(fmt="[%(asctime)s][%(levelname)s]: %(message)s", datefmt="%Y-%m-%d %H:%M:%S") level = logging.DEBUG if is_debug_level else logging.INFO @@ -82,17 +178,14 @@ def setup_logs(is_debug_level: bool, db_path: Path | None = None) -> None: field_styles={"asctime": {"color": "green"}, "levelname": {"bold": True}}, **log_format, ) - if db_path: - log_file = db_path.with_suffix(".log") - # add file handler - file_handler = logging.FileHandler(log_file, mode="w") - file_handler.setLevel(level) - file_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) - logging.root.addHandler(file_handler) + handler = logging.FileHandler(db_path.with_suffix(".log"), mode="w") + handler.setLevel(level) + handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) + logging.root.addHandler(handler) -def setup_db(db_path, overwrite_db: bool = True) -> SourcetrailDB: +def setup_db(db_path: Path, overwrite_db: bool = True) -> SourcetrailDB: """Create and/or open the corresponding Sourcetrail DB. :param db_path: path of the db to open/create @@ -100,15 +193,12 @@ def setup_db(db_path, overwrite_db: bool = True) -> SourcetrailDB: cleared else not :return: the created or opened Sourcetrail DB """ - # db creation/and or opening if SourcetrailDB.exists(db_path): - db = SourcetrailDB.open(db_path, clear=overwrite_db) - else: - path = Path(db_path) - if path.suffix != SourcetrailDB.SOURCETRAIL_DB_EXT: - path = path.with_suffix(f"{path.suffix}{SourcetrailDB.SOURCETRAIL_DB_EXT}") - db = SourcetrailDB.create(path) - return db + return SourcetrailDB.open(db_path, clear=overwrite_db) + path = Path(db_path) + if path.suffix != SourcetrailDB.SOURCETRAIL_DB_EXT: + path = path.with_suffix(f"{path.suffix}{SourcetrailDB.SOURCETRAIL_DB_EXT}") + return SourcetrailDB.create(path) # ------------------------------------------------------------------------------- @@ -127,82 +217,43 @@ def pyrrha(): # noqa: D103 pass -""" - Filesystem mapper. - Map ELF/PE files, their imports and their exports. - Also map symlinks which target ELF/PE files. -""" - - @pyrrha.command( "fs", cls=MapperCommand, - short_help="Map PE and ELF files of a filesystem into a numbatui-compatible db.", - help="Map a filesystem into a numbatui-compatible db. It maps ELF and PE files, \ -their imports/exports plus the symlinks that points on these executable files.", + short_help="Map PE and ELF files of a filesystem into a NumbatUI-compatible db.", + help=( + "Map a filesystem into a NumbatUI-compatible db. " + "It maps ELF and PE files, their imports/exports, " + "plus the symlinks that point to these executable files." + ), ) @click.option( "-e", "--export", - help="Create an export of the resulting FileSystem mapping (in JSON).", + help="Create a JSON export of the resulting FileSystem mapping.", is_flag=True, default=False, - show_default=False, -) -@click.option( - "-j", - "--jobs", - help="Number of parallel jobs created (threads).", - type=click.IntRange(1, multiprocessing.cpu_count(), clamp=True), - metavar="INT", - default=1, - show_default=True, -) -@click.option( - "--ignore", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.IGNORE, - help="When resolving duplicate imports, ignore them", - default=True, -) -@click.option( - "--arbitrary", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.ARBITRARY, - help="When resolving duplicate imports, select the first one available", -) -@click.option( - "--interactive", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.INTERACTIVE, - help="When resolving duplicate imports, user manually select which one to use", -) -@click.argument( - "root_directory", - # help='Path of the directory containing the filesystem to map.', - type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), ) -def fs_mapper( # noqa: D103 +@jobs_option(max_fraction=1.0) +@resolve_duplicates_options +@root_directory_argument +def fs_mapper( debug: bool, db: Path, export: bool, jobs: int, resolve_duplicates: ResolveDuplicateOption, root_directory: Path, -): # noqa: D103 +): + """Map PE and ELF files of a filesystem.""" setup_logs(debug) db_instance = setup_db(db) - root_directory = root_directory.absolute() - fs_mapper = fs.FileSystemImportsMapper(root_directory, db_instance) - filesystem = fs_mapper.map(jobs, resolve_duplicates) + filesystem = FileSystemImportsMapper(root_directory, db_instance).map(jobs, resolve_duplicates) - # if enabled export enabled, save FileSystem object in a JSON if export: - # maybe in the future a user can choose the output path ? - output_file = db_instance.path.with_suffix(".json") - filesystem.write(output_file) + filesystem.write(db_instance.path.with_suffix(".json")) db_instance.close() @@ -210,99 +261,42 @@ def fs_mapper( # noqa: D103 @pyrrha.command( "fs-cg", cls=MapperCommand, - short_help="Map the Call Graph of every firmware executable into a NumbatUI db.", - help="Map a the Inter-Image Call Graph of a whole filesystem into a NumbatUI db." - "It disassembles executables using a disassembler and extract the call graph." - "It then results all call references across binaries.", -) -@click.option( - "-j", - "--jobs", - help="Number of parallel jobs created (threads).", - type=click.IntRange(1, int(multiprocessing.cpu_count() * 0.7), clamp=True), # 70% of threads - metavar="INT", - default=1, - show_default=True, -) -@click.option( - "--ignore", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.IGNORE, - help="When resolving duplicate imports, ignore them", - default=True, -) -@click.option( - "--arbitrary", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.ARBITRARY, - help="When resolving duplicate imports, select the first one available", + short_help="Map the call graph of every firmware executable into a NumbatUI db.", + help=( + "Map the inter-image call graph of a whole filesystem into a NumbatUI db. " + "It disassembles executables, extracts the call graph, " + "and resolves all call references across binaries." + ), ) -@click.option( - "--interactive", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.INTERACTIVE, - help="When resolving duplicate imports, user manually select which one to use", -) -@click.option( - "--disassembler", - required=False, - type=click.Choice(Disassembler, case_sensitive=False), - default=Disassembler.AUTO, - show_default=True, - help="Disassembler to use", -) -@click.option( - "--exporter", - required=False, - type=click.Choice(ExportFormat, case_sensitive=False), - default=ExportFormat.AUTO, - show_default=True, - help="Binary exporter", -) -@click.argument( - "root_directory", - # help='Path of the directory containing the filesystem to map.', - type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), -) -def fs_call_graph_mapper( # noqa: D103 +@jobs_option(max_fraction=0.7) +@resolve_duplicates_options +@backend_option +@root_directory_argument +def fs_call_graph_mapper( debug: bool, db: Path, jobs: int, resolve_duplicates: ResolveDuplicateOption, - disassembler: Disassembler, - exporter: ExportFormat, + backend: Backend, root_directory: Path, ): + """Map the inter-image call graph of a firmware filesystem.""" setup_logs(debug, db) db_instance = setup_db(db) - if disassembler not in [Disassembler.AUTO, Disassembler.IDA, Disassembler.GHIDRA]: - click.echo("disassembler not yet supported") - # TODO: add support for other disassembler + if backend not in ( + Backend.IDA, + Backend.GHIDRA, + ): + click.echo("Backend not yet supported") return 1 - if disassembler is Disassembler.GHIDRA: - ghidra_env_var = "GHIDRA_PATH" - ghidra_dir = os.environ.get(ghidra_env_var) - if not ghidra_dir: - for ghidra_name in ["ghidra", "ghidraRun"]: - if ghidra_path := shutil.which(ghidra_name): - os.environ[ghidra_env_var] = str(Path(ghidra_path).resolve().parent) - - intercg.InterImageCGMapper.DISASS = disassembler - intercg.InterImageCGMapper.EXPORT = exporter - root_directory = root_directory.absolute() - # Create InterCG mapper and launch mapping try: - intercg_mapper = intercg.InterImageCGMapper(root_directory, db_instance) + intercg_mapper = InterImageCGMapper(root_directory, db_instance, backend) fs_object: FileSystem = intercg_mapper.map(jobs, resolve_duplicates) - - # systematically save the FileSystem object (shall be enriched with calls) - output_file = db_instance.path.with_suffix(intercg_mapper.FS_EXT) - fs_object.write(output_file) - + fs_object.write(db_instance.path.with_suffix(intercg_mapper.FS_EXT)) except RuntimeError: pass @@ -311,49 +305,56 @@ def fs_call_graph_mapper( # noqa: D103 @pyrrha.command( - "exe-decomp", + "decomp", cls=MapperCommand, short_help="Map an executable call graph with its decompiled code.", - help="Map a single executable call graph into a numbatui-compatible database." - "It also index the decompiled code along with all call cross-references.", + help=( + "Map a single executable call graph into a NumbatUI-compatible database. " + "Also indexes the decompiled code along with all call cross-references." + ), ) +@backend_option @click.option( - "--disassembler", - required=False, - type=click.Choice(Disassembler, case_sensitive=False), - default=Disassembler.AUTO, - show_default=True, - help="Disassembler to use for disassembly and decompilation.", -) -@click.option( - "--exporter", - required=False, - type=click.Choice(ExportFormat, case_sensitive=False), - default=ExportFormat.AUTO, - show_default=True, - help="Binary export format to use for binary analysis.", + "-e", + "--export", + help="Create a JSON export of the resulting decompilation mapping.", + is_flag=True, + default=False, ) @click.argument( "executable", type=click.Path(exists=False, file_okay=True, dir_okay=False, path_type=Path), ) -def fs_exe_decompiled_mapper( # noqa: D103 - debug: bool, db: Path, disassembler: Disassembler, exporter: ExportFormat, executable: Path +def fs_exe_decompiled_mapper( + debug: bool, + db: Path, + backend: Backend, + export: bool, + executable: Path, ): - # Change default db name. By default will be .srctrldb - if db.name == "exe-decomp.srctrldb": + """Map a single executable with decompiled code.""" + if db.name == "decomp.srctrldb": db = Path(str(executable) + ".srctrldb") setup_logs(debug, db) db_instance = setup_db(db) - if disassembler not in [Disassembler.AUTO, Disassembler.IDA]: - click.echo(f"disassembler {disassembler.name} not yet supported") - # TODO: add support for other disassembler (forward parameter to mapper) - return 1 - - if exedecomp.map_binary(db_instance, executable, disassembler, exporter): + mapper: DecompilMapper + match backend: + case Backend.IDA: + mapper = IdaDecompilMapper(db_instance, executable) + case Backend.GHIDRA: + mapper = GhidraDecompilMapper(db_instance, executable) + case _: + click.echo(f"Backend {backend.name} not yet supported") + return 1 + + if mapper.map(): logging.info("success.") + if export: + export_path = db_instance.path.with_suffix(".json") + mapper.to_export().write(export_path) + logging.info(f"write export into: {export_path}") else: logging.error("failure.") @@ -362,54 +363,5 @@ def fs_exe_decompiled_mapper( # noqa: D103 db_instance.close() -@pyrrha.command( - "workspace-utils", short_help="Help managing workspaces (for cross-binary referencing)." -) -@click.option("-l", "--list", is_flag=True, default=False, help="List all workspaces.") -@click.option("-a", "--add", is_flag=True, default=False, help="Add a rootfs as workspace.") -@click.option("-d", "--delete", is_flag=True, default=False, help="Remove a rootfs as workspace.") -@click.argument( - "path", - type=click.Path(exists=True, file_okay=True, dir_okay=True, path_type=Path), - required=False, -) -def workspace_utils(list: bool, add: bool, delete: bool, path: Path): - """Manage workspaces for cross-binary referencing.""" - # Configure logs (there is not debug ones) - setup_logs(False) - - # Get the base config directory - if sys.platform == "win32": - heimdallr_settings = Path(os.path.expandvars("%APPDATA%/heimdallr/settings.json")) - else: - heimdallr_settings = Path(os.path.expandvars("$HOME/.config/heimdallr/settings.json")) - if not heimdallr_settings.exists(): - click.echo(f"heimdallr config directory {heimdallr_settings} does not exists") - return -1 - - # Load settings - settings = json.loads(heimdallr_settings.read_text()) - idb_path = settings.get("idb_path") - if idb_path is None: - click.echo(f"heimdallr settings file {heimdallr_settings} does not contain idb_path") - return -1 - - if list: - for path in idb_path: - logging.info(f"- {path}") - - if add: - settings["idb_path"].append(str(Path(path).absolute())) - heimdallr_settings.write_text(json.dumps(settings, indent=4)) # Write it back - - if delete: - try: - settings["idb_path"].remove(str(path)) - heimdallr_settings.write_text(json.dumps(settings, indent=4)) # Write it back - except ValueError: - click.echo(f"Path {path} not in idb_path of settings.") - return -1 - - if __name__ == "__main__": pyrrha() diff --git a/src/pyrrha_mapper/intercg/__init__.py b/src/pyrrha_mapper/backend/__init__.py similarity index 85% rename from src/pyrrha_mapper/intercg/__init__.py rename to src/pyrrha_mapper/backend/__init__.py index da2fca5..571d0cd 100644 --- a/src/pyrrha_mapper/intercg/__init__.py +++ b/src/pyrrha_mapper/backend/__init__.py @@ -15,6 +15,8 @@ # limitations under the License. """Module for the intercg mapper.""" -from .fwmapper import InterImageCGMapper +from .base import Backend +from .ghidra import Ghidra +from .ida import IDA -__all__ = ["InterImageCGMapper"] \ No newline at end of file +__all__ = ["Backend", "Ghidra", "IDA"] diff --git a/src/pyrrha_mapper/backend/base.py b/src/pyrrha_mapper/backend/base.py new file mode 100644 index 0000000..fe5400f --- /dev/null +++ b/src/pyrrha_mapper/backend/base.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Interface for backends used by mappers.""" + +from abc import ABCMeta, abstractmethod +from collections.abc import Iterator +from pathlib import Path + +from pyrrha_mapper.types import FuncType + + +class Backend(object, metaclass=ABCMeta): + """Abstraction of any backend used to run analysis.""" + + def __init__( + self, + bin_path: Path, + root_directory: Path | None, + decompilation: bool = False, + image_base: int = 0, + ) -> None: + """Open the binary parser and run any required analysis.""" + pass + + @abstractmethod + def close(self) -> None: + """Close the binary parser and release all resources.""" + ... + + @abstractmethod + def is_func_start(self, addr: int) -> bool: + """:return: True if *addr* (parser space) is the entry point of a function.""" + ... + + @property + @abstractmethod + def func_addrs(self) -> Iterator[int]: + """Yield the parser-space entry-point address of every known function.""" + + @abstractmethod + def func_mangled_name(self, addr: int) -> str: + """:return:: the raw name of a function at *addr*.""" + ... + + @abstractmethod + def func_demangled_name(self, addr: int) -> str: + """:return: the demangled name, falling back to the mangled name.""" + ... + + @abstractmethod + def func_children(self, addr: int) -> list[int]: + """:return: entry-point addresses of callees of the function at *addr*.""" + ... + + @abstractmethod + def func_parents(self, addr: int) -> list[int]: + """:return: entry-point addresses of callers of the function at *addr*.""" + ... + + @abstractmethod + def func_type(self, addr: int) -> FuncType: + """:return: the FuncType of the function at *addr*. + + Thunk stubs that resolve to external/imported functions must return + ``FuncType.IMPORTED`` so the trampoline resolution in ``__init__`` + correctly forwards callers to the imported symbol name. + """ + ... + + @abstractmethod + def func_decompiled(self, addr: int) -> str: + """:return: decompilation result of the function""" + ... diff --git a/src/pyrrha_mapper/backend/ghidra.py b/src/pyrrha_mapper/backend/ghidra.py new file mode 100644 index 0000000..22ebbee --- /dev/null +++ b/src/pyrrha_mapper/backend/ghidra.py @@ -0,0 +1,385 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Ghidra backend implementation of Backend abstract interface.""" + +import logging +import os +import re +import shutil +import tempfile +from collections.abc import Iterator +from pathlib import Path + +import pyghidra + +from pyrrha_mapper.backend import Backend +from pyrrha_mapper.types import FuncType + +# Analyzers required for call-graph extraction. +# Everything else is explicitly disabled to minimise analysis time. +_GHIDRA_REQUIRED_ANALYZERS: frozenset[str] = frozenset( + [ + # --- Function discovery --- + "Disassemble Entry Points", + "Function Start Search", + "Function Start Search After Code", + "Non-Returning Functions - Discovered", + "Non-Returning Functions - Known", + # --- Call graph / cross-references --- + "Call Convention ID", + "Call-Fixup Installer", + "Subroutine References", + "Subroutine References - One Time", + # --- Thunk resolution --- + "Thunk Function", + # --- Format-specific import/export tables --- + # ELF + "ELF Scalar Operand References", + "External Entry References", + # PE + "PE Entry Point", + "Windows x86 PE Thunk Functions", + # Mach-O (no extra analyzer needed beyond the loader itself) + # --- Demangling --- + "Demangler GNU", + "Demangler Microsoft", + ] +) + +# Additional analyzers required when the Ghidra decompiler is used to produce +# pseudocode (i.e. in GhidraLoader but NOT in GhidraParser). +# +# Stack — stack-frame analysis; needed for correct local- +# variable naming in pseudocode (param_N / local_N). +# Stack Variable References — accurate tracking of stack-slot references +# across basic blocks used by the decompiler. +# Shared Return Calls — identifies tail-call / shared-epilogue patterns; +# without it some call edges are absent from the +# decompiled output. +# Data Type Propagation — propagates inferred struct/pointer types through +# the program; without it the decompiler emits +# ``undefined *`` for most pointer arguments, +# making call-site name matching less reliable. +_GHIDRA_DECOMPILER_EXTRA_ANALYZERS: frozenset[str] = frozenset( + [ + "Stack", + "Stack Variable References", + "Shared Return Calls", + "Data Type Propagation", + ] +) + +# Tool-generated fallback names emitted by Ghidra when the real symbol name is +# unknown. Callees matching this pattern cannot be resolved as meaningful +# targets and must be skipped. +# FUN_ unnamed Ghidra function +# _INIT_ ELF .init_array slot +# _FINI_ ELF .fini_array slot +_GHIDRA_SYNTHETIC_NAME_RE: re.Pattern[str] = re.compile( + r"^(?:FUN_[0-9A-Fa-f]+|_INIT_\d+|_FINI_\d+)$" +) + + +class Ghidra(Backend): + """Ghidra backend.""" + + def __init__( + self, + bin_path: Path, + root_directory: Path | None, + decompilation: bool = False, + image_base: int = 0, + timeout = 600, + ) -> None: + """Open the binary parser and run any required analysis.""" + self.decompilation_activated = decompilation + self.image_base = image_base + self._timeout = timeout + + # Initialise all attributes upfront so _close_ghidra is always safe. + self._pyghidra_ctx = None + self._ghidra_program = None + self._ghidra_project_dir: Path | None = None + self._ghidra_func_manager = None + self._ghidra_symbol_table = None + self._ghidra_demangler = None + self._ghidra_cached_func = None + self._ghidra_load_base: int = 0 + self._ghidra_monitor = None + + self._ghidra_project_dir = Path(tempfile.mkdtemp(prefix=f"ghidra_{os.getpid()}_")) + + # Start the JVM once per worker process (no-op if already running). + if not pyghidra.started(): + from pyghidra.launcher import HeadlessPyGhidraLauncher # type: ignore + + launcher = HeadlessPyGhidraLauncher() + launcher.add_vmargs("-Xms512m", "-Xmx2g", "-XX:+UseG1GC") + launcher.start() + + # Ghidra imports must come after JVM start. + from ghidra.app.decompiler import DecompInterface # type: ignore + from ghidra.app.util.demangler.gnu import GnuDemangler # type: ignore + from ghidra.util.task import ConsoleTaskMonitor # type: ignore + + self._ghidra_monitor = ConsoleTaskMonitor() + + # Open without running analysis yet so we can configure the analyser set. + self._pyghidra_ctx = pyghidra.open_program( + str(bin_path) if root_directory is None else str(root_directory / bin_path), + project_location=str(self._ghidra_project_dir), + project_name="p", + analyze=False, + ) + flat_api = self._pyghidra_ctx.__enter__() + program = flat_api.getCurrentProgram() + + # Build the effective analyser set and apply it. + if self.decompilation_activated: + active_analyzers = _GHIDRA_REQUIRED_ANALYZERS | _GHIDRA_DECOMPILER_EXTRA_ANALYZERS + else: + active_analyzers = _GHIDRA_REQUIRED_ANALYZERS + analyzer_options = program.getOptions("Analyzers") + for option_name in analyzer_options.getOptionNames(): + enabled = option_name in active_analyzers + try: + analyzer_options.setBoolean(option_name, enabled) + except Exception: + # Some option names are not simple booleans; skip them silently. + pass + + flat_api.analyzeAll(program) + + self._ghidra_program = program + # Derive load base from the program itself, not from LIEF's image_base, + # so that _to_ghidra_address / _to_parser_addr are always consistent. + self._ghidra_load_base = program.getImageBase().getOffset() + self._ghidra_func_manager = program.getFunctionManager() + self._ghidra_symbol_table = program.getSymbolTable() + + demangler = GnuDemangler() + self._ghidra_demangler = demangler if demangler.canDemangle(program) else None + + if self.decompilation_activated: + self.ifc = DecompInterface() + self.ifc.openProgram(self._ghidra_program) + self.monitor = ConsoleTaskMonitor() + + def close(self) -> None: + """Close the binary parser and release all resources.""" + if self.decompilation_activated: + self.ifc.dispose() + if self._pyghidra_ctx is not None: + try: + self._pyghidra_ctx.__exit__(None, None, None) + except Exception: + pass + self._pyghidra_ctx = None + if self._ghidra_project_dir is not None: + shutil.rmtree(self._ghidra_project_dir, ignore_errors=True) + self._ghidra_project_dir = None + + def is_func_start(self, addr: int) -> bool: + """:return: True if *addr* (parser space) is the entry point of a function.""" + return self._get_ghidra_func(addr) is not None + + @property + def func_addrs(self) -> Iterator[int]: + """Yield the parser-space entry-point address of every known function.""" + seen_addrs: set[int] = set() + for func in self._ghidra_func_manager.getFunctions(True): # type: ignore + if func.isExternal(): + continue + self._ghidra_cached_func = func + parser_addr = self._to_parser_addr(func.getEntryPoint().getOffset()) + if parser_addr in seen_addrs: + continue + seen_addrs.add(parser_addr) + yield self._to_parser_addr(func.getEntryPoint().getOffset()) + + def func_mangled_name(self, addr: int) -> str: + """Return the raw (mangled) name of the function at *addr*. + + Queries the symbol table for ``_Z``-prefixed (Itanium ABI) symbols + first, then falls back to ``func.getName()``, rejecting partial + demangles. Returns ``FUN_`` when no usable name is found. + + :param addr: function entry-point address in parser space. + :return: mangled symbol name or ``FUN_``. + """ + func = self._get_ghidra_func(addr) + if func is None: + return f"FUN_{addr:X}" + + ghidra_addr = self._to_ghidra_address(addr) + for sym in self._ghidra_symbol_table.getSymbols(ghidra_addr): # type: ignore + raw = sym.getName() + if raw and raw.startswith("_Z"): + return raw + + name = func.getName() + if name and not ( + name.startswith("~") + or name.startswith("operator") + or (name.startswith("<") and name.endswith(">")) + ): + return name + + return f"FUN_{addr:X}" + + def func_demangled_name(self, addr: int) -> str: + """Return the demangled name of the function at *addr*. + + Uses ``getName()`` on the ``DemangledObject`` (bare function name + without return type or parameter signature). Falls back to the + mangled name when the demangler is unavailable or returns ``None``. + + :param addr: function entry-point address in parser space. + :return: demangled name, or mangled name if demangling is unavailable. + """ + mangled = self.func_mangled_name(addr) + if self._ghidra_demangler is not None: + try: + result = self._ghidra_demangler.demangle(mangled, True) + if result is not None: + name = result.getName() + if name: + return name + except Exception: + pass + return mangled + + def func_children(self, addr: int) -> list[int]: + """:return: entry-point addresses of callees of the function at *addr*.""" + func = self._get_ghidra_func(addr) + if func is None: + return [] + listing = self._ghidra_program.getListing() # type: ignore + seen: set[int] = set() + result: list[int] = [] + for cu in listing.getCodeUnits(func.getBody(), True): + for ref in cu.getReferencesFrom(): + if not ref.getReferenceType().isCall(): + continue + target_offset = ref.getToAddress().getOffset() + parser_addr = self._to_parser_addr(target_offset) + if parser_addr in seen: + continue + seen.add(parser_addr) + result.append(parser_addr) + return result + + def func_parents(self, addr: int) -> list[int]: + """:return: entry-point addresses of callers of the function at *addr*.""" + func = self._get_ghidra_func(addr) + seen: set[str] = set() + result: list[int] = [] + for caller in func.getCallingFunctions(self._ghidra_monitor) if func is not None else []: + if caller.isExternal(): + continue + name = caller.getName() + if name in seen: + continue + seen.add(name) + result.append(self._to_parser_addr(caller.getEntryPoint().getOffset())) + return result + + def func_type(self, addr: int) -> FuncType: + """:return: the FuncType of the function at *addr*. + + Thunk stubs that resolve to external/imported functions must return + ``FuncType.IMPORTED`` so the trampoline resolution in ``__init__`` + correctly forwards callers to the imported symbol name. + """ + func = self._get_ghidra_func(addr) + if func is None: + return FuncType.NORMAL + if func.isExternal(): + return FuncType.IMPORTED + if func.isThunk(): + return FuncType.THUNK + return FuncType.NORMAL + + def func_decompiled(self, addr: int) -> str: + """:return: decompilation result of the function""" + assert self.decompilation_activated + func = self._get_ghidra_func(addr) + if func is None: + return "" + addr = self._to_parser_addr(func.getEntryPoint().getOffset()) + try: + res = self.ifc.decompileFunction(func, self._timeout, self.monitor) + if res is None or not res.decompileCompleted(): + return "" + return str(res.getDecompiledFunction().getC()) + except Exception as exc: + logging.debug(f"[Ghidra] skipping {addr:#x} ({self.func_mangled_name(addr)!r}): {exc}") + return "" + + # ------------------------------------------------------------------ + # Shared Ghidra primitives + # ------------------------------------------------------------------ + + def _to_ghidra_address(self, parser_addr: int): + """Convert a parser-space address to a Ghidra ``Address`` object. + + :param parser_addr: address in parser space. + :return: Ghidra ``Address`` object. + """ + abs_addr = (parser_addr + self._ghidra_load_base) & 0xFFFFFFFFFFFFFFFF + if abs_addr >= 0x8000000000000000: + abs_addr -= 0x10000000000000000 + return ( + self._ghidra_program.getAddressFactory().getDefaultAddressSpace().getAddress(abs_addr) # type: ignore + ) + + def _to_parser_addr(self, ghidra_offset: int) -> int: + """Convert an absolute Ghidra address offset to parser space. + + :param ghidra_offset: raw offset returned by ``getOffset()``. + :return: address in parser space. + """ + return ghidra_offset - self._ghidra_load_base + + def _get_ghidra_func(self, parser_addr: int): + """Return the Ghidra ``Function`` at *parser_addr*, using a single-entry cache. + + Falls back to ``getFunctionContaining`` when ``getFunctionAt`` returns + ``None``, handling the ARM THUMB ±1 offset case. Only accepts the + fallback result when the entry point matches within ±1 byte. + + :param parser_addr: address in parser space. + :return: Ghidra ``Function``, or ``None`` if not found. + """ + if ( + self._ghidra_cached_func is not None + and self._to_parser_addr(self._ghidra_cached_func.getEntryPoint().getOffset()) + == parser_addr + ): + return self._ghidra_cached_func + + ghidra_addr = self._to_ghidra_address(parser_addr) + func = self._ghidra_func_manager.getFunctionAt(ghidra_addr) # type: ignore + if func is None: + func = self._ghidra_func_manager.getFunctionContaining(ghidra_addr) # type: ignore + if func is not None: + entry_parser_addr = self._to_parser_addr(func.getEntryPoint().getOffset()) + if abs(entry_parser_addr - parser_addr) > 1: + func = None + + if func is not None: + self._ghidra_cached_func = func + return func diff --git a/src/pyrrha_mapper/backend/ida.py b/src/pyrrha_mapper/backend/ida.py new file mode 100644 index 0000000..a959ed9 --- /dev/null +++ b/src/pyrrha_mapper/backend/ida.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""IDA Pro backend implementation of Backend abstract interface.""" + +from __future__ import annotations + +import logging +from collections.abc import Iterator +from pathlib import Path + +from pyrrha_mapper.backend import Backend +from pyrrha_mapper.types import FuncType + + +class IDA(Backend): + """IDA Pro backend.""" + + def __init__( + self, + bin_path: Path, + root_directory: Path | None, + decompilation: bool = False, + image_base: int = 0, + ) -> None: + from ida_domain.database import Database, IdaCommandOptions + + self.decompilation_activated = decompilation + self.image_base = image_base + self._bin_path = bin_path + self._ida_cached_func = None # single-entry cache used by _get_ida_func + self._ida_db: Database = Database.open( + str(bin_path) if root_directory is None else str(root_directory / bin_path), + args=IdaCommandOptions(auto_analysis=True, new_database=False), + ) + + def close(self) -> None: + """Close the binary parser and release all resources.""" + self._ida_db.close(save=False) + + def is_func_start(self, addr: int) -> bool: + """:return: True if *addr* (parser space) is the entry point of a function.""" + from ida_domain.base import InvalidEAError + + try: + if self._ida_cached_func is not None and addr == self._ida_cached_func.start_ea: + return True + func = self._ida_db.functions.get_at(addr) + return func is not None and func.start_ea == addr + except InvalidEAError: + return False + + @property + def func_addrs(self) -> Iterator[int]: + """Yield the parser-space entry-point address of every known function.""" + from ida_domain.functions import FunctionFlags + + for func in self._ida_db.functions.get_all(): + if FunctionFlags.TAIL in self._ida_db.functions.get_flags(func): + continue + self._ida_cached_func = func + yield func.start_ea + + def func_mangled_name(self, addr: int) -> str: + """Return the raw (mangled) name of the function at *addr*. + + Resolution order: + + 1. Import table — preferred for genuine PLT stubs so the name matches + LIEF's :attr:`~pyrrha_mapper.common.Binary.imported_symbol_names`. + 2. ``get_name`` on the ``func_t`` — covers normal and library functions. + 3. ``sub_`` fallback when IDA could not recover any name. + + :param addr: function entry-point address. + :return: mangled symbol name or ``sub_``. + """ + func = self._get_ida_func(addr) + import_info = self._ida_db.imports.get_import_at(addr) + if import_info is not None and import_info.name is not None: + return import_info.name + if func is not None: + name = self._ida_db.functions.get_name(func) + if name: + return name + return f"sub_{addr:X}" + + def func_demangled_name(self, addr: int) -> str: + """:return: the demangled name, falling back to the mangled name.""" + mangled = self.func_mangled_name(addr) + demangled = self._ida_db.names.demangle_name(mangled) + return demangled if demangled is not None else mangled + + def func_children(self, addr: int) -> list[int]: + """Return parser-space addresses of callees of the function at *addr*. + + When IDA's ``get_callees`` returns a ``FUNC_TAIL`` chunk, the chunk's + own callee list is followed one level to obtain the real parent + ``start_ea``. Unresolvable self-referential chunks are dropped silently. + + :param addr: function entry-point address. + :return: list of callee entry-point addresses. + """ + from ida_domain.functions import FunctionFlags + + func = self._get_ida_func(addr) + result: list[int] = [] + for callee in self._ida_db.functions.get_callees(func) if func is not None else []: + if FunctionFlags.TAIL in self._ida_db.functions.get_flags(callee): + parents = list(self._ida_db.functions.get_callees(callee)) + if parents and parents[0].start_ea != callee.start_ea: + result.append(parents[0].start_ea) + else: + result.append(callee.start_ea) + return result + + def func_parents(self, addr: int) -> list[int]: + """:return: parser-space addresses of callers of the function at *addr*.""" + func = self._get_ida_func(addr) + if func is None: + return [] + return [caller.start_ea for caller in self._ida_db.functions.get_callers(func)] + + def func_type(self, addr: int) -> FuncType: + """:return: the FuncType of the function at *addr*. + + Classification order: + + 1. No callees + present in import table → ``IMPORTED`` (bare PLT stub). + 2. Versioned symbol (``name@@VERSION``) → ``IMPORTED``. + 3. ``FUNC_THUNK`` + single callee whose name is a known import → + ``IMPORTED`` (thunk wrapping an external symbol). + 4. ``FUNC_THUNK`` otherwise → ``THUNK``. + 5. ``FUNC_LIB`` → ``LIBRARY``. + 6. Default → ``NORMAL``. + """ + from ida_domain.functions import FunctionFlags + + func = self._get_ida_func(addr) + if func is None: + return FuncType.NORMAL + + flags = self._ida_db.functions.get_flags(func) + callees = list(self._ida_db.functions.get_callees(func)) + + if len(callees) == 0 and self._ida_db.imports.get_import_at(func.start_ea): + return FuncType.IMPORTED + if len(func.name.split("@@")) == 2: + return FuncType.IMPORTED + if FunctionFlags.THUNK in flags: + if len(callees) == 1: + callee_name = self._ida_db.functions.get_name(callees[0]) + if self._ida_db.imports.exists(callee_name): + return FuncType.IMPORTED + return FuncType.THUNK + if FunctionFlags.LIB in flags: + return FuncType.LIBRARY + return FuncType.NORMAL + + def func_decompiled(self, addr: int) -> str: + """:return: decompilation result of the function""" + from ida_domain.base import IdaDomainError + + func = self._get_ida_func(addr) + if func is None: + return "" + try: + pseudocode = self._ida_db.functions.get_pseudocode(func) + lines = pseudocode.to_text(remove_tags=True) + except IdaDomainError as exc: + logging.debug( + f"[IDA] skipping {func.start_ea:#x} " + f"({self._ida_db.functions.get_name(func)!r}): {exc}" + ) + return "" + return "\n".join(lines) + + # ------------------------------------------------------------------ + # Internal IDA method + # ------------------------------------------------------------------ + + def _get_ida_func(self, addr: int): + """Return the IDA ``func_t`` at *addr*, using a single-entry cache. + + :param addr: function entry-point address. + :return: the IDA ``func_t`` object, or ``None`` if not found. + """ + if self._ida_cached_func is not None and addr == self._ida_cached_func.start_ea: + return self._ida_cached_func + return self._ida_db.functions.get_at(addr) + + @property + def _ida_funcs(self) -> Iterator: + """Yield every non-tail ``func_t`` in the IDA database. + + ``FUNC_TAIL`` entries are non-contiguous chunks that share the parent + function's name and address space. Yielding them would produce + duplicate or misleading entries in any downstream mapping, so they are + filtered out here at the source. + + :return: iterator of ``func_t`` objects with ``FUNC_TAIL`` excluded. + """ + from ida_domain.functions import FunctionFlags + + for func in self._ida_db.functions.get_all(): + if FunctionFlags.TAIL in self._ida_db.functions.get_flags(func): + continue + self._ida_cached_func = func + yield func diff --git a/src/pyrrha_mapper/common/filesystem_mapper.py b/src/pyrrha_mapper/common/filesystem_mapper.py deleted file mode 100755 index 3225fb7..0000000 --- a/src/pyrrha_mapper/common/filesystem_mapper.py +++ /dev/null @@ -1,338 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Base classes for mapping binaries of a filesystem.""" - -import logging -from abc import ABC, abstractmethod -from contextlib import contextmanager -from pathlib import Path -from typing import overload - -from numbat import SourcetrailDB -from numbat.exceptions import DBException -from rich.progress import ( - BarColumn, - MofNCompleteColumn, - Progress, - TextColumn, - TimeElapsedColumn, -) - -from pyrrha_mapper.common.objects import Binary, FileSystem, Symlink, Symbol -from pyrrha_mapper.exceptions import PyrrhaError -from pyrrha_mapper.types import ResolveDuplicateOption - - -@contextmanager -def hide_progress(progress: Progress): - """Context Manager which temporally hide a `rich` progress bar. - - Code from https://github.com/Textualize/rich/issues/1535#issuecomment-1745297594 - """ - transient = progress.live.transient # save the old value - progress.live.transient = True - progress.stop() - progress.live.transient = transient # restore the old value - try: - yield - finally: - # make space for the progress to use so it doesn't overwrite any previous lines - print("\n" * (len(progress.tasks) - 2)) - progress.start() - - -class FileSystemMapper(ABC): - """Abstract class which is a base mapper to binaries of a filesystem. - - It maps a filesystem in the following order: - - binaries - - symlinks - - lib imports - - symbol_imports. - To change the behavior of these mapping you can reimplement the - map_* corresponding method. - - Init params - :param root_directory: directory containing the filesystem to map - :param db: interface to the DB - """ - - def __init__(self, root_directory: Path | str, db: SourcetrailDB | None): - self.root_directory = Path(root_directory).resolve().absolute() - self.db_interface = db - self.fs = FileSystem(root_dir=self.root_directory) - self._dry_run = not bool(db) - - @property - def dry_run_mode(self) -> bool: - """Returns whether a Sourcetrail DB as been provided or not. - - If not, only produce the FileSystem object that can also - be used independently. - """ - return self._dry_run - - @dry_run_mode.setter - def dry_run_mode(self, value: bool) -> None: - """If True does not record in db.""" - self._dry_run = value - - # ===================== Records in DB (NumbatUI DB) =============================== - - def record_import_in_db( - self, source_id: int | None, dest_id: int | None, log_prefix: str = "" - ) -> None: - """Record in DB the import of dest by source.""" - if self.dry_run_mode: - return None - assert self.db_interface is not None - if source_id is None or dest_id is None: - logging.error(f"{log_prefix}: Cannot record import, src and/or dest are unknown") - else: - self.db_interface.record_ref_import(source_id, dest_id) - - def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: - """Record the binary inside the DB as well as its internal symbols. - - Update 'bin_obj.id' with the id of the created object in DB and does the same - thing for its symbol. It will record symbols using their demangled names. - - :warning: do not record calls as well as any links between several binaries - - :param binary: the Binary object to map - :return: the updated object - """ - # If dry run do not store the binary in DB - if self.dry_run_mode: - return binary - - assert self.db_interface is not None - binary.id = self.db_interface.record_class( - binary.name, prefix=f"{binary.path.parent}/", delimiter=":" - ) - if binary.id is None: - logging.error(f"{log_prefix}: Record of binary failed.") - return binary - # dict demangled_name -> id to check if a demangled name has already been recorded - recorded_symb: dict[str, int] = dict() - for symbol in set(binary.iter_exported_symbols()): - if symbol.demangled_name in recorded_symb: - logging.debug( - f"{log_prefix}: demangled name {symbol.demangled_name} already in db " - "common node for these symbols" - ) - symbol.id = recorded_symb[symbol.demangled_name] - continue - if symbol.is_func: - symbol.id = self.db_interface.record_method( - symbol.demangled_name, - parent_id=binary.id, - prefix=hex(symbol.addr) if symbol.addr is not None else "None", - ) - if symbol.id is not None: - self.db_interface.change_node_color( - symbol.id, fill_color="#bee0af", border_color="#395f33" - ) - else: - symbol.id = self.db_interface.record_field( - symbol.demangled_name, - parent_id=binary.id, - prefix=hex(symbol.addr) if symbol.addr is not None else "None", - ) - - if symbol.id is None: - logging.error(f"{log_prefix}: Record of symbol '{symbol.demangled_name}' failed.") - else: - try: - self.symbol_recorded(binary, symbol) - self.db_interface.record_public_access(symbol.id) - recorded_symb[symbol.demangled_name] = symbol.id - except DBException as e: - raise PyrrhaError( - f"{log_prefix}: Cannot register access to symbol {symbol.demangled_name}: " - f"{e}" - ) from e - - for symbol in set(binary.iter_not_exported_functions()): - symbol.id = self.db_interface.record_method( - symbol.demangled_name, - parent_id=binary.id, - prefix=hex(symbol.addr) if symbol.addr is not None else "None", - ) - if symbol.id is None: - logging.error(f"{log_prefix}: Record of symbol '{symbol.demangled_name}' failed.") - else: - try: - self.symbol_recorded(binary, symbol) - self.db_interface.record_private_access(symbol.id) - except DBException as e: - raise PyrrhaError( - f"{log_prefix}: Cannot register access to symbol" - f" {symbol.demangled_name}: {e}" - ) from e - - return binary - - def symbol_recorded(self, binary: Binary, symbol: Symbol) -> None: - """Hook called when a symbol is recorded in the DB. - - This method can be overridden to add custom behavior. - - :param binary: the Binary object containing the method - :param symbol: the Symbol object representing the method - """ - pass # Default implementation does nothing - - def record_symlink_in_db(self, sym: Symlink, log_prefix: str = "") -> Symlink: - """Record into DB the symlink and its link to its target. - - Update 'sym.id' with the id of the created object. - :param sym: symlink object - :return: the updated object - """ - if self.dry_run_mode: - return sym - assert self.db_interface is not None - sym.id = self.db_interface.record_typedef_node( - sym.name, prefix=f"{sym.path.parent}/", delimiter=":" - ) - if sym.id is None: - logging.error(f"{log_prefix}: Record of symlink failed.") - else: - self.record_import_in_db(sym.id, sym.target.id) - return sym - - # =============================== Utils =============================== - - @overload - @staticmethod - def _select_fs_component( - strategy: ResolveDuplicateOption, - matching_objects: list[Binary], - log_prefix: str, - target_name: str, - cache: set[Binary] | None = None, - ) -> Binary | None: ... - - @overload - @staticmethod - def _select_fs_component( - strategy: ResolveDuplicateOption, - matching_objects: list[Symlink], - log_prefix: str, - target_name: str, - cache: set[Symlink] | None = None, - ) -> Symlink | None: ... - - @staticmethod - def _select_fs_component( - strategy: ResolveDuplicateOption, - matching_objects: list[Binary] | list[Symlink], - log_prefix: str, - target_name: str, - cache: set[Binary] | set[Symlink] | None = None, - ) -> Binary | Symlink | None: - """Choice of one element of a given list according to the strategy. - - Given a list of objects which match a target, select one or None among - the given list according the strategy given It also logs the choice made - (debug level). If requireds by the strategy, an interaction with the user could - be made. - :param strategy: the resolution strategy - :param matching_objects: a list of FileSystemComponents (NOT empty, not - check by the function) - :param log_prefix: Prefix used at the beginning of each log - :param target_name: Target name, used in logs (and user interaction) - :param resolve_cache: cache of previously selected choices for this target - :return: the selected FileSystemComponent | None if resolution strategy - is IGNORE - """ - if len(matching_objects) > 1 and strategy is ResolveDuplicateOption.IGNORE: - logging.debug( - f"{log_prefix}: several matches for {target_name} but strategy is " - f"{ResolveDuplicateOption.IGNORE.name} so nothing selected" - ) - return None - selected_index = None - selected_bin = None - if len(matching_objects) > 1 and strategy is ResolveDuplicateOption.INTERACTIVE: - for cache_entry in cache or {}: - if cache_entry in matching_objects: # reuse already selected entry - logging.debug( - f"{log_prefix}: manually selected entry to disambiguate {target_name}" - ) - selected_bin = cache_entry - - while ( - selected_bin is None - or selected_index is None - or selected_index < 0 - or selected_index >= len(matching_objects) - ): - print(f"{log_prefix}: several matches for {target_name}, select one\n") - for i in range(len(matching_objects)): - print(f"{i}: {matching_objects[i].path}") - try: - selected_index = int(input()) - except ValueError: - print("Enter a valid number") - else: # "arbitrary" option - selected_index = 0 - if selected_bin is None: - selected_bin = matching_objects[selected_index] - return selected_bin - - def commit(self) -> None: - """Commit changes in database.""" - if not self.dry_run_mode and self.db_interface is not None: - self.db_interface.commit() - - # ================================ Main function ================================== - - def map( - self, - threads: int, - resolution_strategy: ResolveDuplicateOption = ResolveDuplicateOption.IGNORE, - ) -> FileSystem: - """Wrap mapper_main with usefull elements for CLI rendering. - - :param threads: number of threads to use - :param resolution_strategy: the chosen option for duplicate import resolution - :return: The FileSystem object filled - """ - with Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(), - MofNCompleteColumn(), - TimeElapsedColumn(), - ) as progress: - return self.mapper_main(threads, progress, resolution_strategy) - - @abstractmethod - def mapper_main( - self, - threads: int, - progress: Progress, - resolution_strategy: ResolveDuplicateOption = ResolveDuplicateOption.IGNORE, - ) -> FileSystem: - """Main function of the mapper, return the result stored in a FileSytsem. - - :param threads: number of threads to use - :param progress: a progress bar ready to be filled - :param resolution_strategy: the chosen option for duplicate import resolution - :return: The FileSystem object filled - """ # noqa: D401 - pass diff --git a/src/pyrrha_mapper/exedecomp/__init__.py b/src/pyrrha_mapper/exedecomp/__init__.py deleted file mode 100644 index 50d429b..0000000 --- a/src/pyrrha_mapper/exedecomp/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Module for the decomp mapper.""" - -from .binmapper import map_binary - -__all__ = ["map_binary"] \ No newline at end of file diff --git a/src/pyrrha_mapper/exedecomp/binmapper.py b/src/pyrrha_mapper/exedecomp/binmapper.py deleted file mode 100644 index 7544279..0000000 --- a/src/pyrrha_mapper/exedecomp/binmapper.py +++ /dev/null @@ -1,381 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Decompilation code binary mapper.""" - -import logging -import json -from pathlib import Path -from collections import defaultdict -from dataclasses import dataclass -import sys -from typing import NamedTuple -from tempfile import NamedTemporaryFile -import hashlib - -# third-party imports -from qbinary import Program, Function, FunctionType -from qbinary.types import Disassembler, ExportFormat, DisassExportNotImplemented, ExportException - - -from numbat import SourcetrailDB -from idascript import IDA -from numbat import SourcetrailDB -from rich.progress import ( - BarColumn, - MofNCompleteColumn, - Progress, - TextColumn, - TimeElapsedColumn, -) - -# local imports -from pyrrha_mapper.exceptions import FsMapperError - -DECOMPILE_SCRIPT = Path(__file__).parent / "decompile.py" - -# Determine the command to open URLs based on the platform -try: - URL_OPEN_CMD = { - "linux": "xdg-open", - "win32": "start", - "darwin": "open" - }[sys.platform] -except KeyError: - logging.warning(f"Unsupported platform: {sys.platform} (will not add URL handler)") - URL_OPEN_CMD = "" # type: ignore - - -once_check = True - - - -class Location(NamedTuple): - start_line: int - start_col: int - end_line: int - end_col: int - - -@dataclass -class DecompiledFunction: - """Class used to represent a decompiled function.""" - - address: int - name: str # demangled (pp_print) - text: str - location: Location # location of the function name within text - references: dict[ - int, list[Location] - ] # callee_addr -> list(start_line, start_col, end_line, end_col) - numbat_id: int = -1 - - -def normalize_name(name: str) -> str: - """Transform function name.""" - return name.strip("_").strip(".") - - -def find_all_call_references(p:Program, f: Function, source: str, - log_prefix: str = "") -> tuple[Location, dict[int, list[Location]]]: - decl_loc = None - refs: dict[int, list[Location]] = defaultdict(list) # dict: call_addr -> list[Location] - #ppname = lambda name: name.strip("_").strip(".") - - # NOTE: we exclude by design calls that don't have a name, usually these are calls - # to unrecognized function e.g: loc_185CC - call_name_to_addr = {normalize_name(p[c].name): c for c in f.children if p[c].name} - call_addr_to_name = {c: normalize_name(p[c].name) for c in f.children if p[c].name} - - - for idx, line in enumerate(source.splitlines()): - # try to find function declaration - if decl_loc is None: - ppname = normalize_name(f.name) - col = line.find(normalize_name(f.name)) - if col != -1: - decl_loc = Location(idx + 1, col + 1, idx + 1, col + len(ppname)) - - # For a given line, this dict keeps the column (index) of all call matched - matches: dict[int, tuple[int, str]] = {} - - # iterate each calls and try to find them in the line - for cname, caddr in call_name_to_addr.items(): - if cname.endswith(")"): # to handle cases of func name with typing of parameter - name = cname.split("(")[0] - else: - name = cname - col = line.find(f"{name}(") - if col != -1: - matches[col] = (caddr, cname) - - # Iterate all matches in a sorted manner to avoid having overlap matches: - # e.g: If a function calls both lxstat() and xstat() for each line we search - # any occurence of this two functions. But if we have a line like: "int c = lxstats()" - # we will match both functions! Thus we sort them by the column index. In that case we - # keep lxstats(). - sorted_matches = sorted(list(matches.items()), key=lambda x: x[0]) - cursor = 0 - previous = (0, "") - while sorted_matches: - col, (caddr, cname) = sorted_matches.pop(0) - if col < cursor: # means the match is overlapping a previous match - if col + len(cname) == cursor and previous[1].endswith(cname): - logging.debug(f"{log_prefix}: skip match {cname}, end of the {previous[1]}") - else: - logging.warning( - f"{log_prefix}: skip match {cname} [col {col}] overlap with previous one " - f"{previous[1]} [col: {previous[0]}]" - ) - else: # its okay we add it - refs[caddr].append(Location(idx + 1, col + 1, idx + 1, col + len(cname))) - cursor = col + len(cname) - previous = (col, cname) - - if decl_loc is None: - logging.error(f"{log_prefix}: function declaration not found in source code") - - if not is_thunk_to_import(p, f): # it is normal no to find the call in thunks to imports - for ref in (x for x in call_addr_to_name if x not in refs): - logging.warning(f"{log_prefix}: call to {ref:#08x}: '{call_addr_to_name[ref]}' not found in source code") - - return decl_loc, refs - - -def decompile_program(program: Program) -> None: - """Generate a PROGRAM_NAME.decompiled file which contained the binary decompilee obtained with IDA. - - :param program: Program object of the file to decompiled - :return: path of the created decompiled file. - """ - bin_path: str = program.exec_path - assert bin_path, "program.exec_path is not set, can't decompile" - ida = IDA(bin_path, str(DECOMPILE_SCRIPT), [], timeout=600, exit_virtualenv=True) - ida.start() - ida.wait() - - -def load_decompiled(program: Program, progress: Progress, - log_prefix: str = "") -> dict[int, DecompiledFunction]: - decompile_file = Path(f"{program.exec_path}.decompiled") - - if decompile_file.exists(): - logging.info(f"{log_prefix}: load file: {decompile_file}") - data = {int(k): v for k, v in json.loads(decompile_file.read_text()).items()} - final_data: dict[int, DecompiledFunction] = {} - # Iterate the decompiled data to try make references inside - decomp_load = progress.add_task("[deep_pink2]Decompiled binary loading", total=len(data)) - for f_addr, source_text in data.items(): - f: Function = program.get(f_addr) - if f is None: - logging.warning(f"{log_prefix}: function at {f_addr:#08x} referenced " - "in decompiled code not found in exported program") - continue - - decl, refs = find_all_call_references(program, f, source_text, f"{log_prefix} {f.name}") - - assert decl is not None, f"function {f.name} declaration not found in source code" - - final_data[f_addr] = DecompiledFunction( - address=f_addr, name=f.name, text=source_text, location=decl, references=refs - ) - progress.update(decomp_load, advance=1) - - return final_data - else: - logging.info(f"{log_prefix}: extracting decompilation file {decompile_file} (with idascript)") - decompile_program(program) - if decompile_file.exists(): - return load_decompiled(program, progress, log_prefix) # call ourselves again - else: - raise FileNotFoundError("can't find decompilation file (idascript failed)") - - -def load_program(bin_path: Path, disass: Disassembler, format: ExportFormat) -> Program | None: - # First try to find pre-existing exported files if format is AUTO - try: - return Program.from_binary(bin_path, - export_format=format, - disassembler=disass, - timeout= 600, # TODO: Receive through command line ? - override=False, # if export exists use it - ) - except DisassExportNotImplemented as e: - logging.error(f"Disassembler {disass} does not support export format {format}: {e}") - except ExportException as e: - logging.error(f"Error while loading binary {bin_path}: {e}") - return None - - -def set_function_color(db: SourcetrailDB, p: Program, fun: Function, f_id: int) -> None: - # Change node color based on its type - if is_thunk_to_import(p, fun): - db.change_node_color(f_id, fill_color="#bee0af", border_color="#395f33") - elif fun.type == FunctionType.thunk: - db.change_node_color(f_id, fill_color="gray") - # elif fun.type == FunctionType.EXTERN: - # db.change_node_color(f_id, fill_color="magenta") - # elif fun.type == FunctionType.IMPORTED: - # db.change_node_color(f_id, fill_color="mediumvioletred") - else: - pass # Normal function let default color - - -def add_source_file( - db: SourcetrailDB, - mangled_name: str, - symbol_id: int, - info: DecompiledFunction, - log_prefix: str = "", -) -> bool: - """:return: True if successfully added source info.text as a source file in DB.""" - with NamedTemporaryFile(mode="wt", delete_on_close=True) as tmp: - tmp.write(info.text) - tmp.flush() # Ensure the file is written before we try to record it - # Record file - file_id = db.record_file(Path(tmp.name), name=mangled_name) - if file_id is None: - return False - db.record_file_language(file_id, "cpp") - tmp.close() - - # Add the function to the file - logging.debug(f"{log_prefix}: add function {mangled_name} to file {file_id}") - info.numbat_id = file_id - # record de symbol declaration - if info.location: - l1, col1, l2, col2 = info.location - db.record_symbol_location(symbol_id, file_id, l1, col1, l2, col2) - else: - logging.warning(f"{log_prefix}: declaration not found in source code") - - return True - - -def is_thunk_to_import(p: Program, f: Function) -> bool: - if f.type == FunctionType.thunk: - if len(f.children) == 1: - c = list(f.children)[0] - callee: Function = p[c] - if callee.type == FunctionType.imported: - return True - return False - else: - return False - - -def add_url_handler(db: SourcetrailDB, program: Program, hash: str, function: Function, f_id: int) -> None: - """ Open the function using a dedicated URL handler. (Use Heimdallr) """ - if URL_OPEN_CMD and program.exec_path: - url = f"disas://{hash}?idb={Path(program.exec_path).name+'.i64'}&offset={function.addr:#08x}" - cmd: list[str] = [URL_OPEN_CMD, url] - db.set_custom_command(f_id, cmd, "Open in Disassembler") # type: ignore - else: - pass # Can't add URL unsuported platform - - -def map_binary(db: SourcetrailDB, program_path: Path, disass: Disassembler, format: ExportFormat) -> bool: - # Load the Quokka file - with Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(), - MofNCompleteColumn(), - TimeElapsedColumn(), - ) as progress: - # Load the decompilation and quokka files - log_prefix = "[binary loading]" - try: - program = load_program(program_path, disass, format) - if program is None: - logging.error(f"{log_prefix} can't generate exported binary") - return False - except FileNotFoundError as e: - logging.error(f"{log_prefix}: Cannot found {program_path}: {e}") - return False - except FsMapperError as e: - logging.error(f"{log_prefix}: Error during Quokka export generation/loading: {e}") - return False - - # Try loading the decompiled file - try: - decompiled = load_decompiled(program, progress, log_prefix) - except FileNotFoundError as e: - logging.error(f"{log_prefix}: failed to obtain decompiled code: {e}") - return False - - # Compute MD5 hash for URL handler - p_hash = hashlib.md5(Path(program.exec_path).read_bytes()).hexdigest() - - # Index all the functions - f_mapping = {} # f_addr -> numbat_id - func_map = progress.add_task("[orange_red1]Functions analysis", total=len(program)) - for f_addr, f in program.items(): - log_prefix = f"[func analysis] {f.name} ({f.type})" - if f.type == FunctionType.imported: - logging.debug(f"{log_prefix}: extern function, skip") - progress.update(func_map, advance=1) - continue # do not add EXTERN functions - is_imp = is_thunk_to_import(program, f) - f_id = db.record_function(f.name, parent_id=None, is_indexed=not is_imp) - if f_id is None: - logging.error(f"{log_prefix}: error while recording function in db") - progress.update(func_map, advance=1) - continue - f_mapping[f_addr] = f_id - - # Change node color based on its type - set_function_color(db, program, f, f_id) - - # Add custom command to open that function in IDA - add_url_handler(db, program, p_hash, f, f_id) - - # Add source code if any - if f_addr in decompiled and not is_imp: - info = decompiled[f_addr] - if not add_source_file(db, f.mangled_name, f_id, info): - logging.warning(f"{log_prefix}: failed to add decompiled code") - elif f_addr not in decompiled and not is_imp: - logging.warning(f"{log_prefix}: function not in decompiled dict") - else: - pass # do not add decompiled code for thunks to imports - - progress.update(func_map, advance=1) - - - # Index the call graph - cg_map = progress.add_task("[orange1]Call Graph Indexing", total=len(program)) - - for f_addr, f in program.items(): - log_prefix = f"[callgraph indexing] {f.name}" - decomp_fun = decompiled.get(f_addr, None) - - for callee in f.children: - try: - callee_id = f_mapping[callee] - db.record_ref_call(f_mapping[f_addr], callee_id) # record the call - - if decomp_fun: # if we have info about the decompiled function - if refs := decomp_fun.references.get(callee): # get the refs associated with callee - for li, coli, le, cole in refs: # iterate them and add them - db.record_reference_location(callee_id, decomp_fun.numbat_id, li, coli, le, cole) - else: - logging.warning(f"{log_prefix} calls {program[callee].name} " - "but not references in DecompiledFunction") - - except KeyError: - pass # ignore call to non recognized functions - - progress.update(cg_map, advance=1) - return True diff --git a/src/pyrrha_mapper/exedecomp/decompile.py b/src/pyrrha_mapper/exedecomp/decompile.py deleted file mode 100644 index 4fb8078..0000000 --- a/src/pyrrha_mapper/exedecomp/decompile.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -import json -from pathlib import Path -import sys - -try: - import ida_auto - import idautils - import ida_nalt - import ida_pro - import ida_hexrays - INSIDE_IDA = True -except ImportError: - INSIDE_IDA = False - - from idascript import MultiIDA, iter_binary_files, IDA - - -def main_ida(): - ida_auto.auto_wait() - - input_file = ida_nalt.get_input_file_path() - output_file = input_file+".decompiled" - raw_file = input_file+".c" - - # First decompile the whole program - ida_hexrays.clear_cached_cfuncs() - ida_hexrays.decompile_many(raw_file, None, - ida_hexrays.VDRUN_NEWFILE | ida_hexrays.VDRUN_MAYSTOP | ida_hexrays.VDRUN_SILENT) - - funs = {} - - # Then reiterate all functions to get them individually - for fun_ea in idautils.Functions(): - decomp = ida_hexrays.decompile(fun_ea) - if decomp is not None: - funs[fun_ea] = str(decomp) - - with open(output_file, "w") as f: - f.write(json.dumps(funs)) - - ida_pro.qexit(0) - - -def file_iterator(path): - for file in iter_binary_files(path): - ida_i64 = Path(str(file)+".i64") - if ida_i64.exists(): - yield file - - -def main_main(): - """ - Main function called when launched normally - """ - if len(sys.argv) != 2: - print("Usage: decompile_program.py dir/") - sys.exit(1) - - root = sys.argv[1] - - # For each file identified launch many IDA in parrallel this very same script - for (file, retcode) in MultiIDA.map(file_iterator(root), __file__, [], 6): - print(f"Processed {file} [{retcode}]") - - -if __name__ == "__main__": - if INSIDE_IDA: - main_ida() - else: - main_main() diff --git a/src/pyrrha_mapper/fs/__init__.py b/src/pyrrha_mapper/fs/__init__.py deleted file mode 100644 index c1cac54..0000000 --- a/src/pyrrha_mapper/fs/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Module for the FS mapper.""" - -from .imports_mapper import FileSystemImportsMapper - -__all__ = ["FileSystemImportsMapper"] diff --git a/src/pyrrha_mapper/intercg/loader.py b/src/pyrrha_mapper/intercg/loader.py deleted file mode 100644 index ff53778..0000000 --- a/src/pyrrha_mapper/intercg/loader.py +++ /dev/null @@ -1,249 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Load information used by InterCGMapper from the files on the disk.""" - -import logging -from typing import NamedTuple - -# third-party imports -from qbinary import Program, FunctionType, DisassExportNotImplemented, ExportException, \ - Disassembler, ExportFormat - -# local imports -from pyrrha_mapper.common import Binary, Symbol -from pyrrha_mapper.exceptions import FsMapperError - - - -def load_program(binary: Binary, disass: Disassembler, - export: ExportFormat, log_prefix: str = "") -> dict[Symbol, list[str]]: - """Create a Binary object from a given file using lief and qbinary. - - It modifies the provided binary object in place. - - In order, it performs the following actions: - 1. load the program object - 2. use lief to extract exported symbols (handle conflicts with IDA names) - 3. checks if exported functions have been missed by IDA (but referenced in LIEF) - 4. Mangle the call graph to make external call to .PLT to directly jump on the - external symbol - - raise: FsMapperError if cannot load it - - :param binary: a Binary object that will be completed - :param disass: Disassembler enum to use for program loading - :param export: Export format to use for program loading - - :return: a dict of called done by each symbol of the binary - """ - file_path = binary.real_path - if file_path is None: - raise FileNotFoundError(file_path) - - try: - program = Program.from_binary(file_path, - export_format=export, - disassembler=disass, - timeout=600, # TODO: Receive through command line ? - override=False) # if export exists use it - # Load the call graph - return compute_call_graph(binary, program, log_prefix) # type: ignore - except DisassExportNotImplemented as e: - logging.error(f"Disassembler {disass} does not support export format {export}: {e}") - raise FsMapperError(f"{e}") from e - except ExportException as e: - logging.error(f"Error while loading binary {file_path}: {e}") - raise FsMapperError(f"{e}") from e - return None - - -class _FuncData(NamedTuple): - symbol: Symbol - type: FunctionType - calls: list[int] - callers: list[int] - - @property - def name(self) -> str: - return self.symbol.name - - @property - def demangled_name(self) -> str: - return self.symbol.demangled_name - - @property - def addr(self) -> int: - assert self.symbol.addr - return self.symbol.addr - - -def _generate_calls_list(func: _FuncData, call_graph: dict[int, _FuncData], log_prefix: str) -> list[str]: - """Given a function return its call list. - - It only contains functions that are contained in the call graph and have a name. - """ - res = list() - for c in [call_graph[x] for x in func.calls if x in call_graph]: - if c.name: # Has a true name - res.append(c.name) # Add it normally - else: # ignore function without name - logging.warning( - f"{log_prefix}: {func.symbol} calls a function without name (at {c.addr:#08x})" - ) - return res - - -def combine_program_analysis_binary(binary: Binary, program: Program, log_prefix: str) -> dict[int, _FuncData]: - """Combine program and binary objects by computing useful data. - - It updates binary object if new functions are determined. - - :param binary: binary object to update, contain data already analyzed - :param program: Program object in which to extract data - :return: a dict [addr, FuncData object associated to this address] - """ - exports = binary.exported_funcs_by_addr - program_data: dict[int, _FuncData] = {} - for f_addr, f in program.items(): - if f_addr in exports or f_addr + 1 in exports: # function exported (and visible in LIEF) - all_symbs = exports.get( - f_addr, exports.get(f_addr + 1, []) - ) # In THUMB mode address is address+1 - f_symb = disambiguate_export(all_symbs, log_prefix) - if f.name != f_symb.demangled_name: - logging.debug(f"{log_prefix}: change fun name {f.name} -> {f_symb.demangled_name}") - if len(all_symbs) > 1: # all the symbols will point on the chosen one - map(lambda x: binary.replace_function(f_symb, x, True), all_symbs) - else: - f_symb = Symbol(name=f.mangled_name, demangled_name=f.name, is_func=True, addr=f_addr) - binary.add_function(f_symb) - - program_data[f_addr] = _FuncData( - symbol=f_symb, - type=f.type, - calls=list(f.children), - callers=list(f.parents), - ) - return program_data - - -def compute_call_graph(binary: Binary, program: Program, log_prefix: str = "") -> dict[Symbol, list[str]]: - """Compute the call graph of the program using Quokka/Binexport. - - It fill the call attribute of binary. - - :param binary: binary object to update, contain data already analyzed - :param program: Program object in which to extract data - """ - - def _nb_initial_underscore(x: str) -> int: - return len(x) - len(x.strip("_.")) - - # Call graph fun_name -> [callee_name1, callee_name2] - call_graph: dict[Symbol, list[str]] = {} - exports = binary.exported_funcs_by_addr - - # Combine program and binary objects by computing useful data - program_data = combine_program_analysis_binary(binary, program, log_prefix) - - # Check if some exports don't have any associated function (not detected by IDA) - for exp_addr in exports.keys() - program.keys(): - all_symbs = exports[exp_addr] - canon = disambiguate_export(all_symbs, log_prefix) - if p_fun := program.get(exp_addr - 1): - # IDA keeps ARM address while LIEF use THUMB addresses - if p_fun.mangled_name in [s.name for s in all_symbs]: - # Check that we have a match on names - continue - # else case - logging.debug(f"{log_prefix}: export {canon.name}: {hex(exp_addr)} address not found in program.") - call_graph[canon] = [] - if len(all_symbs) > 1: # all the symbols will point on the chosen one - map(lambda x: binary.replace_function(canon, x, True), all_symbs) - - # Iterate back the temporary dict to fill the real call graph - # The deal here is to fast-forward call to imported function directly on the - # imported symbol and not on the PLT (to make the graph more straightforward) - removed_trampoline: dict[str, str] = dict() - for f in program_data.values(): - if ( - f.type in [FunctionType.normal, FunctionType.library] - # If thunk AND exported or thunk AND call several func, keep it (for later resolution) - or ( - f.type == FunctionType.thunk - and ((f.addr in exports) or (f.addr + 1 in exports) or len(f.calls) > 1) - ) - ): - call_graph[f.symbol] = _generate_calls_list(f, program_data, log_prefix) - continue - - # Replace thunk calling only one function (and only one) - elif f.type == FunctionType.thunk and len(f.calls) == 1 and f.calls[0] in program_data: - sub_callee = program_data[f.calls[0]] - if sub_callee.type == FunctionType.imported: - # Keep the name of the thunk "strcpy, sprintf" - name, target = sub_callee.name, f.name - # in case of nested functions (starting with _, keep the less nested one) - if _nb_initial_underscore(target) > _nb_initial_underscore(name): - name, target = target, name - else: # Forward the call to the underlying function name - name, target = f.name, sub_callee[0].name - # resolve trampoline and update associated dict - while target in removed_trampoline and removed_trampoline[target] != target: - target = removed_trampoline[target] - removed_trampoline[name] = target - for key, val in removed_trampoline.items(): - if val == name: - removed_trampoline[key] = target - - # If terminal thunk keep it in binary - elif f.type == FunctionType.thunk and len(f.calls) == 0 and len(f.callers) > 0: - continue - - # remove any function not explicitely kept (THUNK, IMPORTED, EXTERN) - if binary.get_function_by_name(f.name).addr == f.addr: - binary.remove_function(f.name) - - return { - symb: [removed_trampoline[c] if c in removed_trampoline else c for c in calls] - for symb, calls in call_graph.items() - } - - -def disambiguate_export(symbs: list[Symbol], log_prefix: str = "") -> Symbol: - """Given a list of symbols associated with one address, chose one.""" - if len(symbs) == 1: - return symbs[0] # If only one no ambiguity - - chosen = None - for symb in symbs: - if symb.demangled_name.startswith("_"): - continue - if chosen is None: - chosen = symb - elif chosen == symb: - continue - else: - # print(f"multiple options for name: {chosen}, {name}") - if len(symb.demangled_name) < len(chosen.demangled_name): - chosen = symb - - # all exports starts with _ - if chosen is None: - options = [s.demangled_name for s in symbs] - logging.debug(f"{log_prefix}: cannot disambiguate, select shortest name: {options}") - chosen = min(symbs, key=lambda x: len(x.demangled_name)) - return chosen diff --git a/src/pyrrha_mapper/common/__init__.py b/src/pyrrha_mapper/mappers/__init__.py similarity index 50% rename from src/pyrrha_mapper/common/__init__.py rename to src/pyrrha_mapper/mappers/__init__.py index b4be036..239b7c5 100644 --- a/src/pyrrha_mapper/common/__init__.py +++ b/src/pyrrha_mapper/mappers/__init__.py @@ -13,16 +13,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Common objects and functions that can be used for any mapper.""" - -from .filesystem_mapper import FileSystemMapper, hide_progress +"""Module for the intercg mapper.""" +from .decomp_mapper import GhidraDecompilMapper, IdaDecompilMapper +from .decomp_objects import ExportedDecompilation, ExportedFunction, ExportedLocation +from .imports_mapper import FileSystemImportsMapper, hide_progress +from .intercg_mapper import InterImageCGMapper from .objects import Binary, FileSystem, Symbol, Symlink -__all__ = [ - "FileSystemMapper", - "Binary", - "FileSystem", - "hide_progress", - "Symbol", - "Symlink", -] +__all__ = ["IdaDecompilMapper", + "GhidraDecompilMapper", + "ExportedDecompilation", + "ExportedFunction", + "ExportedLocation", + "InterImageCGMapper", + "FileSystemImportsMapper", + "hide_progress", + "Binary", + "FileSystem", + "Symbol", + "Symlink"] \ No newline at end of file diff --git a/src/pyrrha_mapper/mappers/decomp_mapper.py b/src/pyrrha_mapper/mappers/decomp_mapper.py new file mode 100644 index 0000000..ccf6df8 --- /dev/null +++ b/src/pyrrha_mapper/mappers/decomp_mapper.py @@ -0,0 +1,384 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Decompilation code binary mapper.""" + +import logging +from collections import defaultdict +from dataclasses import dataclass, field +from pathlib import Path +from tempfile import NamedTemporaryFile +from typing import NamedTuple + +from numbat import SourcetrailDB +from rich.progress import ( + BarColumn, + MofNCompleteColumn, + Progress, + TextColumn, + TimeElapsedColumn, +) + +from pyrrha_mapper.backend import IDA, Backend, Ghidra +from pyrrha_mapper.types import FuncType + +from .decomp_objects import ExportedDecompilation +from .objects import Binary, Symbol + + +class Location(NamedTuple): + """Location inside a text of a word or more.""" + + start_line: int + start_col: int + end_line: int + end_col: int + + +@dataclass +class FuncData: + """Store function data collected by the binary parser. + + All addresses are in **parser space** (the native address space of the + underlying tool — IDA, Ghidra, etc.). + """ + + symbol: Symbol + type: FuncType + calls: list[int] + callers: list[int] + source: str + source_id: int | None = None + declaration: Location | None = None + # Keyed by callee (parser-space) address; defaultdict so call-site locations + # can be appended without pre-seeding each callee entry. + source_calls_loc: dict[int, list[Location]] = field(default_factory=lambda: defaultdict(list)) + + @property + def id(self) -> int | None: + """:return: the associated DB id if any""" + return self.symbol.id + + @id.setter + def id(self, val: int) -> None: + self.symbol.id = val + + @property + def name(self) -> str: + """:return: mangled name of the function""" + return self.symbol.name + + @property + def demangled_name(self) -> str: + """:return: demangled name of the function""" + return self.symbol.demangled_name + + @property + def addr(self) -> int: + """:return: address of the function in the Binary""" + assert self.symbol.addr is not None + return self.symbol.addr + + +def normalize_name(name: str) -> str: + """Transform function name.""" + return name.strip("_").strip(".") + + +class DecompilMapper(Backend): + """Map a single binary's decompiled source and call graph into a Sourcetrail DB.""" + + def __init__( + self, + db: SourcetrailDB, + bin_path: Path, + ) -> None: + self.db_interface = db + super().__init__(bin_path, None, decompilation=True) + self.bin = Binary(path=bin_path) + self.functions: dict[int, FuncData] = dict() + self.source_ids: dict[int, int] = dict() + # Display binaries as a dedicated "Binaries" group in NumbatUI, mirroring + # the inter-image call graph mapper so both mappers share a graph shape. + self.db_interface.set_node_type("class", "Binaries", "binary") + + def record_function(self, func: FuncData, log_prefix) -> FuncData: + """Record a function into the DB (do not record the associated source). + + :return: updated func data with id + """ + if func.type == FuncType.IMPORTED: + logging.debug(f"{log_prefix}: extern function, skip") + return func # do not add EXTERN functions + f_id = self.db_interface.record_function( + func.demangled_name, + prefix=hex(func.addr) if func.addr is not None else "None", + parent_id=self.bin.id, + ) + if f_id is None: + logging.error(f"{log_prefix}: error while recording function in db") + else: + func.id = f_id + return func + + def index_function(self, addr: int, log_prefix: str) -> None: + """Iterate over all the functions of the binary and extract useful data. + + Record function at the given address (addr) into DB and as member of + self.binary. + """ + func_type = self.func_type(addr) + func_data = FuncData( + symbol=Symbol( + name=self.func_mangled_name(addr), + demangled_name=self.func_demangled_name(addr), + is_func=True, + addr=addr, + ), + type=func_type, + calls=self.func_children(addr), + callers=self.func_parents(addr), + source=self.func_decompiled(addr) if func_type != FuncType.IMPORTED else "", + ) + self.bin.add_function(func_data.symbol) + self.functions[addr] = self.record_function(func_data, log_prefix) + + def record_source(self, func: FuncData, log_prefix: str) -> FuncData: + """Record decompiled version of each function. + + :param func: Func data object to treat + :param log_prefix: string prepended to every log message. + :return: updated func data object + """ + with NamedTemporaryFile(mode="wt", delete_on_close=True) as tmp: + tmp.write(func.source) + tmp.flush() + func.source_id = self.db_interface.record_file(Path(tmp.name), name=func.name) + if func.source_id is None: + return func + self.db_interface.record_file_language(func.source_id, "cpp") + tmp.close() + + logging.debug(f"{log_prefix}: add function {func.name} to file {func.source_id}") + if func.id is not None and func.declaration is not None: + self.db_interface.record_symbol_location(func.id, func.source_id, *func.declaration) + else: + logging.warning(f"{log_prefix}: declaration not found in source code") + + return func + + def index_decompiled(self, addr, log_prefix) -> None: + """Locate the declaration and every call-site inside the source of function at address addr. + + Record the associated source. + :param addr: address of the function to treat + :param log_prefix: string prepended to every log message. + """ + func = self.functions[addr] + + # Imported functions have no decompiled body (source is set to "" in + # index_function), so there is nothing to locate or record. Skip them + # to avoid spurious "declaration not found" errors. + if func.type == FuncType.IMPORTED: + return + + # Build lookup tables for the callees of this function. + # normalize_name strips leading/trailing underscores and dots so that + # e.g. "__memcpy" and "memcpy" both match the same call-site token. + callee_name_to_addr: dict[str, int] = { + normalize_name(self.functions[callee_addr].name): callee_addr + for callee_addr in func.calls + if callee_addr in self.functions and self.functions[callee_addr].name + } + + func_name = normalize_name(func.name) + + for line_index, line_text in enumerate(func.source.splitlines()): + # Lines in Location are 1-based; line_index is 0-based. + line_number = line_index + 1 + + # Try to find the function declaration on this line. + if func.declaration is None: + decl_col = line_text.find(func_name) + if decl_col != -1: + func.declaration = Location( + line_number, + decl_col + 1, + line_number, + decl_col + len(func_name), + ) + + # Scan the line for each callee name, recording the start column of + # every hit. The dict key is the column so overlaps are detected + # in the sort pass below. + # key: start_col, value: (callee_addr, callee_name) + hits_by_col: dict[int, tuple[int, str]] = {} + + for callee_name, callee_addr in callee_name_to_addr.items(): + # If the stored name includes a type signature (e.g. "func(int)") + # strip to the bare identifier before searching. + search_token = ( + callee_name.split("(")[0] if callee_name.endswith(")") else callee_name + ) + hit_col = line_text.find(f"{search_token}(") + if hit_col != -1: + hits_by_col[hit_col] = (callee_addr, callee_name) + + # Process hits left-to-right so that a longer earlier match + # (e.g. "lxstat") shadows a shorter later substring (e.g. "xstat"). + # end_of_last_accepted tracks the column just past the last accepted + # match so that substring overlaps are detected. + end_of_last_accepted = 0 + last_accepted_col = 0 + last_accepted_name = "" + + for start_col, (callee_addr, callee_name) in sorted(hits_by_col.items()): + if start_col < end_of_last_accepted: + # This hit starts inside the span of the previous match. + if start_col + len( + callee_name + ) == end_of_last_accepted and last_accepted_name.endswith(callee_name): + # The hit is a suffix of the accepted match — expected, + # not a real overlap (e.g. "stat" at the end of "lxstat"). + logging.debug( + f"{log_prefix}: skip '{callee_name}' — suffix of '{last_accepted_name}'" + ) + else: + logging.warning( + f"{log_prefix}: skip '{callee_name}' [col {start_col}] — " + f"overlaps '{last_accepted_name}' [col {last_accepted_col}]" + ) + else: + func.source_calls_loc[callee_addr].append( + Location( + line_number, + start_col + 1, + line_number, + start_col + len(callee_name), + ) + ) + end_of_last_accepted = start_col + len(callee_name) + last_accepted_col = start_col + last_accepted_name = callee_name + + if func.declaration is None: + logging.error(f"{log_prefix}: function declaration not found in source code") + + self.functions[addr] = self.record_source(func, log_prefix) + + def index_call_graph(self, addr, log_prefix) -> None: + """Map the call graph of the function at address addr. + + It also map as the associated references in source if any. + Record the callgraph into db.. + :param addr: address of the function to treat + :param log_prefix: string prepended to every log message. + """ + func = self.functions[addr] + # Imported functions have no body and are not recorded in the DB, so + # they cannot be callers; skip them without warning. + if func.type == FuncType.IMPORTED: + return + if func.id is None: + logging.warning(f"{log_prefix}: {func.name} is not a registered function, skip") + return + for child_addr in func.calls: + if child_addr not in self.functions: + logging.warning( + f"{log_prefix}: Calls to {child_addr:0x} addr from {func.name} " + + "does not match a registered function" + ) + continue + child = self.functions[child_addr] + if child.id is None: + # Imported callees are never recorded in the DB (they have no + # body), so a missing id is expected rather than an error. + level = logging.DEBUG if child.type == FuncType.IMPORTED else logging.WARNING + logging.log( + level, + f"{log_prefix}: cannot record call to {child.name} from {func.name} " + + "missing target id.", + ) + continue + self.db_interface.record_ref_call(func.id, child.id) + + # source_calls_loc is keyed by the *callee* address (see + # index_decompiled), so look up the locations for this child. + child_locations = func.source_calls_loc.get(child_addr, []) + if func.source == "" or child_locations == [] or func.source_id is None: + continue + for location in child_locations: + self.db_interface.record_reference_location(child.id, func.source_id, *location) + + def map(self) -> bool: + """Run the successive steps of the mapping. + + :return: True if the binary node was recorded and indexing ran, else False. + """ + # Record the binary as a class node so functions can be attached to it + # via parent_id. Without this id, record_function would orphan every + # function. Mirrors InterImageCGMapper.record_binary_in_db. + self.bin.id = self.db_interface.record_class( + self.bin.name, prefix=f"{self.bin.path.parent}/", delimiter=":" + ) + if self.bin.id is None: + logging.error(f"[binary indexing] {self.bin.name}: record of binary failed") + return False + + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + MofNCompleteColumn(), + TimeElapsedColumn(), + ) as progress: + func_addrs = list(self.func_addrs) + func_indexing = progress.add_task("[red]Function indexing", total=len(func_addrs)) + for addr in func_addrs: + self.index_function(addr, f"[function indexing] {addr:#x}") + progress.update(func_indexing, advance=1) + + decompilee_indexing = progress.add_task( + "[orange_red1]Source indexing", total=len(self.functions) + ) + for addr in self.functions.keys(): + self.index_decompiled(addr, f"[source indexing] {self.functions[addr].name}") + progress.update(decompilee_indexing, advance=1) + + cg_indexing = progress.add_task("[gold1]Call graph indexing", total=len(self.functions)) + for addr in self.functions.keys(): + self.index_call_graph(addr, f"[call graph indexing] {self.functions[addr].name}") + progress.update(cg_indexing, advance=1) + + return True + + def to_export(self) -> ExportedDecompilation: + """Build a serialisable export of the current mapping result. + + :return: an ExportedDecompilation projecting this mapper's binary and + functions into a JSON-serialisable model. + """ + return ExportedDecompilation.from_mapper(self) + + +class IdaDecompilMapper(DecompilMapper, IDA): + """Decompile Mapper backed by IDA Pro.""" + + pass + + +class GhidraDecompilMapper(DecompilMapper, Ghidra): + """Decompile Mapper backed by Ghidra.""" + + pass diff --git a/src/pyrrha_mapper/mappers/decomp_objects.py b/src/pyrrha_mapper/mappers/decomp_objects.py new file mode 100644 index 0000000..d90ee69 --- /dev/null +++ b/src/pyrrha_mapper/mappers/decomp_objects.py @@ -0,0 +1,294 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Serialisable export model for the decompilation mapper. + +These pydantic models mirror the transient analysis structures of +``decomp_mapper`` (``Location``, ``FuncData``) but, unlike them, can be dumped +to / loaded from JSON. The mapper keeps using the lightweight dataclass/ +NamedTuple for the hot indexing loop; this module provides the serialisable +projection produced once at the end of a run (see +``ExportedDecompilation.from_mapper``). + +All function addresses are expressed in **parser space** (the native address +space of the underlying tool — IDA, Ghidra, etc.), exactly as in the mapper. +""" + +from __future__ import annotations + +import json +from collections.abc import Iterable +from pathlib import Path +from typing import TYPE_CHECKING, Any, Self + +from pydantic import ( + BaseModel, + Field, + SerializationInfo, + ValidationInfo, + field_serializer, + field_validator, + model_validator, +) + +from pyrrha_mapper.types import FuncType + +from .objects import Symbol + +if TYPE_CHECKING: # pragma: no cover + from .decomp_mapper import DecompilMapper, FuncData, Location + + +class ExportedLocation(BaseModel): + """Serialisable location of a word (or more) inside a decompiled source. + + Mirror of ``decomp_mapper.Location`` (a ``NamedTuple``) that can be dumped + to and loaded from JSON. Lines and columns are 1-based, matching the + convention used by the mapper when it records symbol/reference locations. + """ + + start_line: int + start_col: int + end_line: int + end_col: int + + @classmethod + def from_location(cls, location: Location) -> Self: + """:return: an ExportedLocation built from a mapper Location.""" + return cls( + start_line=location.start_line, + start_col=location.start_col, + end_line=location.end_line, + end_col=location.end_col, + ) + + def as_tuple(self) -> tuple[int, int, int, int]: + """:return: the location as a ``(start_line, start_col, end_line, end_col)`` tuple.""" + return (self.start_line, self.start_col, self.end_line, self.end_col) + + # from https://github.com/pydantic/pydantic/discussions/2910 + def __lt__(self, other): # noqa: D105 + return tuple(self.model_dump().values()) < tuple(other.model_dump().values()) + + def __le__(self, other): # noqa: D105 + return tuple(self.model_dump().values()) <= tuple(other.model_dump().values()) + + def __gt__(self, other): # noqa: D105 + return tuple(self.model_dump().values()) > tuple(other.model_dump().values()) + + def __ge__(self, other): # noqa: D105 + return tuple(self.model_dump().values()) >= tuple(other.model_dump().values()) + + +class ExportedFunction(BaseModel): + """Serialisable view of a single decompiled function. + + Mirror of ``decomp_mapper.FuncData``. The underlying :class:`Symbol` is + embedded directly; ``id``/``name``/``demangled_name``/``addr`` are exposed + as delegating properties so this object offers the same read surface as + ``FuncData``. + """ + + symbol: Symbol + type: FuncType + calls: list[int] = Field(default_factory=list) + callers: list[int] = Field(default_factory=list) + source: str = "" + source_id: int | None = None + declaration: ExportedLocation | None = None + # Keyed by callee (parser-space) address, as in FuncData.source_calls_loc. + source_calls_loc: dict[int, list[ExportedLocation]] = Field(default_factory=dict) + + @field_validator("symbol", mode="after") + @classmethod + def validate_symbol_is_func(cls, value: Symbol) -> Symbol: + """Ensure the embedded symbol is a function.""" + if not value.is_func: + raise ValueError(f"symbol '{value}' cannot back a function as 'is_func' is False") + return value + + @property + def id(self) -> int | None: + """:return: the associated DB id if any.""" + return self.symbol.id + + @id.setter + def id(self, val: int) -> None: + self.symbol.id = val + + @property + def name(self) -> str: + """:return: mangled name of the function.""" + return self.symbol.name + + @property + def demangled_name(self) -> str: + """:return: demangled name of the function.""" + return self.symbol.demangled_name + + @property + def addr(self) -> int: + """:return: address of the function in the binary (parser space).""" + assert self.symbol.addr is not None + return self.symbol.addr + + @classmethod + def from_func_data(cls, func: FuncData) -> Self: + """:return: an ExportedFunction built from a mapper FuncData object.""" + declaration = ( + ExportedLocation.from_location(func.declaration) + if func.declaration is not None + else None + ) + source_calls_loc = { + callee_addr: [ExportedLocation.from_location(loc) for loc in locations] + for callee_addr, locations in func.source_calls_loc.items() + } + return cls( + symbol=func.symbol, + type=func.type, + calls=list(func.calls), + callers=list(func.callers), + source=func.source, + source_id=func.source_id, + declaration=declaration, + source_calls_loc=source_calls_loc, + ) + + def __repr__(self): # noqa: D105 + return f"ExportedFunction('{self.name}')" + + +class ExportedDecompilation(BaseModel): + """Serialisable result of a single ``DecompilMapper`` run. + + It stores the analysed binary identity and the decompiled functions keyed + by their parser-space entry-point address. It is based on pydantic so it + can be dumped to a dict/JSON and rebuilt from these dumps. + """ + + path: Path + id: int | None = None + name: str = "" + functions: dict[int, ExportedFunction] = Field(default_factory=dict) + + def model_post_init(self, __context: Any) -> None: + """Enforce object name based on its path.""" + self.name = self.path.name + + # ----------------------------- Serialisation --------------------------------- + + @field_serializer("functions", mode="plain", when_used="always") + def serialize_functions( + self, v: dict[int, ExportedFunction], info: SerializationInfo + ) -> dict[Any, Any]: + """Serialize the address-keyed functions dict. + + JSON object keys must be strings, so integer addresses are stringified + in JSON mode and kept as integers in python mode. + """ + mode = "json" if info.mode_is_json() else "python" + res: dict[Any, Any] = dict() + for addr, func in v.items(): + key = str(addr) if info.mode_is_json() else addr + res[key] = func.model_dump(mode=mode) + return res + + @field_validator("functions", mode="before") + @classmethod + def validate_functions(cls, data: Any, info: ValidationInfo) -> Any: + """Validate a dict dump and turn it into an ``int -> ExportedFunction`` dict. + + Accepts an already-built mapping, a python dump (int keys) or a JSON + dump (string keys); the latter has its keys converted back to int. + """ + if not isinstance(data, dict): + raise ValueError("provided functions data is not a dict") + res: dict[int, ExportedFunction] = dict() + for addr, content in data.items(): + try: + int_addr = int(addr) + except (TypeError, ValueError) as e: + raise ValueError(f"Cannot convert function key '{addr}' into an int: {e}") from e + if isinstance(content, ExportedFunction): + res[int_addr] = content + else: + res[int_addr] = ExportedFunction.model_validate(content) + return res + + @model_validator(mode="after") + def validate_keys_match_addr(self) -> Self: + """Ensure each function is stored under its own address when it has one.""" + for addr, func in self.functions.items(): + if func.symbol.addr is not None and func.symbol.addr != addr: + raise ValueError( + f"function '{func.name}' stored under address {addr} but its symbol " + f"address is {func.symbol.addr}" + ) + return self + + def model_dump_json(self, **args) -> str: + """Override classic pydantic model_dump_json with preselected arguments.""" + return json.dumps(self.model_dump(mode="json", **args)) + + def write(self, export_path: Path) -> None: + """Dump content of the instance into a JSON file.""" + export_path.write_text(self.model_dump_json()) + + @classmethod + def from_json_export(cls, export_path: Path | str) -> Self: + """Create and populate an instance from a JSON file content.""" + export_path = Path(export_path) + return cls.model_validate_json(export_path.read_text()) + + @classmethod + def from_mapper(cls, mapper: DecompilMapper) -> Self: + """:return: an ExportedDecompilation built from a DecompilMapper run.""" + functions = { + addr: ExportedFunction.from_func_data(func) for addr, func in mapper.functions.items() + } + return cls(path=mapper.bin.path, id=mapper.bin.id, functions=functions) + + # ----------------------------- Manipulation helpers --------------------------- + + def add_function(self, func: ExportedFunction) -> None: + """Record a function, keyed by its address. Overrides any existing entry.""" + self.functions[func.addr] = func + + def function_exists(self, addr: int) -> bool: + """:return: True if a function exists at the given address.""" + return addr in self.functions + + def function_name_exists(self, name: str) -> bool: + """:return: True if a function with the given (mangled) name exists.""" + return any(func.name == name for func in self.functions.values()) + + def get_function_by_addr(self, addr: int) -> ExportedFunction: + """:return: the function recorded at the given address.""" + return self.functions[addr] + + def get_function_by_name(self, name: str) -> ExportedFunction: + """:return: the first function with the given (mangled) name.""" + for func in self.functions.values(): + if func.name == name: + return func + raise KeyError(name) + + def iter_functions(self) -> Iterable[ExportedFunction]: + """:return: an iterable over the functions of the binary.""" + yield from self.functions.values() + + def __repr__(self): # noqa: D105 + return f"ExportedDecompilation('{self.path}', funcs={len(self.functions)})" diff --git a/src/pyrrha_mapper/fs/imports_mapper.py b/src/pyrrha_mapper/mappers/imports_mapper.py similarity index 58% rename from src/pyrrha_mapper/fs/imports_mapper.py rename to src/pyrrha_mapper/mappers/imports_mapper.py index b92092d..5f89ba8 100644 --- a/src/pyrrha_mapper/fs/imports_mapper.py +++ b/src/pyrrha_mapper/mappers/imports_mapper.py @@ -19,27 +19,72 @@ import queue from abc import ABC from collections.abc import Callable +from contextlib import contextmanager from dataclasses import dataclass from functools import partial from multiprocessing import Queue, get_context from pathlib import Path -from typing import Any +from typing import Any, overload -import lief from numbat import SourcetrailDB -from rich.progress import Progress - -from pyrrha_mapper.common import Binary, FileSystem, FileSystemMapper, Symbol, Symlink +from numbat.exceptions import DBException +from rich.progress import ( + BarColumn, + MofNCompleteColumn, + Progress, + TextColumn, + TimeElapsedColumn, +) + +from pyrrha_mapper.exceptions import PyrrhaError from pyrrha_mapper.types import ResolveDuplicateOption -lief.logging.disable() +from .objects import Binary, FileSystem, Symbol, Symlink + + +@contextmanager +def hide_progress(progress: Progress): + """Context Manager which temporally hide a `rich` progress bar. + + Code from https://github.com/Textualize/rich/issues/1535#issuecomment-1745297594 + """ + transient = progress.live.transient # save the old value + progress.live.transient = True + progress.stop() + progress.live.transient = transient # restore the old value + try: + yield + finally: + # make space for the progress to use so it doesn't overwrite any previous lines + print("\n" * (len(progress.tasks) - 2)) + progress.start() + + +class FileSystemImportsMapper: + """Filesystem mapper based on Lief, which computes imports and exports. + It maps a filesystem in the following order: + - binaries + - symlinks + - lib imports + - symbol_imports. + To change the behavior of these mapping you can reimplement the + map_* corresponding method. -class FileSystemImportsMapper(FileSystemMapper): - """Filesystem mapper based on Lief, which computes imports and exports.""" + Init params + :param root_directory: directory containing the filesystem to map + :param db: interface to the DB + """ def __init__(self, root_directory: Path | str, db: SourcetrailDB | None): - super(FileSystemImportsMapper, self).__init__(root_directory, db) + import lief + + lief.logging.disable() + + self.root_directory = Path(root_directory).resolve().absolute() + self.db_interface = db + self.fs = FileSystem(root_dir=self.root_directory) + self._dry_run = not bool(db) if not self.dry_run_mode and self.db_interface is not None: # Setup graph customisation in NumbatUI @@ -56,16 +101,254 @@ def is_binary_supported(p: Path) -> bool: :param p: the path of the file to analyzed :return: True is the path point on a file """ + import lief + + lief.logging.disable() + return p.is_file() and not p.is_symlink() and (lief.is_elf(str(p)) or lief.is_pe(str(p))) - + + @property + def dry_run_mode(self) -> bool: + """Returns whether a Sourcetrail DB as been provided or not. + + If not, only produce the FileSystem object that can also + be used independently. + """ + return self._dry_run + + @dry_run_mode.setter + def dry_run_mode(self, value: bool) -> None: + """If True does not record in db.""" + self._dry_run = value + + # ===================== Records in DB (NumbatUI DB) =============================== + + def record_import_in_db( + self, source_id: int | None, dest_id: int | None, log_prefix: str = "" + ) -> None: + """Record in DB the import of dest by source.""" + if self.dry_run_mode: + return None + assert self.db_interface is not None + if source_id is None or dest_id is None: + logging.error(f"{log_prefix}: Cannot record import, src and/or dest are unknown") + else: + self.db_interface.record_ref_import(source_id, dest_id) + + def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: + """Record the binary inside the DB as well as its internal symbols. + + Update 'bin_obj.id' with the id of the created object in DB and does the same + thing for its symbol. It will record symbols using their demangled names. + + :warning: do not record calls as well as any links between several binaries + + :param binary: the Binary object to map + :return: the updated object + """ + # If dry run do not store the binary in DB + if self.dry_run_mode: + return binary + + assert self.db_interface is not None + binary.id = self.db_interface.record_class( + binary.name, prefix=f"{binary.path.parent}/", delimiter=":" + ) + if binary.id is None: + logging.error(f"{log_prefix}: Record of binary failed.") + return binary + + recorded_symb: dict[str, int] = dict() + for symbol in set(binary.iter_exported_symbols()): + if symbol.demangled_name in recorded_symb: + logging.debug( + f"{log_prefix}: demangled name {symbol.demangled_name} already in db " + "common node for these symbols" + ) + symbol.id = recorded_symb[symbol.demangled_name] + # Also propagate the id to any other symbol registered under + # the same mangled name (e.g. secondary demangled-key entries). + for other in binary.exported_functions.values(): + if other.name == symbol.name and other.id is None: + other.id = symbol.id + continue + if symbol.is_func: + symbol.id = self.db_interface.record_method( + symbol.demangled_name, + parent_id=binary.id, + prefix=hex(symbol.addr) if symbol.addr is not None else "None", + ) + if symbol.id is not None: + self.db_interface.change_node_color( + symbol.id, fill_color="#bee0af", border_color="#395f33" + ) + else: + symbol.id = self.db_interface.record_field( + symbol.demangled_name, + parent_id=binary.id, + prefix=hex(symbol.addr) if symbol.addr is not None else "None", + ) + + if symbol.id is None: + logging.error(f"{log_prefix}: Record of symbol '{symbol.demangled_name}' failed.") + else: + try: + self.db_interface.record_public_access(symbol.id) + recorded_symb[symbol.demangled_name] = symbol.id + # Propagate id to all symbols sharing the same mangled name + # (covers secondary demangled-key registrations). + for other in binary.exported_functions.values(): + if other.name == symbol.name and other.id is None: + other.id = symbol.id + except DBException as e: + raise PyrrhaError( + f"{log_prefix}: Cannot register access to symbol {symbol.demangled_name}: " + f"{e}" + ) from e + + for symbol in set(binary.iter_not_exported_functions()): + # Skip if this demangled name was already recorded as an exported + # symbol — same demangled name means same DB node, and calling + # record_private_access on it would violate the UNIQUE constraint. + if symbol.demangled_name in recorded_symb: + logging.debug( + f"{log_prefix}: demangled name {symbol.demangled_name} already recorded " + "as exported, skipping internal registration" + ) + symbol.id = recorded_symb[symbol.demangled_name] + continue + symbol.id = self.db_interface.record_method( + symbol.demangled_name, + parent_id=binary.id, + prefix=hex(symbol.addr) if symbol.addr is not None else "None", + ) + if symbol.id is None: + logging.error(f"{log_prefix}: Record of symbol '{symbol.demangled_name}' failed.") + else: + try: + self.db_interface.record_private_access(symbol.id) + recorded_symb[symbol.demangled_name] = symbol.id + except DBException as e: + raise PyrrhaError( + f"{log_prefix}: Cannot register access to symbol" + f" {symbol.demangled_name}: {e}" + ) from e + + return binary + + def record_symlink_in_db(self, sym: Symlink, log_prefix: str = "") -> Symlink: + """Record into DB the symlink and its link to its target. + + Update 'sym.id' with the id of the created object. + :param sym: symlink object + :return: the updated object + """ + if self.dry_run_mode: + return sym + assert self.db_interface is not None + sym.id = self.db_interface.record_typedef_node( + sym.name, prefix=f"{sym.path.parent}/", delimiter=":" + ) + if sym.id is None: + logging.error(f"{log_prefix}: Record of symlink failed.") + else: + self.record_import_in_db(sym.id, sym.target.id) + return sym + + # =============================== Utils =============================== + + @overload + @staticmethod + def _select_fs_component( + strategy: ResolveDuplicateOption, + matching_objects: list[Binary], + log_prefix: str, + target_name: str, + cache: set[Binary] | None = None, + ) -> Binary | None: ... + + @overload + @staticmethod + def _select_fs_component( + strategy: ResolveDuplicateOption, + matching_objects: list[Symlink], + log_prefix: str, + target_name: str, + cache: set[Symlink] | None = None, + ) -> Symlink | None: ... + + @staticmethod + def _select_fs_component( + strategy: ResolveDuplicateOption, + matching_objects: list[Binary] | list[Symlink], + log_prefix: str, + target_name: str, + cache: set[Binary] | set[Symlink] | None = None, + ) -> Binary | Symlink | None: + """Choice of one element of a given list according to the strategy. + + Given a list of objects which match a target, select one or None among + the given list according the strategy given It also logs the choice made + (debug level). If requireds by the strategy, an interaction with the user could + be made. + :param strategy: the resolution strategy + :param matching_objects: a list of FileSystemComponents (NOT empty, not + check by the function) + :param log_prefix: Prefix used at the beginning of each log + :param target_name: Target name, used in logs (and user interaction) + :param resolve_cache: cache of previously selected choices for this target + :return: the selected FileSystemComponent | None if resolution strategy + is IGNORE + """ + if len(matching_objects) > 1 and strategy is ResolveDuplicateOption.IGNORE: + logging.debug( + f"{log_prefix}: several matches for {target_name} but strategy is " + f"{ResolveDuplicateOption.IGNORE.name} so nothing selected" + ) + return None + selected_index = None + selected_bin = None + if len(matching_objects) > 1 and strategy is ResolveDuplicateOption.INTERACTIVE: + for cache_entry in cache or {}: + if cache_entry in matching_objects: # reuse already selected entry + logging.debug( + f"{log_prefix}: manually selected entry to disambiguate {target_name}" + ) + selected_bin = cache_entry + + while ( + selected_bin is None + or selected_index is None + or selected_index < 0 + or selected_index >= len(matching_objects) + ): + print(f"{log_prefix}: several matches for {target_name}, select one\n") + for i in range(len(matching_objects)): + print(f"{i}: {matching_objects[i].path}") + try: + selected_index = int(input()) + except ValueError: + print("Enter a valid number") + else: # "arbitrary" option + selected_index = 0 + if selected_bin is None: + selected_bin = matching_objects[selected_index] + return selected_bin + + def commit(self) -> None: + """Commit changes in database.""" + if not self.dry_run_mode and self.db_interface is not None: + self.db_interface.commit() + + # =================== Binary parsing ============================== + def load_binary_args(self) -> dict[str, Any]: """Return dict of args for load_binary that are always the same for the wholde firmware. - + Use to optimize multiprocessing. Set here there real values. """ return {"root_directory": self.root_directory} - @staticmethod def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | str: """Create a Binary object from a given file using lief. @@ -73,7 +356,9 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s raise: FsMapperError if cannot load it :return: bin object and additionnal info if needed or a string in case of error """ - # compute absolute path but from root_directory base + import lief + + lief.logging.disable() base = Path(root_directory.anchor) rel_path = base.joinpath(file_path.relative_to(root_directory)) @@ -88,6 +373,15 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s if parsing_res is None: return f"Lief cannot parse {file_path}" + bin_obj.image_base = parsing_res.imagebase + bin_obj.is_relocatable = parsing_res.header.file_type == lief.ELF.Header.FILE_TYPE.REL + # Extract the ELF SONAME if present (shared libraries only). + # This allows resolving imports that reference the SONAME rather + # than the actual filename (e.g. libpthread.so.0 vs libpthread-2.11.1.so). + for dyn_entry in parsing_res.dynamic_entries: + if dyn_entry.tag == lief.ELF.DynamicEntry.TAG.SONAME: + bin_obj.soname = str(dyn_entry.name) + break # parse imported libs for lib in parsing_res.libraries: bin_obj.add_imported_library_name(str(lib)) @@ -97,27 +391,50 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s # store exported symbols s: lief.ELF.Symbol is_kernel_module = bin_obj.path.suffix == ".ko" + seen_symbol_names: set[str] = set() for s in parsing_res.symbols: + sym_name = str(s.name) if s.imported: - bin_obj.add_imported_symbol_name(str(s.name)) + bin_obj.add_imported_symbol_name(sym_name) elif s.exported or is_kernel_module and s.name: is_func = s.is_function or s.type == lief.ELF.Symbol.TYPE.GNU_IFUNC if not is_func and is_kernel_module: continue - bin_obj.add_exported_symbol( - Symbol( - name=str(s.name), - is_func=is_func, - demangled_name=s.demangled_name, - addr=s.value, - ) + # LIEF may yield the same symbol name from both .symtab + # and .dynsym; only register the first occurrence to avoid + # duplicate DB entries (UNIQUE constraint on node_id). + if sym_name in seen_symbol_names: + continue + seen_symbol_names.add(sym_name) + # Use the mangled name as demangled_name when LIEF's + # demangled_name is identical to the mangled name (i.e. + # demangling was not available or not needed). + lief_demangled = str(s.demangled_name) + demangled = lief_demangled if lief_demangled != sym_name else sym_name + sym = Symbol( + name=sym_name, + is_func=is_func, + demangled_name=demangled, + addr=s.value, ) + # Register under the mangled name as primary key. + # Also register under the demangled name if it differs, + # so that call-graph resolution can match short callee + # strings against exported_functions keys. + bin_obj.add_exported_symbol(sym) + if demangled != sym_name: + bin_obj.add_exported_symbol(sym, symbol_name=demangled) elif s.is_function: + # Skip symbols already registered as exported functions to + # avoid duplicate DB entries. + if sym_name in seen_symbol_names: + continue + seen_symbol_names.add(sym_name) bin_obj.add_function( Symbol( - name=str(s.name), + name=sym_name, is_func=s.is_function, - demangled_name=s.demangled_name, + demangled_name=str(s.demangled_name), addr=s.value, ) ) @@ -135,6 +452,7 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s res: lief.Binary | None = lief.parse(str(file_path)) if res is None: return f"ERROR: Lief cannot parse {file_path}" + bin_obj.image_base = res.imagebase # parse imported libs for lib in res.libraries: bin_obj.add_imported_library_name(str(lib)) @@ -154,26 +472,30 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s @classmethod def parse_binary_job(cls, ingress: Queue, egress: Queue, parse_func: Callable) -> None: - """Parse an executable file and create the associated Binary object. + """Parse an executable file and create the associated Binary object, used to multiprocess. - It is used for multiprocessing. - :param ingress: input Queue, contain a Path - :param egress: output Queue, send back (file path, Binary result or - logging string if an issue happen) - :param parse_func: func which take a path as argument (called file_path) and parse it + :param ingress: input Queue, contains Path items or None as a stop sentinel + :param egress: output Queue, sends back (file path, Binary result) or + (file path, Exception) if an issue occurred + :param parse_func: func which takes a path as argument (called file_path) and parses it """ while True: try: path = ingress.get(timeout=0.5) - try: - egress.put((path, parse_func(file_path = path))) - except Exception as e: - egress.put((path, e)) except queue.Empty: - pass + continue except KeyboardInterrupt: break + if path is None: + break + + try: + egress.put((path, parse_func(file_path=path))) + except Exception as e: + logging.error(f"[worker] Failed on {path}: {e}") + egress.put((path, e)) + def map_binary(self, bin_object: Binary, additional_res: Any = None) -> None: """Given a Binary object add it to the DB. @@ -184,6 +506,8 @@ def map_binary(self, bin_object: Binary, additional_res: Any = None) -> None: if not self.dry_run_mode: self.record_binary_in_db(bin_object, f"[binary mapping] {bin_object.name}") + # =============================== Symlinks ================================== + def map_symlink(self, path: Path) -> None: """Given a symlink, resolve it and create the associated objects if needed. @@ -228,6 +552,8 @@ def map_symlink(self, path: Path) -> None: else: logging.warning(f"{log_prefix}: '{target}' does not correspond to a recorded binary") + # =============================== Imports ================================== + @dataclass(frozen=True) class _LibImport(ABC): initial_import: Symlink | Binary | None @@ -287,6 +613,14 @@ def _resolve_lib_import( if dest is None: return self._PartialLibImport(initial_import=sym_obj) return self._SolvedLibImport(initial_import=sym_obj, final_import=dest) + elif self.fs.soname_exists(lib_name): + # The imported name matches the SONAME of a binary whose filename + # differs (e.g. libpthread.so.0 is the SONAME of libpthread-2.11.1.so). + matching_binaries = self.fs.get_binaries_by_soname(lib_name) + lib_obj = self._select_fs_component(strategy, matching_binaries, log_prefix, lib_name) + if lib_obj is None: + return self._FailedLibImport() + return self._SolvedLibImport(initial_import=lib_obj, final_import=lib_obj) else: return self._FailedLibImport() @@ -307,6 +641,7 @@ def map_lib_imports( targeted Binary object in the case of a Symlink) """ log_prefix = f"[lib imports] {binary.path}" + for lib_name in binary.imported_library_names: res = self._resolve_lib_import(lib_name, resolution_strategy, log_prefix) match res: @@ -315,7 +650,14 @@ def map_lib_imports( # resolution, the final target of the symlink is considered to be # imported and not the symlink itself self.record_import_in_db(binary.id, res.initial_import.id, log_prefix) - binary.add_imported_library(res.final_import) + + if lib_name != res.final_import.name: + # SONAME case: store the resolved binary under the + # original import name (the SONAME) rather than the + # binary's filename, to avoid a spurious extra entry. + binary.imported_libraries[lib_name] = res.final_import + else: + binary.add_imported_library(res.final_import) case self._PartialLibImport(): self.record_import_in_db(binary.id, res.initial_import.id, log_prefix) logging.warning( @@ -522,6 +864,25 @@ def map_symbol_imports_main( progress.update(symbol_imports, advance=1) self.commit() + def map( + self, + threads: int, + resolution_strategy: ResolveDuplicateOption = ResolveDuplicateOption.IGNORE, + ) -> FileSystem: + """Wrap mapper_main with usefull elements for CLI rendering. + + :param threads: number of threads to use + :param resolution_strategy: the chosen option for duplicate import resolution + :return: The FileSystem object filled + """ + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + MofNCompleteColumn(), + TimeElapsedColumn(), + ) as progress: + return self.mapper_main(threads, progress, resolution_strategy) + def mapper_main( self, threads: int, diff --git a/src/pyrrha_mapper/mappers/intercg_bin_loader.py b/src/pyrrha_mapper/mappers/intercg_bin_loader.py new file mode 100644 index 0000000..c9cb4db --- /dev/null +++ b/src/pyrrha_mapper/mappers/intercg_bin_loader.py @@ -0,0 +1,495 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Load information used by InterCGMapper from the files on the disk.""" + +import logging +import re +from pathlib import Path +from typing import NamedTuple + +from pyrrha_mapper.backend import IDA, Backend, Ghidra +from pyrrha_mapper.exceptions import FsMapperError +from pyrrha_mapper.types import FuncType + +from .imports_mapper import FileSystemImportsMapper +from .objects import Binary, Symbol + + +class FuncData(NamedTuple): + """Store function data collected by the binary parser. + + All addresses are in **parser space** (the native address space of the + underlying tool — IDA, Ghidra, etc.). + """ + + symbol: Symbol + type: FuncType + calls: list[int] + callers: list[int] + + @property + def name(self) -> str: + """:return: mangled name of the function""" + return self.symbol.name + + @property + def demangled_name(self) -> str: + """:return: demangled name of the function""" + return self.symbol.demangled_name + + @property + def addr(self) -> int: + """:return: address of the function in the Binary""" + assert self.symbol.addr is not None + return self.symbol.addr + + +def _count_leading_underscores(name: str) -> int: + """:return: the number of leading underscores/dots in name""" + return len(name) - len(name.lstrip("_.")) + + +# Tool-generated fallback names (FUN_, sub_, _INIT_, _FINI_). +# A trampoline destination that matches one of these cannot be resolved as a +# cross-binary callee — skip the substitution to preserve the original name. +_SYNTHETIC_FUNC_NAME_RE: re.Pattern[str] = re.compile( + r"^(?:FUN_[0-9A-Fa-f]+|sub_[0-9A-Fa-f]+|_INIT_\d+|_FINI_\d+)$" +) + + +class BinaryParser(Backend): + """Abstract base class that parses a binary and extracts call-graph data. + + Subclasses implement the parser-specific methods (IDA, Ghidra, …). + Adresses are the one used in the backend, which can differ from LIEF ones + (relative vs virtual). + """ + + def __init__(self, root_directory: Path, file_path: Path) -> None: + self.log_prefix = f"[binary parsing] {file_path.name}" + self._is_relocatable: bool = False + self._binary = self._generate_lief_bin(root_directory, file_path) + self._is_relocatable = self._binary.is_relocatable + super().__init__( + file_path, root_directory, decompilation=False, image_base=self._binary.image_base + ) + + image_base = self._binary.image_base + + # Remap LIEF export addresses to parser space. + parser_exports: dict[int, list[Symbol]] = { + lief_addr - image_base: symbols + for lief_addr, symbols in self._binary.exported_funcs_by_addr.items() + } + + # Step 1 — merge parser functions with LIEF export metadata. + program_data: dict[int, FuncData] = self._combine_program_analysis_binary(parser_exports) + + # Step 2 — add exported symbols not discovered by the parser. + # Skipped for ET_REL: LIEF addresses are section-relative and incompatible + # with the parser address space; Step 1 already matched exports by name. + parser_addrs: set[int] = set(self.func_addrs) + call_graph: dict[Symbol, list[str]] = {} + + if not self._is_relocatable: + for parser_addr, symbols in parser_exports.items(): + if parser_addr in parser_addrs: + continue + canon = self._disambiguate_export(symbols) + # ARM THUMB: parser may use address - 1 (THUMB bit cleared) + if self.is_func_start(parser_addr - 1): + if self.func_mangled_name(parser_addr - 1) in {s.name for s in symbols}: + continue + logging.debug( + f"{self.log_prefix}: export {canon.name} @ {parser_addr:#x} " + f"not found in parser output" + ) + call_graph[canon] = [] + if len(symbols) > 1: + for sym in symbols: + self._binary.replace_function(canon, sym, True) + + # Step 3 — build the call graph, resolving thunk trampolines. + trampoline_map: dict[str, str] = {} + # LIEF-confirmed imported names (.dynsym): distinguishes genuine PLT stubs + # (IMPORTED + name in this set) from inlined C++ functions mis-classified + # as external thunks by the disassembler (IMPORTED + name NOT in this set). + lief_imported_names: set[str] = set(self._binary.imported_symbol_names) + to_analyse = program_data + + while len(to_analyse) > 0: + missed_data = dict() + for func_data in to_analyse.values(): + exported = ( + func_data.addr in parser_exports + or func_data.addr + 1 in parser_exports # ARM THUMB + ) + + # Keep the function in the call graph when: + # (a) it is a normal/library function, + # (b) it is an exported or multi-callee thunk, OR + # (c) it was classified IMPORTED by the disassembler but its + # name is absent from LIEF's imported-symbol table AND it + # is registered in the binary — the disassembler + # mis-classified an inlined C++ function (e.g. D0Ev + # deleting-destructor, virtual thunks) as an external + # stub. Keeping it lets callers resolve it as a local + # call rather than generating an unresolved-callee error. + # The function_exists guard prevents promoting functions + # that were never registered (e.g. genuine C-linkage + # imports whose unmangled name happens to be absent from + # lief_imported_names). + if ( + func_data.type in (FuncType.LIBRARY, FuncType.NORMAL) + or (func_data.type == FuncType.THUNK and (exported or len(func_data.calls) > 1)) + or ( + func_data.type == FuncType.IMPORTED + and func_data.name not in lief_imported_names + and self._binary.function_exists(func_data.name) + ) + ): + call_graph[func_data.symbol] = self._build_calls_list(func_data, program_data) + continue + + if func_data.type == FuncType.THUNK and len(func_data.calls) == 1: + if func_data.calls[0] not in program_data: + mangled_name = self.func_mangled_name(func_data.calls[0]) + if mangled_name == "": + logging.warning("Nothing found ") + continue + + func_symbol = Symbol( + name=mangled_name, + demangled_name=self.func_demangled_name(func_data.calls[0]), + is_func=True, + addr=func_data.calls[0], + ) + self._binary.add_function(func_symbol) + func = FuncData( + symbol=func_symbol, + type=self.func_type(func_data.calls[0]), + calls=self.func_children(func_data.calls[0]), + callers=self.func_parents(func_data.calls[0]), + ) + missed_data[func_data.calls[0]] = func + callee_data = func + else: + callee_data = program_data[func_data.calls[0]] + if callee_data.type == FuncType.IMPORTED: + # Keep the name of the thunk "strcpy, sprintf" + trampoline_name = func_data.name + destination_name = callee_data.name + # in case of nested functions (starting with _, keep the less nested one) + if _count_leading_underscores(trampoline_name) > _count_leading_underscores( + destination_name + ): + trampoline_name, destination_name = destination_name, trampoline_name + else: # Forward the call to the underlying function name + trampoline_name = func_data.name + destination_name = callee_data.name + # Resolve chains: A→B, B→C becomes A→C + while ( + destination_name in trampoline_map + and trampoline_map[destination_name] != destination_name + ): + destination_name = trampoline_map[destination_name] + # Do not record a trampoline substitution when the destination + # is a tool-generated synthetic name (e.g. "FUN_1234" or + # "sub_5678"): the disassembler could not identify the branch + # target, so replacing the original stub name with a synthetic + # placeholder would drop the cross-binary call edge entirely. + # Skipping the substitution leaves the stub name intact so + # fwmapper can still resolve it against exported_functions. + if not _SYNTHETIC_FUNC_NAME_RE.match(destination_name): + trampoline_map[trampoline_name] = destination_name + for key, val in trampoline_map.items(): + if val == trampoline_name: + trampoline_map[key] = destination_name + + # Only remove the thunk stub when it wraps a true external + # (IMPORTED) symbol — i.e. it is a genuine PLT stub. Internal + # forwarding thunks (callee type is NORMAL or another THUNK) + # must stay registered in the binary so their callers can + # resolve them as local calls. + if callee_data.type != FuncType.IMPORTED: + continue + + elif func_data.type == FuncType.THUNK and not func_data.calls and func_data.callers: + # Terminal thunk with callers but no callees — keep it + continue + + # Remove functions not kept as exported/library/normal. + # _Z-prefixed names are preserved: a statically linked binary can + # contain a private copy of a C++ symbol also present in the + # dynamic import table — removing it would break intra-binary edges. + if func_data.name.startswith("_Z"): + continue + if ( + self._binary.function_exists(func_data.name) + and self._binary.get_function_by_name(func_data.name).addr == func_data.addr + ): + self._binary.remove_function(func_data.name) + to_analyse = missed_data + program_data.update(missed_data) + + # Apply trampoline substitutions to the final call graph + self._call_graph: dict[Symbol, list[str]] = { + sym: [trampoline_map.get(c, c) for c in callees] for sym, callees in call_graph.items() + } + + self.close() + + # ------------------------------------------------------------------ + # Public properties + # ------------------------------------------------------------------ + + @property + def binary(self) -> Binary: + """:return: the Binary produced by the parser.""" + return self._binary + + @property + def call_graph(self) -> dict[Symbol, list[str]]: + """:return: mapping from each Symbol to its list of callee names.""" + return self._call_graph + + # ------------------------------------------------------------------ + # Concrete helpers + # ------------------------------------------------------------------ + + def _generate_lief_bin(self, root_directory: Path, file_path: Path) -> Binary: + """Load the binary via LIEF and return a populated Binary object. + + :raises FsMapperError: on load failure or missing path information. + """ + result = FileSystemImportsMapper.load_binary(root_directory, file_path) + if isinstance(result, str): + raise FsMapperError(result) + lief_binary, _ = result + if lief_binary.real_path is None: + raise FsMapperError(f"{self.log_prefix}: real_path not set (skip)") + if not lief_binary.real_path.exists(): + raise FsMapperError(f"{self.log_prefix}: executable not found (skip)") + return lief_binary + + def _build_calls_list( + self, + func: FuncData, + call_graph: dict[int, FuncData], + ) -> list[str]: + """Given a function return its call list. + + It only contains functions that are contained in the call graph and have a name. + + :return: a list of string (function names) + """ + res: list[str] = list() + for callee in [call_graph[addr] for addr in func.calls if addr in call_graph]: + if callee.name is not None and callee.name != "": + res.append(callee.name) + else: + logging.warning( + f"{self.log_prefix}: {func.symbol} calls unnamed function @ {callee.addr:#08x}" + ) + return res + + def _combine_program_analysis_binary( + self, + parser_exports: dict[int, list[Symbol]], + ) -> dict[int, FuncData]: + """Build a ``{parser_addr: FuncData}`` dict merging parser and LIEF data. + + For each function discovered by the parser: + + - If its parser-space address matches a LIEF export entry, the export + Symbol is used. + - Otherwise a new internal Symbol is created — unless the function name + matches a known imported symbol (e.g. a PLT stub already tracked by + LIEF as an import), in which case the function is skipped entirely. + + :param parser_exports: LIEF exports already remapped to parser space. + :return: mapping from parser-space address to FuncData. + """ + imported_names: set[str] = set(self._binary.imported_symbol_names) + program_data: dict[int, FuncData] = {} + + for parser_addr in self.func_addrs: + if parser_addr in parser_exports or parser_addr + 1 in parser_exports: + # Exported function — adopt the LIEF symbol. + symbols = parser_exports.get(parser_addr, parser_exports.get(parser_addr + 1, [])) + func_symbol = self._disambiguate_export(symbols) + parser_name = self.func_demangled_name(parser_addr) + if parser_name != func_symbol.demangled_name: + logging.debug( + f"{self.log_prefix}: rename {parser_name} → {func_symbol.demangled_name}" + ) + if len(symbols) > 1: + for sym in symbols: + self._binary.replace_function(func_symbol, sym, True) + else: + # Internal function — create a new Symbol in parser space. + mangled_name = self.func_mangled_name(parser_addr) + # Skip LIEF-imported names except: (a) PLT thunks — must reach + # Step 3 to build trampoline_map; (b) _Z-prefixed names — a + # statically linked binary may contain a private copy of a symbol + # whose mangled name also appears in the dynamic import table. + if ( + mangled_name in imported_names + and not mangled_name.startswith("_Z") + and self.func_type(parser_addr) != FuncType.THUNK + ): + continue + func_symbol = Symbol( + name=mangled_name, + demangled_name=self.func_demangled_name(parser_addr), + is_func=True, + addr=parser_addr, + ) + self._binary.add_function(func_symbol) + + program_data[parser_addr] = FuncData( + symbol=func_symbol, + type=self.func_type(parser_addr), + calls=self.func_children(parser_addr), + callers=self.func_parents(parser_addr), + ) + + return program_data + + def _disambiguate_export(self, symbols: list[Symbol]) -> Symbol: + """Choose the most appropriate Symbol when multiple share the same address. + + Prefers the shortest name that does not start with ``_``. + Falls back to the globally shortest name if all names start with ``_``. + """ + if len(symbols) == 1: + return symbols[0] + + chosen: Symbol | None = None + for sym in symbols: + if sym.demangled_name.startswith("_"): + continue + if chosen is None or len(sym.demangled_name) < len(chosen.demangled_name): + chosen = sym + + if chosen is None: + logging.debug( + f"{self.log_prefix}: all exports start with '_', " + f"picking shortest: {[s.demangled_name for s in symbols]}" + ) + chosen = min(symbols, key=lambda s: len(s.demangled_name)) + + return chosen + + +class IDABinaryParser(BinaryParser, IDA): + """Binary parser backed by IDA Pro.""" + + pass + + +class GhidraBinaryParser(BinaryParser, Ghidra): + """Binary parser backed by Ghidra.""" + + def __init__(self, *args, **kwargs)-> None: + super().__init__(*args, **kwargs) + program = self._ghidra_program + + # Build the exported-address set once so _func_type can check cheaply. + self._ghidra_exported_parser_addrs: set[int] = { + lief_addr - self._binary.image_base for lief_addr in self._binary.exported_funcs_by_addr + } + + # ET_REL (kernel modules, object files): Ghidra lays sections out at a + # fake base (0x10000); LIEF reports raw section-relative offsets. + # The two coordinate systems are incompatible — match by name instead. + self._ghidra_is_relocatable: bool = bool( + program.getOptions(program.PROGRAM_INFO).getBoolean("Relocatable", False) # type: ignore + ) + # Name → LIEF Symbol map, populated only for relocatable binaries. + self._ghidra_exported_names: dict = ( + { + sym.name: sym + for symbols in self._binary.exported_funcs_by_addr.values() + for sym in symbols + } + if self._ghidra_is_relocatable + else {} + ) + + def _combine_program_analysis_binary(self, parser_exports: dict) -> dict: + """Override for relocatable binaries (ET_REL, e.g. kernel modules). + + For ``ET_REL`` files Ghidra places sections in a fake address space + while LIEF reports raw section-relative offsets. Address-based + matching is impossible — exported symbols are matched by name instead. + For non-relocatable binaries the base-class implementation is used. + + :param parser_exports: LIEF exports already remapped to parser space. + :return: mapping from parser-space address to FuncData. + """ + # Only GhidraParser sets _ghidra_is_relocatable; GhidraLoader doesn't + # call BaseParser.__init__ so this method is never reached from there. + if not getattr(self, "_ghidra_is_relocatable", False): + return super()._combine_program_analysis_binary(parser_exports) + + imported_names: set[str] = set(self._binary.imported_symbol_names) + program_data: dict[int, FuncData] = {} + + for parser_addr in self.func_addrs: + mangled_name = self.func_mangled_name(parser_addr) + + if mangled_name in self._ghidra_exported_names: + # ET_REL: adopt name/demangled from the LIEF export symbol but + # use the Ghidra parser-space address so the rest of BaseParser + # sees a consistent address space. + lief_sym = self._ghidra_exported_names[mangled_name] + func_symbol = Symbol( + name=lief_sym.name, + demangled_name=lief_sym.demangled_name, + is_func=True, + addr=parser_addr, + ) + parser_name = self.func_demangled_name(parser_addr) + if parser_name != func_symbol.demangled_name: + logging.debug( + f"{getattr(self, 'log_prefix', '')}: " + f"rename {parser_name} → {func_symbol.demangled_name}" + ) + self._binary.add_exported_symbol(func_symbol) + else: + if ( + mangled_name in imported_names + and not mangled_name.startswith("_Z") + and self.func_type(parser_addr) != FuncType.THUNK + ): + continue + func_symbol = Symbol( + name=mangled_name, + demangled_name=self.func_demangled_name(parser_addr), + is_func=True, + addr=parser_addr, + ) + self._binary.add_function(func_symbol) + + program_data[parser_addr] = FuncData( + symbol=func_symbol, + type=self.func_type(parser_addr), + calls=self.func_children(parser_addr), + callers=self.func_parents(parser_addr), + ) + + return program_data diff --git a/src/pyrrha_mapper/intercg/fwmapper.py b/src/pyrrha_mapper/mappers/intercg_mapper.py similarity index 57% rename from src/pyrrha_mapper/intercg/fwmapper.py rename to src/pyrrha_mapper/mappers/intercg_mapper.py index 270a674..d4dd3b4 100644 --- a/src/pyrrha_mapper/intercg/fwmapper.py +++ b/src/pyrrha_mapper/mappers/intercg_mapper.py @@ -16,58 +16,113 @@ """InterCGMapper implementation.""" import logging +import re from collections import defaultdict from pathlib import Path from typing import Any -from hashlib import md5 -import sys # third-party imports from numbat import SourcetrailDB from rich.progress import Progress -# local imports -from pyrrha_mapper.common import ( +from pyrrha_mapper.exceptions import FsMapperError +from pyrrha_mapper.mappers.intercg_bin_loader import ( + BinaryParser, + GhidraBinaryParser, + IDABinaryParser, +) +from pyrrha_mapper.types import Backend, ResolveDuplicateOption + +from .imports_mapper import ( + FileSystemImportsMapper, + hide_progress, +) +from .objects import ( Binary, FileSystem, Symbol, Symlink, - hide_progress, ) -from pyrrha_mapper.exceptions import FsMapperError -from pyrrha_mapper.fs import FileSystemImportsMapper -from pyrrha_mapper.intercg.loader import load_program -from pyrrha_mapper.types import ResolveDuplicateOption -from qbinary.types import Disassembler, ExportFormat -IGNORE_LIST = ["__gmon_start__"] +IGNORE_LIST: frozenset[str] = frozenset( + [ + # Linker-injected bookkeeping stubs + "__gmon_start__", + "_ITM_deregisterTMCloneTable", + "_ITM_registerTMCloneTable", + "__TMC_END__", + "deregister_tm_clones", + "register_tm_clones", + # ITM runtime helpers + "_ITM_RU1", + "_ITM_addUserCommitAction", + "_ITM_memcpyRnWt", + "_ITM_memcpyRtWn", + # C++ operators (Ghidra partial-demangle form) + "operator new", + "operator new[]", + "operator delete", + "operator delete[]", + "new[]", + "operator==", + "operator!=", + "operator<", + "operator>", + "operator<=", + "operator>=", + "operator=", + "operator+", + "operator-", + "operator*", + "operator/", + "operator[]", + "operator()", + "operator<<", + "operator>>", + "operator+=", + "operator-=", + # GCC exception helpers — never valid cross-binary callees + "__throw_bad_alloc", + "__throw_bad_array_new_length", + "__throw_bad_cast", + "__throw_bad_function_call", + "__throw_future_error", + "__throw_invalid_argument", + "__throw_length_error", + "__throw_logic_error", + "__throw_out_of_range", + "__throw_out_of_range_fmt", + "__throw_overflow_error", + "__throw_range_error", + "__throw_regex_error", + "__throw_runtime_error", + "__throw_system_error", + "__throw_underflow_error", + # C++ ABI internal + "__do_upcast", + ] +) -QUOKKA_EXT = ".quokka" +# Tool-generated synthetic names (FUN_, _INIT_, _FINI_) that can +# never be resolved as cross-binary callees. +_GHIDRA_SYNTHETIC_NAME_RE: re.Pattern[str] = re.compile( + r"^(?:FUN_[0-9A-Fa-f]+|_INIT_\d+|_FINI_\d+)$" +) NUMBAT_UI_BIN = "NumbatUi" -# Determine the command to open URLs based on the platform -try: - URL_OPEN_CMD = { - "linux": "xdg-open", - "win32": "start", - "darwin": "open" - }[sys.platform] -except KeyError: - logging.warning(f"Unsupported platform: {sys.platform} (will not add URL handler)") - URL_OPEN_CMD = "" # type: ignore - - class InterImageCGMapper(FileSystemImportsMapper): """Filesystem mapper based on Lief, which computes imports and exports.""" FS_EXT = ".fs.json" - DISASS = Disassembler.AUTO - EXPORT = ExportFormat.AUTO - - def __init__(self, root_directory: Path | str, db: SourcetrailDB | None): + def __init__( + self, + root_directory: Path | str, + db: SourcetrailDB | None, + backend: Backend, + ): super(InterImageCGMapper, self).__init__(root_directory, db) # super initialize root_directory, db_interface, fs and _dry_run variables @@ -84,7 +139,7 @@ def __init__(self, root_directory: Path | str, db: SourcetrailDB | None): self.exports_to_bins: dict[str, list[Binary]] = {} self.progress: Progress | None = None self.unresolved_callgraph: dict[Path, dict[Symbol, list[str]]] = dict() - self._current_binary_hash = "" + self.backend = backend def _correct_map_result(self, res: Any) -> bool: return ( @@ -98,23 +153,21 @@ def _correct_map_result(self, res: Any) -> bool: ) ) ) - + def load_binary_args(self) -> dict[str, Any]: """Return dict of args for load_binary that are always the same for the wholde firmware. - + Use to optimize multiprocessing. Set here there real values. """ res = super().load_binary_args() - res["disass"] = self.DISASS - res["exporter"] = self.EXPORT + res["backend"] = self.backend return res @staticmethod def load_binary( root_directory: Path, file_path: Path, - disass: Disassembler = DISASS, - exporter: ExportFormat = EXPORT, + backend: Backend = Backend.IDA, ) -> tuple[Binary, dict[Symbol, list[str]] | None] | str: """Load all the binaries located in the filesystem as Binary objects. @@ -124,41 +177,18 @@ def load_binary( enrich it with InterCG-mapper required data. It includes call graphs and some function normalization in case collisions. It modifies the FileSystem object in place. - - :param cache_file: Cache file to load binaries from (if exists) """ - res = FileSystemImportsMapper.load_binary(root_directory, file_path) - if isinstance(res, str): # error message - return res - else: - binary, _ = res - if binary.real_path is None: - return f"ERROR: Path on the filesystem of {binary.name} not set (skip)" - if not binary.real_path.exists(): - return ( - f"ERROR cannot find executable mentioned in 'fs' mapper: " - f"{binary.real_path.name} (skip)" - ) - try: - prefix = f"[binary mapping] {binary.name}" - unresolved_cg = load_program(binary, disass, exporter, prefix) - return binary, unresolved_cg + if backend == Backend.IDA: + ida_parser: BinaryParser = IDABinaryParser(root_directory, file_path) + return ida_parser.binary, ida_parser.call_graph + elif backend == Backend.GHIDRA: + ghidra_parser = GhidraBinaryParser(root_directory, file_path) + return ghidra_parser.binary, ghidra_parser.call_graph + else: + return f" disassembler {backend} is not supported" except (FileNotFoundError, FsMapperError, SyntaxError) as e: - logging.error(f"ERROR: Loading error: {binary.name}: {e}") - return binary, None - - - def add_url_handler(self, hash: str, binary: Binary, symbol: Symbol) -> None: - """ Open the function using a dedicated URL handler. (Use Heimdallr) """ - if not hash: - return # no hash, no URL handler - if URL_OPEN_CMD: - url = f"disas://{hash}?idb={binary.name+'.i64'}&offset={symbol.addr:#08x}" - cmd: list[str] = ["xdg-open", url] - self.db_interface.set_custom_command(symbol.id, cmd, "Open in Disassembler") # type: ignore - else: - pass # Can't add URL unsuported platform + return f"[binary mapping] {file_path.name}: ERROR: Loading error: {e}" def map_binary( self, @@ -170,27 +200,19 @@ def map_binary( This function updates the filesystem representation stored as `self.fs`. :param bin_object: Binary object """ - self._current_binary_hash = md5(Path(bin_object.real_path).read_bytes()).hexdigest() - super().map_binary(bin_object) if additional_res is not None: self.unresolved_callgraph[bin_object.path] = additional_res if bin_object.id is not None: self.node_ids[bin_object.id] = bin_object - if additional_res is not None: - self._record_custom_command(bin_object, f"[bin mapping] {bin_object.name}") - - def symbol_recorded(self, binary: Binary, symbol: Symbol) -> None: - """ - Register a symbol recorded handler to add a custom command. - """ - self.add_url_handler(self._current_binary_hash, binary, symbol) def _treat_bin_parsing_result(self, path: Path, res: Any): """Handle load_binary res, map it or display error.""" - log_prefix = f"[binary mapping] {path.name}" + log_prefix = f"[binary parsing] {path.name}" if isinstance(res, str): logging.error(f"{log_prefix}: {res}") + elif isinstance(res, BaseException): + logging.error(f"{log_prefix}: {repr(res)}") elif self._correct_map_result(res): bin_obj, additional_info = res self.map_binary(bin_obj, additional_info) @@ -198,7 +220,61 @@ def _treat_bin_parsing_result(self, path: Path, res: Any): self.map_binary(res[0], None) logging.info(f"{log_prefix}: fallback to lief results, internal analysis failed") else: - logging.warning(f"{log_prefix}: impossible to parse the following result {res}") + logging.warning(f"{log_prefix}: impossible to parse the following result {res.args}") + + @staticmethod + def _merge_parser_functions_into_cached_binary( + parser_bin: Binary, cached_bin: Binary, log_prefix: str = "" + ) -> None: + """Merge disassembler-discovered functions from *parser_bin* into *cached_bin*. + + The .fs.json cache only contains LIEF-visible data. After a cache + reload the disassembler is re-run to rebuild the call graph, but the + resulting ``Binary`` object (``parser_bin``) is discarded — only the + cached binary (``cached_bin``) stays in ``self.fs``. This means that + any function the disassembler registered that was not already present in + the LIEF binary (e.g. internal functions discovered via ``add_function`` + during ``_combine_program_analysis_binary``) will be absent from + ``cached_bin``. The CG mapping loop then hits + ``not binary.function_exists(f_symb.name)`` for every such function and + silently drops the associated call edges. + + This helper bridges the gap by: + + 1. Registering internal functions present in ``parser_bin`` but absent + from ``cached_bin``. These functions have no DB id (they were never + recorded in Numbat), which is correct — only exported symbols are + recorded. + 2. Registering exported functions present in ``parser_bin`` but absent + from ``cached_bin``. This handles symbols the disassembler promoted + to exports that LIEF did not see. No DB id is assigned. + + The operation is intentionally conservative: it never removes existing + functions from ``cached_bin`` and never overwrites a symbol that already + has an id. + + :param parser_bin: freshly-parsed Binary produced by the disassembler. + :param cached_bin: Binary loaded from the .fs.json cache (has DB ids). + :param log_prefix: prefix for log messages. + """ + # Step 1 — register internal functions discovered only by the disassembler. + for func_name, func_symb in parser_bin.internal_functions.items(): + if not cached_bin.function_exists(func_name): + cached_bin.add_function(func_symb, func_name=func_name) + logging.debug( + f"{log_prefix}: merged internal function '{func_name}' from parser into cache" + ) + + # Step 2 — ensure the cached binary's exported function set is a + # superset of the parser's. Symbols exported by the disassembler but + # absent from the cached binary are added so function_exists() succeeds. + # No DB id is assigned — these symbols were not recorded in Numbat. + for func_name, func_symb in parser_bin.exported_functions.items(): + if not cached_bin.exported_function_exists(func_name): + cached_bin.add_exported_symbol(func_symb, symbol_name=func_name) + logging.debug( + f"{log_prefix}: merged exported function '{func_name}' from parser into cache" + ) def map_binaries_main(self, threads: int, progress: Progress) -> None: """Parse and map binaries of a given directory. @@ -219,13 +295,42 @@ def map_binaries_main(self, threads: int, progress: Progress) -> None: binaries_map = progress.add_task( "[red]Binaries recording", total=len(list(self.fs.iter_binaries())) ) + # The .fs.json cache only serialises LIEF-visible data. The + # disassembler call graph is transient and internal functions + # discovered by the disassembler (absent from LIEF's symbol table) + # are not persisted either. We must therefore: + # 1. Re-run the disassembler for each binary to rebuild + # unresolved_callgraph and recover internal functions. + # 2. Merge those internal functions into the cached Binary + # BEFORE calling record_binary_in_db, so that Numbat receives + # DB ids for them. Without ids, _record_call_ref silently + # drops every call whose caller is an internal function. for binary in self.fs.iter_binaries(): log_prefix = f"[bin mapping] {binary.name}" - # Create the node entry in numbat and create the custom command + if binary.real_path is not None: + res = self.load_binary(file_path=binary.real_path, **self.load_binary_args()) + if isinstance(res, str): + logging.error(f"{log_prefix}: CG reload failed: {res}") + elif self._correct_map_result(res): + parser_bin, call_graph = res + # Merge disassembler functions before recording in DB + # even when call_graph is empty — the binary may still + # expose internal functions needed as call targets. + self._merge_parser_functions_into_cached_binary( + parser_bin, binary, log_prefix + ) + if call_graph is not None: + self.unresolved_callgraph[binary.path] = call_graph + else: + logging.warning(f"{log_prefix}: unexpected result during CG reload") + else: + logging.warning(f"{log_prefix}: no real_path set, skipping CG reload") + + # Record in DB after the merge so internal functions discovered + # by the disassembler are included and receive DB ids. self.record_binary_in_db(binary, log_prefix) if binary.id is not None: self.node_ids[binary.id] = binary - self._record_custom_command(binary, log_prefix) progress.update(binaries_map, advance=1) else: @@ -252,15 +357,13 @@ def mapper_main( # Step1: Load FileSystem object and enrich it if needed self.map_binaries_main(threads, progress) self.map_symlinks_main(progress) - self.dry_run_mode = True # (do not record lib imports in numbat db) + self.dry_run_mode = True # do not record lib imports in numbat db self.map_lib_imports_main(progress, resolution_strategy) if self.db_interface is not None: self.dry_run_mode = False - self.progress = progress # need to be able to hide it further down in calls+ - - # Dict of: exported-funs -> [binaries] - self.exports_to_bins = self.make_export_to_binaries_map() + self.progress = progress + self.exports_to_bins = self._make_export_to_binaries_map() # Iterate again all binaries to create call edges (all numbat_id are created) cg_map = progress.add_task( @@ -273,16 +376,27 @@ def mapper_main( count_res = {True: 0, False: 0} if binary.path in self.unresolved_callgraph: for f_symb, targets in self.unresolved_callgraph[binary.path].items(): - if targets and not binary.function_exists(f_symb.name): + if not binary.function_exists(f_symb.name): + if targets: + addr_log = {hex(f_symb.addr) if f_symb.addr is not None else None} + logging.error( + f"function {f_symb.name} ({addr_log}) not in binary: {binary.name}" + ) + continue + + try: + caller = binary.get_function_by_name(f_symb.name) + except KeyError: logging.error( - f"function {f_symb.name} ({hex(f_symb.addr) if f_symb.addr is not None else None}) not in binary: {binary.name}" + f"{log_prefix}: caller {f_symb.name} not found in binary {binary.name}" ) continue + for target in targets: try: res = self._record_one_call( binary, - f_symb, + caller, target, resolution_strategy, unindex_symbols, @@ -307,20 +421,6 @@ def mapper_main( # return the filesystem object return self.fs - def _record_custom_command(self, binary: Binary, log_prefix: str = "") -> None: - """Add a custom command to call numbat-ui on the underlying Sourcetrail. - - :param binary: binary on which to apply the custom command - """ - if self.dry_run_mode: - return None - assert self.db_interface is not None - cmd = [NUMBAT_UI_BIN, str(binary.real_path) + ".srctrlprj"] - if binary.id is None: - logging.warning(f"{log_prefix}: cannot record command as binary has no id") - else: - self.db_interface.set_custom_command(binary.id, cmd, f"Open in {NUMBAT_UI_BIN}") - def _record_call_ref(self, src: Symbol, dst: Symbol, log_prefix: str = "") -> bool: """Add call reference between two symbols in DB. @@ -332,8 +432,8 @@ def _record_call_ref(self, src: Symbol, dst: Symbol, log_prefix: str = "") -> bo assert self.db_interface is not None if src.id is None or dst.id is None: logging.error( - f"{log_prefix}: Cannot record call ref between {src.name} and " - f"{dst.name}, missing ids ({src.name}: {src.id}, {dst.name}: {dst.id})" + f"{log_prefix}: Cannot record call ref between '{src.name}' and " + f"'{dst.name}', missing ids ({src.name}: {src.id}, {dst.name}: {dst.id})" ) return False self.db_interface.record_ref_call(src.id, dst.id) @@ -361,7 +461,7 @@ def _record_unindexed_call(self, src: Symbol, dst: str, log_prefix: str = "") -> return None self.db_interface.record_ref_call(src.id, tgt_id) - def make_export_to_binaries_map(self) -> dict[str, list[Binary]]: + def _make_export_to_binaries_map(self) -> dict[str, list[Binary]]: """Compute dict mapping: exported-funs -> binaries (exporting the function). Indeed multiple binaries can export the same symbol ! @@ -399,20 +499,31 @@ def _record_one_call( :return: True if target function was found """ - # local call + # Ghidra emits template arguments in callee names (e.g. "_M_insert"); + # strip them so lookups match the base-name key in exported_functions. + if "<" in callee: + callee = callee[: callee.index("<")] + + # The disassembler may emit versioned symbol names (e.g. "getenv@@GLIBC_2.4"). + # All export/import keys are stored without the version suffix, so strip it. + if "@@" in callee: + callee = callee[: callee.index("@@")] + if binary.function_exists(callee): callee_symb = binary.get_function_by_name(callee) binary.add_call(caller, callee_symb) - return self._record_call_ref(caller, callee_symb) + return self._record_call_ref(caller, callee_symb, f"{log_prefix}: local call") - if callee in IGNORE_LIST: + if callee in IGNORE_LIST or _GHIDRA_SYNTHETIC_NAME_RE.match(callee): return False # already solved import if binary.imported_symbol_exists(callee, is_resolved=True): callee_symb = binary.get_imported_symbol(callee) binary.add_call(caller, callee_symb) - return self._record_call_ref(caller, callee_symb) + return self._record_call_ref( + caller, callee_symb, f"{log_prefix}: already solved import" + ) # solve import from listed imported libraries tmp = self.resolve_symbol_import(binary, callee, resolver, log_prefix) @@ -422,7 +533,9 @@ def _record_one_call( binary.add_imported_library(target_bin) binary.add_imported_symbol(target_symb) binary.add_call(caller, target_symb) - return self._record_call_ref(caller, target_symb) + return self._record_call_ref( + caller, target_symb, f"{log_prefix}: import in listed imported lib" + ) # Get binaries exporting this symbol served_by: list[Binary] = self.exports_to_bins[callee] @@ -448,10 +561,12 @@ def _record_one_call( callee_symb = served_by[0].get_exported_symbol(callee) binary.add_imported_symbol(callee_symb) binary.add_call(caller, callee_symb) - return self._record_call_ref(caller, callee_symb) + return self._record_call_ref(caller, callee_symb, log_prefix) else: # still not resolved - self._record_unindexed_call(caller, callee) + self._record_unindexed_call(caller, callee, log_prefix) if binary.path.suffix != ".ko": unindex_symbols.add(callee) - logging.debug(f"{log_prefix}: no match found for edge {caller.name} -> {callee}") + logging.warning(f"{log_prefix}: no match found for edge {caller.name} -> {callee}") + else: + logging.debug(f"{log_prefix}: no match found for edge {caller.name} -> {callee}") return False diff --git a/src/pyrrha_mapper/common/objects.py b/src/pyrrha_mapper/mappers/objects.py similarity index 93% rename from src/pyrrha_mapper/common/objects.py rename to src/pyrrha_mapper/mappers/objects.py index 1c4dd9a..62cc2b7 100644 --- a/src/pyrrha_mapper/common/objects.py +++ b/src/pyrrha_mapper/mappers/objects.py @@ -102,16 +102,20 @@ class Binary(FileSystemComponent): ) # warning only symbols which are not functions exported_functions: dict[str, Symbol] = Field(default_factory=dict) - # Fields for call graph representation - # functions is both: internal functions + exported functions + # Call graph fields internal_functions: dict[str, Symbol] = Field(default_factory=dict) calls: dict[str, list[Symbol]] = Field(default_factory=dict) - # ELF specific fields + # ELF-specific fields + soname: str | None = Field(default=None) # ELF DT_SONAME (e.g. "libpthread.so.0") version_requirement: dict[str, list[str]] = Field( default_factory=dict ) # dict(symbol_name, list(requirements)) + # Runtime-only (excluded from serialisation) + image_base: int = Field(default=0, exclude=True) + is_relocatable: bool = Field(default=False, exclude=True) + @field_validator("internal_functions", "exported_functions", mode="after") @classmethod def validate_functions_field(cls, value: dict[str, Symbol]) -> dict[str, Symbol]: @@ -173,6 +177,9 @@ def add_exported_symbol(self, symbol: Symbol, symbol_name: str = "") -> None: if symbol.is_func: self.exported_functions[symbol_name] = symbol self.exported_symbols.pop(symbol_name, None) + # Remove from internal_functions if it was previously registered + # there (e.g. LIEF yields the same symbol via .symtab then .dynsym). + self.internal_functions.pop(symbol_name, None) else: self.exported_symbols[symbol_name] = symbol self.exported_functions.pop(symbol_name, None) @@ -413,6 +420,7 @@ class FileSystem(BaseModel): symlinks: dict[Path, Symlink] = Field(default_factory=dict) _binary_names: dict[str, list[Binary]] = PrivateAttr(default_factory=dict, init=False) _symlink_names: dict[str, list[Symlink]] = PrivateAttr(default_factory=dict, init=False) + _soname_to_binaries: dict[str, list[Binary]] = PrivateAttr(default_factory=dict, init=False) def __repr__(self): # noqa: D105 return ( @@ -420,8 +428,7 @@ def __repr__(self): # noqa: D105 f"bins={len(self.binaries)}, symlinks={len(self.symlinks)})" ) - # ------------------------------ Overload Pydantic methods ------------------------- - # Always export by aliases, set always excluded attributes + # Pydantic overrides: always export by aliases, exclude runtime-only fields. @field_serializer( "binaries", mode="plain", when_used="always", return_type=dict[str | Path, dict] ) @@ -441,6 +448,7 @@ def fs_bin_serializer(self, v: dict[Path, Binary], info: SerializationInfo) -> A "id": True, "path": True, "name": True, + "soname": True, "imported_symbols": True, "exported_symbols": True, "exported_functions": True, @@ -521,11 +529,13 @@ def fs_bin_validate(cls, data: Any, info: ValidationInfo) -> Any: ) from e if lib_path_obj not in res: raise ValueError(f"Imported lib '{lib_path}' not listed in binaries") - res[bin_path].add_imported_library(res[lib_path_obj]) + # Store under the original import name (the dict key) rather + # than the binary's filename so SONAME keys survive + # serialization round-trips (e.g. "libc.so.6" stays as-is + # instead of being replaced by "libc-2.11.1.so"). + res[bin_path].imported_libraries[name] = res[lib_path_obj] - # optmimize version by replacing every iteration of the same symbol (same id) - # by one object - # 1. generate dict of symbols by ids + # Deduplicate: replace repeated Symbol instances with the same id by one object. symbols_by_ids: dict[int, Symbol] = { s.id: s for bin in res.values() for s in bin.iter_exported_symbols() if s.id is not None } @@ -671,6 +681,15 @@ def _record_component_name(self, fs_object: Binary | Symlink) -> None: names_dict[fs_object.name].append(fs_object) # type: ignore else: names_dict[fs_object.name] = [fs_object] # type: ignore + # Index binaries by their ELF SONAME so that imports referencing the + # SONAME (e.g. "libpthread.so.0") can be resolved even when no symlink + # with that name exists in the firmware filesystem. + if isinstance(fs_object, Binary) and fs_object.soname: + soname = fs_object.soname + if soname in self._soname_to_binaries: + self._soname_to_binaries[soname].append(fs_object) + else: + self._soname_to_binaries[soname] = [fs_object] def _set_object_realpath(self, obj: FileSystemComponent) -> None: obj.real_path = Path(self.root_dir) / ("." + str(obj.path)) @@ -709,6 +728,14 @@ def symlink_name_exists(self, name: str) -> bool: """return: true if the given name is stored in the current FS instance.""" return name in self._symlink_names + def soname_exists(self, soname: str) -> bool: + """return: true if the given SONAME is stored in the current FS instance.""" + return soname in self._soname_to_binaries + + def get_binaries_by_soname(self, soname: str) -> list[Binary]: + """:return: the binaries with the given SONAME.""" + return self._soname_to_binaries[soname] + def get_binaries_by_name(self, name: str) -> list[Binary]: """:return: the binaries with the given path.""" return self._binary_names[name] diff --git a/src/pyrrha_mapper/types.py b/src/pyrrha_mapper/types.py index 30c336d..86b043a 100644 --- a/src/pyrrha_mapper/types.py +++ b/src/pyrrha_mapper/types.py @@ -15,24 +15,17 @@ # limitations under the License. """Types shared in multiple mappers.""" -from enum import Enum, auto +from enum import Enum, StrEnum, auto -class Disassembler(Enum): - """Represent a SRE (Software Reverse Engineering tool, a disassembler).""" +class Backend(Enum): + """Represent the backend used for Pyrrha.""" - AUTO = auto() # doc: Disassembler shall selected automatically IDA = auto() # doc: IDA Pro disassembler GHIDRA = auto() # doc: GHIDRA disassembler BINARY_NINJA = auto() # doc: Binary Ninja disassembler - - -class Exporters(Enum): - """Represent export file formats used in some of the mappers.""" - - AUTO = auto() # doc: The exporter shall be automatically selected - BINEXPORT = auto() # doc: Use Binexport as exporter - QUOKKA = auto() # doc: Use Quokka as exporter + QUOKKA_IDA = auto() # doc: Use Quokka as exporter of IDA + QUOKKA_GHIDRA = auto() # doc: Use Quokka as exporter of Ghidra class ResolveDuplicateOption(Enum): @@ -41,3 +34,12 @@ class ResolveDuplicateOption(Enum): IGNORE = 1 # doc: The mapper will let the conflict as unresolved. ARBITRARY = 2 # doc: The mapper will choose a default one. INTERACTIVE = 3 # doc: The user can interactively solve the conflict. + + +class FuncType(StrEnum): + """Represent the type of a function.""" + + IMPORTED = "imported" + LIBRARY = "library" + NORMAL = "normal" + THUNK = "thunk" diff --git a/tests/conftest.py b/tests/conftest.py index b354e3c..20f6f7e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1,62 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Pytest configuration and shared fixtures.""" + +import os +import shutil +from pathlib import Path + import pytest -from qbinary.types import ExportFormat, Disassembler -def pytest_addoption(parser): +from pyrrha_mapper.types import Backend + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Register custom CLI options.""" parser.addoption( - "--disassembler", + "--backend", action="store", - help="disassembler", - choices={x.name.lower() for x in Disassembler}, + help="backend", + choices={x.name.lower() for x in [Backend.IDA, Backend.GHIDRA]}, ) - parser.addoption( - "--exporter", - action="store", - help="exporter", - choices={x.name.lower() for x in ExportFormat}, - ) \ No newline at end of file + + +@pytest.fixture(autouse=True) +def _collect_export_artifacts(request: pytest.FixtureRequest) -> None: + """Copy artifacts produced by export_res to PYTEST_ARTIFACTS_DIR when it is set.""" + artifacts_dir = os.environ.get("PYTEST_ARTIFACTS_DIR") + if not artifacts_dir: + return + # Only act when the test used the export_res fixture. + if "export_res" not in request.fixturenames: + return + + def _copy() -> None: + try: + export_res = request.getfixturevalue("export_res") + except pytest.FixtureLookupError: + return + dest = Path(artifacts_dir) + dest.mkdir(parents=True, exist_ok=True) + for path in [ + export_res.export_path, + export_res.db_path, + export_res.project_path, + ]: + if path.exists(): + shutil.copy2(path, dest / path.name) + + request.addfinalizer(_copy) diff --git a/tests/test_cli.py b/tests/test_cli.py index 8ead6e6..65ad0a4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -23,21 +23,74 @@ from click import Command from click.testing import CliRunner, Result +from pyrrha_mapper import FileSystem, Symbol from pyrrha_mapper.__main__ import pyrrha -from pyrrha_mapper.common import FileSystem, Symbol -from pyrrha_mapper.intercg.fwmapper import InterImageCGMapper +from pyrrha_mapper.mappers import ExportedDecompilation, InterImageCGMapper def check_click_result(res: Result) -> None: """Raise Assertion error if issue.""" - assert res.exit_code == 0 + assert res.exit_code == 0, res.output assert not res.exception, res.exception for log in res.stderr.splitlines(): - assert ( - "ERROR" not in log - and "WARNING" not in log - and "CRITICAL" not in log - ), f"Error log: {log}" + assert "ERROR" not in log and "WARNING" not in log and "CRITICAL" not in log, ( + f"Error log: {log}" + ) + + +def check_click_result_allow_logs(res: Result) -> None: + """Like check_click_result but tolerates per-function ERROR/WARNING logs. + + The decomp mapper legitimately logs warnings/errors for individual + functions (e.g. a declaration not located in some decompiled body); these + do not make the run fail. Only the exit code and absence of an exception + are checked here, plus that no CRITICAL message was emitted. + """ + assert res.exit_code == 0, res.output + assert not res.exception, res.exception + for log in res.stderr.splitlines(): + assert "CRITICAL" not in log, f"Critical log: {log}" + + +class _SubprocessResult(NamedTuple): + """Mimic the subset of click ``Result`` used by ``check_click_result``. + + Backends that start a JVM (Ghidra via pyghidra/JPype) cannot be launched + reliably with ``CliRunner.invoke``: it runs the command *in-process*, and + starting the JVM inside the already-initialised pytest/coverage process + aborts JVM start-up (surfacing as + ``module '_jpype' has no attribute '_java_lang_Class'``). Running pyrrha + in a fresh subprocess - exactly how it is used in production and in the + standalone CLI - avoids this entirely. + """ + + exit_code: int + output: str + stderr: str + exception: BaseException | None = None + + +def run_pyrrha_subprocess(args: list) -> "_SubprocessResult": + """Run the pyrrha CLI in a separate process and adapt the result. + + :param args: CLI arguments (without the leading ``pyrrha``). + :return: a ``Result``-compatible object accepted by ``check_click_result``. + """ + import subprocess + import sys + + completed = subprocess.run( + [sys.executable, "-m", "pyrrha_mapper", *map(str, args)], + capture_output=True, + text=True, + ) + return _SubprocessResult( + exit_code=completed.returncode, + output=completed.stdout + completed.stderr, + stderr=completed.stderr, + exception=None, + ) + class TestCLI: """Tests to check that the CLI works and display correct messages.""" @@ -83,7 +136,8 @@ def SUBCOMMAND(self) -> str: FW_TEST_BIN_PATHS = { FW_TEST_LD, Path("/lib/libc.so.6"), - Path("/lib/libcrypto.so.1.1"), + # Path("/lib/libcrypto.so.1.1"), + Path("/lib/libcrypto.so.FOR_SONAME_TESTING"), Path("/lib/libdl.so.2"), Path("/lib/libpthread.so.0"), Path("/lib/libssl.so.1.1"), @@ -91,12 +145,24 @@ def SUBCOMMAND(self) -> str: } FW_TEST_SYMLINKS_PATHS = {Path("/lib/libssl.so")} + FW_TEST_SONAMES = { + "ld-linux.so.3": "ld-linux.so.3", + "libcrypto.so.FOR_SONAME_TESTING": "libcrypto.so.1.1", + "libdl.so.2": "libdl.so.2", + "libpthread.so.0": "libpthread.so.0", + "libssl.so.1.1": "libssl.so.1.1", + } + # =============================== INTERNAL STUFFS ================================== class ExecResults(NamedTuple): # noqa: D106 res: Result db_path: Path + @property + def project_path(self) -> Path: # noqa: D102 + return self.db_path.with_suffix(".srctrlprj") + @property def export_path(self) -> Path: # noqa: D102 return self.db_path.with_suffix(".json") @@ -110,22 +176,23 @@ def _path_id(val): # =============================== FIXTURES ======================================== @pytest.fixture(scope="class") - def pyrrha_exec(self, request, tmp_path_factory) -> ExecResults: + @classmethod + def pyrrha_exec(cls, request, tmp_path_factory) -> ExecResults: """Run pyrrha whith the given thread number and the given db path.""" runner = CliRunner() tmp_path = ( tmp_path_factory.mktemp("db", numbered=True) - / f"{self.SUBCOMMAND}-{request.param}.srctrldb" + / f"{cls.SUBCOMMAND}-{request.param}.srctrldb" ) args = [ - self.SUBCOMMAND, + cls.SUBCOMMAND, "--db", f"{tmp_path}", "-j", request.param, - f"{self.FW_TEST_PATH}", + f"{cls.FW_TEST_PATH}", ] - return self.ExecResults(res=runner.invoke(self.COMMAND, args), db_path=tmp_path) + return cls.ExecResults(res=runner.invoke(cls.COMMAND, args), db_path=tmp_path) @abstractmethod @pytest.fixture(scope="class") @@ -134,7 +201,8 @@ def export_res(self, tmp_path_factory, request) -> ExecResults: ... @pytest.fixture(scope="class") - def export_dump(self, export_res: ExecResults) -> FileSystem: + @classmethod + def export_dump(cls, export_res: ExecResults) -> FileSystem: """Load JSON export into a FileSystem object.""" return FileSystem.from_json_export(export_res.export_path) @@ -224,23 +292,24 @@ class TestFSMapper(BaseTestFsMapper): # =============================== FIXTURES ======================================== @pytest.fixture(scope="class") - def export_res(self, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: + @classmethod + def export_res(cls, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: """Run Pyrrha with export activated.""" runner = CliRunner() tmp_path = ( tmp_path_factory.mktemp("db", numbered=True) - / f"{self.SUBCOMMAND}-{request.param}-export.srctrldb" + / f"{cls.SUBCOMMAND}-{request.param}-export.srctrldb" ) args = [ - self.SUBCOMMAND, + cls.SUBCOMMAND, "-e", "--db", f"{tmp_path}", "-j", request.param, - f"{self.FW_TEST_PATH}", + f"{cls.FW_TEST_PATH}", ] - return self.ExecResults(res=runner.invoke(self.COMMAND, args), db_path=tmp_path) + return cls.ExecResults(res=runner.invoke(cls.COMMAND, args), db_path=tmp_path) # =================================== TESTS ======================================== @@ -256,6 +325,18 @@ def test_resolved_imported_symbols(self, bin_path: Path, export_dump: FileSystem "Some imported symbols have not been resolved" ) + @pytest.mark.parametrize("export_res", [1, 16], indirect=True) + @pytest.mark.parametrize( + "bin_path", BaseTestFsMapper.FW_TEST_BIN_PATHS, ids=BaseTestFsMapper._path_id + ) + def test_sonames(self, bin_path: Path, export_dump: FileSystem) -> None: + """Imported symbols correspond to a symbol object.""" + _bin = export_dump.get_binary_by_path(bin_path) + if _bin.path.name in BaseTestFsMapper.FW_TEST_SONAMES.keys(): + assert BaseTestFsMapper.FW_TEST_SONAMES[_bin.path.name] == _bin.soname, ( + "Some sonames are not matching" + ) + class TestFsCgMapper(BaseTestFsMapper): """Main functional test class for the fs-cg mapper. Tests are done from the CLI.""" @@ -272,48 +353,52 @@ def export_path(self) -> Path: # noqa: D102 # =============================== FIXTURES ========================================= @pytest.fixture(scope="class") - def pyrrha_exec(self, request, tmp_path_factory) -> BaseTestFsMapper.ExecResults: - """Run pyrrha whith the given thread number and the given db path.""" - runner = CliRunner() + @classmethod + def pyrrha_exec(cls, request, tmp_path_factory) -> BaseTestFsMapper.ExecResults: + """Run pyrrha whith the given thread number and the given db path. + + Uses a subprocess (not CliRunner) because the Ghidra backend starts a + JVM, which cannot be launched in-process inside pytest. + """ tmp_path = ( tmp_path_factory.mktemp("db", numbered=True) - / f"{self.SUBCOMMAND}-{request.param}.srctrldb" + / f"{cls.SUBCOMMAND}-{request.param}.srctrldb" ) args = [ - self.SUBCOMMAND, - "--disassembler", - f"{request.config.getoption('--disassembler')}", - "--exporter", - f"{request.config.getoption('--exporter')}", + cls.SUBCOMMAND, + "--backend", + f"{request.config.getoption('--backend')}", "--db", f"{tmp_path}", "-j", request.param, - f"{self.FW_TEST_PATH}", + f"{cls.FW_TEST_PATH}", ] - return self.ExecResults(res=runner.invoke(self.COMMAND, args), db_path=tmp_path) + return cls.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) @pytest.fixture(scope="class") - def export_res(self, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: - """Run Pyrrha with export activated.""" - runner = CliRunner() + @classmethod + def export_res(cls, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: + """Run Pyrrha with export activated. + + Uses a subprocess (not CliRunner) because the Ghidra backend starts a + JVM, which cannot be launched in-process inside pytest. + """ tmp_path = ( tmp_path_factory.mktemp("db", numbered=True) - / f"{self.SUBCOMMAND}-{request.param}-export.srctrldb" + / f"{cls.SUBCOMMAND}-{request.param}-export.srctrldb" ) args = [ - self.SUBCOMMAND, - "--disassembler", - f"{request.config.getoption('--disassembler')}", - "--exporter", - f"{request.config.getoption('--exporter')}", + cls.SUBCOMMAND, + "--backend", + f"{request.config.getoption('--backend')}", "--db", f"{tmp_path}", "-j", request.param, - f"{self.FW_TEST_PATH}", + f"{cls.FW_TEST_PATH}", ] - return self.ExecResults(res=runner.invoke(self.COMMAND, args), db_path=tmp_path) + return cls.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) # =================================== TESTS ======================================== @@ -342,3 +427,113 @@ def test_resolved_imported_symbols(self, bin_path: Path, export_dump: FileSystem assert target.name in _bin.imported_symbol_names assert _bin.imported_symbol_exists(target.name) assert isinstance(_bin.get_imported_symbol(target.name), Symbol) + + +class TestDecompMapper: + """Functional tests for the decomp mapper. Tests are done from the CLI. + + The decomp mapper runs on a single executable, so each binary of the test + firmware triggers its own ``decomp`` invocation. A subprocess (not + CliRunner) is used because the Ghidra backend starts a JVM, which cannot be + launched in-process inside pytest. + """ + + COMMAND: Command = pyrrha + SUBCOMMAND = "decomp" + + FW_TEST_PATH = Path(__file__).parent / "test_fw" + # Same set of executables as the fs-cg functional tests. + FW_TEST_BIN_PATHS = BaseTestFsMapper.FW_TEST_BIN_PATHS + + class ExecResults(NamedTuple): # noqa: D106 + res: Result + db_path: Path + + @property + def project_path(self) -> Path: # noqa: D102 + return self.db_path.with_suffix(".srctrlprj") + + @property + def export_path(self) -> Path: # noqa: D102 + return self.db_path.with_suffix(".json") + + @staticmethod + def _path_id(val): + if isinstance(val, Path): + return str(val) + return val + + @classmethod + def _host_path(cls, bin_path: Path) -> Path: + """:return: the on-host path of a firmware-relative binary path.""" + return cls.FW_TEST_PATH / bin_path.relative_to(bin_path.anchor) + + # =============================== FIXTURES ========================================= + + @pytest.fixture(scope="class") + @classmethod + def export_res(cls, tmp_path_factory, request) -> "TestDecompMapper.ExecResults": + """Run the decomp mapper with export activated on a single executable.""" + bin_path: Path = request.param + executable = cls._host_path(bin_path) + tmp_path = ( + tmp_path_factory.mktemp("db", numbered=True) + / f"{cls.SUBCOMMAND}-{bin_path.name}.srctrldb" + ) + args = [ + cls.SUBCOMMAND, + "--backend", + f"{request.config.getoption('--backend')}", + "--db", + f"{tmp_path}", + "--export", + f"{executable}", + ] + return cls.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) + + @pytest.fixture(scope="class") + @classmethod + def export_dump(cls, export_res: "TestDecompMapper.ExecResults") -> ExportedDecompilation: + """Load the JSON export into an ExportedDecompilation object.""" + return ExportedDecompilation.from_json_export(export_res.export_path) + + # =================================== TESTS ======================================== + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_db_creation(self, export_res: "TestDecompMapper.ExecResults") -> None: + """The NumbatUI DB and project files are generated.""" + check_click_result_allow_logs(export_res.res) + assert export_res.db_path.with_suffix(".srctrldb").exists(), "Missing DB file" + assert export_res.db_path.with_suffix(".srctrlprj").exists(), "Missing project file" + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_export_creation(self, export_res: "TestDecompMapper.ExecResults") -> None: + """The JSON export file exists.""" + check_click_result_allow_logs(export_res.res) + assert export_res.export_path.exists(), "Export file does not exist" + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_export_format(self, export_dump: ExportedDecompilation) -> None: + """The JSON export loads as an ExportedDecompilation object.""" + assert isinstance(export_dump, ExportedDecompilation), "Export cannot be loaded correctly" + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_functions_present(self, request, export_dump: ExportedDecompilation) -> None: + """The export records functions and binds them to the analysed binary.""" + bin_path: Path = request.node.callspec.params["export_res"] + assert export_dump.path.name == bin_path.name + assert len(list(export_dump.iter_functions())) > 0, "No function recorded" + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_function_addr_keys(self, export_dump: ExportedDecompilation) -> None: + """Every function is stored under its own (parser-space) address.""" + for addr, func in export_dump.functions.items(): + assert func.addr == addr, ( + f"{func.name} stored under {addr:#x} but addr is {func.addr:#x}" + ) + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_decompiled_source(self, export_dump: ExportedDecompilation) -> None: + """At least one non-imported function carries decompiled source.""" + with_source = [f for f in export_dump.iter_functions() if f.type != "imported" and f.source] + assert with_source, "No decompiled source recorded for any local function" diff --git a/tests/test_decomp_objects.py b/tests/test_decomp_objects.py new file mode 100644 index 0000000..9d20446 --- /dev/null +++ b/tests/test_decomp_objects.py @@ -0,0 +1,355 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for the decompilation export model (decomp_objects).""" + +import json +from pathlib import Path + +import pytest + +from pyrrha_mapper.mappers.decomp_objects import ( + ExportedDecompilation, + ExportedFunction, + ExportedLocation, +) +from pyrrha_mapper.mappers.objects import Symbol +from pyrrha_mapper.types import FuncType + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def callee_symbol() -> Symbol: + """:return: the symbol used as a callee function.""" + return Symbol(name="bar", demangled_name="bar", is_func=True, id=6, addr=0x2000) + + +@pytest.fixture +def caller_symbol() -> Symbol: + """:return: the symbol used as a caller function.""" + return Symbol(name="foo", demangled_name="foo(int)", is_func=True, id=5, addr=0x1000) + + +@pytest.fixture +def declaration_loc() -> ExportedLocation: + """:return: a location pointing at a function declaration.""" + return ExportedLocation(start_line=1, start_col=6, end_line=1, end_col=9) + + +@pytest.fixture +def call_loc() -> ExportedLocation: + """:return: a location pointing at a call site.""" + return ExportedLocation(start_line=2, start_col=3, end_line=2, end_col=6) + + +@pytest.fixture +def callee_func(callee_symbol: Symbol) -> ExportedFunction: + """:return: an ExportedFunction with no calls (a leaf callee).""" + return ExportedFunction(symbol=callee_symbol, type=FuncType.NORMAL, source="void bar(){}") + + +@pytest.fixture +def caller_func( + caller_symbol: Symbol, declaration_loc: ExportedLocation, call_loc: ExportedLocation +) -> ExportedFunction: + """:return: an ExportedFunction that calls bar (addr 0x2000).""" + return ExportedFunction( + symbol=caller_symbol, + type=FuncType.NORMAL, + calls=[0x2000], + callers=[], + source="void foo(int a){\n bar();\n}", + source_id=9, + declaration=declaration_loc, + source_calls_loc={0x2000: [call_loc]}, + ) + + +@pytest.fixture +def imported_func() -> ExportedFunction: + """:return: an imported (extern) ExportedFunction with no source.""" + return ExportedFunction( + symbol=Symbol(name="puts", demangled_name="puts", is_func=True, id=7, addr=0x3000), + type=FuncType.IMPORTED, + ) + + +@pytest.fixture +def example_decomp( + caller_func: ExportedFunction, + callee_func: ExportedFunction, + imported_func: ExportedFunction, +) -> ExportedDecompilation: + """:return: an ExportedDecompilation with caller, callee and an import.""" + return ExportedDecompilation( + path=Path("/bin/example"), + id=1, + functions={ + 0x1000: caller_func, + 0x2000: callee_func, + 0x3000: imported_func, + }, + ) + + +# --------------------------------------------------------------------------- +# ExportedLocation +# --------------------------------------------------------------------------- + + +class TestExportedLocation: + """Tests for the ExportedLocation model.""" + + def test_from_location(self) -> None: + """from_location copies the four coordinates.""" + from pyrrha_mapper.mappers.decomp_mapper import Location + + loc = Location(start_line=3, start_col=4, end_line=3, end_col=10) + exported = ExportedLocation.from_location(loc) + assert exported.as_tuple() == (3, 4, 3, 10) + + def test_as_tuple(self, call_loc: ExportedLocation) -> None: + """as_tuple returns the coordinates in declaration order.""" + assert call_loc.as_tuple() == (2, 3, 2, 6) + + def test_ordering(self) -> None: + """Locations are ordered by their dumped tuple.""" + small = ExportedLocation(start_line=1, start_col=1, end_line=1, end_col=2) + big = ExportedLocation(start_line=2, start_col=1, end_line=2, end_col=2) + assert small < big + assert small <= big + assert big > small + assert big >= small + assert small <= small + assert small >= small + + def test_roundtrip(self, call_loc: ExportedLocation) -> None: + """A location survives a JSON round-trip.""" + reloaded = ExportedLocation.model_validate_json(call_loc.model_dump_json()) + assert reloaded == call_loc + + +# --------------------------------------------------------------------------- +# ExportedFunction +# --------------------------------------------------------------------------- + + +class TestExportedFunction: + """Tests for the ExportedFunction model.""" + + def test_property_delegation(self, caller_func: ExportedFunction) -> None: + """id/name/demangled_name/addr delegate to the embedded symbol.""" + assert caller_func.id == 5 + assert caller_func.name == "foo" + assert caller_func.demangled_name == "foo(int)" + assert caller_func.addr == 0x1000 + + def test_id_setter(self, caller_func: ExportedFunction) -> None: + """Setting id updates the embedded symbol.""" + caller_func.id = 42 + assert caller_func.symbol.id == 42 + + def test_repr(self, caller_func: ExportedFunction) -> None: + """The repr uses the mangled name.""" + assert repr(caller_func) == "ExportedFunction('foo')" + + def test_non_func_symbol_rejected(self) -> None: + """A symbol with is_func=False cannot back an ExportedFunction.""" + with pytest.raises(ValueError): + ExportedFunction( + symbol=Symbol(name="data", demangled_name="data", is_func=False, addr=1), + type=FuncType.NORMAL, + ) + + def test_from_func_data(self) -> None: + """from_func_data converts a mapper FuncData into an ExportedFunction.""" + from pyrrha_mapper.mappers.decomp_mapper import FuncData, Location + + symbol = Symbol(name="foo", demangled_name="foo", is_func=True, id=5, addr=0x1000) + func = FuncData( + symbol=symbol, + type=FuncType.NORMAL, + calls=[0x2000], + callers=[0x500], + source="void foo(){ bar(); }", + source_id=9, + declaration=Location(1, 6, 1, 9), + ) + func.source_calls_loc[0x2000].append(Location(1, 13, 1, 16)) + + exported = ExportedFunction.from_func_data(func) + assert exported.name == "foo" + assert exported.calls == [0x2000] + assert exported.callers == [0x500] + assert exported.source_id == 9 + assert exported.declaration is not None + assert exported.declaration.as_tuple() == (1, 6, 1, 9) + assert exported.source_calls_loc[0x2000][0].as_tuple() == (1, 13, 1, 16) + + def test_from_func_data_no_declaration(self, callee_symbol: Symbol) -> None: + """from_func_data tolerates a missing declaration.""" + from pyrrha_mapper.mappers.decomp_mapper import FuncData + + func = FuncData( + symbol=callee_symbol, + type=FuncType.NORMAL, + calls=[], + callers=[], + source="", + ) + exported = ExportedFunction.from_func_data(func) + assert exported.declaration is None + assert exported.source_calls_loc == {} + + def test_roundtrip(self, caller_func: ExportedFunction) -> None: + """An ExportedFunction survives a JSON round-trip.""" + reloaded = ExportedFunction.model_validate_json(caller_func.model_dump_json()) + assert reloaded == caller_func + + +# --------------------------------------------------------------------------- +# ExportedDecompilation +# --------------------------------------------------------------------------- + + +class TestExportedDecompilation: + """Tests for the ExportedDecompilation model.""" + + def test_name_from_path(self, example_decomp: ExportedDecompilation) -> None: + """The name is derived from the path.""" + assert example_decomp.name == "example" + + def test_repr(self, example_decomp: ExportedDecompilation) -> None: + """The repr reports the path and the function count.""" + assert repr(example_decomp) == "ExportedDecompilation('/bin/example', funcs=3)" + + def test_function_exists(self, example_decomp: ExportedDecompilation) -> None: + """function_exists checks membership by address.""" + assert example_decomp.function_exists(0x1000) + assert not example_decomp.function_exists(0xDEAD) + + def test_function_name_exists(self, example_decomp: ExportedDecompilation) -> None: + """function_name_exists checks membership by mangled name.""" + assert example_decomp.function_name_exists("foo") + assert not example_decomp.function_name_exists("missing") + + def test_get_function_by_addr( + self, example_decomp: ExportedDecompilation, caller_func: ExportedFunction + ) -> None: + """get_function_by_addr retrieves the stored function.""" + assert example_decomp.get_function_by_addr(0x1000) == caller_func + + def test_get_function_by_name( + self, example_decomp: ExportedDecompilation, caller_func: ExportedFunction + ) -> None: + """get_function_by_name retrieves by mangled name and raises otherwise.""" + assert example_decomp.get_function_by_name("foo") == caller_func + with pytest.raises(KeyError): + example_decomp.get_function_by_name("missing") + + def test_add_function(self, callee_func: ExportedFunction) -> None: + """add_function stores a function under its address.""" + decomp = ExportedDecompilation(path=Path("/bin/x")) + decomp.add_function(callee_func) + assert decomp.function_exists(callee_func.addr) + assert decomp.get_function_by_addr(callee_func.addr) == callee_func + + def test_iter_functions(self, example_decomp: ExportedDecompilation) -> None: + """iter_functions yields every stored function.""" + names = sorted(f.name for f in example_decomp.iter_functions()) + assert names == ["bar", "foo", "puts"] + + def test_python_dump_keeps_int_keys(self, example_decomp: ExportedDecompilation) -> None: + """A python-mode dump keeps integer address keys.""" + dump = example_decomp.model_dump() + assert set(dump["functions"].keys()) == {0x1000, 0x2000, 0x3000} + + def test_json_dump_stringifies_keys(self, example_decomp: ExportedDecompilation) -> None: + """A JSON-mode dump stringifies the integer address keys.""" + dump_json = json.loads(example_decomp.model_dump_json()) + assert set(dump_json["functions"].keys()) == {"4096", "8192", "12288"} + + def test_roundtrip_equal(self, example_decomp: ExportedDecompilation) -> None: + """A full JSON round-trip preserves equality and int keys.""" + reloaded = ExportedDecompilation.model_validate_json(example_decomp.model_dump_json()) + assert reloaded == example_decomp + assert set(reloaded.functions.keys()) == {0x1000, 0x2000, 0x3000} + # nested int-keyed source_calls_loc is restored too + assert reloaded.functions[0x1000].source_calls_loc[0x2000][0].as_tuple() == (2, 3, 2, 6) + + def test_write_and_from_json_export( + self, example_decomp: ExportedDecompilation, tmp_path: Path + ) -> None: + """Calling write then from_json_export round-trips through a file.""" + export_path = tmp_path / "decomp.json" + example_decomp.write(export_path) + assert json.loads(export_path.read_text()), "exported data cannot be loaded as JSON" + reloaded = ExportedDecompilation.from_json_export(export_path) + assert reloaded == example_decomp + + def test_from_json_export_accepts_str_path( + self, example_decomp: ExportedDecompilation, tmp_path: Path + ) -> None: + """from_json_export also accepts a plain string path.""" + export_path = tmp_path / "decomp.json" + example_decomp.write(export_path) + reloaded = ExportedDecompilation.from_json_export(str(export_path)) + assert reloaded == example_decomp + + def test_validate_rejects_non_dict_functions(self) -> None: + """A non-dict functions payload is rejected.""" + with pytest.raises(ValueError): + ExportedDecompilation.model_validate({"path": "/bin/x", "functions": "nope"}) + + def test_validate_rejects_non_int_key(self, example_decomp: ExportedDecompilation) -> None: + """A function key that cannot be coerced to int is rejected.""" + dump = json.loads(example_decomp.model_dump_json()) + dump["functions"]["not_an_int"] = dump["functions"].pop("4096") + with pytest.raises(ValueError): + ExportedDecompilation.model_validate(dump) + + def test_validate_rejects_addr_key_mismatch(self, caller_func: ExportedFunction) -> None: + """A function stored under a key different from its symbol addr is rejected.""" + with pytest.raises(ValueError): + ExportedDecompilation(path=Path("/bin/x"), functions={0x9999: caller_func}) + + def test_from_mapper(self, monkeypatch: pytest.MonkeyPatch) -> None: + """from_mapper projects a DecompilMapper's bin and functions.""" + from pyrrha_mapper.mappers.decomp_mapper import FuncData + from pyrrha_mapper.mappers.objects import Binary + + symbol = Symbol(name="foo", demangled_name="foo", is_func=True, id=5, addr=0x1000) + + class _FakeMapper: + def __init__(self) -> None: + self.bin = Binary(path=Path("/bin/example"), id=1) + self.functions = { + 0x1000: FuncData( + symbol=symbol, + type=FuncType.NORMAL, + calls=[], + callers=[], + source="void foo(){}", + ) + } + + export = ExportedDecompilation.from_mapper(_FakeMapper()) # type: ignore[arg-type] + assert export.path == Path("/bin/example") + assert export.id == 1 + assert export.name == "example" + assert export.get_function_by_addr(0x1000).name == "foo" diff --git a/tests/test_filesystem_objects.py b/tests/test_filesystem_objects.py index 5df62dd..3658aab 100644 --- a/tests/test_filesystem_objects.py +++ b/tests/test_filesystem_objects.py @@ -22,8 +22,8 @@ import pytest -from pyrrha_mapper.common import Binary, FileSystem, Symbol, Symlink -from pyrrha_mapper.common.objects import TargetType +from pyrrha_mapper import Binary, FileSystem, Symbol, Symlink +from pyrrha_mapper.mappers.objects import TargetType class SerializedFS(NamedTuple): diff --git a/tests/test_fw/lib/libcrypto.so.1.1 b/tests/test_fw/lib/libcrypto.so.FOR_SONAME_TESTING similarity index 100% rename from tests/test_fw/lib/libcrypto.so.1.1 rename to tests/test_fw/lib/libcrypto.so.FOR_SONAME_TESTING