From 997a591a5b50014762fcbc1954a4d81eb91e945b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 27 Mar 2026 16:37:04 +0100 Subject: [PATCH 01/62] fs-cg: replace qbinary by direct interactions with disassemblers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- pyproject.toml | 2 + src/pyrrha_mapper/intercg/fwmapper.py | 72 +-- src/pyrrha_mapper/intercg/loader.py | 800 ++++++++++++++++++++------ 3 files changed, 646 insertions(+), 228 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4301a80..fef14b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,8 @@ dependencies = [ 'rich', # InterCG mapper "qbinary>=0.0.3", # will also install idascript + "ida_domain", + "pyghidra" ] dynamic = ['version'] diff --git a/src/pyrrha_mapper/intercg/fwmapper.py b/src/pyrrha_mapper/intercg/fwmapper.py index 270a674..a8f1df5 100644 --- a/src/pyrrha_mapper/intercg/fwmapper.py +++ b/src/pyrrha_mapper/intercg/fwmapper.py @@ -16,11 +16,11 @@ """InterCGMapper implementation.""" import logging +import sys from collections import defaultdict +from hashlib import md5 from pathlib import Path from typing import Any -from hashlib import md5 -import sys # third-party imports from numbat import SourcetrailDB @@ -36,9 +36,8 @@ ) from pyrrha_mapper.exceptions import FsMapperError from pyrrha_mapper.fs import FileSystemImportsMapper -from pyrrha_mapper.intercg.loader import load_program -from pyrrha_mapper.types import ResolveDuplicateOption -from qbinary.types import Disassembler, ExportFormat +from pyrrha_mapper.intercg.loader import BinaryParser, GhidraParser, IDAParser +from pyrrha_mapper.types import Disassembler, Exporter, ResolveDuplicateOption IGNORE_LIST = ["__gmon_start__"] @@ -48,15 +47,10 @@ # Determine the command to open URLs based on the platform try: - URL_OPEN_CMD = { - "linux": "xdg-open", - "win32": "start", - "darwin": "open" - }[sys.platform] + URL_OPEN_CMD = {"linux": "xdg-open", "win32": "start", "darwin": "open"}[sys.platform] except KeyError: logging.warning(f"Unsupported platform: {sys.platform} (will not add URL handler)") - URL_OPEN_CMD = "" # type: ignore - + URL_OPEN_CMD = "" # type: ignore class InterImageCGMapper(FileSystemImportsMapper): @@ -64,8 +58,8 @@ class InterImageCGMapper(FileSystemImportsMapper): FS_EXT = ".fs.json" - DISASS = Disassembler.AUTO - EXPORT = ExportFormat.AUTO + DISASS = Disassembler.IDA + EXPORT = Exporter.NONE def __init__(self, root_directory: Path | str, db: SourcetrailDB | None): super(InterImageCGMapper, self).__init__(root_directory, db) @@ -98,10 +92,10 @@ def _correct_map_result(self, res: Any) -> bool: ) ) ) - + def load_binary_args(self) -> dict[str, Any]: """Return dict of args for load_binary that are always the same for the wholde firmware. - + Use to optimize multiprocessing. Set here there real values. """ res = super().load_binary_args() @@ -114,7 +108,7 @@ def load_binary( root_directory: Path, file_path: Path, disass: Disassembler = DISASS, - exporter: ExportFormat = EXPORT, + exporter: Exporter = EXPORT, ) -> tuple[Binary, dict[Symbol, list[str]] | None] | str: """Load all the binaries located in the filesystem as Binary objects. @@ -127,36 +121,26 @@ def load_binary( :param cache_file: Cache file to load binaries from (if exists) """ - res = FileSystemImportsMapper.load_binary(root_directory, file_path) - if isinstance(res, str): # error message - return res - else: - binary, _ = res - if binary.real_path is None: - return f"ERROR: Path on the filesystem of {binary.name} not set (skip)" - if not binary.real_path.exists(): - return ( - f"ERROR cannot find executable mentioned in 'fs' mapper: " - f"{binary.real_path.name} (skip)" - ) - try: - prefix = f"[binary mapping] {binary.name}" - unresolved_cg = load_program(binary, disass, exporter, prefix) - return binary, unresolved_cg + if disass == Disassembler.IDA: + ida_parser: BinaryParser = IDAParser(root_directory, file_path) + return ida_parser.binary, ida_parser.call_graph + elif disass == Disassembler.GHIDRA: + ghidra_parser = GhidraParser(root_directory, file_path) + return ghidra_parser.binary, ghidra_parser.call_graph + else: + return f" disassembler {disass} is not supported" except (FileNotFoundError, FsMapperError, SyntaxError) as e: - logging.error(f"ERROR: Loading error: {binary.name}: {e}") - return binary, None - + return f"[binary mapping] {file_path.name}: ERROR: Loading error: {e}" def add_url_handler(self, hash: str, binary: Binary, symbol: Symbol) -> None: - """ Open the function using a dedicated URL handler. (Use Heimdallr) """ + """Open the function using a dedicated URL handler. (Use Heimdallr)""" if not hash: return # no hash, no URL handler if URL_OPEN_CMD: - url = f"disas://{hash}?idb={binary.name+'.i64'}&offset={symbol.addr:#08x}" + url = f"disas://{hash}?idb={binary.name + '.i64'}&offset={symbol.addr:#08x}" cmd: list[str] = ["xdg-open", url] - self.db_interface.set_custom_command(symbol.id, cmd, "Open in Disassembler") # type: ignore + self.db_interface.set_custom_command(symbol.id, cmd, "Open in Disassembler") # type: ignore else: pass # Can't add URL unsuported platform @@ -181,16 +165,16 @@ def map_binary( self._record_custom_command(bin_object, f"[bin mapping] {bin_object.name}") def symbol_recorded(self, binary: Binary, symbol: Symbol) -> None: - """ - Register a symbol recorded handler to add a custom command. - """ + """Register a symbol recorded handler to add a custom command.""" self.add_url_handler(self._current_binary_hash, binary, symbol) def _treat_bin_parsing_result(self, path: Path, res: Any): """Handle load_binary res, map it or display error.""" - log_prefix = f"[binary mapping] {path.name}" + log_prefix = f"[binary parsing] {path.name}" if isinstance(res, str): logging.error(f"{log_prefix}: {res}") + elif isinstance(res, BaseException): + logging.error(f"{log_prefix}: {repr(res)}") elif self._correct_map_result(res): bin_obj, additional_info = res self.map_binary(bin_obj, additional_info) @@ -198,7 +182,7 @@ def _treat_bin_parsing_result(self, path: Path, res: Any): self.map_binary(res[0], None) logging.info(f"{log_prefix}: fallback to lief results, internal analysis failed") else: - logging.warning(f"{log_prefix}: impossible to parse the following result {res}") + logging.warning(f"{log_prefix}: impossible to parse the following result {res.args}") def map_binaries_main(self, threads: int, progress: Progress) -> None: """Parse and map binaries of a given directory. diff --git a/src/pyrrha_mapper/intercg/loader.py b/src/pyrrha_mapper/intercg/loader.py index ff53778..117a1e1 100644 --- a/src/pyrrha_mapper/intercg/loader.py +++ b/src/pyrrha_mapper/intercg/loader.py @@ -16,234 +16,666 @@ """Load information used by InterCGMapper from the files on the disk.""" import logging +from abc import abstractmethod +from collections.abc import Iterator +from enum import StrEnum +from pathlib import Path from typing import NamedTuple -# third-party imports -from qbinary import Program, FunctionType, DisassExportNotImplemented, ExportException, \ - Disassembler, ExportFormat - -# local imports from pyrrha_mapper.common import Binary, Symbol from pyrrha_mapper.exceptions import FsMapperError +from pyrrha_mapper.fs import FileSystemImportsMapper +class FuncType(StrEnum): + """Represent the type of a function.""" -def load_program(binary: Binary, disass: Disassembler, - export: ExportFormat, log_prefix: str = "") -> dict[Symbol, list[str]]: - """Create a Binary object from a given file using lief and qbinary. - - It modifies the provided binary object in place. - - In order, it performs the following actions: - 1. load the program object - 2. use lief to extract exported symbols (handle conflicts with IDA names) - 3. checks if exported functions have been missed by IDA (but referenced in LIEF) - 4. Mangle the call graph to make external call to .PLT to directly jump on the - external symbol + IMPORTED = "imported" + LIBRARY = "library" + NORMAL = "normal" + THUNK = "thunk" - raise: FsMapperError if cannot load it - :param binary: a Binary object that will be completed - :param disass: Disassembler enum to use for program loading - :param export: Export format to use for program loading +class FuncData(NamedTuple): + """Store function data collected by the binary parser. - :return: a dict of called done by each symbol of the binary + All addresses are in **parser space** (the native address space of the + underlying tool — IDA, Ghidra, etc.). """ - file_path = binary.real_path - if file_path is None: - raise FileNotFoundError(file_path) - - try: - program = Program.from_binary(file_path, - export_format=export, - disassembler=disass, - timeout=600, # TODO: Receive through command line ? - override=False) # if export exists use it - # Load the call graph - return compute_call_graph(binary, program, log_prefix) # type: ignore - except DisassExportNotImplemented as e: - logging.error(f"Disassembler {disass} does not support export format {export}: {e}") - raise FsMapperError(f"{e}") from e - except ExportException as e: - logging.error(f"Error while loading binary {file_path}: {e}") - raise FsMapperError(f"{e}") from e - return None - - -class _FuncData(NamedTuple): + symbol: Symbol - type: FunctionType + type: FuncType calls: list[int] callers: list[int] @property def name(self) -> str: + """:return: mangled name of the function""" return self.symbol.name @property def demangled_name(self) -> str: + """:return: demangled name of the function""" return self.symbol.demangled_name @property def addr(self) -> int: + """:return: address of the function in the Binary""" assert self.symbol.addr return self.symbol.addr -def _generate_calls_list(func: _FuncData, call_graph: dict[int, _FuncData], log_prefix: str) -> list[str]: - """Given a function return its call list. +def _count_leading_underscores(name: str) -> int: + """:return: the number of leading underscores/dots in name""" + return len(name) - len(name.lstrip("_.")) + - It only contains functions that are contained in the call graph and have a name. +class BinaryParser: + """Abstract base class that parses a binary and extracts call-graph data. + + Subclasses implement the parser-specific methods (IDA, Ghidra, …). + Adresses are the one used in the backend, which can differ from LIEF ones + (relative vs virtual). """ - res = list() - for c in [call_graph[x] for x in func.calls if x in call_graph]: - if c.name: # Has a true name - res.append(c.name) # Add it normally - else: # ignore function without name - logging.warning( - f"{log_prefix}: {func.symbol} calls a function without name (at {c.addr:#08x})" + + def __init__(self, root_directory: Path, file_path: Path) -> None: + self.log_prefix = f"[binary parsing] {file_path.name}" + self._binary = self._generate_lief_bin(root_directory, file_path) + self._initiate_bin_parser(root_directory, file_path, self._binary.image_base) + + image_base = self._binary.image_base + + # Remap LIEF export addresses to parser space once. + # Keys are parser-space addresses; values are lists of LIEF Symbols. + parser_exports: dict[int, list[Symbol]] = { + lief_addr - image_base: symbols + for lief_addr, symbols in self._binary.exported_funcs_by_addr.items() + } + + # ------------------------------------------------------------------ + # Step 1 — combine parser functions with LIEF export metadata + # ------------------------------------------------------------------ + program_data: dict[int, FuncData] = self._combine_program_analysis_binary(parser_exports) + + # ------------------------------------------------------------------ + # Step 2 — find exported symbols not discovered by the parser and + # add them to the call graph with an empty call list + # ------------------------------------------------------------------ + parser_addrs: set[int] = set(self._iter_func_addr()) + call_graph: dict[Symbol, list[str]] = {} + + for parser_addr, symbols in parser_exports.items(): + if parser_addr in parser_addrs: + continue + canon = self._disambiguate_export(symbols) + # ARM THUMB: parser may use address - 1 (THUMB bit cleared) + if self._is_func_start(parser_addr - 1): + if self._func_mangled_name(parser_addr - 1) in {s.name for s in symbols}: + continue + logging.debug( + f"{self.log_prefix}: export {canon.name} @ {parser_addr:#x} " + f"not found in parser output" + ) + call_graph[canon] = [] + if len(symbols) > 1: + for sym in symbols: + self._binary.replace_function(canon, sym, True) + + # ------------------------------------------------------------------ + # Step 3 — build the call graph, resolving thunk trampolines + # ------------------------------------------------------------------ + # Maps a trampoline name → the canonical name it should forward to. + trampoline_map: dict[str, str] = {} + + for func_data in program_data.values(): + exported = ( + func_data.addr in parser_exports + or func_data.addr + 1 in parser_exports # ARM THUMB ) - return res + if func_data.type in (FuncType.LIBRARY, FuncType.NORMAL) or ( + func_data.type == FuncType.THUNK and (exported or len(func_data.calls) > 1) + ): + call_graph[func_data.symbol] = self._build_calls_list(func_data, program_data) + continue -def combine_program_analysis_binary(binary: Binary, program: Program, log_prefix: str) -> dict[int, _FuncData]: - """Combine program and binary objects by computing useful data. + if ( + func_data.type == FuncType.THUNK + and len(func_data.calls) == 1 + and func_data.calls[0] in program_data + ): + callee_data = program_data[func_data.calls[0]] + if callee_data.type == FuncType.IMPORTED: + # Keep the less-decorated name as the canonical one + trampoline_name = func_data.name + destination_name = callee_data.name + if _count_leading_underscores(trampoline_name) > _count_leading_underscores( + destination_name + ): + trampoline_name, destination_name = destination_name, trampoline_name + else: + trampoline_name = func_data.name + destination_name = callee_data.name + + # Resolve chains: A→B, B→C becomes A→C + while ( + destination_name in trampoline_map + and trampoline_map[destination_name] != destination_name + ): + destination_name = trampoline_map[destination_name] + trampoline_map[trampoline_name] = destination_name + for key, val in trampoline_map.items(): + if val == trampoline_name: + trampoline_map[key] = destination_name + + elif func_data.type == FuncType.THUNK and not func_data.calls and func_data.callers: + # Terminal thunk with callers but no callees — keep it + continue - It updates binary object if new functions are determined. + # Remove functions not kept as exported/library/normal + if self._binary.get_function_by_name(func_data.name).addr == func_data.addr: + self._binary.remove_function(func_data.name) - :param binary: binary object to update, contain data already analyzed - :param program: Program object in which to extract data - :return: a dict [addr, FuncData object associated to this address] - """ - exports = binary.exported_funcs_by_addr - program_data: dict[int, _FuncData] = {} - for f_addr, f in program.items(): - if f_addr in exports or f_addr + 1 in exports: # function exported (and visible in LIEF) - all_symbs = exports.get( - f_addr, exports.get(f_addr + 1, []) - ) # In THUMB mode address is address+1 - f_symb = disambiguate_export(all_symbs, log_prefix) - if f.name != f_symb.demangled_name: - logging.debug(f"{log_prefix}: change fun name {f.name} -> {f_symb.demangled_name}") - if len(all_symbs) > 1: # all the symbols will point on the chosen one - map(lambda x: binary.replace_function(f_symb, x, True), all_symbs) - else: - f_symb = Symbol(name=f.mangled_name, demangled_name=f.name, is_func=True, addr=f_addr) - binary.add_function(f_symb) - - program_data[f_addr] = _FuncData( - symbol=f_symb, - type=f.type, - calls=list(f.children), - callers=list(f.parents), - ) - return program_data + # Apply trampoline substitutions to the final call graph + self._call_graph: dict[Symbol, list[str]] = { + sym: [trampoline_map.get(c, c) for c in callees] for sym, callees in call_graph.items() + } + self._close_bin_parser() -def compute_call_graph(binary: Binary, program: Program, log_prefix: str = "") -> dict[Symbol, list[str]]: - """Compute the call graph of the program using Quokka/Binexport. + # ------------------------------------------------------------------ + # Useful public properties + # ------------------------------------------------------------------ - It fill the call attribute of binary. + @property + def binary(self) -> Binary: + """:return: the Binary produced by the parser.""" + return self._binary - :param binary: binary object to update, contain data already analyzed - :param program: Program object in which to extract data - """ + @property + def call_graph(self) -> dict[Symbol, list[str]]: + """:return: mapping from each Symbol to its list of callee names.""" + return self._call_graph + + # ------------------------------------------------------------------ + # Abstract interface — implemented by each parser backend + # ------------------------------------------------------------------ + + @abstractmethod + def _initiate_bin_parser(self, root_directory: Path, file_path: Path, image_base: int = 0): + """Open the binary parser and run any required analysis.""" + + @abstractmethod + def _close_bin_parser(self): + """Close the binary parser and release all resources.""" + + @abstractmethod + def _is_func_start(self, addr: int) -> bool: + """:return: True if *addr* (parser space) is the entry point of a function.""" + + @abstractmethod + def _iter_func_addr(self) -> Iterator[int]: + """Yield the parser-space entry-point address of every known function.""" + + @abstractmethod + def _func_mangled_name(self, addr: int) -> str: + """:return: the raw (mangled) name of the function at *addr*""" + + @abstractmethod + def _func_demangled_name(self, addr: int) -> str: + """:return: the demangled name of the function at *addr*""" + + @abstractmethod + def _func_children(self, addr: int) -> list[int]: + """:return: entry-point addresses of callees of the function at *addr*.""" + + @abstractmethod + def _func_parents(self, addr: int) -> list[int]: + """:return: entry-point addresses of callers of the function at *addr*.""" + + @abstractmethod + def _func_type(self, addr: int) -> FuncType: + """:return: the FuncType of the function at *addr*. + + Thunk stubs that resolve to external/imported functions must return + ``FuncType.IMPORTED`` so the trampoline resolution in ``__init__`` + correctly forwards callers to the imported symbol name. + """ + + # ------------------------------------------------------------------ + # Concrete helpers + # ------------------------------------------------------------------ + + def _generate_lief_bin(self, root_directory: Path, file_path: Path) -> Binary: + """Load the binary via LIEF and return a populated Binary object. + + :raises FsMapperError: on load failure or missing path information. + """ + result = FileSystemImportsMapper.load_binary(root_directory, file_path) + if isinstance(result, str): + raise FsMapperError(result) + lief_binary, _ = result + if lief_binary.real_path is None: + raise FsMapperError(f"{self.log_prefix}: real_path not set (skip)") + if not lief_binary.real_path.exists(): + raise FsMapperError(f"{self.log_prefix}: executable not found (skip)") + return lief_binary + + def _build_calls_list( + self, + func: FuncData, + call_graph: dict[int, FuncData], + ) -> list[str]: + """Given a function return its call list. + + It only contains functions that are contained in the call graph and have a name. + + :return: a list of string (function names) + """ + res: list[str] = list() + for callee in [call_graph[addr] for addr in func.calls if addr in call_graph]: + if callee.name is not None and callee.name != "": + res.append(callee.name) + else: + logging.warning( + f"{self.log_prefix}: {func.symbol} calls unnamed function @ {callee.addr:#08x}" + ) + return res + + def _combine_program_analysis_binary( + self, + parser_exports: dict[int, list[Symbol]], + ) -> dict[int, FuncData]: + """Build a ``{parser_addr: FuncData}`` dict merging parser and LIEF data. + + For each function discovered by the parser: + + - If its parser-space address matches a LIEF export entry, the export + Symbol is used. + - Otherwise a new internal Symbol is created — unless the function name + matches a known imported symbol (e.g. a PLT stub already tracked by + LIEF as an import), in which case the function is skipped entirely. + + :param parser_exports: LIEF exports already remapped to parser space. + :return: mapping from parser-space address to FuncData. + """ + imported_names: set[str] = set(self._binary.imported_symbol_names) + program_data: dict[int, FuncData] = {} + + for parser_addr in self._iter_func_addr(): + if parser_addr in parser_exports or parser_addr + 1 in parser_exports: + # Exported function — adopt the LIEF symbol + symbols = parser_exports.get(parser_addr, parser_exports.get(parser_addr + 1, [])) + func_symbol = self._disambiguate_export(symbols) + parser_name = self._func_demangled_name(parser_addr) + if parser_name != func_symbol.demangled_name: + logging.debug( + f"{self.log_prefix}: rename {parser_name} → {func_symbol.demangled_name}" + ) + if len(symbols) > 1: + for sym in symbols: + self._binary.replace_function(func_symbol, sym, True) + else: + # Internal function — create a new Symbol in parser space + mangled_name = self._func_mangled_name(parser_addr) + # Skip PLT stubs and functions already tracked as imports by LIEF + if mangled_name in imported_names: + continue + func_symbol = Symbol( + name=mangled_name, + demangled_name=self._func_demangled_name(parser_addr), + is_func=True, + addr=parser_addr, + ) + self._binary.add_function(func_symbol) + + program_data[parser_addr] = FuncData( + symbol=func_symbol, + type=self._func_type(parser_addr), + calls=self._func_children(parser_addr), + callers=self._func_parents(parser_addr), + ) - def _nb_initial_underscore(x: str) -> int: - return len(x) - len(x.strip("_.")) + return program_data - # Call graph fun_name -> [callee_name1, callee_name2] - call_graph: dict[Symbol, list[str]] = {} - exports = binary.exported_funcs_by_addr + def _disambiguate_export(self, symbols: list[Symbol]) -> Symbol: + """Choose the most appropriate Symbol when multiple share the same address. - # Combine program and binary objects by computing useful data - program_data = combine_program_analysis_binary(binary, program, log_prefix) + Prefers the shortest name that does not start with ``_``. + Falls back to the globally shortest name if all names start with ``_``. + """ + if len(symbols) == 1: + return symbols[0] - # Check if some exports don't have any associated function (not detected by IDA) - for exp_addr in exports.keys() - program.keys(): - all_symbs = exports[exp_addr] - canon = disambiguate_export(all_symbs, log_prefix) - if p_fun := program.get(exp_addr - 1): - # IDA keeps ARM address while LIEF use THUMB addresses - if p_fun.mangled_name in [s.name for s in all_symbs]: - # Check that we have a match on names + chosen: Symbol | None = None + for sym in symbols: + if sym.demangled_name.startswith("_"): continue - # else case - logging.debug(f"{log_prefix}: export {canon.name}: {hex(exp_addr)} address not found in program.") - call_graph[canon] = [] - if len(all_symbs) > 1: # all the symbols will point on the chosen one - map(lambda x: binary.replace_function(canon, x, True), all_symbs) - - # Iterate back the temporary dict to fill the real call graph - # The deal here is to fast-forward call to imported function directly on the - # imported symbol and not on the PLT (to make the graph more straightforward) - removed_trampoline: dict[str, str] = dict() - for f in program_data.values(): - if ( - f.type in [FunctionType.normal, FunctionType.library] - # If thunk AND exported or thunk AND call several func, keep it (for later resolution) - or ( - f.type == FunctionType.thunk - and ((f.addr in exports) or (f.addr + 1 in exports) or len(f.calls) > 1) + if chosen is None or len(sym.demangled_name) < len(chosen.demangled_name): + chosen = sym + + if chosen is None: + logging.debug( + f"{self.log_prefix}: all exports start with '_', " + f"picking shortest: {[s.demangled_name for s in symbols]}" ) + chosen = min(symbols, key=lambda s: len(s.demangled_name)) + + return chosen + + +# ====================================================================== +# IDA Pro backend +# ====================================================================== + + +class IDAParser(BinaryParser): + """BinaryParser implementation using IDA Pro as the analysis backend.""" + + def _initiate_bin_parser(self, root_directory: Path, file_path: Path, image_base: int = 0): + """Open the IDA database, running auto-analysis if needed.""" + from ida_domain.database import Database, IdaCommandOptions + + self._ida_cached_func = None # single-entry cache used by _get_ida_func + self._ida_db = Database.open( + str(root_directory / file_path), + args=IdaCommandOptions(auto_analysis=True, new_database=False), + ) + + def _close_bin_parser(self): + """Close the IDA database without saving.""" + self._ida_db.close(save=False) + + def _get_ida_func(self, addr: int): + """:return: the IDA function at *addr*, using a single-entry cache.""" + if self._ida_cached_func is not None and addr == self._ida_cached_func.start_ea: + return self._ida_cached_func + return self._ida_db.functions.get_at(addr) + + def _is_func_start(self, addr: int) -> bool: + """:return: True if *addr* is the entry point of a known IDA function.""" + from ida_domain.base import InvalidEAError + + try: + if self._ida_cached_func is not None and addr == self._ida_cached_func.start_ea: + return True + func = self._ida_db.functions.get_at(addr) + return func is not None and func.start_ea == addr + except InvalidEAError: + return False + + def _iter_func_addr(self) -> Iterator[int]: + """Yield the entry-point address of every function known to IDA.""" + for func in self._ida_db.functions.get_all(): + self._ida_cached_func = func + yield func.start_ea + + def _func_mangled_name(self, addr: int) -> str: + """:return: the raw name of the function at *addr*, or ``sub_``.""" + func = self._get_ida_func(addr) + if func is not None: + name = self._ida_db.functions.get_name(func) + if name: + return name + return f"sub_{addr:X}" + + def _func_demangled_name(self, addr: int) -> str: + """:return: the demangled name, falling back to the mangled name.""" + mangled = self._func_mangled_name(addr) + demangled = self._ida_db.names.demangle_name(mangled) + return demangled if demangled is not None else mangled + + def _func_children(self, addr: int) -> list[int]: + """:return: parser-space addresses of callees of the function at *addr*.""" + func = self._get_ida_func(addr) + if func is None: + return [] + return [callee.start_ea for callee in self._ida_db.functions.get_callees(func)] + + def _func_parents(self, addr: int) -> list[int]: + """:return: parser-space addresses of callers of the function at *addr*.""" + func = self._get_ida_func(addr) + if func is None: + return [] + return [caller.start_ea for caller in self._ida_db.functions.get_callers(func)] + + def _func_type(self, addr: int) -> FuncType: + """:return: the FuncType of the function at *addr*. + + Thunks whose sole callee is an imported symbol are classified as + ``IMPORTED`` so the trampoline resolution correctly forwards callers. + """ + from ida_domain.functions import FunctionFlags + + func = self._get_ida_func(addr) + if func is None: + return FuncType.NORMAL + + match self._ida_db.functions.get_flags(func): + case FunctionFlags.LIB: + return FuncType.LIBRARY + case FunctionFlags.THUNK: + callees = list(self._ida_db.functions.get_callees(func)) + if len(callees) == 1: + callee_name = self._ida_db.functions.get_name(callees[0]) + if self._ida_db.imports.exists(callee_name): + return FuncType.IMPORTED + return FuncType.THUNK + case _: + func_name = self._ida_db.functions.get_name(func) + if self._ida_db.imports.exists(func_name): + return FuncType.IMPORTED + return FuncType.NORMAL + + +# ====================================================================== +# Ghidra backend +# ====================================================================== + + +class GhidraParser(BinaryParser): + """BinaryParser backed by Ghidra 12.0+ via PyGhidra.""" + + def _initiate_bin_parser(self, root_directory: Path, file_path: Path, image_base: int = 0): + """Start the JVM, open and fully analyse the binary, and initialise handles.""" + import os + import tempfile + + import pyghidra + + # Initialise all attributes upfront so _close_bin_parser is always safe + self._pyghidra_ctx = None + self._ghidra_program = None + self._ghidra_project_dir: Path | None = None + self._ghidra_func_manager = None + self._ghidra_symbol_table = None + self._ghidra_ext_manager = None + self._ghidra_demangler = None + self._ghidra_cached_func = None + self._ghidra_load_base: int = 0 + self._ghidra_monitor = None + + full_path = root_directory / file_path + self._ghidra_project_dir = Path(tempfile.mkdtemp(prefix=f"ghidra_{os.getpid()}_")) + + # Start the JVM once per worker process (no-op if already running) + if not pyghidra.started(): + from pyghidra.launcher import HeadlessPyGhidraLauncher + + launcher = HeadlessPyGhidraLauncher() + launcher.add_vmargs("-Xms512m", "-Xmx2g", "-XX:+UseG1GC") + launcher.start() + + # Ghidra imports must come after JVM start + from ghidra.app.util.demangler.gnu import GnuDemangler + from ghidra.util.task import ConsoleTaskMonitor + + self._ghidra_monitor = ConsoleTaskMonitor() + + # open_program(analyze=True) runs full blocking analysis and correctly + # populates all cross-references including the call graph. + # Note: open_program is deprecated in PyGhidra 3.0 but is currently + # the only reliable path for complete headless analysis. + self._pyghidra_ctx = pyghidra.open_program( + str(full_path), + project_location=str(self._ghidra_project_dir), + project_name="p", + analyze=True, + ) + flat_api = self._pyghidra_ctx.__enter__() + program = flat_api.getCurrentProgram() + + self._ghidra_program = program + self._ghidra_load_base = program.getImageBase().getOffset() + self._ghidra_func_manager = program.getFunctionManager() + self._ghidra_symbol_table = program.getSymbolTable() + self._ghidra_ext_manager = program.getExternalManager() + + demangler = GnuDemangler() + self._ghidra_demangler = demangler if demangler.canDemangle(program) else None + + def _close_bin_parser(self): + """Exit the PyGhidra context and delete the temporary project directory.""" + import shutil + + if self._pyghidra_ctx is not None: + try: + self._pyghidra_ctx.__exit__(None, None, None) + except Exception: + pass + if self._ghidra_project_dir is not None: + shutil.rmtree(self._ghidra_project_dir, ignore_errors=True) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _to_ghidra_address(self, parser_addr: int): + """Convert a parser-space address to a Ghidra ``Address`` object. + + Adds ``_ghidra_load_base`` to restore the absolute Ghidra address, then + masks to a signed 64-bit integer to satisfy JPype's type requirements. + """ + abs_addr = (parser_addr + self._ghidra_load_base) & 0xFFFFFFFFFFFFFFFF + if abs_addr >= 0x8000000000000000: + abs_addr -= 0x10000000000000000 + return ( + self._ghidra_program.getAddressFactory().getDefaultAddressSpace().getAddress(abs_addr) + ) + + def _to_parser_addr(self, ghidra_offset: int) -> int: + """Convert an absolute Ghidra address offset to parser space.""" + return ghidra_offset - self._ghidra_load_base + + def _get_ghidra_func(self, parser_addr: int): + """:return: the Ghidra Function at *parser_addr*, with a single-entry cache.""" + if ( + self._ghidra_cached_func is not None + and self._to_parser_addr(self._ghidra_cached_func.getEntryPoint().getOffset()) + == parser_addr ): - call_graph[f.symbol] = _generate_calls_list(f, program_data, log_prefix) - continue - - # Replace thunk calling only one function (and only one) - elif f.type == FunctionType.thunk and len(f.calls) == 1 and f.calls[0] in program_data: - sub_callee = program_data[f.calls[0]] - if sub_callee.type == FunctionType.imported: - # Keep the name of the thunk "strcpy, sprintf" - name, target = sub_callee.name, f.name - # in case of nested functions (starting with _, keep the less nested one) - if _nb_initial_underscore(target) > _nb_initial_underscore(name): - name, target = target, name - else: # Forward the call to the underlying function name - name, target = f.name, sub_callee[0].name - # resolve trampoline and update associated dict - while target in removed_trampoline and removed_trampoline[target] != target: - target = removed_trampoline[target] - removed_trampoline[name] = target - for key, val in removed_trampoline.items(): - if val == name: - removed_trampoline[key] = target - - # If terminal thunk keep it in binary - elif f.type == FunctionType.thunk and len(f.calls) == 0 and len(f.callers) > 0: - continue - - # remove any function not explicitely kept (THUNK, IMPORTED, EXTERN) - if binary.get_function_by_name(f.name).addr == f.addr: - binary.remove_function(f.name) - - return { - symb: [removed_trampoline[c] if c in removed_trampoline else c for c in calls] - for symb, calls in call_graph.items() - } - - -def disambiguate_export(symbs: list[Symbol], log_prefix: str = "") -> Symbol: - """Given a list of symbols associated with one address, chose one.""" - if len(symbs) == 1: - return symbs[0] # If only one no ambiguity - - chosen = None - for symb in symbs: - if symb.demangled_name.startswith("_"): - continue - if chosen is None: - chosen = symb - elif chosen == symb: - continue - else: - # print(f"multiple options for name: {chosen}, {name}") - if len(symb.demangled_name) < len(chosen.demangled_name): - chosen = symb - - # all exports starts with _ - if chosen is None: - options = [s.demangled_name for s in symbs] - logging.debug(f"{log_prefix}: cannot disambiguate, select shortest name: {options}") - chosen = min(symbs, key=lambda x: len(x.demangled_name)) - return chosen + return self._ghidra_cached_func + return self._ghidra_func_manager.getFunctionAt(self._to_ghidra_address(parser_addr)) + + # ------------------------------------------------------------------ + # BinaryParser interface + # ------------------------------------------------------------------ + + def _is_func_start(self, addr: int) -> bool: + """:return: True if *addr* (parser space) is a known Ghidra function entry.""" + return self._ghidra_func_manager.getFunctionAt(self._to_ghidra_address(addr)) is not None + + def _iter_func_addr(self) -> Iterator[int]: + """Yield parser-space entry-point addresses of every Ghidra function.""" + for func in self._ghidra_func_manager.getFunctions(True): + self._ghidra_cached_func = func + yield self._to_parser_addr(func.getEntryPoint().getOffset()) + + def _func_mangled_name(self, addr: int) -> str: + """:return: the raw name of the function at *addr*, or ``sub_``.""" + func = self._get_ghidra_func(addr) + if func is not None: + name = func.getName() + if name: + return name + return f"sub_{addr:X}" + + def _func_demangled_name(self, addr: int) -> str: + """:return: the demangled name, falling back to the mangled name.""" + mangled = self._func_mangled_name(addr) + if self._ghidra_demangler is not None: + try: + result = self._ghidra_demangler.demangle(mangled, True) + if result is not None: + return result.getSignature(False) + except Exception: + pass + return mangled + + def _func_children(self, addr: int) -> list[int]: + """:return: parser-space addresses of callees of the function at *addr*. + + External callees are not returned here — they are handled by + ``_func_type`` classifying their PLT thunk stubs as ``FuncType.IMPORTED``. + """ + func = self._get_ghidra_func(addr) + if func is None: + return [] + + seen: set[str] = set() + result: list[int] = [] + for callee in func.getCalledFunctions(self._ghidra_monitor): + name = callee.getName() + if name in seen: + continue + seen.add(name) + result.append(self._to_parser_addr(callee.getEntryPoint().getOffset())) + return result + + def _func_parents(self, addr: int) -> list[int]: + """:return: parser-space addresses of callers of the function at *addr*.""" + func = self._get_ghidra_func(addr) + if func is None: + return [] + + seen: set[str] = set() + result: list[int] = [] + for caller in func.getCallingFunctions(self._ghidra_monitor): + name = caller.getName() + if name in seen: + continue + seen.add(name) + result.append(self._to_parser_addr(caller.getEntryPoint().getOffset())) + return result + + def _func_type(self, addr: int) -> FuncType: + """:return: the FuncType of the function at *addr* (parser space). + + Thunk stubs that resolve to external functions are classified as + ``IMPORTED`` so the trampoline resolution in ``BinaryParser`` correctly + forwards all callers to the imported symbol name. + """ + func = self._get_ghidra_func(addr) + if func is None: + return FuncType.NORMAL + + if func.isExternal(): + return FuncType.IMPORTED + + if func.isThunk(): + # Resolve thunk chain; classify as IMPORTED if it ends at an external + thunked = func.getThunkedFunction(True) + if thunked is not None and thunked.isExternal(): + return FuncType.IMPORTED + return FuncType.THUNK + + # Heuristic: function in a namespace matching a known external library + from ghidra.program.model.symbol import SourceType + + symbol = self._ghidra_symbol_table.getPrimarySymbol(self._to_ghidra_address(addr)) + if symbol is not None and symbol.getSource() == SourceType.ANALYSIS: + namespace = func.getParentNamespace() + if namespace is not None and self._ghidra_ext_manager.contains(namespace.getName(True)): + return FuncType.LIBRARY + + return FuncType.NORMAL From 540357e6fb642080623962b70aeb06b16012d61f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Mon, 30 Mar 2026 17:36:25 +0200 Subject: [PATCH 02/62] [fix] fs: multiprocess error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/fs/imports_mapper.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/pyrrha_mapper/fs/imports_mapper.py b/src/pyrrha_mapper/fs/imports_mapper.py index b92092d..ebd798b 100644 --- a/src/pyrrha_mapper/fs/imports_mapper.py +++ b/src/pyrrha_mapper/fs/imports_mapper.py @@ -154,26 +154,30 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s @classmethod def parse_binary_job(cls, ingress: Queue, egress: Queue, parse_func: Callable) -> None: - """Parse an executable file and create the associated Binary object. + """Parse an executable file and create the associated Binary object, used to multiprocess. - It is used for multiprocessing. - :param ingress: input Queue, contain a Path - :param egress: output Queue, send back (file path, Binary result or - logging string if an issue happen) - :param parse_func: func which take a path as argument (called file_path) and parse it + :param ingress: input Queue, contains Path items or None as a stop sentinel + :param egress: output Queue, sends back (file path, Binary result) or + (file path, Exception) if an issue occurred + :param parse_func: func which takes a path as argument (called file_path) and parses it """ while True: try: path = ingress.get(timeout=0.5) - try: - egress.put((path, parse_func(file_path = path))) - except Exception as e: - egress.put((path, e)) except queue.Empty: - pass + continue except KeyboardInterrupt: break + if path is None: + break + + try: + egress.put((path, parse_func(file_path=path))) + except Exception as e: + logging.error(f"[worker] Failed on {path}: {e}") + egress.put((path, e)) + def map_binary(self, bin_object: Binary, additional_res: Any = None) -> None: """Given a Binary object add it to the DB. From 01570a1fb57add1416144e0cb073fa60db8e2871 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Mon, 30 Mar 2026 17:36:53 +0200 Subject: [PATCH 03/62] objects, fs: add Binary image base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/common/objects.py | 3 +++ src/pyrrha_mapper/fs/imports_mapper.py | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pyrrha_mapper/common/objects.py b/src/pyrrha_mapper/common/objects.py index 1c4dd9a..32a3c0a 100644 --- a/src/pyrrha_mapper/common/objects.py +++ b/src/pyrrha_mapper/common/objects.py @@ -112,6 +112,9 @@ class Binary(FileSystemComponent): default_factory=dict ) # dict(symbol_name, list(requirements)) + # Runtime-only field: virtual address at which the binary is loaded. + image_base: int = Field(default=0, exclude=True) + @field_validator("internal_functions", "exported_functions", mode="after") @classmethod def validate_functions_field(cls, value: dict[str, Symbol]) -> dict[str, Symbol]: diff --git a/src/pyrrha_mapper/fs/imports_mapper.py b/src/pyrrha_mapper/fs/imports_mapper.py index ebd798b..48a3809 100644 --- a/src/pyrrha_mapper/fs/imports_mapper.py +++ b/src/pyrrha_mapper/fs/imports_mapper.py @@ -57,15 +57,14 @@ def is_binary_supported(p: Path) -> bool: :return: True is the path point on a file """ return p.is_file() and not p.is_symlink() and (lief.is_elf(str(p)) or lief.is_pe(str(p))) - + def load_binary_args(self) -> dict[str, Any]: """Return dict of args for load_binary that are always the same for the wholde firmware. - + Use to optimize multiprocessing. Set here there real values. """ return {"root_directory": self.root_directory} - @staticmethod def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | str: """Create a Binary object from a given file using lief. @@ -88,6 +87,7 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s if parsing_res is None: return f"Lief cannot parse {file_path}" + bin_obj.image_base = parsing_res.imagebase # parse imported libs for lib in parsing_res.libraries: bin_obj.add_imported_library_name(str(lib)) @@ -135,6 +135,7 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s res: lief.Binary | None = lief.parse(str(file_path)) if res is None: return f"ERROR: Lief cannot parse {file_path}" + bin_obj.image_base = res.imagebase # parse imported libs for lib in res.libraries: bin_obj.add_imported_library_name(str(lib)) From 41505a6b6bda5cf7567a24be9272b1bda74a3e18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Mon, 30 Mar 2026 18:26:17 +0200 Subject: [PATCH 04/62] main: factorize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/__main__.py | 364 +++++++++++++++++----------------- 1 file changed, 181 insertions(+), 183 deletions(-) diff --git a/src/pyrrha_mapper/__main__.py b/src/pyrrha_mapper/__main__.py index d30407f..723d87f 100644 --- a/src/pyrrha_mapper/__main__.py +++ b/src/pyrrha_mapper/__main__.py @@ -15,25 +15,135 @@ # limitations under the License. """CLI Module.""" +import functools import json import logging import multiprocessing import os -import shutil import sys from pathlib import Path import click import coloredlogs # type: ignore # no typing used in this library from numbat import SourcetrailDB -from qbinary.types import Disassembler, ExportFormat from pyrrha_mapper import exedecomp, fs, intercg from pyrrha_mapper.common import FileSystem -from pyrrha_mapper.types import ResolveDuplicateOption +from pyrrha_mapper.types import Disassembler, Exporter, ResolveDuplicateOption # ------------------------------------------------------------------------------- -# Common stuff for mappers +# Shared option decorators +# ------------------------------------------------------------------------------- + + +def resolve_duplicates_options(f): + """Add the three mutually exclusive resolve-duplicate options (decorator).""" + + @click.option( + "--ignore", + "resolve_duplicates", + flag_value=ResolveDuplicateOption.IGNORE, + help="When resolving duplicate imports, ignore them.", + default=True, + ) + @click.option( + "--arbitrary", + "resolve_duplicates", + flag_value=ResolveDuplicateOption.ARBITRARY, + help="When resolving duplicate imports, select the first one available.", + ) + @click.option( + "--interactive", + "resolve_duplicates", + flag_value=ResolveDuplicateOption.INTERACTIVE, + help="When resolving duplicate imports, user manually selects which one to use.", + ) + @functools.wraps(f) + def wrapper(*args, **kwargs): + return f(*args, **kwargs) + + return wrapper + + +def jobs_option(max_fraction: float = 1.0): + """Add a ``--jobs`` option (decorator). + + :param max_fraction: fraction of CPU count to use as the upper bound (default 1.0). + """ + + def decorator(f): + max_jobs = max(1, int(multiprocessing.cpu_count() * max_fraction)) + + @click.option( + "-j", + "--jobs", + help="Number of parallel jobs.", + type=click.IntRange(1, max_jobs, clamp=True), + metavar="INT", + default=1, + show_default=True, + ) + @functools.wraps(f) + def wrapper(*args, **kwargs): + return f(*args, **kwargs) + + return wrapper + + return decorator + + +def disassembler_option(f): + """*Add the ``--disassembler`` option.""" + + @click.option( + "-b", + "--backend", + required=False, + type=click.Choice(Disassembler, case_sensitive=False), + default=Disassembler.IDA, + show_default=True, + help="Disassembler to use.", + ) + @functools.wraps(f) + def wrapper(*args, **kwargs): + return f(*args, **kwargs) + + return wrapper + + +def exporter_option(f): + """Add the ``--exporter`` option (decorator).""" + @click.option( + "--exporter", + required=False, + type=click.Choice(Exporter, case_sensitive=False), + default=Exporter.NONE, + show_default=True, + help="Binary export format to use for binary analysis.", + ) + @functools.wraps(f) + def wrapper(*args, **kwargs): + return f(*args, **kwargs) + + return wrapper + + +def root_directory_argument(f): + """Add the ``root_directory`` argument (decorator).""" + + @click.argument( + "root_directory", + type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), + ) + @functools.wraps(f) + def wrapper(*args, **kwargs): + return f(*args, **kwargs) + + return wrapper + + +# ------------------------------------------------------------------------------- +# Common command helpers # ------------------------------------------------------------------------------- @@ -57,16 +167,16 @@ def __init__(self, *args, **kwargs): ) self.params.insert( 0, - click.core.Option(("-d", "--debug"), is_flag=True, help="Set log level to DEBUG"), + click.core.Option(("-d", "--debug"), is_flag=True, help="Set log level to DEBUG."), ) self.no_args_is_help = True def setup_logs(is_debug_level: bool, db_path: Path | None = None) -> None: - """Set up logs. + """Set up coloured console logging and an optional log file. - :param is_debug_level: if True set the log level as DEBUG else INFO - :param db_path: if provided, save a collocated log file. + :param is_debug_level: if True, set the log level to DEBUG, else INFO. + :param db_path: if provided, write a collocated ``.log`` file. """ log_format = dict(fmt="[%(asctime)s][%(levelname)s]: %(message)s", datefmt="%Y-%m-%d %H:%M:%S") level = logging.DEBUG if is_debug_level else logging.INFO @@ -82,17 +192,14 @@ def setup_logs(is_debug_level: bool, db_path: Path | None = None) -> None: field_styles={"asctime": {"color": "green"}, "levelname": {"bold": True}}, **log_format, ) - if db_path: - log_file = db_path.with_suffix(".log") - # add file handler - file_handler = logging.FileHandler(log_file, mode="w") - file_handler.setLevel(level) - file_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) - logging.root.addHandler(file_handler) + handler = logging.FileHandler(db_path.with_suffix(".log"), mode="w") + handler.setLevel(level) + handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) + logging.root.addHandler(handler) -def setup_db(db_path, overwrite_db: bool = True) -> SourcetrailDB: +def setup_db(db_path: Path, overwrite_db: bool = True) -> SourcetrailDB: """Create and/or open the corresponding Sourcetrail DB. :param db_path: path of the db to open/create @@ -100,15 +207,12 @@ def setup_db(db_path, overwrite_db: bool = True) -> SourcetrailDB: cleared else not :return: the created or opened Sourcetrail DB """ - # db creation/and or opening if SourcetrailDB.exists(db_path): - db = SourcetrailDB.open(db_path, clear=overwrite_db) - else: - path = Path(db_path) - if path.suffix != SourcetrailDB.SOURCETRAIL_DB_EXT: - path = path.with_suffix(f"{path.suffix}{SourcetrailDB.SOURCETRAIL_DB_EXT}") - db = SourcetrailDB.create(path) - return db + return SourcetrailDB.open(db_path, clear=overwrite_db) + path = Path(db_path) + if path.suffix != SourcetrailDB.SOURCETRAIL_DB_EXT: + path = path.with_suffix(f"{path.suffix}{SourcetrailDB.SOURCETRAIL_DB_EXT}") + return SourcetrailDB.create(path) # ------------------------------------------------------------------------------- @@ -126,83 +230,45 @@ def setup_db(db_path, overwrite_db: bool = True) -> SourcetrailDB: def pyrrha(): # noqa: D103 pass - -""" - Filesystem mapper. - Map ELF/PE files, their imports and their exports. - Also map symlinks which target ELF/PE files. -""" - - @pyrrha.command( "fs", cls=MapperCommand, - short_help="Map PE and ELF files of a filesystem into a numbatui-compatible db.", - help="Map a filesystem into a numbatui-compatible db. It maps ELF and PE files, \ -their imports/exports plus the symlinks that points on these executable files.", + short_help="Map PE and ELF files of a filesystem into a NumbatUI-compatible db.", + help=( + "Map a filesystem into a NumbatUI-compatible db. " + "It maps ELF and PE files, their imports/exports, " + "plus the symlinks that point to these executable files." + ), ) @click.option( "-e", "--export", - help="Create an export of the resulting FileSystem mapping (in JSON).", + help="Create a JSON export of the resulting FileSystem mapping.", is_flag=True, default=False, - show_default=False, -) -@click.option( - "-j", - "--jobs", - help="Number of parallel jobs created (threads).", - type=click.IntRange(1, multiprocessing.cpu_count(), clamp=True), - metavar="INT", - default=1, - show_default=True, -) -@click.option( - "--ignore", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.IGNORE, - help="When resolving duplicate imports, ignore them", - default=True, ) -@click.option( - "--arbitrary", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.ARBITRARY, - help="When resolving duplicate imports, select the first one available", -) -@click.option( - "--interactive", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.INTERACTIVE, - help="When resolving duplicate imports, user manually select which one to use", -) -@click.argument( - "root_directory", - # help='Path of the directory containing the filesystem to map.', - type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), -) -def fs_mapper( # noqa: D103 +@jobs_option(max_fraction=1.0) +@resolve_duplicates_options +@root_directory_argument +def fs_mapper( debug: bool, db: Path, export: bool, jobs: int, resolve_duplicates: ResolveDuplicateOption, root_directory: Path, -): # noqa: D103 +): + """Map PE and ELF files of a filesystem.""" setup_logs(debug) db_instance = setup_db(db) - root_directory = root_directory.absolute() - fs_mapper = fs.FileSystemImportsMapper(root_directory, db_instance) - filesystem = fs_mapper.map(jobs, resolve_duplicates) + filesystem = fs.FileSystemImportsMapper(root_directory, db_instance).map( + jobs, resolve_duplicates + ) - # if enabled export enabled, save FileSystem object in a JSON if export: - # maybe in the future a user can choose the output path ? - output_file = db_instance.path.with_suffix(".json") - filesystem.write(output_file) + filesystem.write(db_instance.path.with_suffix(".json")) db_instance.close() @@ -210,99 +276,41 @@ def fs_mapper( # noqa: D103 @pyrrha.command( "fs-cg", cls=MapperCommand, - short_help="Map the Call Graph of every firmware executable into a NumbatUI db.", - help="Map a the Inter-Image Call Graph of a whole filesystem into a NumbatUI db." - "It disassembles executables using a disassembler and extract the call graph." - "It then results all call references across binaries.", -) -@click.option( - "-j", - "--jobs", - help="Number of parallel jobs created (threads).", - type=click.IntRange(1, int(multiprocessing.cpu_count() * 0.7), clamp=True), # 70% of threads - metavar="INT", - default=1, - show_default=True, -) -@click.option( - "--ignore", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.IGNORE, - help="When resolving duplicate imports, ignore them", - default=True, -) -@click.option( - "--arbitrary", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.ARBITRARY, - help="When resolving duplicate imports, select the first one available", -) -@click.option( - "--interactive", - "resolve_duplicates", - flag_value=ResolveDuplicateOption.INTERACTIVE, - help="When resolving duplicate imports, user manually select which one to use", -) -@click.option( - "--disassembler", - required=False, - type=click.Choice(Disassembler, case_sensitive=False), - default=Disassembler.AUTO, - show_default=True, - help="Disassembler to use", -) -@click.option( - "--exporter", - required=False, - type=click.Choice(ExportFormat, case_sensitive=False), - default=ExportFormat.AUTO, - show_default=True, - help="Binary exporter", -) -@click.argument( - "root_directory", - # help='Path of the directory containing the filesystem to map.', - type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), + short_help="Map the call graph of every firmware executable into a NumbatUI db.", + help=( + "Map the inter-image call graph of a whole filesystem into a NumbatUI db. " + "It disassembles executables, extracts the call graph, " + "and resolves all call references across binaries." + ), ) -def fs_call_graph_mapper( # noqa: D103 +@jobs_option(max_fraction=0.7) +@resolve_duplicates_options +@disassembler_option +@exporter_option +@root_directory_argument +def fs_call_graph_mapper( debug: bool, db: Path, jobs: int, resolve_duplicates: ResolveDuplicateOption, - disassembler: Disassembler, - exporter: ExportFormat, + backend: Disassembler, + exporter: Exporter, root_directory: Path, ): + """Map the inter-image call graph of a firmware filesystem.""" setup_logs(debug, db) db_instance = setup_db(db) - if disassembler not in [Disassembler.AUTO, Disassembler.IDA, Disassembler.GHIDRA]: - click.echo("disassembler not yet supported") - # TODO: add support for other disassembler + if backend not in (Disassembler.IDA, Disassembler.GHIDRA,): + click.echo("Backend not yet supported") return 1 - - if disassembler is Disassembler.GHIDRA: - ghidra_env_var = "GHIDRA_PATH" - ghidra_dir = os.environ.get(ghidra_env_var) - if not ghidra_dir: - for ghidra_name in ["ghidra", "ghidraRun"]: - if ghidra_path := shutil.which(ghidra_name): - os.environ[ghidra_env_var] = str(Path(ghidra_path).resolve().parent) - - intercg.InterImageCGMapper.DISASS = disassembler - intercg.InterImageCGMapper.EXPORT = exporter - + root_directory = root_directory.absolute() - # Create InterCG mapper and launch mapping try: - intercg_mapper = intercg.InterImageCGMapper(root_directory, db_instance) + intercg_mapper = intercg.InterImageCGMapper(root_directory, db_instance, backend, exporter) fs_object: FileSystem = intercg_mapper.map(jobs, resolve_duplicates) - - # systematically save the FileSystem object (shall be enriched with calls) - output_file = db_instance.path.with_suffix(intercg_mapper.FS_EXT) - fs_object.write(output_file) - + fs_object.write(db_instance.path.with_suffix(intercg_mapper.FS_EXT)) except RuntimeError: pass @@ -314,45 +322,36 @@ def fs_call_graph_mapper( # noqa: D103 "exe-decomp", cls=MapperCommand, short_help="Map an executable call graph with its decompiled code.", - help="Map a single executable call graph into a numbatui-compatible database." - "It also index the decompiled code along with all call cross-references.", -) -@click.option( - "--disassembler", - required=False, - type=click.Choice(Disassembler, case_sensitive=False), - default=Disassembler.AUTO, - show_default=True, - help="Disassembler to use for disassembly and decompilation.", -) -@click.option( - "--exporter", - required=False, - type=click.Choice(ExportFormat, case_sensitive=False), - default=ExportFormat.AUTO, - show_default=True, - help="Binary export format to use for binary analysis.", + help=( + "Map a single executable call graph into a NumbatUI-compatible database. " + "Also indexes the decompiled code along with all call cross-references." + ), ) +@disassembler_option +@exporter_option @click.argument( "executable", type=click.Path(exists=False, file_okay=True, dir_okay=False, path_type=Path), ) -def fs_exe_decompiled_mapper( # noqa: D103 - debug: bool, db: Path, disassembler: Disassembler, exporter: ExportFormat, executable: Path +def fs_exe_decompiled_mapper( + debug: bool, + db: Path, + backend: Disassembler, + exporter: Exporter, + executable: Path, ): - # Change default db name. By default will be .srctrldb + """Map a single executable with decompiled code.""" if db.name == "exe-decomp.srctrldb": db = Path(str(executable) + ".srctrldb") setup_logs(debug, db) db_instance = setup_db(db) - if disassembler not in [Disassembler.AUTO, Disassembler.IDA]: - click.echo(f"disassembler {disassembler.name} not yet supported") - # TODO: add support for other disassembler (forward parameter to mapper) + if backend not in (Disassembler.IDA,): + click.echo(f"Backend {backend.name} not yet supported") return 1 - if exedecomp.map_binary(db_instance, executable, disassembler, exporter): + if exedecomp.map_binary(db_instance, executable, backend, exporter): logging.info("success.") else: logging.error("failure.") @@ -375,7 +374,6 @@ def fs_exe_decompiled_mapper( # noqa: D103 ) def workspace_utils(list: bool, add: bool, delete: bool, path: Path): """Manage workspaces for cross-binary referencing.""" - # Configure logs (there is not debug ones) setup_logs(False) # Get the base config directory From e6ea4ad0c97b5ec486a80e977d7edf0c761eaeb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Mon, 30 Mar 2026 18:27:33 +0200 Subject: [PATCH 05/62] types: adapt enum to new backends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/intercg/fwmapper.py | 20 ++++++++++++-------- src/pyrrha_mapper/types.py | 5 ++--- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/pyrrha_mapper/intercg/fwmapper.py b/src/pyrrha_mapper/intercg/fwmapper.py index a8f1df5..ba49170 100644 --- a/src/pyrrha_mapper/intercg/fwmapper.py +++ b/src/pyrrha_mapper/intercg/fwmapper.py @@ -58,10 +58,13 @@ class InterImageCGMapper(FileSystemImportsMapper): FS_EXT = ".fs.json" - DISASS = Disassembler.IDA - EXPORT = Exporter.NONE - - def __init__(self, root_directory: Path | str, db: SourcetrailDB | None): + def __init__( + self, + root_directory: Path | str, + db: SourcetrailDB | None, + disassembler: Disassembler, + exporter: Exporter, + ): super(InterImageCGMapper, self).__init__(root_directory, db) # super initialize root_directory, db_interface, fs and _dry_run variables @@ -79,6 +82,7 @@ def __init__(self, root_directory: Path | str, db: SourcetrailDB | None): self.progress: Progress | None = None self.unresolved_callgraph: dict[Path, dict[Symbol, list[str]]] = dict() self._current_binary_hash = "" + self.disassembler, self.exporter = disassembler, exporter def _correct_map_result(self, res: Any) -> bool: return ( @@ -99,16 +103,16 @@ def load_binary_args(self) -> dict[str, Any]: Use to optimize multiprocessing. Set here there real values. """ res = super().load_binary_args() - res["disass"] = self.DISASS - res["exporter"] = self.EXPORT + res["disass"] = self.disassembler + res["exporter"] = self.exporter return res @staticmethod def load_binary( root_directory: Path, file_path: Path, - disass: Disassembler = DISASS, - exporter: Exporter = EXPORT, + disass: Disassembler = Disassembler.IDA, + exporter: Exporter = Exporter.NONE, ) -> tuple[Binary, dict[Symbol, list[str]] | None] | str: """Load all the binaries located in the filesystem as Binary objects. diff --git a/src/pyrrha_mapper/types.py b/src/pyrrha_mapper/types.py index 30c336d..4bee410 100644 --- a/src/pyrrha_mapper/types.py +++ b/src/pyrrha_mapper/types.py @@ -21,17 +21,16 @@ class Disassembler(Enum): """Represent a SRE (Software Reverse Engineering tool, a disassembler).""" - AUTO = auto() # doc: Disassembler shall selected automatically IDA = auto() # doc: IDA Pro disassembler GHIDRA = auto() # doc: GHIDRA disassembler BINARY_NINJA = auto() # doc: Binary Ninja disassembler -class Exporters(Enum): +class Exporter(Enum): """Represent export file formats used in some of the mappers.""" - AUTO = auto() # doc: The exporter shall be automatically selected BINEXPORT = auto() # doc: Use Binexport as exporter + NONE = auto() # doc: Use no exporter and interact directly with disassembler QUOKKA = auto() # doc: Use Quokka as exporter From b19d5360ebf2214be7e4cde06b2eaa861a2ad4e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 31 Mar 2026 10:52:37 +0200 Subject: [PATCH 06/62] all: remove heimdallr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- docs/disassembler.md | 143 ------------------ src/pyrrha_mapper/__main__.py | 51 ------- src/pyrrha_mapper/common/filesystem_mapper.py | 18 +-- src/pyrrha_mapper/exedecomp/binmapper.py | 24 --- src/pyrrha_mapper/intercg/fwmapper.py | 29 ---- 5 files changed, 3 insertions(+), 262 deletions(-) delete mode 100644 docs/disassembler.md diff --git a/docs/disassembler.md b/docs/disassembler.md deleted file mode 100644 index 6b0046d..0000000 --- a/docs/disassembler.md +++ /dev/null @@ -1,143 +0,0 @@ -# Disassembler Integration - -Some pyrrha mappers and especially the `exe-decomp` enables jumping in a disassembler from the UI -by right-clicking on a function and selecting "Open in disassembler". Executing arbitrary command -is made available by the [Numbat feature](https://quarkslab.github.io/numbat/customization/) and -requires opening a Sourcetrail DB with ``NumbatUI``. - -The link between Numbat and a disassembler is made by implementing custom URL protocol handlers. As such, -clicking "Open in disassembler" will trigger a command like: - -```bash -xdg-open 'disas://e62f747cf47383858bd563febb813e20?idb=inadyn.i64&offset=0x0124c8' -``` - -On Linux `xdg-open` will open the URL with the default application associated with the `disas` protocol. -For windows and MacOS, application opened are respectively `start` and `open`. For it to work, -we need to register a custom URL handler for the `disas` protocol. This is done by using [heimdallr](https://github.com/interruptlabs/heimdallr-client) developped by [Interrupt Labs](https://interruptlabs.com/). - - -## Heimdallr - -Heimdallr is a custom URL handler that allows you to open a disassembler from the UI. Developpres provides -an [IDA plugin](https://github.com/interruptlabs/heimdallr-ida) to support it and some folks added a [Ghidra -support](https://github.com/foundryzero/ghidra-deep-links). It works by running a gRPC server in the disassembler -that will listen for incoming requests. The image below summarizes the workflow on Linux: - -![](img/heimdallr.svg) - -As shown on the image the Linux system handles URL handlers with `.desktop` files that needs to be registered. -The handler will call `heimdallr_client` utility that is in charge of identifying running gRPC servers to send -the query to a running disassembler or to start it. - -## Installation - -Heimdallr is fairly unmaintained and undocumented. Still, it works rather well. In order to get it working -one need to perform the following steps: - -1. Install `heimdallr-ida` plugin in IDA -2. Install `heimdallr-client` "globally" so that it is reachable by the URL handler dispatcher -3. Configure a `settings.json` file to specify disassembler path etc. -4. Create and register a `.desktop` file to handle the `disas://` protocol. - -**1-heimdallr-ida**: The plugin is available on the [Github page](https://github.com/interruptlabs/heimdallr-ida). -The README.md provides installation steps. The ``install()`` command will automatically copy files in the IDA Pro -directory and creates a default `settings.json` file in `$HOME/.config/heimdallr/settings.json`. - -!!! tip - The install command might be a bit buggy, so it is recommended to install the plugin manually by copying the - files in IDA. - -**2-heimdallr-client**: The client is available on the [Github page](https://github.com/interruptlabs/heimdallr-client). -It can be installed with `pip`: - -```bash -pip3 install git+https://git@github.com/interruptlabs/heimdallr-client.git#egg=heimdallr_client -``` - -!!! note - It should be installed globally so that it is reachable by the URL handler dispatcher. Thus it is recommended - to install it with `--user`. - -**3-Configuring settings**: The `$HOME/.config/heimdallr` will contain all files used by `heimdallr` to locate -running RPC server instances in order to send them requests. The file `settings.json` is used to configure -the disassembler path and paths where to look for binaries. Thus configure carefully your IDA path inside. - -```json -{ - "ida_location": "/my/path/to/ida", - "idb_path": [ - ], - "heimdallr_client": "heimdallr_client" -} -``` - -!!! note - The IDA location binary provided should be a non-blocking IDA or bash script, as `heimdallr-client` - will run it with `subprocess.run` and wait for it before sending the request. - - -**4-Creating protocol handler**: The `.desktop` file is used to register the `disas://` protocol handler. -On Linux, it is usually located in `~/.local/share/applications/`. Creates a file in this directory with -the following content: - -???+ "`heimdallr.desktop`" - ```ini - [Desktop Entry] - Name=Heimdallr-handler - Comment=Disas URL handler - GenericName=heimdallr-handler-generic - Exec=heimdallr_client %u - Type=Application - StartupNotify=true - Categories=GNOME;GTK;Utility; - MimeType=x-scheme-handler/disas; - ``` - -Then you need to update the associated `mimeinfo.cache` file with: - -```bash -update-desktop-database ~/.local/share/applications -``` -This will allow you to handle URLs with the `disas://` scheme. -*It shall add the line: `x-scheme-handler/disas=heimdallr.desktop` in the file.* - - -## Testing - -You can test that URL are properly resolved by running: - -```bash -xdg-mime query default x-scheme-handler/disas -``` -This should return `heimdallr.desktop`. Then you can try opening a binary with: - -```bash -xdg-open 'disas://e62f747cf47383858bd563febb813e20?idb=inadyn.i64&offset=0x0124c8' -``` - -Where you provide the MD5 hash of the binary, its DB name and the offset to jump to. -By default, heimdallr look in the IDA Pro history to locate the idb. Otherwise, it search -for directories referenced in the "idb_path" field of the `settings.json` file. - - - -## Usage in Pyrrha mappers - -Pyrrha uses `heimdallr` to resolve binaries location and offsets. Thus when working -on a specific firmware you might need to specify its root directory in the `ida_path` -of the `settings.json` file. Pyrrha provides an utility command to list, add and remove -entries in this file. - -```bash -pyrrha workspace-utils --list # list all entries -``` - -```bash -pyrrha workspace-utils --add /path/to/firmware/rootfs # add directory in search path -``` - -```bash -pyrrha workspace-utils --delete /path/to/firmware/rootfs # remove directory from search path -``` - diff --git a/src/pyrrha_mapper/__main__.py b/src/pyrrha_mapper/__main__.py index 723d87f..66bbaf2 100644 --- a/src/pyrrha_mapper/__main__.py +++ b/src/pyrrha_mapper/__main__.py @@ -16,11 +16,8 @@ """CLI Module.""" import functools -import json import logging import multiprocessing -import os -import sys from pathlib import Path import click @@ -361,53 +358,5 @@ def fs_exe_decompiled_mapper( db_instance.close() -@pyrrha.command( - "workspace-utils", short_help="Help managing workspaces (for cross-binary referencing)." -) -@click.option("-l", "--list", is_flag=True, default=False, help="List all workspaces.") -@click.option("-a", "--add", is_flag=True, default=False, help="Add a rootfs as workspace.") -@click.option("-d", "--delete", is_flag=True, default=False, help="Remove a rootfs as workspace.") -@click.argument( - "path", - type=click.Path(exists=True, file_okay=True, dir_okay=True, path_type=Path), - required=False, -) -def workspace_utils(list: bool, add: bool, delete: bool, path: Path): - """Manage workspaces for cross-binary referencing.""" - setup_logs(False) - - # Get the base config directory - if sys.platform == "win32": - heimdallr_settings = Path(os.path.expandvars("%APPDATA%/heimdallr/settings.json")) - else: - heimdallr_settings = Path(os.path.expandvars("$HOME/.config/heimdallr/settings.json")) - if not heimdallr_settings.exists(): - click.echo(f"heimdallr config directory {heimdallr_settings} does not exists") - return -1 - - # Load settings - settings = json.loads(heimdallr_settings.read_text()) - idb_path = settings.get("idb_path") - if idb_path is None: - click.echo(f"heimdallr settings file {heimdallr_settings} does not contain idb_path") - return -1 - - if list: - for path in idb_path: - logging.info(f"- {path}") - - if add: - settings["idb_path"].append(str(Path(path).absolute())) - heimdallr_settings.write_text(json.dumps(settings, indent=4)) # Write it back - - if delete: - try: - settings["idb_path"].remove(str(path)) - heimdallr_settings.write_text(json.dumps(settings, indent=4)) # Write it back - except ValueError: - click.echo(f"Path {path} not in idb_path of settings.") - return -1 - - if __name__ == "__main__": pyrrha() diff --git a/src/pyrrha_mapper/common/filesystem_mapper.py b/src/pyrrha_mapper/common/filesystem_mapper.py index 3225fb7..62813b1 100755 --- a/src/pyrrha_mapper/common/filesystem_mapper.py +++ b/src/pyrrha_mapper/common/filesystem_mapper.py @@ -31,7 +31,7 @@ TimeElapsedColumn, ) -from pyrrha_mapper.common.objects import Binary, FileSystem, Symlink, Symbol +from pyrrha_mapper.common.objects import Binary, FileSystem, Symbol, Symlink from pyrrha_mapper.exceptions import PyrrhaError from pyrrha_mapper.types import ResolveDuplicateOption @@ -157,7 +157,6 @@ def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: logging.error(f"{log_prefix}: Record of symbol '{symbol.demangled_name}' failed.") else: try: - self.symbol_recorded(binary, symbol) self.db_interface.record_public_access(symbol.id) recorded_symb[symbol.demangled_name] = symbol.id except DBException as e: @@ -165,7 +164,7 @@ def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: f"{log_prefix}: Cannot register access to symbol {symbol.demangled_name}: " f"{e}" ) from e - + for symbol in set(binary.iter_not_exported_functions()): symbol.id = self.db_interface.record_method( symbol.demangled_name, @@ -176,7 +175,6 @@ def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: logging.error(f"{log_prefix}: Record of symbol '{symbol.demangled_name}' failed.") else: try: - self.symbol_recorded(binary, symbol) self.db_interface.record_private_access(symbol.id) except DBException as e: raise PyrrhaError( @@ -185,17 +183,7 @@ def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: ) from e return binary - - def symbol_recorded(self, binary: Binary, symbol: Symbol) -> None: - """Hook called when a symbol is recorded in the DB. - - This method can be overridden to add custom behavior. - - :param binary: the Binary object containing the method - :param symbol: the Symbol object representing the method - """ - pass # Default implementation does nothing - + def record_symlink_in_db(self, sym: Symlink, log_prefix: str = "") -> Symlink: """Record into DB the symlink and its link to its target. diff --git a/src/pyrrha_mapper/exedecomp/binmapper.py b/src/pyrrha_mapper/exedecomp/binmapper.py index 7544279..e517c78 100644 --- a/src/pyrrha_mapper/exedecomp/binmapper.py +++ b/src/pyrrha_mapper/exedecomp/binmapper.py @@ -20,7 +20,6 @@ from pathlib import Path from collections import defaultdict from dataclasses import dataclass -import sys from typing import NamedTuple from tempfile import NamedTemporaryFile import hashlib @@ -46,16 +45,6 @@ DECOMPILE_SCRIPT = Path(__file__).parent / "decompile.py" -# Determine the command to open URLs based on the platform -try: - URL_OPEN_CMD = { - "linux": "xdg-open", - "win32": "start", - "darwin": "open" - }[sys.platform] -except KeyError: - logging.warning(f"Unsupported platform: {sys.platform} (will not add URL handler)") - URL_OPEN_CMD = "" # type: ignore once_check = True @@ -276,16 +265,6 @@ def is_thunk_to_import(p: Program, f: Function) -> bool: return False -def add_url_handler(db: SourcetrailDB, program: Program, hash: str, function: Function, f_id: int) -> None: - """ Open the function using a dedicated URL handler. (Use Heimdallr) """ - if URL_OPEN_CMD and program.exec_path: - url = f"disas://{hash}?idb={Path(program.exec_path).name+'.i64'}&offset={function.addr:#08x}" - cmd: list[str] = [URL_OPEN_CMD, url] - db.set_custom_command(f_id, cmd, "Open in Disassembler") # type: ignore - else: - pass # Can't add URL unsuported platform - - def map_binary(db: SourcetrailDB, program_path: Path, disass: Disassembler, format: ExportFormat) -> bool: # Load the Quokka file with Progress( @@ -338,9 +317,6 @@ def map_binary(db: SourcetrailDB, program_path: Path, disass: Disassembler, form # Change node color based on its type set_function_color(db, program, f, f_id) - # Add custom command to open that function in IDA - add_url_handler(db, program, p_hash, f, f_id) - # Add source code if any if f_addr in decompiled and not is_imp: info = decompiled[f_addr] diff --git a/src/pyrrha_mapper/intercg/fwmapper.py b/src/pyrrha_mapper/intercg/fwmapper.py index ba49170..f5c436a 100644 --- a/src/pyrrha_mapper/intercg/fwmapper.py +++ b/src/pyrrha_mapper/intercg/fwmapper.py @@ -16,9 +16,7 @@ """InterCGMapper implementation.""" import logging -import sys from collections import defaultdict -from hashlib import md5 from pathlib import Path from typing import Any @@ -41,17 +39,8 @@ IGNORE_LIST = ["__gmon_start__"] -QUOKKA_EXT = ".quokka" - NUMBAT_UI_BIN = "NumbatUi" -# Determine the command to open URLs based on the platform -try: - URL_OPEN_CMD = {"linux": "xdg-open", "win32": "start", "darwin": "open"}[sys.platform] -except KeyError: - logging.warning(f"Unsupported platform: {sys.platform} (will not add URL handler)") - URL_OPEN_CMD = "" # type: ignore - class InterImageCGMapper(FileSystemImportsMapper): """Filesystem mapper based on Lief, which computes imports and exports.""" @@ -81,7 +70,6 @@ def __init__( self.exports_to_bins: dict[str, list[Binary]] = {} self.progress: Progress | None = None self.unresolved_callgraph: dict[Path, dict[Symbol, list[str]]] = dict() - self._current_binary_hash = "" self.disassembler, self.exporter = disassembler, exporter def _correct_map_result(self, res: Any) -> bool: @@ -137,17 +125,6 @@ def load_binary( except (FileNotFoundError, FsMapperError, SyntaxError) as e: return f"[binary mapping] {file_path.name}: ERROR: Loading error: {e}" - def add_url_handler(self, hash: str, binary: Binary, symbol: Symbol) -> None: - """Open the function using a dedicated URL handler. (Use Heimdallr)""" - if not hash: - return # no hash, no URL handler - if URL_OPEN_CMD: - url = f"disas://{hash}?idb={binary.name + '.i64'}&offset={symbol.addr:#08x}" - cmd: list[str] = ["xdg-open", url] - self.db_interface.set_custom_command(symbol.id, cmd, "Open in Disassembler") # type: ignore - else: - pass # Can't add URL unsuported platform - def map_binary( self, bin_object: Binary, @@ -158,8 +135,6 @@ def map_binary( This function updates the filesystem representation stored as `self.fs`. :param bin_object: Binary object """ - self._current_binary_hash = md5(Path(bin_object.real_path).read_bytes()).hexdigest() - super().map_binary(bin_object) if additional_res is not None: self.unresolved_callgraph[bin_object.path] = additional_res @@ -168,10 +143,6 @@ def map_binary( if additional_res is not None: self._record_custom_command(bin_object, f"[bin mapping] {bin_object.name}") - def symbol_recorded(self, binary: Binary, symbol: Symbol) -> None: - """Register a symbol recorded handler to add a custom command.""" - self.add_url_handler(self._current_binary_hash, binary, symbol) - def _treat_bin_parsing_result(self, path: Path, res: Any): """Handle load_binary res, map it or display error.""" log_prefix = f"[binary parsing] {path.name}" From 8dcc65100cef8c2b66f6b2c3712a5990501cd4ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 3 Apr 2026 11:16:43 +0200 Subject: [PATCH 07/62] fs-cg, decomp: replace disass/exporter with unique backend value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- ci/ida/Dockerfile | 146 +++++++++++++++-------- src/pyrrha_mapper/__main__.py | 56 ++++----- src/pyrrha_mapper/exedecomp/binmapper.py | 9 +- src/pyrrha_mapper/intercg/fwmapper.py | 19 ++- src/pyrrha_mapper/types.py | 14 +-- 5 files changed, 136 insertions(+), 108 deletions(-) diff --git a/ci/ida/Dockerfile b/ci/ida/Dockerfile index 02087cb..4c73e86 100644 --- a/ci/ida/Dockerfile +++ b/ci/ida/Dockerfile @@ -14,73 +14,123 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM docker.io/library/debian:testing-slim -SHELL ["/bin/bash", "-c"] - # ============= How to generate the required data =========================== -# paths can be changed from commandline if needed -# idapro.hexlic: license file, downloaded from your account on hex-rays website -# ida-pro_91.run: executable file, downloaded from your account on hex-rays website -# ida.reg: history file, to be generated manually. Keep in memory that the licence has alredy been accepted. -# 1. Build this docker with an empty ida.reg and launch it. -# 2. Launch idat and accept the license. -# 3. In another terminal, get the id of the current ida docker with `docker ps` -# 4. Run `docker cp ID:/root/.idapro/ida.reg ./` where ID is the id get at the -# previous step, you know have a correct ida.reg. -# 5. Rebuild your image with the correct ida.reg +# All sensitive files are passed exclusively via --mount=type=secret so they +# never appear in any image layer or in `docker history`. +# +# Required files (place them next to this Dockerfile): +# idapro.hexlic : license file, downloaded from your account on hex-rays website +# ida-pro_91.run: executable installer, downloaded from your account on hex-rays website +# +# Build command: +# docker build \ +# --secret id=ida_installer,src=ida-pro_91.run \ +# -t pyrrha-ida . +# +# ida.reg is generated automatically during the build (see stage 1). +# +# Run command (licence is mounted at runtime, never stored in the image): +# docker run --rm \ +# --mount type=secret,id=ida_license,src=idapro.hexlic \ +# pyrrha-ida +# +# IDA locates the licence via the HEXRAYS_LICENSE env var which is set to the +# secret path below. # =========================================================================== # ======================== IDA Installation ================================= +# Contains build-only packages and the installer; none of this reaches the +# final image. +FROM docker.io/library/debian:testing-slim AS ida-install ARG IDA_VERSION=91 -ARG IDA_INSTALLER=ida-pro_${IDA_VERSION}.run -ENV IDA_INSTALL_DIR=/opt/ida_${IDA_VERSION} +ARG IDA_INSTALL_DIR=/opt/ida_${IDA_VERSION} +# Build-time dependencies: +# - libxcb-xinerama0 : required by the .run installer's Qt bootstrap +# - xvfb : provides a virtual framebuffer so that idat can run +# headlessly long enough to accept the licence and write +# ida.reg on first launch +# - remaining libs : runtime deps that are also needed at install time RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ - ca-certificates \ - ccache \ - cmake \ - g++ \ - gcc \ - git \ - libpython3-dev \ - libqt5gui5 \ libfontconfig1 \ libmagic1 \ + libqt5gui5 \ libsecret-1-0 \ - make \ - ninja-build \ + libxcb-xinerama0 \ python3-minimal \ - python3-pip \ - python3-venv \ python-is-python3 \ - unzip \ - xcb-proto \ - wget \ - zlib1g-dev \ - && mkdir -p $IDA_INSTALL_DIR ~/.local/share/applications/ + xvfb \ + && mkdir -p "${IDA_INSTALL_DIR}" ~/.local/share/applications/ \ + && rm -rf /var/lib/apt/lists/* + +# Run the installer in a secret-mount RUN so the .run binary is never +# committed to any layer. The licence is intentionally NOT copied here; +# it will be mounted at runtime only (see run command above). +RUN --mount=type=secret,id=ida_installer,target=/tmp/ida_installer.run \ + chmod +x /tmp/ida_installer.run && \ + DEBIAN_FRONTEND=noninteractive /tmp/ida_installer.run \ + --mode unattended \ + --prefix "${IDA_INSTALL_DIR}" -RUN --mount=type=bind,src=${IDA_INSTALLER},target=${IDA_INSTALLER} DEBIAN_FRONTEND=noninteractive apt-get install --yes --reinstall libxcb-xinerama0 && \ - ./${IDA_INSTALLER} --mode unattended --prefix ${IDA_INSTALL_DIR} +# Generate ida.reg by launching idat once inside a virtual framebuffer. +# The -A flag auto-accepts the EULA; idat exits after startup with no project. +# ida.reg is written to root's home by default. +RUN Xvfb :99 -screen 0 1024x768x24 & \ + sleep 2 && \ + DISPLAY=:99 "${IDA_INSTALL_DIR}/idat" -A && \ + sleep 5 && \ + kill %1 || true + +# ======================== IDA Runtime image ================================= +# Only the IDA tree, the licence, ida.reg, and the runtime shared libraries are +# present here. No installer, no xvfb, no build tooling. +FROM docker.io/library/debian:testing-slim + +ARG IDA_VERSION=91 +# Exported so that qbinary/idascript and other tools spawned inside the +# container can locate the IDA installation without extra configuration. +ENV IDA_INSTALL_DIR=/opt/ida_${IDA_VERSION} ENV PATH=${IDA_INSTALL_DIR}:${PATH} +# IDA looks for the licence at the path provided by this variable. +# At runtime, mount the licence as a secret at that exact path: +# docker run --mount type=secret,id=ida_license,src=idapro.hexlic,... +ENV HEXRAYS_LICENSE=/run/secrets/ida_license +# IDADIR is required by ida_domain to locate the IDA installation at runtime. +ENV IDADIR=${IDA_INSTALL_DIR} -# ======================== Plugin Installation ============================== - -ARG QUOKKA_VERSION=v0.6.1 -ARG QUOKKA_URL=https://github.com/quarkslab/quokka/releases/download/${QUOKKA_VERSION}/${IDA_VERSION}-quokka_plugin0064.so -ARG BINEXPORT_URL=https://github.com/google/binexport/releases/download/v12-20240417-ghidra_11.0.3/BinExport-Linux.zip -RUN if [[ ${IDA_VERSION} -eq 84 ]]; then \ - wget ${QUOKKA_URL} -O ${IDA_INSTALL_DIR}/plugins/quokka64.so \ - && wget ${BINEXPORT_URL} -O binexport.zip \ - && unzip -j binexport.zip ida/binexport12_ida.so ida/binexport12_ida64.so -d ${IDA_INSTALL_DIR}/plugins/ \ - && rm -f binexport.zip ; \ - else wget ${QUOKKA_URL} -O ${IDA_INSTALL_DIR}/plugins/quokka.so ; fi \ - && apt-get purge --yes wget \ +# Runtime-only shared libraries required by IDA and its Qt layer. +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + libfontconfig1 \ + libmagic1 \ + libqt5gui5 \ + libsecret-1-0 \ + python3-minimal \ + python-is-python3 \ && rm -rf /var/lib/apt/lists/* -RUN useradd --create-home -u 1000 -m user && chown -R user:user $IDA_INSTALL_DIR +# Copy only the installed IDA tree from the build stage. +COPY --from=ida-install ${IDA_INSTALL_DIR} ${IDA_INSTALL_DIR} + +# User creation +RUN useradd --create-home -u 1000 -m user && \ + chown -R user:user "${IDA_INSTALL_DIR}" + USER user -RUN $IDA_INSTALL_DIR/idapyswitch -a WORKDIR /home/user + +# Copy ida.reg (generated in stage 1) into the user's IDA config directory. +COPY --from=ida-install /root/.idapro/ida.reg /home/user/.idapro/ida.reg + +# virtualenv (automatically launch at runtime) +ENV VIRTUAL_ENV=/home/user/.venv +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" +RUN python -m venv "${VIRTUAL_ENV}" && \ + echo "source ${VIRTUAL_ENV}/bin/activate" >> /home/user/.bashrc + +RUN "${IDA_INSTALL_DIR}/idapyswitch" -a + +CMD ["/bin/bash"] diff --git a/src/pyrrha_mapper/__main__.py b/src/pyrrha_mapper/__main__.py index 66bbaf2..4b55451 100644 --- a/src/pyrrha_mapper/__main__.py +++ b/src/pyrrha_mapper/__main__.py @@ -26,7 +26,7 @@ from pyrrha_mapper import exedecomp, fs, intercg from pyrrha_mapper.common import FileSystem -from pyrrha_mapper.types import Disassembler, Exporter, ResolveDuplicateOption +from pyrrha_mapper.types import Backend, ResolveDuplicateOption # ------------------------------------------------------------------------------- # Shared option decorators @@ -89,34 +89,17 @@ def wrapper(*args, **kwargs): return decorator -def disassembler_option(f): - """*Add the ``--disassembler`` option.""" +def backend_option(f): + """*Add the ``--backend`` option.""" @click.option( "-b", "--backend", required=False, - type=click.Choice(Disassembler, case_sensitive=False), - default=Disassembler.IDA, + type=click.Choice(Backend, case_sensitive=False), + default=Backend.IDA, show_default=True, - help="Disassembler to use.", - ) - @functools.wraps(f) - def wrapper(*args, **kwargs): - return f(*args, **kwargs) - - return wrapper - - -def exporter_option(f): - """Add the ``--exporter`` option (decorator).""" - @click.option( - "--exporter", - required=False, - type=click.Choice(Exporter, case_sensitive=False), - default=Exporter.NONE, - show_default=True, - help="Binary export format to use for binary analysis.", + help="Backend to use.", ) @functools.wraps(f) def wrapper(*args, **kwargs): @@ -227,6 +210,7 @@ def setup_db(db_path: Path, overwrite_db: bool = True) -> SourcetrailDB: def pyrrha(): # noqa: D103 pass + @pyrrha.command( "fs", cls=MapperCommand, @@ -282,30 +266,31 @@ def fs_mapper( ) @jobs_option(max_fraction=0.7) @resolve_duplicates_options -@disassembler_option -@exporter_option +@backend_option @root_directory_argument def fs_call_graph_mapper( debug: bool, db: Path, jobs: int, resolve_duplicates: ResolveDuplicateOption, - backend: Disassembler, - exporter: Exporter, + backend: Backend, root_directory: Path, ): """Map the inter-image call graph of a firmware filesystem.""" setup_logs(debug, db) db_instance = setup_db(db) - if backend not in (Disassembler.IDA, Disassembler.GHIDRA,): + if backend not in ( + Backend.IDA, + Backend.GHIDRA, + ): click.echo("Backend not yet supported") return 1 - + root_directory = root_directory.absolute() try: - intercg_mapper = intercg.InterImageCGMapper(root_directory, db_instance, backend, exporter) + intercg_mapper = intercg.InterImageCGMapper(root_directory, db_instance, backend) fs_object: FileSystem = intercg_mapper.map(jobs, resolve_duplicates) fs_object.write(db_instance.path.with_suffix(intercg_mapper.FS_EXT)) except RuntimeError: @@ -324,8 +309,7 @@ def fs_call_graph_mapper( "Also indexes the decompiled code along with all call cross-references." ), ) -@disassembler_option -@exporter_option +@backend_option @click.argument( "executable", type=click.Path(exists=False, file_okay=True, dir_okay=False, path_type=Path), @@ -333,8 +317,7 @@ def fs_call_graph_mapper( def fs_exe_decompiled_mapper( debug: bool, db: Path, - backend: Disassembler, - exporter: Exporter, + backend: Backend, executable: Path, ): """Map a single executable with decompiled code.""" @@ -344,11 +327,12 @@ def fs_exe_decompiled_mapper( setup_logs(debug, db) db_instance = setup_db(db) - if backend not in (Disassembler.IDA,): + if backend not in (Backend.IDA,): click.echo(f"Backend {backend.name} not yet supported") return 1 - if exedecomp.map_binary(db_instance, executable, backend, exporter): + # todo: add backend changes + if exedecomp.map_binary(db_instance, executable): logging.info("success.") else: logging.error("failure.") diff --git a/src/pyrrha_mapper/exedecomp/binmapper.py b/src/pyrrha_mapper/exedecomp/binmapper.py index e517c78..e61981c 100644 --- a/src/pyrrha_mapper/exedecomp/binmapper.py +++ b/src/pyrrha_mapper/exedecomp/binmapper.py @@ -192,7 +192,7 @@ def load_decompiled(program: Program, progress: Progress, raise FileNotFoundError("can't find decompilation file (idascript failed)") -def load_program(bin_path: Path, disass: Disassembler, format: ExportFormat) -> Program | None: +def load_program(bin_path: Path, disass: Disassembler = Disassembler.IDA, format: ExportFormat = ExportFormat.QUOKKA) -> Program | None: # First try to find pre-existing exported files if format is AUTO try: return Program.from_binary(bin_path, @@ -265,7 +265,12 @@ def is_thunk_to_import(p: Program, f: Function) -> bool: return False -def map_binary(db: SourcetrailDB, program_path: Path, disass: Disassembler, format: ExportFormat) -> bool: +def map_binary( + db: SourcetrailDB, + program_path: Path, + disass: Disassembler = Disassembler.IDA, + format: ExportFormat = ExportFormat.QUOKKA, +) -> bool: # Load the Quokka file with Progress( TextColumn("[progress.description]{task.description}"), diff --git a/src/pyrrha_mapper/intercg/fwmapper.py b/src/pyrrha_mapper/intercg/fwmapper.py index f5c436a..b0e782b 100644 --- a/src/pyrrha_mapper/intercg/fwmapper.py +++ b/src/pyrrha_mapper/intercg/fwmapper.py @@ -35,7 +35,7 @@ from pyrrha_mapper.exceptions import FsMapperError from pyrrha_mapper.fs import FileSystemImportsMapper from pyrrha_mapper.intercg.loader import BinaryParser, GhidraParser, IDAParser -from pyrrha_mapper.types import Disassembler, Exporter, ResolveDuplicateOption +from pyrrha_mapper.types import Backend, ResolveDuplicateOption IGNORE_LIST = ["__gmon_start__"] @@ -51,8 +51,7 @@ def __init__( self, root_directory: Path | str, db: SourcetrailDB | None, - disassembler: Disassembler, - exporter: Exporter, + backend: Backend, ): super(InterImageCGMapper, self).__init__(root_directory, db) # super initialize root_directory, db_interface, fs and _dry_run variables @@ -70,7 +69,7 @@ def __init__( self.exports_to_bins: dict[str, list[Binary]] = {} self.progress: Progress | None = None self.unresolved_callgraph: dict[Path, dict[Symbol, list[str]]] = dict() - self.disassembler, self.exporter = disassembler, exporter + self.backend = backend def _correct_map_result(self, res: Any) -> bool: return ( @@ -91,16 +90,14 @@ def load_binary_args(self) -> dict[str, Any]: Use to optimize multiprocessing. Set here there real values. """ res = super().load_binary_args() - res["disass"] = self.disassembler - res["exporter"] = self.exporter + res["backend"] = self.backend return res @staticmethod def load_binary( root_directory: Path, file_path: Path, - disass: Disassembler = Disassembler.IDA, - exporter: Exporter = Exporter.NONE, + disass: Backend = Backend.IDA, ) -> tuple[Binary, dict[Symbol, list[str]] | None] | str: """Load all the binaries located in the filesystem as Binary objects. @@ -110,14 +107,12 @@ def load_binary( enrich it with InterCG-mapper required data. It includes call graphs and some function normalization in case collisions. It modifies the FileSystem object in place. - - :param cache_file: Cache file to load binaries from (if exists) """ try: - if disass == Disassembler.IDA: + if disass == Backend.IDA: ida_parser: BinaryParser = IDAParser(root_directory, file_path) return ida_parser.binary, ida_parser.call_graph - elif disass == Disassembler.GHIDRA: + elif disass == Backend.GHIDRA: ghidra_parser = GhidraParser(root_directory, file_path) return ghidra_parser.binary, ghidra_parser.call_graph else: diff --git a/src/pyrrha_mapper/types.py b/src/pyrrha_mapper/types.py index 4bee410..f645fe6 100644 --- a/src/pyrrha_mapper/types.py +++ b/src/pyrrha_mapper/types.py @@ -18,20 +18,14 @@ from enum import Enum, auto -class Disassembler(Enum): - """Represent a SRE (Software Reverse Engineering tool, a disassembler).""" +class Backend(Enum): + """Represent the backend used for Pyrrha.""" IDA = auto() # doc: IDA Pro disassembler GHIDRA = auto() # doc: GHIDRA disassembler BINARY_NINJA = auto() # doc: Binary Ninja disassembler - - -class Exporter(Enum): - """Represent export file formats used in some of the mappers.""" - - BINEXPORT = auto() # doc: Use Binexport as exporter - NONE = auto() # doc: Use no exporter and interact directly with disassembler - QUOKKA = auto() # doc: Use Quokka as exporter + QUOKKA_IDA = auto() # doc: Use Quokka as exporter of IDA + QUOKKA_GHIDRA = auto() # doc: Use Quokka as exporter of Ghidra class ResolveDuplicateOption(Enum): From 5861a69a865d036f4b85e4d8d3725c2e8a9df73e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 3 Apr 2026 11:21:21 +0200 Subject: [PATCH 08/62] tests: adapt to new changes of CLI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- tests/conftest.py | 16 ++++++---------- tests/test_cli.py | 12 ++++-------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index b354e3c..aa3193d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1,12 @@ import pytest -from qbinary.types import ExportFormat, Disassembler + +from pyrrha_mapper.types import Backend + def pytest_addoption(parser): parser.addoption( - "--disassembler", - action="store", - help="disassembler", - choices={x.name.lower() for x in Disassembler}, - ) - parser.addoption( - "--exporter", + "--backend", action="store", - help="exporter", - choices={x.name.lower() for x in ExportFormat}, + help="backend", + choices={x.name.lower() for x in Backend}, ) \ No newline at end of file diff --git a/tests/test_cli.py b/tests/test_cli.py index 8ead6e6..b4cb69b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -281,10 +281,8 @@ def pyrrha_exec(self, request, tmp_path_factory) -> BaseTestFsMapper.ExecResults ) args = [ self.SUBCOMMAND, - "--disassembler", - f"{request.config.getoption('--disassembler')}", - "--exporter", - f"{request.config.getoption('--exporter')}", + "--backend", + f"{request.config.getoption('--backend')}", "--db", f"{tmp_path}", "-j", @@ -303,10 +301,8 @@ def export_res(self, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: ) args = [ self.SUBCOMMAND, - "--disassembler", - f"{request.config.getoption('--disassembler')}", - "--exporter", - f"{request.config.getoption('--exporter')}", + "--backend", + f"{request.config.getoption('--backend')}", "--db", f"{tmp_path}", "-j", From f436a25eacd9ba4cf9bb17bddf94d674f16c175a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 3 Apr 2026 11:55:04 +0200 Subject: [PATCH 09/62] ci: update for new version (ida) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 58 +------- ci/ida/.dockerignore | 6 + ci/ida/Dockerfile | 90 ++++++++----- ci/ida/Dockerfile.final | 33 +++++ ci/ida/build.sh | 291 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 394 insertions(+), 84 deletions(-) create mode 100644 ci/ida/.dockerignore create mode 100644 ci/ida/Dockerfile.final create mode 100755 ci/ida/build.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0a4a65c..0c97eef 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,22 +1,8 @@ #========================== STEPS USED IN WORKFLOWS ==================================== -.step_python_setup_install: &step_python_setup_install - - echo -e "\e[95m===== Setup Python" - - python --version ; pip --version - - python3 -m venv venv - - source venv/bin/activate - .step_install_pyrrha_test: &step_install_pyrrha_test - echo -e "\e[95m===== Install Pyrrha with test extension" - pip install '.[test]' -.step_configure_disassembler: &step_configure_disassembler - - if [[ ${DISASSEMBLER} == "ida" ]]; then - echo -e "\e[95m===== Configure IDA" && - mkdir -p ~/.idapro/ && - echo $KEY | base64 -d > ~/.idapro/$KEY_NAME && - echo $REG | base64 -d > ~/.idapro/ida.reg && - export IDA_LICENSE=keyfile=$KEY_NAME && - idapyswitch -a ; fi; .step_gen_artifacts: &step_gen_artifacts - echo -e "\e[95m===== Generate artifacts" @@ -36,7 +22,6 @@ test_data_structures: stage: test before_script: - - *step_python_setup_install - *step_install_pyrrha_test script: - *step_run_tests @@ -56,7 +41,6 @@ test_data_structures: .run_pyrrha_test_artifacts: stage: test before_script: - - *step_python_setup_install - *step_install_pyrrha_test script: - *step_gen_artifacts @@ -87,53 +71,25 @@ test_fs: TEST_PATH: tests/test_cli.py::TestFSMapper -.test_fs-cg: +test_fs-cg: extends: - .run_pyrrha_test_artifacts before_script: - !reference [.run_pyrrha_test_artifacts, before_script] - - *step_configure_disassembler image: - name: $CONTAINER_PATH/${DISASSEMBLER}:${VERSION} + name: $CONTAINER_PATH/${BACKEND}:${VERSION} docker: user: user variables: - DB: ${DISASSEMBLER}_${VERSION}_${EXPORTER} + DB: ${BACKEND}_${VERSION} MAPPER: fs-cg - MAPPER_OPTIONS: '--disassembler ${DISASSEMBLER} --exporter ${EXPORTER}' + MAPPER_OPTIONS: '--backend ${BACKEND}' TEST_COVERAGE_SOURCE: pyrrha_mapper.common.filesystem_mapper,pyrrha_mapper.intercg TEST_PATH: tests/test_cli.py::TestFsCgMapper TEST_SUP_OPTIONS: ${MAPPER_OPTIONS} - -test_fs-cg_ghidra: - extends: - - .test_fs-cg - variables: - DISASSEMBLER: ghidra + HEXRAYS_LICENSE: "${IDA_LICENSE}" parallel: matrix: - - VERSION: 11.1.2 - EXPORTER: binexport + - BACKEND: "ida" + VERSION: [91, 93] -test_fs-cg_ida: - extends: - - .test_fs-cg - variables: - DISASSEMBLER: ida - parallel: - matrix: - - VERSION: 84 - EXPORTER: [quokka, binexport] - - VERSION: 91 - EXPORTER: quokka - rules: - - if: $VERSION == "84" - variables: - KEY: $IDA_KEY - KEY_NAME: ida.key - REG: $IDA84_REG - - if: $VERSION == "91" - variables: - KEY: $LICENSE - KEY_NAME: ida_license.hexlic - REG: $IDA_REG diff --git a/ci/ida/.dockerignore b/ci/ida/.dockerignore new file mode 100644 index 0000000..bf9ed0a --- /dev/null +++ b/ci/ida/.dockerignore @@ -0,0 +1,6 @@ +# Exclude version-specific ida.reg backups; only the current ida.reg +# (copied by the build script before invoking docker build) is needed. +ida_*.reg + +# Exclude licence files — never baked into images, mounted at runtime only. +*.hexlic diff --git a/ci/ida/Dockerfile b/ci/ida/Dockerfile index 4c73e86..f2d91f7 100644 --- a/ci/ida/Dockerfile +++ b/ci/ida/Dockerfile @@ -24,10 +24,12 @@ # # Build command: # docker build \ -# --secret id=ida_installer,src=ida-pro_91.run \ +# --build-arg IDA_VERSION=91 \ +# --build-arg IDA_INSTALLER=ida-pro_91.run \ # -t pyrrha-ida . # -# ida.reg is generated automatically during the build (see stage 1). +# ida.reg is injected by Dockerfile.final after interactive EULA acceptance. +# See build_ida.sh for the full two-phase build procedure. # # Run command (licence is mounted at runtime, never stored in the image): # docker run --rm \ @@ -46,46 +48,57 @@ FROM docker.io/library/debian:testing-slim AS ida-install ARG IDA_VERSION=91 ARG IDA_INSTALL_DIR=/opt/ida_${IDA_VERSION} +# Path to the installer .run file on the build host, passed via --build-arg. +# It is bind-mounted into the build container at a fixed target path so that +# the ARG value (which Docker cannot expand inside --mount options) is only +# used inside the shell command where substitution works normally. +ARG IDA_INSTALLER=ida-pro_${IDA_VERSION}.run + # Build-time dependencies: # - libxcb-xinerama0 : required by the .run installer's Qt bootstrap -# - xvfb : provides a virtual framebuffer so that idat can run -# headlessly long enough to accept the licence and write -# ida.reg on first launch -# - remaining libs : runtime deps that are also needed at install time +# - libopengl0 : required by IDA's Qt layer for rendering +# - remaining libs : runtime deps also needed at install time RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ libfontconfig1 \ libmagic1 \ + libopengl0 \ libqt5gui5 \ libsecret-1-0 \ libxcb-xinerama0 \ python3-minimal \ python-is-python3 \ - xvfb \ - && mkdir -p "${IDA_INSTALL_DIR}" ~/.local/share/applications/ \ - && rm -rf /var/lib/apt/lists/* - -# Run the installer in a secret-mount RUN so the .run binary is never -# committed to any layer. The licence is intentionally NOT copied here; -# it will be mounted at runtime only (see run command above). -RUN --mount=type=secret,id=ida_installer,target=/tmp/ida_installer.run \ + && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* \ + && mkdir -p "${IDA_INSTALL_DIR}" ~/.local/share/applications/ + +# The installer is bind-mounted from the build context so the large .run +# binary is never written to any layer. Docker cannot expand ARGs inside +# --mount options, so src=. mounts the whole context and the ARG is used +# only inside the shell command where substitution works normally. +ARG IDA_INSTALLER +RUN --mount=type=bind,src=.,target=/build/context \ + cp "/build/context/${IDA_INSTALLER}" /tmp/ida_installer.run && \ chmod +x /tmp/ida_installer.run && \ DEBIAN_FRONTEND=noninteractive /tmp/ida_installer.run \ --mode unattended \ - --prefix "${IDA_INSTALL_DIR}" - -# Generate ida.reg by launching idat once inside a virtual framebuffer. -# The -A flag auto-accepts the EULA; idat exits after startup with no project. -# ida.reg is written to root's home by default. -RUN Xvfb :99 -screen 0 1024x768x24 & \ - sleep 2 && \ - DISPLAY=:99 "${IDA_INSTALL_DIR}/idat" -A && \ - sleep 5 && \ - kill %1 || true + --prefix "${IDA_INSTALL_DIR}" && \ + rm /tmp/ida_installer.run && \ + # Strip unneeded files from the IDA tree to reduce COPY --from size: + # - documentation + # - desktop integration files + # - uninstaller + rm -rf \ + "${IDA_INSTALL_DIR}/README_python3.txt" \ + "${IDA_INSTALL_DIR}/Uninstall IDA"* \ + "${IDA_INSTALL_DIR}/uninstall"* \ + "${IDA_INSTALL_DIR}/appico.png" \ + "${IDA_INSTALL_DIR}/hvui.png" # ======================== IDA Runtime image ================================= -# Only the IDA tree, the licence, ida.reg, and the runtime shared libraries are -# present here. No installer, no xvfb, no build tooling. +# Only the IDA tree and the runtime shared libraries are present here. +# No installer, no build tooling. +# ida.reg is NOT present at this stage — it is injected by Dockerfile.final +# after the user has accepted the EULA interactively (see build_ida.sh). FROM docker.io/library/debian:testing-slim ARG IDA_VERSION=91 @@ -102,35 +115,46 @@ ENV HEXRAYS_LICENSE=/run/secrets/ida_license ENV IDADIR=${IDA_INSTALL_DIR} # Runtime-only shared libraries required by IDA and its Qt layer. +# apt-get clean and list removal are in the same layer to avoid bloat. RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ libfontconfig1 \ libmagic1 \ + libopengl0 \ + libpython3.13 \ libqt5gui5 \ libsecret-1-0 \ + libxcb-cursor0 \ python3-minimal \ + python3-venv \ python-is-python3 \ - && rm -rf /var/lib/apt/lists/* + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* # Copy only the installed IDA tree from the build stage. COPY --from=ida-install ${IDA_INSTALL_DIR} ${IDA_INSTALL_DIR} -# User creation +# ── User creation ───────────────────────────────────────────────────────────── RUN useradd --create-home -u 1000 -m user && \ chown -R user:user "${IDA_INSTALL_DIR}" USER user WORKDIR /home/user -# Copy ida.reg (generated in stage 1) into the user's IDA config directory. -COPY --from=ida-install /root/.idapro/ida.reg /home/user/.idapro/ida.reg - -# virtualenv (automatically launch at runtime) +# virtualenv (automatically activated at runtime for both interactive and +# non-interactive sessions via ENV PATH and .bashrc respectively) ENV VIRTUAL_ENV=/home/user/.venv ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" RUN python -m venv "${VIRTUAL_ENV}" && \ echo "source ${VIRTUAL_ENV}/bin/activate" >> /home/user/.bashrc -RUN "${IDA_INSTALL_DIR}/idapyswitch" -a +# Register the venv's Python interpreter with IDA so that idascript and +# qbinary can drive it programmatically. +# idapyswitch -a (auto) relies on python3-config which is not installed; +# instead point it directly to the system libpython via --force-path. +# libpython3.13 provides the shared library on Debian testing. +RUN libpython="$(find /usr /lib -name 'libpython3*.so*' 2>/dev/null | head -1)" && \ + [ -n "${libpython}" ] || { echo "ERROR: libpython not found"; exit 1; } && \ + "${IDA_INSTALL_DIR}/idapyswitch" --force-path "${libpython}" CMD ["/bin/bash"] diff --git a/ci/ida/Dockerfile.final b/ci/ida/Dockerfile.final new file mode 100644 index 0000000..82e9d1d --- /dev/null +++ b/ci/ida/Dockerfile.final @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Phase 2 Dockerfile: extends the setup image produced by Dockerfile by +# injecting the ida.reg file generated during the interactive phase 1 run. +# Invoked by build_ida.sh after ida.reg has been extracted from the container. + +ARG IMAGE_NAME=pyrrha-ida +ARG IDA_VERSION=91 + +FROM ${IMAGE_NAME}:${IDA_VERSION}-setup + +# Cache-busting ARG: set to the md5 hash of ida.reg by the build script so +# Docker does not reuse a cached layer for the COPY below. +ARG IDA_REG_HASH + +# Copy ida.reg generated during the interactive phase 1 run. +# The build script copies it into the build context as ida.reg before +# invoking docker build, so it is available as a plain build context file. +COPY --chown=user:user ida.reg /home/user/.idapro/ida.reg diff --git a/ci/ida/build.sh b/ci/ida/build.sh new file mode 100755 index 0000000..4460081 --- /dev/null +++ b/ci/ida/build.sh @@ -0,0 +1,291 @@ +#!/usr/bin/env bash +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Build one Docker image per requested IDA version. Each version is built from +# its own installer file (ida-pro_.run) located next to this script. +# Produced images are tagged :; the numerically greatest version is +# additionally tagged :latest. +# +# IDA 9.x requires interactive EULA acceptance before it writes ida.reg. +# The build therefore proceeds in two phases per version: +# Phase 1 — build a setup image without ida.reg, run it interactively so +# the user can accept the EULA, then extract the resulting ida.reg +# from the stopped container. Skipped if ida_.reg already +# exists on disk from a prior run. +# Phase 2 — build the final image via Dockerfile.final, which extends the +# setup image and injects ida.reg. +# +# The installer is passed via bind-mount (no size limit, never committed to any +# layer). It must be located next to this script as ida-pro_.run. +# The licence file (idapro.hexlic) is NEVER baked into any image layer. +# Pass it at runtime via a Docker secret: +# docker run --mount type=secret,id=ida_license,src=idapro.hexlic : + +set -euo pipefail + +# ── Docker command resolution ───────────────────────────────────────────────── + +# Determine whether docker must be run via sudo. A plain `docker info` is +# attempted first; if it fails (e.g. the current user is not in the docker +# group), sudo is prepended for all subsequent docker calls. +if docker info > /dev/null 2>&1; then + DOCKER="docker" +elif sudo docker info > /dev/null 2>&1; then + DOCKER="sudo docker" +else + echo "ERROR: Cannot connect to the Docker daemon (tried both 'docker' and 'sudo docker')." >&2 + exit 1 +fi +readonly DOCKER + +# ── Constants ───────────────────────────────────────────────────────────────── + +readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly DOCKERFILE="${SCRIPT_DIR}/Dockerfile" +readonly DOCKERFILE_FINAL="${SCRIPT_DIR}/Dockerfile.final" +readonly DEFAULT_LICENSE="${SCRIPT_DIR}/idapro.hexlic" +readonly IMAGE_NAME_DEFAULT="pyrrha-ida" + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +usage() { + cat < [--version ...] [OPTIONS] + +Build one Docker image per requested IDA version. Each version is built from +its own installer file (ida-pro_.run) located next to this script. +Produced images are tagged :; the numerically greatest version is +additionally tagged :latest. + +IDA 9.x requires interactive EULA acceptance on first launch. The build runs +in two phases: phase 1 launches IDA so you can accept the EULA and extracts +the resulting ida.reg; phase 2 builds the final image with ida.reg injected. +Phase 1 is skipped if ida_.reg already exists from a prior run. + +The licence file (idapro.hexlic) is NEVER baked into any image layer. +Pass it at runtime via a Docker secret: + docker run --mount type=secret,id=ida_license,src=idapro.hexlic : + +The installer is passed via bind-mount and is never committed to any layer. + +Options: + -v, --version IDA version number (e.g. 91). Repeatable. + Installer resolved as ./ida-pro_.run. + -n, --name Base image name (default: pyrrha-ida). + Images tagged :, newest also :latest. + -l, --license Path to idapro.hexlic (default: ./idapro.hexlic). + Validated at startup, never passed to docker build. + -h, --help Print this help and exit. + +Examples: + # Build a single version with defaults: + $(basename "$0") --version 91 + + # Build two versions under a custom image name: + $(basename "$0") --version 91 --version 92 --name myorg/ida + + # Build with a licence file stored elsewhere: + $(basename "$0") --version 91 --license /secure/idapro.hexlic +EOF + exit 0 +} + +die() { + echo "ERROR: $*" >&2 + echo >&2 + usage + exit 1 +} + +# ── Argument parsing ────────────────────────────────────────────────────────── + +versions=() +image_name="${IMAGE_NAME_DEFAULT}" +license_path="${DEFAULT_LICENSE}" + +while [[ $# -gt 0 ]]; do + case "$1" in + -v|--version) + [[ -n "${2:-}" ]] || die "--version requires an argument." + versions+=("$2") + shift 2 + ;; + -n|--name) + [[ -n "${2:-}" ]] || die "--name requires an argument." + image_name="$2" + shift 2 + ;; + -l|--license) + [[ -n "${2:-}" ]] || die "--license requires an argument." + license_path="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + die "Unknown option: $1" + ;; + esac +done + +[[ ${#versions[@]} -gt 0 ]] || die "At least one --version is required." + +# ── Pre-flight checks ───────────────────────────────────────────────────────── + +[[ -f "${DOCKERFILE}" ]] || die "Dockerfile not found at: ${DOCKERFILE}" +[[ -f "${DOCKERFILE_FINAL}" ]] || die "Dockerfile.final not found at: ${DOCKERFILE_FINAL}" +[[ -f "${license_path}" ]] || die "Licence file not found at: ${license_path}" + +# ── Build loop ──────────────────────────────────────────────────────────────── + +# The latest version is the numerically greatest one; it will also be tagged +# as :latest after its build. +latest_version="$(printf '%s\n' "${versions[@]}" | sort -n | tail -1)" + +for version in "${versions[@]}"; do + # Each version has its own installer: ida-pro_.run + installer_path="${SCRIPT_DIR}/ida-pro_${version}.run" + + [[ -f "${installer_path}" ]] || \ + die "Installer not found for version ${version}: ${installer_path}" + + # IDA_INSTALLER is a plain filename relative to the build context so that + # Docker bind-mounting the context can resolve it without path doubling. + installer_filename="$(basename "${installer_path}")" + + image_tag="${image_name}:${version}" + tmp_image="${image_name}:${version}-setup" + tmp_container="ida-setup-${version}" + ida_reg_path="${SCRIPT_DIR}/ida_${version}.reg" + + echo "==> Building ${image_tag}" + echo " Installer : ${installer_path}" + echo " Licence : ${license_path}" + echo " ida.reg : ${ida_reg_path} (exists: $([ -f "${ida_reg_path}" ] && echo yes || echo no))" + + # ── Phase 1: build setup image and extract ida.reg ──────────────────── + # IDA 9.x requires interactive EULA acceptance before writing ida.reg. + # Skipped if ida_.reg already exists from a prior run. + if [[ ! -f "${ida_reg_path}" ]]; then + echo "==> [Phase 1] ida.reg not found at ${ida_reg_path}, running setup..." + + echo "==> [Phase 1] Building setup image ${tmp_image}..." + ${DOCKER} build \ + --build-arg "IDA_VERSION=${version}" \ + --build-arg "IDA_INSTALLER=${installer_filename}" \ + --tag "${tmp_image}" \ + --file "${DOCKERFILE}" \ + "${SCRIPT_DIR}" + + echo "==> [Phase 1] Starting temporary container." + echo " Accept the IDA EULA when prompted, then close IDA." + echo " The container will stop automatically afterwards." + + # Remove any leftover container from a previous failed attempt. + ${DOCKER} rm "${tmp_container}" 2>/dev/null || true + + # Find the IDA GUI binary. In IDA 9.x it is simply 'ida' (no suffix); + # older versions used 'ida64'. We match exactly those two names. + ida_binary="$(${DOCKER} run --rm "${tmp_image}" \ + find "/opt/ida_${version}" -maxdepth 1 -type f -executable \ + \( -name 'ida' -o -name 'ida64' \) | head -1)" + [[ -n "${ida_binary}" ]] || \ + die "Could not find IDA GUI binary (ida or ida64) in /opt/ida_${version}." + echo " IDA binary: ${ida_binary}" + + # Allow the root-owned Docker container to connect to the user's X + # display. Revoked immediately after the container stops. + xhost +local:root + + # Do NOT use --rm: we need the stopped container's filesystem to + # extract ida.reg after the user has accepted the EULA and closed IDA. + # Note: docker run does not support --mount type=secret (build-only); + # the licence is passed as a read-only bind mount instead. + # '|| true' prevents set -e from aborting the script when IDA exits + # with a non-zero code (which it does on normal close). + ${DOCKER} run --name "${tmp_container}" \ + -v "${license_path}:/run/secrets/ida_license:ro" \ + -e DISPLAY="${DISPLAY:-:0}" \ + -v /tmp/.X11-unix:/tmp/.X11-unix \ + "${tmp_image}" \ + "${ida_binary}" || true + + # Revoke the X display permission as soon as the container exits. + xhost -local:root + + echo "==> [Phase 1] Container stopped. Extracting ida.reg..." + ${DOCKER} cp "${tmp_container}:/home/user/.idapro/ida.reg" "${ida_reg_path}" || \ + die "docker cp failed — ida.reg not found in container '${tmp_container}'." \ + "Make sure you accepted the EULA and closed IDA before the container exited." + + [[ -f "${ida_reg_path}" ]] || \ + die "ida.reg was not saved to ${ida_reg_path} after docker cp." + + ${DOCKER} rm "${tmp_container}" + ${DOCKER} rmi "${tmp_image}" 2>/dev/null || true + echo "==> [Phase 1] ida.reg saved to: ${ida_reg_path}" + else + echo "==> [Phase 1] Skipped — ida.reg already exists at: ${ida_reg_path}" + fi + + # ── Phase 2: build the final image with ida.reg injected ───────────── + # Uses Dockerfile.final which extends the setup image and only adds + # ida.reg, avoiding the COPY-in-wrong-stage problem of a single Dockerfile. + echo "==> [Phase 2] Building final image ${image_tag}..." + + cp "${ida_reg_path}" "${SCRIPT_DIR}/ida.reg" || \ + die "Failed to copy ${ida_reg_path} into build context." + [[ -f "${SCRIPT_DIR}/ida.reg" ]] || \ + die "ida.reg is missing from build context (${SCRIPT_DIR}/ida.reg)." + + # Pass the md5 hash of ida.reg as a build arg to bust the Docker cache at + # the COPY instruction, preventing reuse of a layer built before the file + # existed. + ida_reg_hash="$(md5sum "${SCRIPT_DIR}/ida.reg" | cut -d' ' -f1)" + + # Phase 2 needs the setup image as its base; build it if it was cleaned up. + if ! ${DOCKER} image inspect "${tmp_image}" > /dev/null 2>&1; then + echo "==> [Phase 2] Setup image not found, rebuilding ${tmp_image}..." + ${DOCKER} build \ + --build-arg "IDA_VERSION=${version}" \ + --build-arg "IDA_INSTALLER=${installer_filename}" \ + --tag "${tmp_image}" \ + --file "${DOCKERFILE}" \ + "${SCRIPT_DIR}" + fi + ${DOCKER} build \ + --build-arg "IMAGE_NAME=${image_name}" \ + --build-arg "IDA_VERSION=${version}" \ + --build-arg "IDA_REG_HASH=${ida_reg_hash}" \ + --tag "${image_tag}" \ + --file "${DOCKERFILE_FINAL}" \ + "${SCRIPT_DIR}" + + rm -f "${SCRIPT_DIR}/ida.reg" + + # Tag the newest version as :latest. + if [[ "${version}" == "${latest_version}" ]]; then + ${DOCKER} tag "${image_tag}" "${image_name}:latest" + echo "==> Also tagged ${image_tag} as ${image_name}:latest" + fi + + echo "==> Successfully built ${image_tag}" +done + +echo "==> Done." From 41e45de0592d755c6c5f764e3f04b07c72c9da23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 3 Apr 2026 11:57:59 +0200 Subject: [PATCH 10/62] [fix] intercg: error in argument renaming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/intercg/fwmapper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pyrrha_mapper/intercg/fwmapper.py b/src/pyrrha_mapper/intercg/fwmapper.py index b0e782b..f4a08b3 100644 --- a/src/pyrrha_mapper/intercg/fwmapper.py +++ b/src/pyrrha_mapper/intercg/fwmapper.py @@ -97,7 +97,7 @@ def load_binary_args(self) -> dict[str, Any]: def load_binary( root_directory: Path, file_path: Path, - disass: Backend = Backend.IDA, + backend: Backend = Backend.IDA, ) -> tuple[Binary, dict[Symbol, list[str]] | None] | str: """Load all the binaries located in the filesystem as Binary objects. @@ -109,14 +109,14 @@ def load_binary( FileSystem object in place. """ try: - if disass == Backend.IDA: + if backend == Backend.IDA: ida_parser: BinaryParser = IDAParser(root_directory, file_path) return ida_parser.binary, ida_parser.call_graph - elif disass == Backend.GHIDRA: + elif backend == Backend.GHIDRA: ghidra_parser = GhidraParser(root_directory, file_path) return ghidra_parser.binary, ghidra_parser.call_graph else: - return f" disassembler {disass} is not supported" + return f" disassembler {backend} is not supported" except (FileNotFoundError, FsMapperError, SyntaxError) as e: return f"[binary mapping] {file_path.name}: ERROR: Loading error: {e}" From e3999976542d4b5972da66e07bd93853e5239859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 3 Apr 2026 17:41:43 +0200 Subject: [PATCH 11/62] [fix] intercg: missing thunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/intercg/loader.py | 155 +++++++++++++++++----------- 1 file changed, 96 insertions(+), 59 deletions(-) diff --git a/src/pyrrha_mapper/intercg/loader.py b/src/pyrrha_mapper/intercg/loader.py index 117a1e1..922d22b 100644 --- a/src/pyrrha_mapper/intercg/loader.py +++ b/src/pyrrha_mapper/intercg/loader.py @@ -126,55 +126,78 @@ def __init__(self, root_directory: Path, file_path: Path) -> None: # ------------------------------------------------------------------ # Maps a trampoline name → the canonical name it should forward to. trampoline_map: dict[str, str] = {} + to_analyse = program_data + + while len(to_analyse) > 0: + missed_data = dict() + for func_data in to_analyse.values(): + exported = ( + func_data.addr in parser_exports + or func_data.addr + 1 in parser_exports # ARM THUMB + ) - for func_data in program_data.values(): - exported = ( - func_data.addr in parser_exports - or func_data.addr + 1 in parser_exports # ARM THUMB - ) - - if func_data.type in (FuncType.LIBRARY, FuncType.NORMAL) or ( - func_data.type == FuncType.THUNK and (exported or len(func_data.calls) > 1) - ): - call_graph[func_data.symbol] = self._build_calls_list(func_data, program_data) - continue + if func_data.type in (FuncType.LIBRARY, FuncType.NORMAL) or ( + func_data.type == FuncType.THUNK and (exported or len(func_data.calls) > 1) + ): + call_graph[func_data.symbol] = self._build_calls_list(func_data, program_data) + continue - if ( - func_data.type == FuncType.THUNK - and len(func_data.calls) == 1 - and func_data.calls[0] in program_data - ): - callee_data = program_data[func_data.calls[0]] - if callee_data.type == FuncType.IMPORTED: - # Keep the less-decorated name as the canonical one - trampoline_name = func_data.name - destination_name = callee_data.name - if _count_leading_underscores(trampoline_name) > _count_leading_underscores( - destination_name + if func_data.type == FuncType.THUNK and len(func_data.calls) == 1: + if func_data.calls[0] not in program_data: + mangled_name = self._func_mangled_name(func_data.calls[0]) + if mangled_name == "": + logging.warning("Nothing found ") + continue + + func_symbol = Symbol( + name=mangled_name, + demangled_name=self._func_demangled_name(func_data.calls[0]), + is_func=True, + addr=func_data.calls[0], + ) + self._binary.add_function(func_symbol) + func = FuncData( + symbol=func_symbol, + type=self._func_type(func_data.calls[0]), + calls=self._func_children(func_data.calls[0]), + callers=self._func_parents(func_data.calls[0]), + ) + missed_data[func_data.calls[0]] = func + callee_data = func + else: + callee_data = program_data[func_data.calls[0]] + if callee_data.type == FuncType.IMPORTED: + # Keep the name of the thunk "strcpy, sprintf" + trampoline_name = func_data.name + destination_name = callee_data.name + # in case of nested functions (starting with _, keep the less nested one) + if _count_leading_underscores(trampoline_name) > _count_leading_underscores( + destination_name + ): + trampoline_name, destination_name = destination_name, trampoline_name + else: # Forward the call to the underlying function name + trampoline_name = func_data.name + destination_name = callee_data.name + # Resolve chains: A→B, B→C becomes A→C + while ( + destination_name in trampoline_map + and trampoline_map[destination_name] != destination_name ): - trampoline_name, destination_name = destination_name, trampoline_name - else: - trampoline_name = func_data.name - destination_name = callee_data.name - - # Resolve chains: A→B, B→C becomes A→C - while ( - destination_name in trampoline_map - and trampoline_map[destination_name] != destination_name - ): - destination_name = trampoline_map[destination_name] - trampoline_map[trampoline_name] = destination_name - for key, val in trampoline_map.items(): - if val == trampoline_name: - trampoline_map[key] = destination_name - - elif func_data.type == FuncType.THUNK and not func_data.calls and func_data.callers: - # Terminal thunk with callers but no callees — keep it - continue + destination_name = trampoline_map[destination_name] + trampoline_map[trampoline_name] = destination_name + for key, val in trampoline_map.items(): + if val == trampoline_name: + trampoline_map[key] = destination_name + + elif func_data.type == FuncType.THUNK and not func_data.calls and func_data.callers: + # Terminal thunk with callers but no callees — keep it + continue - # Remove functions not kept as exported/library/normal - if self._binary.get_function_by_name(func_data.name).addr == func_data.addr: - self._binary.remove_function(func_data.name) + # Remove functions not kept as exported/library/normal + if self._binary.get_function_by_name(func_data.name).addr == func_data.addr: + self._binary.remove_function(func_data.name) + to_analyse = missed_data + program_data.update(missed_data) # Apply trampoline substitutions to the final call graph self._call_graph: dict[Symbol, list[str]] = { @@ -410,9 +433,18 @@ def _iter_func_addr(self) -> Iterator[int]: self._ida_cached_func = func yield func.start_ea + def _get_import(self, addr: int) -> str | None: + res = self._ida_db.functions.get_at(addr) + if res: + return res.name + return None + def _func_mangled_name(self, addr: int) -> str: """:return: the raw name of the function at *addr*, or ``sub_``.""" func = self._get_ida_func(addr) + import_info = self._ida_db.imports.get_import_at(addr) + if import_info is not None and import_info.name is not None: + return import_info.name.split("@")[0] if func is not None: name = self._ida_db.functions.get_name(func) if name: @@ -451,21 +483,26 @@ def _func_type(self, addr: int) -> FuncType: if func is None: return FuncType.NORMAL - match self._ida_db.functions.get_flags(func): - case FunctionFlags.LIB: - return FuncType.LIBRARY - case FunctionFlags.THUNK: - callees = list(self._ida_db.functions.get_callees(func)) - if len(callees) == 1: - callee_name = self._ida_db.functions.get_name(callees[0]) - if self._ida_db.imports.exists(callee_name): - return FuncType.IMPORTED - return FuncType.THUNK - case _: - func_name = self._ida_db.functions.get_name(func) - if self._ida_db.imports.exists(func_name): + flags = self._ida_db.functions.get_flags(func) + is_imported = False + + callees = list(self._ida_db.functions.get_callees(func)) + if len(callees) == 0: + if self._ida_db.imports.get_import_at(addr): + is_imported = True + + if is_imported: + return FuncType.IMPORTED + elif FunctionFlags.THUNK in flags: + callees = list(self._ida_db.functions.get_callees(func)) + if len(callees) == 1: + callee_name = self._ida_db.functions.get_name(callees[0]) + if self._ida_db.imports.exists(callee_name): return FuncType.IMPORTED - return FuncType.NORMAL + return FuncType.THUNK + elif FunctionFlags.LIB in flags: + return FuncType.LIBRARY + return FuncType.NORMAL # ====================================================================== From 3a622c06d4a7cd585f85e451bba296086d3d603d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 7 Apr 2026 11:03:13 +0200 Subject: [PATCH 12/62] ci: add ghidra support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 2 + ci/ghidra/Dockerfile | 96 ++++++++++++++-------- ci/ghidra/build.sh | 190 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 253 insertions(+), 35 deletions(-) create mode 100755 ci/ghidra/build.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0c97eef..f4fd8a3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -92,4 +92,6 @@ test_fs-cg: matrix: - BACKEND: "ida" VERSION: [91, 93] + - BACKEND: "ghidra" + VERSION: ["12.0.4"] diff --git a/ci/ghidra/Dockerfile b/ci/ghidra/Dockerfile index 69b5785..22c3702 100644 --- a/ci/ghidra/Dockerfile +++ b/ci/ghidra/Dockerfile @@ -14,56 +14,82 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM openjdk:21-jdk-slim -SHELL ["/bin/bash", "-c"] -# ======================== Ghidra Installation ================================= +# ======================== Ghidra Download and Extraction ============================== +# Use a dedicated stage so that wget, unzip, and the zip archive itself are +# never committed to the final image layer. +FROM debian:bookworm-slim AS ghidra-download -ARG GHIDRA_VERSION=11.1.2 -ARG GHIDRA_RELEASE_DATE=20240709 +ARG GHIDRA_VERSION=12.0.4 +ARG GHIDRA_RELEASE_DATE=20260303 +ARG GHIDRA_SHA256=c3b458661d69e26e203d739c0c82d143cc8a4a29d9e571f099c2cf4bda62a120 ARG GHIDRA_URL=https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_${GHIDRA_VERSION}_build/ghidra_${GHIDRA_VERSION}_PUBLIC_${GHIDRA_RELEASE_DATE}.zip -ENV GHIDRA_INSTALL_DIR=/opt/ghidra_${GHIDRA_VERSION}_PUBLIC RUN apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + apt-get install --yes --no-install-recommends \ ca-certificates \ - libfreetype6 \ - libmagic1 \ - libpython3-dev \ - python3-minimal \ - python3-pip \ - python3-venv \ - python-is-python3 \ unzip \ wget \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* -RUN wget $GHIDRA_URL -O ghidra.zip && unzip ghidra.zip -d /opt/ && rm ghidra.zip +# Download, verify checksum, extract, then discard the archive immediately. +# Also strip files not needed at runtime to reduce COPY --from size. +RUN wget --no-verbose "${GHIDRA_URL}" -O /tmp/ghidra.zip && \ + echo "${GHIDRA_SHA256} /tmp/ghidra.zip" | sha256sum --check --strict && \ + unzip -q /tmp/ghidra.zip -d /opt/ && \ + rm /tmp/ghidra.zip && \ + find /opt/ghidra_${GHIDRA_VERSION}_PUBLIC -name "*.bat" -delete && \ + rm -rf \ + /opt/ghidra_${GHIDRA_VERSION}_PUBLIC/docs \ + /opt/ghidra_${GHIDRA_VERSION}_PUBLIC/Extensions/Eclipse \ + /opt/ghidra_${GHIDRA_VERSION}_PUBLIC/licenses -ENV PATH=${GHIDRA_INSTALL_DIR}:${PATH} +# ======================== Ghidra Installation and Runtime Image ======================= +# eclipse-temurin:21-jre-jammy is the current maintained replacement for the +# deprecated openjdk:21-jdk-slim. The JRE is sufficient for running Ghidra +# headless — the full JDK is not required at runtime. +FROM eclipse-temurin:21-jre-jammy -# ======================== Plugin Installation ============================== +ARG GHIDRA_VERSION=12.0.4 + +# Exported so that pyghidra and other tools spawned inside the container can +# locate the Ghidra installation without extra configuration. +ENV GHIDRA_INSTALL_DIR=/opt/ghidra_${GHIDRA_VERSION}_PUBLIC +ENV PATH=${GHIDRA_INSTALL_DIR}:${PATH} -ARG BINEXPORT_URL=https://github.com/google/binexport/archive/refs/heads/main.zip -ARG GRADLE_VERSION=8.14.3 -ARG GRADLE_URL=https://services.gradle.org/distributions/gradle-${GRADLE_VERSION}-bin.zip -ARG GHIDRA_PLUGIN_DIR=/root/.config/ghidra/ghidra_${GHIDRA_VERSION}_PUBLIC/Extensions +# Copy only the extracted Ghidra tree from the download stage — no wget/unzip +# tooling or archive bytes are present in this layer. +COPY --from=ghidra-download /opt/ghidra_${GHIDRA_VERSION}_PUBLIC ${GHIDRA_INSTALL_DIR} -RUN wget ${GRADLE_URL} -O gradle.zip \ - && unzip gradle.zip -d gradle \ - && wget ${BINEXPORT_URL} -O binexport.zip \ - && mkdir -p ${GHIDRA_PLUGIN_DIR} \ - && unzip binexport.zip binexport-main/java/* -d binexport \ - && (cd binexport/binexport-main/java/ && /gradle/gradle-${GRADLE_VERSION}/bin/gradle buildExtension -PGHIDRA_INSTALL_DIR=${GHIDRA_INSTALL_DIR} && unzip dist/ghidra_${GHIDRA_VERSION}_PUBLIC_$( date +%Y%m%d)_BinExport.zip -d ${GHIDRA_PLUGIN_DIR}) \ - && rm -rf gradle.zip gradle binexport.zip binexport \ - && apt-get purge --yes wget unzip && apt --yes autoremove +# Install the minimal runtime dependencies for Ghidra headless + pyghidra and +# for building/installing the pyrrha-mapper Python package. +# All apt artefacts are removed in the same RUN layer to keep layer size down. +RUN apt-get update && \ + apt-get install --yes --no-install-recommends \ + libfreetype6 \ + libmagic1 \ + python3-minimal \ + python3-pip \ + python3-venv \ + python-is-python3 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* -# ======================== USER CREATION ============================== -ARG USER_GHIDRA_PLUGIN_DIR=/home/user/.config/ghidra/ghidra_${GHIDRA_VERSION}_PUBLIC/Extensions +# User creation +RUN useradd --create-home -u 1000 -m user && \ + chown -R user:user "${GHIDRA_INSTALL_DIR}" -RUN useradd --create-home -u 1000 -m user && chown -R user:user $GHIDRA_INSTALL_DIR -RUN mkdir -p ${USER_GHIDRA_PLUGIN_DIR} && mv ${GHIDRA_PLUGIN_DIR}/* ${USER_GHIDRA_PLUGIN_DIR} && chown -R user:user /home/user/.config USER user WORKDIR /home/user -CMD ["/bin/bash"] \ No newline at end of file +# Python virtual environment — activated automatically for both interactive +# shells (.bashrc) and non-interactive processes (ENV PATH). +ENV VIRTUAL_ENV=/home/user/.venv +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" + +RUN python -m venv "${VIRTUAL_ENV}" && \ + pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir pyrrha-mapper && \ + echo "source ${VIRTUAL_ENV}/bin/activate" >> /home/user/.bashrc + +CMD ["/bin/bash"] diff --git a/ci/ghidra/build.sh b/ci/ghidra/build.sh new file mode 100755 index 0000000..02e6617 --- /dev/null +++ b/ci/ghidra/build.sh @@ -0,0 +1,190 @@ +#!/usr/bin/env bash +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Build a Docker image with Ghidra and pyrrha-mapper. +# +# The Ghidra release is downloaded and verified at build time — no local +# installer file is required. The produced image is tagged :. +# +# Usage: +# ./build_ghidra.sh [OPTIONS] +# +# Options: +# -v, --version Ghidra version (default: 12.0.4). +# -d, --date Ghidra release date string (default: 20260303). +# -s, --sha256 Expected SHA-256 of the Ghidra zip (required when +# overriding --version, to ensure integrity). +# -n, --name Base image name (default: pyrrha-ghidra). +# Image tagged :, also :latest. +# -h, --help Print this help and exit. +# +# Examples: +# # Build with defaults: +# ./build_ghidra.sh +# +# # Build a specific version: +# ./build_ghidra.sh --version 12.0.4 --date 20260303 \ +# --sha256 c3b458661d69e26e203d739c0c82d143cc8a4a29d9e571f099c2cf4bda62a120 +# +# # Build under a custom image name: +# ./build_ghidra.sh --name myorg/ghidra + +set -euo pipefail + +# ── Docker command resolution ───────────────────────────────────────────────── + +# Determine whether docker must be run via sudo. A plain `docker info` is +# attempted first; if it fails (e.g. the current user is not in the docker +# group), sudo is prepended for all subsequent docker calls. +if docker info > /dev/null 2>&1; then + DOCKER="docker" +elif sudo docker info > /dev/null 2>&1; then + DOCKER="sudo docker" +else + echo "ERROR: Cannot connect to the Docker daemon (tried both 'docker' and 'sudo docker')." >&2 + exit 1 +fi +readonly DOCKER + +# ── Constants ───────────────────────────────────────────────────────────────── + +readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly DOCKERFILE="${SCRIPT_DIR}/Dockerfile" +readonly IMAGE_NAME_DEFAULT="pyrrha-ghidra" + +# Default Ghidra release — update these when a new version is published. +readonly DEFAULT_VERSION="12.0.4" +readonly DEFAULT_DATE="20260303" +readonly DEFAULT_SHA256="c3b458661d69e26e203d739c0c82d143cc8a4a29d9e571f099c2cf4bda62a120" + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +usage() { + cat <: and also :latest. + +Options: + -v, --version Ghidra version (default: ${DEFAULT_VERSION}). + -d, --date Ghidra release date string (default: ${DEFAULT_DATE}). + -s, --sha256 Expected SHA-256 of the Ghidra zip. + Required when overriding --version. + Default: ${DEFAULT_SHA256} + -n, --name Base image name (default: ${IMAGE_NAME_DEFAULT}). + Image tagged : and :latest. + -h, --help Print this help and exit. + +Examples: + # Build with defaults: + $(basename "$0") + + # Build a specific version: + $(basename "$0") --version 12.0.4 --date 20260303 \\ + --sha256 c3b458661d69e26e203d739c0c82d143cc8a4a29d9e571f099c2cf4bda62a120 + + # Build under a custom image name: + $(basename "$0") --name myorg/ghidra +EOF + exit 0 +} + +die() { + echo "ERROR: $*" >&2 + echo >&2 + usage + exit 1 +} + +# ── Argument parsing ────────────────────────────────────────────────────────── + +ghidra_version="${DEFAULT_VERSION}" +ghidra_date="${DEFAULT_DATE}" +ghidra_sha256="${DEFAULT_SHA256}" +image_name="${IMAGE_NAME_DEFAULT}" +version_overridden=false + +while [[ $# -gt 0 ]]; do + case "$1" in + -v|--version) + [[ -n "${2:-}" ]] || die "--version requires an argument." + ghidra_version="$2" + version_overridden=true + shift 2 + ;; + -d|--date) + [[ -n "${2:-}" ]] || die "--date requires an argument." + ghidra_date="$2" + shift 2 + ;; + -s|--sha256) + [[ -n "${2:-}" ]] || die "--sha256 requires an argument." + ghidra_sha256="$2" + shift 2 + ;; + -n|--name) + [[ -n "${2:-}" ]] || die "--name requires an argument." + image_name="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + die "Unknown option: $1" + ;; + esac +done + +# If the user overrode --version but not --sha256, warn that the default +# SHA-256 is almost certainly wrong for a different version. +if [[ "${version_overridden}" == true && "${ghidra_sha256}" == "${DEFAULT_SHA256}" ]]; then + die "You overrode --version but not --sha256. " \ + "Please provide the correct SHA-256 for Ghidra ${ghidra_version} via --sha256." +fi + +# ── Pre-flight checks ───────────────────────────────────────────────────────── + +[[ -f "${DOCKERFILE}" ]] || die "Dockerfile not found at: ${DOCKERFILE}" + +# ── Build ───────────────────────────────────────────────────────────────────── + +image_tag="${image_name}:${ghidra_version}" + +echo "==> Building ${image_tag}" +echo " Ghidra version : ${ghidra_version}" +echo " Release date : ${ghidra_date}" +echo " SHA-256 : ${ghidra_sha256}" + +${DOCKER} build \ + --build-arg "GHIDRA_VERSION=${ghidra_version}" \ + --build-arg "GHIDRA_RELEASE_DATE=${ghidra_date}" \ + --build-arg "GHIDRA_SHA256=${ghidra_sha256}" \ + --tag "${image_tag}" \ + --file "${DOCKERFILE}" \ + "${SCRIPT_DIR}" + +${DOCKER} tag "${image_tag}" "${image_name}:latest" + +echo "==> Successfully built ${image_tag}" +echo "==> Also tagged as ${image_name}:latest" +echo "==> Done." + From 93cfd04d2d66feff23e6f278b74bcfccc4f6b7c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 7 Apr 2026 14:09:06 +0200 Subject: [PATCH 13/62] [fix] intercg: fix missing thunk in Ghidra and extend ignore list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/intercg/fwmapper.py | 11 ++++- src/pyrrha_mapper/intercg/loader.py | 67 ++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/src/pyrrha_mapper/intercg/fwmapper.py b/src/pyrrha_mapper/intercg/fwmapper.py index f4a08b3..eb6c7ff 100644 --- a/src/pyrrha_mapper/intercg/fwmapper.py +++ b/src/pyrrha_mapper/intercg/fwmapper.py @@ -37,7 +37,14 @@ from pyrrha_mapper.intercg.loader import BinaryParser, GhidraParser, IDAParser from pyrrha_mapper.types import Backend, ResolveDuplicateOption -IGNORE_LIST = ["__gmon_start__"] +IGNORE_LIST = [ + "__gmon_start__", + "_ITM_deregisterTMCloneTable", + "_ITM_registerTMCloneTable", + "__TMC_END__", + "deregister_tm_clones", + "register_tm_clones", +] NUMBAT_UI_BIN = "NumbatUi" @@ -112,7 +119,7 @@ def load_binary( if backend == Backend.IDA: ida_parser: BinaryParser = IDAParser(root_directory, file_path) return ida_parser.binary, ida_parser.call_graph - elif backend == Backend.GHIDRA: + elif backend == Backend.GHIDRA: ghidra_parser = GhidraParser(root_directory, file_path) return ghidra_parser.binary, ghidra_parser.call_graph else: diff --git a/src/pyrrha_mapper/intercg/loader.py b/src/pyrrha_mapper/intercg/loader.py index 922d22b..58a5929 100644 --- a/src/pyrrha_mapper/intercg/loader.py +++ b/src/pyrrha_mapper/intercg/loader.py @@ -531,6 +531,7 @@ def _initiate_bin_parser(self, root_directory: Path, file_path: Path, image_base self._ghidra_cached_func = None self._ghidra_load_base: int = 0 self._ghidra_monitor = None + self._ghidra_exported_parser_addrs: set[int] = set() full_path = root_directory / file_path self._ghidra_project_dir = Path(tempfile.mkdtemp(prefix=f"ghidra_{os.getpid()}_")) @@ -563,7 +564,13 @@ def _initiate_bin_parser(self, root_directory: Path, file_path: Path, image_base program = flat_api.getCurrentProgram() self._ghidra_program = program + # Derive load base from the program itself, not from LIEF's image_base, + # so that _to_ghidra_address / _to_parser_addr are always consistent. self._ghidra_load_base = program.getImageBase().getOffset() + # Build the exported-address set once so _func_type can check it cheaply. + self._ghidra_exported_parser_addrs: set[int] = { + lief_addr - self._binary.image_base for lief_addr in self._binary.exported_funcs_by_addr + } self._ghidra_func_manager = program.getFunctionManager() self._ghidra_symbol_table = program.getSymbolTable() self._ghidra_ext_manager = program.getExternalManager() @@ -605,14 +612,34 @@ def _to_parser_addr(self, ghidra_offset: int) -> int: return ghidra_offset - self._ghidra_load_base def _get_ghidra_func(self, parser_addr: int): - """:return: the Ghidra Function at *parser_addr*, with a single-entry cache.""" + """:return: the Ghidra Function at *parser_addr*, with a single-entry cache. + + Falls back to ``getFunctionContaining`` when ``getFunctionAt`` returns + ``None``, which handles the ARM THUMB case where the parser address + may be offset by one from the real entry point stored by Ghidra. + """ if ( self._ghidra_cached_func is not None and self._to_parser_addr(self._ghidra_cached_func.getEntryPoint().getOffset()) == parser_addr ): return self._ghidra_cached_func - return self._ghidra_func_manager.getFunctionAt(self._to_ghidra_address(parser_addr)) + + ghidra_addr = self._to_ghidra_address(parser_addr) + func = self._ghidra_func_manager.getFunctionAt(ghidra_addr) + if func is None: + # getFunctionContaining handles mid-function addresses and the ARM + # THUMB ±1 offset; only accept the result when the entry point + # matches exactly (after rounding) to avoid false positives. + func = self._ghidra_func_manager.getFunctionContaining(ghidra_addr) + if func is not None: + entry_parser_addr = self._to_parser_addr(func.getEntryPoint().getOffset()) + if abs(entry_parser_addr - parser_addr) > 1: + func = None + + if func is not None: + self._ghidra_cached_func = func + return func # ------------------------------------------------------------------ # BinaryParser interface @@ -620,13 +647,26 @@ def _get_ghidra_func(self, parser_addr: int): def _is_func_start(self, addr: int) -> bool: """:return: True if *addr* (parser space) is a known Ghidra function entry.""" - return self._ghidra_func_manager.getFunctionAt(self._to_ghidra_address(addr)) is not None + return self._get_ghidra_func(addr) is not None def _iter_func_addr(self) -> Iterator[int]: - """Yield parser-space entry-point addresses of every Ghidra function.""" + """Yield parser-space entry-point addresses of every non-external Ghidra function. + + ``getFunctions(True)`` skips functions that live in Ghidra's external + program space (imported stubs resolved to library addresses). Those are + handled separately by the LIEF import tracking in ``BinaryParser``. + """ + seen_addrs: set[int] = set() for func in self._ghidra_func_manager.getFunctions(True): + # Skip external-space functions — they are not mapped in the binary. + if func.isExternal(): + continue self._ghidra_cached_func = func - yield self._to_parser_addr(func.getEntryPoint().getOffset()) + parser_addr = self._to_parser_addr(func.getEntryPoint().getOffset()) + if parser_addr in seen_addrs: + continue + seen_addrs.add(parser_addr) + yield parser_addr def _func_mangled_name(self, addr: int) -> str: """:return: the raw name of the function at *addr*, or ``sub_``.""" @@ -662,6 +702,8 @@ def _func_children(self, addr: int) -> list[int]: seen: set[str] = set() result: list[int] = [] for callee in func.getCalledFunctions(self._ghidra_monitor): + if callee.isExternal(): + continue name = callee.getName() if name in seen: continue @@ -678,6 +720,8 @@ def _func_parents(self, addr: int) -> list[int]: seen: set[str] = set() result: list[int] = [] for caller in func.getCallingFunctions(self._ghidra_monitor): + if caller.isExternal(): + continue name = caller.getName() if name in seen: continue @@ -691,6 +735,12 @@ def _func_type(self, addr: int) -> FuncType: Thunk stubs that resolve to external functions are classified as ``IMPORTED`` so the trampoline resolution in ``BinaryParser`` correctly forwards all callers to the imported symbol name. + + Exception: if the thunk is itself exported (i.e. it appears in the + binary's export table), it must be kept as ``THUNK`` so that + ``BinaryParser`` adds it to the call graph rather than silently + dropping it. ``IMPORTED`` is reserved for non-exported stubs whose + only purpose is to forward calls to an external symbol. """ func = self._get_ghidra_func(addr) if func is None: @@ -700,10 +750,13 @@ def _func_type(self, addr: int) -> FuncType: return FuncType.IMPORTED if func.isThunk(): - # Resolve thunk chain; classify as IMPORTED if it ends at an external + # Resolve thunk chain; classify as IMPORTED only when the thunk is + # not exported — exported thunks must remain visible in the call + # graph so BinaryParser does not drop them. thunked = func.getThunkedFunction(True) if thunked is not None and thunked.isExternal(): - return FuncType.IMPORTED + if addr not in self._ghidra_exported_parser_addrs: + return FuncType.IMPORTED return FuncType.THUNK # Heuristic: function in a namespace matching a known external library From debff98568bd405be431869373463e154338d05c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 7 Apr 2026 15:22:51 +0200 Subject: [PATCH 14/62] tests: add possibility to record artifacts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- tests/conftest.py | 54 +++++++++++++++++++++++++++++++++++++++++++++-- tests/test_cli.py | 4 ++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index aa3193d..f21f422 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,62 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Pytest configuration and shared fixtures.""" + +import os +import shutil +from pathlib import Path + import pytest from pyrrha_mapper.types import Backend -def pytest_addoption(parser): +def pytest_addoption(parser: pytest.Parser) -> None: + """Register custom CLI options.""" parser.addoption( "--backend", action="store", help="backend", choices={x.name.lower() for x in Backend}, - ) \ No newline at end of file + ) + + +@pytest.fixture(autouse=True) +def _collect_export_artifacts(request: pytest.FixtureRequest) -> None: + """Copy artifacts produced by export_res to PYTEST_ARTIFACTS_DIR when it is set.""" + artifacts_dir = os.environ.get("PYTEST_ARTIFACTS_DIR") + if not artifacts_dir: + return + # Only act when the test used the export_res fixture. + if "export_res" not in request.fixturenames: + return + + def _copy() -> None: + try: + export_res = request.getfixturevalue("export_res") + except pytest.FixtureLookupError: + return + dest = Path(artifacts_dir) + dest.mkdir(parents=True, exist_ok=True) + for path in [ + export_res.export_path, + export_res.db_path, + export_res.project_path, + ]: + if path.exists(): + shutil.copy2(path, dest / path.name) + + request.addfinalizer(_copy) diff --git a/tests/test_cli.py b/tests/test_cli.py index b4cb69b..c3892c5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -97,6 +97,10 @@ class ExecResults(NamedTuple): # noqa: D106 res: Result db_path: Path + @property + def project_path(self) -> Path: # noqa: D102 + return self.db_path.with_suffix(".srctrlprj") + @property def export_path(self) -> Path: # noqa: D102 return self.db_path.with_suffix(".json") From 00c7eb0cfe5c816bbf39b4de7b80f4b967950d0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 7 Apr 2026 15:23:15 +0200 Subject: [PATCH 15/62] ci: export artifacts from test, remove artifact step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 49 +++++++++++++------------------------------------ 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f4fd8a3..0684532 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,14 +3,6 @@ - echo -e "\e[95m===== Install Pyrrha with test extension" - pip install '.[test]' - -.step_gen_artifacts: &step_gen_artifacts - - echo -e "\e[95m===== Generate artifacts" - - mkdir -p ${ARTIFACTS} - - cp -r tests ${ARTIFACTS} - - (cd ${ARTIFACTS} && pyrrha $MAPPER --db ${DB} --debug tests/test_fw ${MAPPER_OPTIONS}) - - ls ${ARTIFACTS} - .step_run_tests: &step_run_tests - echo -e "\e[95m===== Tests" - coverage run --source=${TEST_COVERAGE_SOURCE} -m pytest --junitxml=report.xml -vvv -x ${TEST_SUP_OPTIONS} ${TEST_PATH} @@ -22,9 +14,13 @@ test_data_structures: stage: test before_script: - - *step_install_pyrrha_test + - echo -e "\e[95m===== Install Pyrrha with test extension" + - pip install '.[test]' script: - - *step_run_tests + - echo -e "\e[95m===== Tests" + - coverage run --source=${TEST_COVERAGE_SOURCE} -m pytest --junitxml=report.xml -vvv -x ${TEST_SUP_OPTIONS} ${TEST_PATH} + - coverage xml + - coverage report image: python:latest variables: TEST_COVERAGE_SOURCE: pyrrha_mapper.common.objects @@ -38,44 +34,25 @@ test_data_structures: path: coverage.xml #========================== MAPPERS TESTS ==================================== -.run_pyrrha_test_artifacts: - stage: test - before_script: - - *step_install_pyrrha_test - script: - - *step_gen_artifacts - - *step_run_tests - coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' +test_fs: + extends: + - test_data_structures artifacts: name: db_$CI_JOB_NAME_SLUG - when: always paths: - - ${ARTIFACTS}/${DB}.srctrldb - - ${ARTIFACTS}/${DB}.srctrlprj - reports: - junit: report.xml - coverage_report: - coverage_format: cobertura - path: coverage.xml - variables: - ARTIFACTS: tmp/artifacts - -test_fs: - extends: - - .run_pyrrha_test_artifacts - image: python:latest + - test_artifacts/ + when: always variables: DB: fs MAPPER: fs TEST_COVERAGE_SOURCE: pyrrha_mapper.common.filesystem_mapper,pyrrha_mapper.fs TEST_PATH: tests/test_cli.py::TestFSMapper + PYTEST_ARTIFACTS_DIR: test_artifacts test_fs-cg: extends: - - .run_pyrrha_test_artifacts - before_script: - - !reference [.run_pyrrha_test_artifacts, before_script] + - test_fs image: name: $CONTAINER_PATH/${BACKEND}:${VERSION} docker: From c23207fcbd6bc4ed18035fc9eb532a57e96c082b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 7 Apr 2026 16:02:11 +0200 Subject: [PATCH 16/62] ci: factorize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0684532..1256ccd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,14 +1,3 @@ -#========================== STEPS USED IN WORKFLOWS ==================================== -.step_install_pyrrha_test: &step_install_pyrrha_test - - echo -e "\e[95m===== Install Pyrrha with test extension" - - pip install '.[test]' - -.step_run_tests: &step_run_tests - - echo -e "\e[95m===== Tests" - - coverage run --source=${TEST_COVERAGE_SOURCE} -m pytest --junitxml=report.xml -vvv -x ${TEST_SUP_OPTIONS} ${TEST_PATH} - - coverage xml - - coverage report - #========================== OBJECTS TESTS ==================================== test_data_structures: From 8f734a70b32f8abe1d08a452b40e5927ff70f426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 17 Apr 2026 13:28:29 +0200 Subject: [PATCH 17/62] [fix] ci: Ghidra Dockerfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- ci/ghidra/Dockerfile | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/ci/ghidra/Dockerfile b/ci/ghidra/Dockerfile index 22c3702..1d45454 100644 --- a/ci/ghidra/Dockerfile +++ b/ci/ghidra/Dockerfile @@ -45,10 +45,10 @@ RUN wget --no-verbose "${GHIDRA_URL}" -O /tmp/ghidra.zip && \ /opt/ghidra_${GHIDRA_VERSION}_PUBLIC/licenses # ======================== Ghidra Installation and Runtime Image ======================= -# eclipse-temurin:21-jre-jammy is the current maintained replacement for the -# deprecated openjdk:21-jdk-slim. The JRE is sufficient for running Ghidra -# headless — the full JDK is not required at runtime. -FROM eclipse-temurin:21-jre-jammy +# eclipse-temurin:21-jdk-jammy is the current maintained replacement for the +# deprecated openjdk:21-jdk-slim. The full JDK is required as Ghidra's +# launcher validates the Java installation and rejects JRE-only setups. +FROM eclipse-temurin:21-jdk-jammy ARG GHIDRA_VERSION=12.0.4 @@ -63,18 +63,27 @@ COPY --from=ghidra-download /opt/ghidra_${GHIDRA_VERSION}_PUBLIC ${GHIDRA_INSTAL # Install the minimal runtime dependencies for Ghidra headless + pyghidra and # for building/installing the pyrrha-mapper Python package. +# Python 3.11 is installed via the deadsnakes PPA as Ubuntu 22.04 (Jammy) +# ships 3.10 by default. # All apt artefacts are removed in the same RUN layer to keep layer size down. RUN apt-get update && \ + apt-get install --yes --no-install-recommends \ + software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa && \ + apt-get update && \ apt-get install --yes --no-install-recommends \ libfreetype6 \ libmagic1 \ - python3-minimal \ + python3.11 \ + python3.11-venv \ python3-pip \ - python3-venv \ - python-is-python3 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* +# Make python3.11 the default python3 and python. +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 + # User creation RUN useradd --create-home -u 1000 -m user && \ chown -R user:user "${GHIDRA_INSTALL_DIR}" @@ -87,9 +96,9 @@ WORKDIR /home/user ENV VIRTUAL_ENV=/home/user/.venv ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" -RUN python -m venv "${VIRTUAL_ENV}" && \ +RUN python3.11 -m venv "${VIRTUAL_ENV}" && \ pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir pyrrha-mapper && \ echo "source ${VIRTUAL_ENV}/bin/activate" >> /home/user/.bashrc -CMD ["/bin/bash"] +CMD ["/bin/bash"] \ No newline at end of file From ee9a11985b41d48507ce964d8d4cf1dd48c202c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 17 Apr 2026 13:28:58 +0200 Subject: [PATCH 18/62] common, intercg: various fixes around adresses and demangled names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- pyproject.toml | 7 +- src/pyrrha_mapper/common/filesystem_mapper.py | 25 +- src/pyrrha_mapper/common/objects.py | 18 +- src/pyrrha_mapper/fs/imports_mapper.py | 45 +- src/pyrrha_mapper/intercg/fwmapper.py | 216 +++++++- src/pyrrha_mapper/intercg/loader.py | 475 ++++++++++++++---- 6 files changed, 628 insertions(+), 158 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fef14b0..7ed00b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,11 +25,10 @@ authors = [ ] readme = "README.md" description = "A mapper collection for firmware analysis" -requires-python = ">=3.10" +requires-python = ">=3.11" license = { text = "Apache License 2.0" } classifiers = [ 'Development Status :: 4 - Beta', - 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', 'Programming Language :: Python :: 3.13', @@ -77,7 +76,7 @@ test = ['pytest', 'pytest-cov'] typechecking = ['mypy'] [tool.setuptools.dynamic] -version = {attr = "pyrrha_mapper.__version__"} +version = {attr = "pyrrha_mapper.__version__"} [tool.mypy] plugins = ['pydantic.mypy'] @@ -107,5 +106,3 @@ convention = "numpy" [tool.ruff.format] line-ending = "auto" docstring-code-format = true - - diff --git a/src/pyrrha_mapper/common/filesystem_mapper.py b/src/pyrrha_mapper/common/filesystem_mapper.py index 62813b1..7ad55ef 100755 --- a/src/pyrrha_mapper/common/filesystem_mapper.py +++ b/src/pyrrha_mapper/common/filesystem_mapper.py @@ -126,7 +126,7 @@ def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: if binary.id is None: logging.error(f"{log_prefix}: Record of binary failed.") return binary - # dict demangled_name -> id to check if a demangled name has already been recorded + recorded_symb: dict[str, int] = dict() for symbol in set(binary.iter_exported_symbols()): if symbol.demangled_name in recorded_symb: @@ -135,6 +135,11 @@ def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: "common node for these symbols" ) symbol.id = recorded_symb[symbol.demangled_name] + # Also propagate the id to any other symbol registered under + # the same mangled name (e.g. secondary demangled-key entries). + for other in binary.exported_functions.values(): + if other.name == symbol.name and other.id is None: + other.id = symbol.id continue if symbol.is_func: symbol.id = self.db_interface.record_method( @@ -159,6 +164,11 @@ def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: try: self.db_interface.record_public_access(symbol.id) recorded_symb[symbol.demangled_name] = symbol.id + # Propagate id to all symbols sharing the same mangled name + # (covers secondary demangled-key registrations). + for other in binary.exported_functions.values(): + if other.name == symbol.name and other.id is None: + other.id = symbol.id except DBException as e: raise PyrrhaError( f"{log_prefix}: Cannot register access to symbol {symbol.demangled_name}: " @@ -166,6 +176,16 @@ def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: ) from e for symbol in set(binary.iter_not_exported_functions()): + # Skip if this demangled name was already recorded as an exported + # symbol — same demangled name means same DB node, and calling + # record_private_access on it would violate the UNIQUE constraint. + if symbol.demangled_name in recorded_symb: + logging.debug( + f"{log_prefix}: demangled name {symbol.demangled_name} already recorded " + "as exported, skipping internal registration" + ) + symbol.id = recorded_symb[symbol.demangled_name] + continue symbol.id = self.db_interface.record_method( symbol.demangled_name, parent_id=binary.id, @@ -176,6 +196,7 @@ def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: else: try: self.db_interface.record_private_access(symbol.id) + recorded_symb[symbol.demangled_name] = symbol.id except DBException as e: raise PyrrhaError( f"{log_prefix}: Cannot register access to symbol" @@ -183,7 +204,7 @@ def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: ) from e return binary - + def record_symlink_in_db(self, sym: Symlink, log_prefix: str = "") -> Symlink: """Record into DB the symlink and its link to its target. diff --git a/src/pyrrha_mapper/common/objects.py b/src/pyrrha_mapper/common/objects.py index 32a3c0a..e0969cb 100644 --- a/src/pyrrha_mapper/common/objects.py +++ b/src/pyrrha_mapper/common/objects.py @@ -102,18 +102,18 @@ class Binary(FileSystemComponent): ) # warning only symbols which are not functions exported_functions: dict[str, Symbol] = Field(default_factory=dict) - # Fields for call graph representation - # functions is both: internal functions + exported functions + # Call graph fields internal_functions: dict[str, Symbol] = Field(default_factory=dict) calls: dict[str, list[Symbol]] = Field(default_factory=dict) - # ELF specific fields + # ELF-specific fields version_requirement: dict[str, list[str]] = Field( default_factory=dict ) # dict(symbol_name, list(requirements)) - # Runtime-only field: virtual address at which the binary is loaded. + # Runtime-only (excluded from serialisation) image_base: int = Field(default=0, exclude=True) + is_relocatable: bool = Field(default=False, exclude=True) @field_validator("internal_functions", "exported_functions", mode="after") @classmethod @@ -176,6 +176,9 @@ def add_exported_symbol(self, symbol: Symbol, symbol_name: str = "") -> None: if symbol.is_func: self.exported_functions[symbol_name] = symbol self.exported_symbols.pop(symbol_name, None) + # Remove from internal_functions if it was previously registered + # there (e.g. LIEF yields the same symbol via .symtab then .dynsym). + self.internal_functions.pop(symbol_name, None) else: self.exported_symbols[symbol_name] = symbol self.exported_functions.pop(symbol_name, None) @@ -423,8 +426,7 @@ def __repr__(self): # noqa: D105 f"bins={len(self.binaries)}, symlinks={len(self.symlinks)})" ) - # ------------------------------ Overload Pydantic methods ------------------------- - # Always export by aliases, set always excluded attributes + # Pydantic overrides: always export by aliases, exclude runtime-only fields. @field_serializer( "binaries", mode="plain", when_used="always", return_type=dict[str | Path, dict] ) @@ -526,9 +528,7 @@ def fs_bin_validate(cls, data: Any, info: ValidationInfo) -> Any: raise ValueError(f"Imported lib '{lib_path}' not listed in binaries") res[bin_path].add_imported_library(res[lib_path_obj]) - # optmimize version by replacing every iteration of the same symbol (same id) - # by one object - # 1. generate dict of symbols by ids + # Deduplicate: replace repeated Symbol instances with the same id by one object. symbols_by_ids: dict[int, Symbol] = { s.id: s for bin in res.values() for s in bin.iter_exported_symbols() if s.id is not None } diff --git a/src/pyrrha_mapper/fs/imports_mapper.py b/src/pyrrha_mapper/fs/imports_mapper.py index 48a3809..1cdbea0 100644 --- a/src/pyrrha_mapper/fs/imports_mapper.py +++ b/src/pyrrha_mapper/fs/imports_mapper.py @@ -72,7 +72,6 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s raise: FsMapperError if cannot load it :return: bin object and additionnal info if needed or a string in case of error """ - # compute absolute path but from root_directory base base = Path(root_directory.anchor) rel_path = base.joinpath(file_path.relative_to(root_directory)) @@ -88,6 +87,7 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s return f"Lief cannot parse {file_path}" bin_obj.image_base = parsing_res.imagebase + bin_obj.is_relocatable = parsing_res.header.file_type == lief.ELF.Header.FILE_TYPE.REL # parse imported libs for lib in parsing_res.libraries: bin_obj.add_imported_library_name(str(lib)) @@ -97,27 +97,50 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s # store exported symbols s: lief.ELF.Symbol is_kernel_module = bin_obj.path.suffix == ".ko" + seen_symbol_names: set[str] = set() for s in parsing_res.symbols: + sym_name = str(s.name) if s.imported: - bin_obj.add_imported_symbol_name(str(s.name)) + bin_obj.add_imported_symbol_name(sym_name) elif s.exported or is_kernel_module and s.name: is_func = s.is_function or s.type == lief.ELF.Symbol.TYPE.GNU_IFUNC if not is_func and is_kernel_module: continue - bin_obj.add_exported_symbol( - Symbol( - name=str(s.name), - is_func=is_func, - demangled_name=s.demangled_name, - addr=s.value, - ) + # LIEF may yield the same symbol name from both .symtab + # and .dynsym; only register the first occurrence to avoid + # duplicate DB entries (UNIQUE constraint on node_id). + if sym_name in seen_symbol_names: + continue + seen_symbol_names.add(sym_name) + # Use the mangled name as demangled_name when LIEF's + # demangled_name is identical to the mangled name (i.e. + # demangling was not available or not needed). + lief_demangled = str(s.demangled_name) + demangled = lief_demangled if lief_demangled != sym_name else sym_name + sym = Symbol( + name=sym_name, + is_func=is_func, + demangled_name=demangled, + addr=s.value, ) + # Register under the mangled name as primary key. + # Also register under the demangled name if it differs, + # so that call-graph resolution can match short callee + # strings against exported_functions keys. + bin_obj.add_exported_symbol(sym) + if demangled != sym_name: + bin_obj.add_exported_symbol(sym, symbol_name=demangled) elif s.is_function: + # Skip symbols already registered as exported functions to + # avoid duplicate DB entries. + if sym_name in seen_symbol_names: + continue + seen_symbol_names.add(sym_name) bin_obj.add_function( Symbol( - name=str(s.name), + name=sym_name, is_func=s.is_function, - demangled_name=s.demangled_name, + demangled_name=str(s.demangled_name), addr=s.value, ) ) diff --git a/src/pyrrha_mapper/intercg/fwmapper.py b/src/pyrrha_mapper/intercg/fwmapper.py index eb6c7ff..e3e9a78 100644 --- a/src/pyrrha_mapper/intercg/fwmapper.py +++ b/src/pyrrha_mapper/intercg/fwmapper.py @@ -16,6 +16,7 @@ """InterCGMapper implementation.""" import logging +import re from collections import defaultdict from pathlib import Path from typing import Any @@ -37,14 +38,70 @@ from pyrrha_mapper.intercg.loader import BinaryParser, GhidraParser, IDAParser from pyrrha_mapper.types import Backend, ResolveDuplicateOption -IGNORE_LIST = [ - "__gmon_start__", - "_ITM_deregisterTMCloneTable", - "_ITM_registerTMCloneTable", - "__TMC_END__", - "deregister_tm_clones", - "register_tm_clones", -] +IGNORE_LIST: frozenset[str] = frozenset( + [ + # Linker-injected bookkeeping stubs + "__gmon_start__", + "_ITM_deregisterTMCloneTable", + "_ITM_registerTMCloneTable", + "__TMC_END__", + "deregister_tm_clones", + "register_tm_clones", + # ITM runtime helpers + "_ITM_RU1", + "_ITM_addUserCommitAction", + "_ITM_memcpyRnWt", + "_ITM_memcpyRtWn", + # C++ operators (Ghidra partial-demangle form) + "operator new", + "operator new[]", + "operator delete", + "operator delete[]", + "new[]", + "operator==", + "operator!=", + "operator<", + "operator>", + "operator<=", + "operator>=", + "operator=", + "operator+", + "operator-", + "operator*", + "operator/", + "operator[]", + "operator()", + "operator<<", + "operator>>", + "operator+=", + "operator-=", + # GCC exception helpers — never valid cross-binary callees + "__throw_bad_alloc", + "__throw_bad_array_new_length", + "__throw_bad_cast", + "__throw_bad_function_call", + "__throw_future_error", + "__throw_invalid_argument", + "__throw_length_error", + "__throw_logic_error", + "__throw_out_of_range", + "__throw_out_of_range_fmt", + "__throw_overflow_error", + "__throw_range_error", + "__throw_regex_error", + "__throw_runtime_error", + "__throw_system_error", + "__throw_underflow_error", + # C++ ABI internal + "__do_upcast", + ] +) + +# Tool-generated synthetic names (FUN_, _INIT_, _FINI_) that can +# never be resolved as cross-binary callees. +_GHIDRA_SYNTHETIC_NAME_RE: re.Pattern[str] = re.compile( + r"^(?:FUN_[0-9A-Fa-f]+|_INIT_\d+|_FINI_\d+)$" +) NUMBAT_UI_BIN = "NumbatUi" @@ -161,6 +218,60 @@ def _treat_bin_parsing_result(self, path: Path, res: Any): else: logging.warning(f"{log_prefix}: impossible to parse the following result {res.args}") + @staticmethod + def _merge_parser_functions_into_cached_binary( + parser_bin: Binary, cached_bin: Binary, log_prefix: str = "" + ) -> None: + """Merge disassembler-discovered functions from *parser_bin* into *cached_bin*. + + The .fs.json cache only contains LIEF-visible data. After a cache + reload the disassembler is re-run to rebuild the call graph, but the + resulting ``Binary`` object (``parser_bin``) is discarded — only the + cached binary (``cached_bin``) stays in ``self.fs``. This means that + any function the disassembler registered that was not already present in + the LIEF binary (e.g. internal functions discovered via ``add_function`` + during ``_combine_program_analysis_binary``) will be absent from + ``cached_bin``. The CG mapping loop then hits + ``not binary.function_exists(f_symb.name)`` for every such function and + silently drops the associated call edges. + + This helper bridges the gap by: + + 1. Registering internal functions present in ``parser_bin`` but absent + from ``cached_bin``. These functions have no DB id (they were never + recorded in Numbat), which is correct — only exported symbols are + recorded. + 2. Registering exported functions present in ``parser_bin`` but absent + from ``cached_bin``. This handles symbols the disassembler promoted + to exports that LIEF did not see. No DB id is assigned. + + The operation is intentionally conservative: it never removes existing + functions from ``cached_bin`` and never overwrites a symbol that already + has an id. + + :param parser_bin: freshly-parsed Binary produced by the disassembler. + :param cached_bin: Binary loaded from the .fs.json cache (has DB ids). + :param log_prefix: prefix for log messages. + """ + # Step 1 — register internal functions discovered only by the disassembler. + for func_name, func_symb in parser_bin.internal_functions.items(): + if not cached_bin.function_exists(func_name): + cached_bin.add_function(func_symb, func_name=func_name) + logging.debug( + f"{log_prefix}: merged internal function '{func_name}' from parser into cache" + ) + + # Step 2 — ensure the cached binary's exported function set is a + # superset of the parser's. Symbols exported by the disassembler but + # absent from the cached binary are added so function_exists() succeeds. + # No DB id is assigned — these symbols were not recorded in Numbat. + for func_name, func_symb in parser_bin.exported_functions.items(): + if not cached_bin.exported_function_exists(func_name): + cached_bin.add_exported_symbol(func_symb, symbol_name=func_name) + logging.debug( + f"{log_prefix}: merged exported function '{func_name}' from parser into cache" + ) + def map_binaries_main(self, threads: int, progress: Progress) -> None: """Parse and map binaries of a given directory. @@ -180,9 +291,39 @@ def map_binaries_main(self, threads: int, progress: Progress) -> None: binaries_map = progress.add_task( "[red]Binaries recording", total=len(list(self.fs.iter_binaries())) ) + # The .fs.json cache only serialises LIEF-visible data. The + # disassembler call graph is transient and internal functions + # discovered by the disassembler (absent from LIEF's symbol table) + # are not persisted either. We must therefore: + # 1. Re-run the disassembler for each binary to rebuild + # unresolved_callgraph and recover internal functions. + # 2. Merge those internal functions into the cached Binary + # BEFORE calling record_binary_in_db, so that Numbat receives + # DB ids for them. Without ids, _record_call_ref silently + # drops every call whose caller is an internal function. for binary in self.fs.iter_binaries(): log_prefix = f"[bin mapping] {binary.name}" - # Create the node entry in numbat and create the custom command + if binary.real_path is not None: + res = self.load_binary(file_path=binary.real_path, **self.load_binary_args()) + if isinstance(res, str): + logging.error(f"{log_prefix}: CG reload failed: {res}") + elif self._correct_map_result(res): + parser_bin, call_graph = res + # Merge disassembler functions before recording in DB + # even when call_graph is empty — the binary may still + # expose internal functions needed as call targets. + self._merge_parser_functions_into_cached_binary( + parser_bin, binary, log_prefix + ) + if call_graph is not None: + self.unresolved_callgraph[binary.path] = call_graph + else: + logging.warning(f"{log_prefix}: unexpected result during CG reload") + else: + logging.warning(f"{log_prefix}: no real_path set, skipping CG reload") + + # Record in DB after the merge so internal functions discovered + # by the disassembler are included and receive DB ids. self.record_binary_in_db(binary, log_prefix) if binary.id is not None: self.node_ids[binary.id] = binary @@ -213,14 +354,12 @@ def mapper_main( # Step1: Load FileSystem object and enrich it if needed self.map_binaries_main(threads, progress) self.map_symlinks_main(progress) - self.dry_run_mode = True # (do not record lib imports in numbat db) + self.dry_run_mode = True # do not record lib imports in numbat db self.map_lib_imports_main(progress, resolution_strategy) if self.db_interface is not None: self.dry_run_mode = False - self.progress = progress # need to be able to hide it further down in calls+ - - # Dict of: exported-funs -> [binaries] + self.progress = progress self.exports_to_bins = self.make_export_to_binaries_map() # Iterate again all binaries to create call edges (all numbat_id are created) @@ -234,16 +373,26 @@ def mapper_main( count_res = {True: 0, False: 0} if binary.path in self.unresolved_callgraph: for f_symb, targets in self.unresolved_callgraph[binary.path].items(): - if targets and not binary.function_exists(f_symb.name): + if not binary.function_exists(f_symb.name): + if targets: + logging.error( + f"function {f_symb.name} ({hex(f_symb.addr) if f_symb.addr is not None else None}) not in binary: {binary.name}" + ) + continue + + try: + caller = binary.get_function_by_name(f_symb.name) + except KeyError: logging.error( - f"function {f_symb.name} ({hex(f_symb.addr) if f_symb.addr is not None else None}) not in binary: {binary.name}" + f"{log_prefix}: caller {f_symb.name} not found in binary {binary.name}" ) continue + for target in targets: try: res = self._record_one_call( binary, - f_symb, + caller, target, resolution_strategy, unindex_symbols, @@ -293,8 +442,8 @@ def _record_call_ref(self, src: Symbol, dst: Symbol, log_prefix: str = "") -> bo assert self.db_interface is not None if src.id is None or dst.id is None: logging.error( - f"{log_prefix}: Cannot record call ref between {src.name} and " - f"{dst.name}, missing ids ({src.name}: {src.id}, {dst.name}: {dst.id})" + f"{log_prefix}: Cannot record call ref between '{src.name}' and " + f"'{dst.name}', missing ids ({src.name}: {src.id}, {dst.name}: {dst.id})" ) return False self.db_interface.record_ref_call(src.id, dst.id) @@ -360,20 +509,31 @@ def _record_one_call( :return: True if target function was found """ - # local call + # Ghidra emits template arguments in callee names (e.g. "_M_insert"); + # strip them so lookups match the base-name key in exported_functions. + if "<" in callee: + callee = callee[: callee.index("<")] + + # The disassembler may emit versioned symbol names (e.g. "getenv@@GLIBC_2.4"). + # All export/import keys are stored without the version suffix, so strip it. + if "@@" in callee: + callee = callee[: callee.index("@@")] + if binary.function_exists(callee): callee_symb = binary.get_function_by_name(callee) binary.add_call(caller, callee_symb) - return self._record_call_ref(caller, callee_symb) + return self._record_call_ref(caller, callee_symb, f"{log_prefix}: local call") - if callee in IGNORE_LIST: + if callee in IGNORE_LIST or _GHIDRA_SYNTHETIC_NAME_RE.match(callee): return False # already solved import if binary.imported_symbol_exists(callee, is_resolved=True): callee_symb = binary.get_imported_symbol(callee) binary.add_call(caller, callee_symb) - return self._record_call_ref(caller, callee_symb) + return self._record_call_ref( + caller, callee_symb, f"{log_prefix}: already solved import" + ) # solve import from listed imported libraries tmp = self.resolve_symbol_import(binary, callee, resolver, log_prefix) @@ -383,7 +543,9 @@ def _record_one_call( binary.add_imported_library(target_bin) binary.add_imported_symbol(target_symb) binary.add_call(caller, target_symb) - return self._record_call_ref(caller, target_symb) + return self._record_call_ref( + caller, target_symb, f"{log_prefix}: import in listed imported lib" + ) # Get binaries exporting this symbol served_by: list[Binary] = self.exports_to_bins[callee] @@ -409,10 +571,12 @@ def _record_one_call( callee_symb = served_by[0].get_exported_symbol(callee) binary.add_imported_symbol(callee_symb) binary.add_call(caller, callee_symb) - return self._record_call_ref(caller, callee_symb) + return self._record_call_ref(caller, callee_symb, log_prefix) else: # still not resolved - self._record_unindexed_call(caller, callee) + self._record_unindexed_call(caller, callee, log_prefix) if binary.path.suffix != ".ko": unindex_symbols.add(callee) - logging.debug(f"{log_prefix}: no match found for edge {caller.name} -> {callee}") + logging.warning(f"{log_prefix}: no match found for edge {caller.name} -> {callee}") + else: + logging.debug(f"{log_prefix}: no match found for edge {caller.name} -> {callee}") return False diff --git a/src/pyrrha_mapper/intercg/loader.py b/src/pyrrha_mapper/intercg/loader.py index 58a5929..59fe817 100644 --- a/src/pyrrha_mapper/intercg/loader.py +++ b/src/pyrrha_mapper/intercg/loader.py @@ -16,6 +16,7 @@ """Load information used by InterCGMapper from the files on the disk.""" import logging +import re from abc import abstractmethod from collections.abc import Iterator from enum import StrEnum @@ -61,7 +62,7 @@ def demangled_name(self) -> str: @property def addr(self) -> int: """:return: address of the function in the Binary""" - assert self.symbol.addr + assert self.symbol.addr is not None return self.symbol.addr @@ -70,6 +71,14 @@ def _count_leading_underscores(name: str) -> int: return len(name) - len(name.lstrip("_.")) +# Tool-generated fallback names (FUN_, sub_, _INIT_, _FINI_). +# A trampoline destination that matches one of these cannot be resolved as a +# cross-binary callee — skip the substitution to preserve the original name. +_SYNTHETIC_FUNC_NAME_RE: re.Pattern[str] = re.compile( + r"^(?:FUN_[0-9A-Fa-f]+|sub_[0-9A-Fa-f]+|_INIT_\d+|_FINI_\d+)$" +) + + class BinaryParser: """Abstract base class that parses a binary and extracts call-graph data. @@ -80,52 +89,52 @@ class BinaryParser: def __init__(self, root_directory: Path, file_path: Path) -> None: self.log_prefix = f"[binary parsing] {file_path.name}" + self._is_relocatable: bool = False self._binary = self._generate_lief_bin(root_directory, file_path) + self._is_relocatable = self._binary.is_relocatable self._initiate_bin_parser(root_directory, file_path, self._binary.image_base) image_base = self._binary.image_base - # Remap LIEF export addresses to parser space once. - # Keys are parser-space addresses; values are lists of LIEF Symbols. + # Remap LIEF export addresses to parser space. parser_exports: dict[int, list[Symbol]] = { lief_addr - image_base: symbols for lief_addr, symbols in self._binary.exported_funcs_by_addr.items() } - # ------------------------------------------------------------------ - # Step 1 — combine parser functions with LIEF export metadata - # ------------------------------------------------------------------ + # Step 1 — merge parser functions with LIEF export metadata. program_data: dict[int, FuncData] = self._combine_program_analysis_binary(parser_exports) - # ------------------------------------------------------------------ - # Step 2 — find exported symbols not discovered by the parser and - # add them to the call graph with an empty call list - # ------------------------------------------------------------------ + # Step 2 — add exported symbols not discovered by the parser. + # Skipped for ET_REL: LIEF addresses are section-relative and incompatible + # with the parser address space; Step 1 already matched exports by name. parser_addrs: set[int] = set(self._iter_func_addr()) call_graph: dict[Symbol, list[str]] = {} - for parser_addr, symbols in parser_exports.items(): - if parser_addr in parser_addrs: - continue - canon = self._disambiguate_export(symbols) - # ARM THUMB: parser may use address - 1 (THUMB bit cleared) - if self._is_func_start(parser_addr - 1): - if self._func_mangled_name(parser_addr - 1) in {s.name for s in symbols}: + if not self._is_relocatable: + for parser_addr, symbols in parser_exports.items(): + if parser_addr in parser_addrs: continue - logging.debug( - f"{self.log_prefix}: export {canon.name} @ {parser_addr:#x} " - f"not found in parser output" - ) - call_graph[canon] = [] - if len(symbols) > 1: - for sym in symbols: - self._binary.replace_function(canon, sym, True) - - # ------------------------------------------------------------------ - # Step 3 — build the call graph, resolving thunk trampolines - # ------------------------------------------------------------------ - # Maps a trampoline name → the canonical name it should forward to. + canon = self._disambiguate_export(symbols) + # ARM THUMB: parser may use address - 1 (THUMB bit cleared) + if self._is_func_start(parser_addr - 1): + if self._func_mangled_name(parser_addr - 1) in {s.name for s in symbols}: + continue + logging.debug( + f"{self.log_prefix}: export {canon.name} @ {parser_addr:#x} " + f"not found in parser output" + ) + call_graph[canon] = [] + if len(symbols) > 1: + for sym in symbols: + self._binary.replace_function(canon, sym, True) + + # Step 3 — build the call graph, resolving thunk trampolines. trampoline_map: dict[str, str] = {} + # LIEF-confirmed imported names (.dynsym): distinguishes genuine PLT stubs + # (IMPORTED + name in this set) from inlined C++ functions mis-classified + # as external thunks by the disassembler (IMPORTED + name NOT in this set). + lief_imported_names: set[str] = set(self._binary.imported_symbol_names) to_analyse = program_data while len(to_analyse) > 0: @@ -136,8 +145,28 @@ def __init__(self, root_directory: Path, file_path: Path) -> None: or func_data.addr + 1 in parser_exports # ARM THUMB ) - if func_data.type in (FuncType.LIBRARY, FuncType.NORMAL) or ( - func_data.type == FuncType.THUNK and (exported or len(func_data.calls) > 1) + # Keep the function in the call graph when: + # (a) it is a normal/library function, + # (b) it is an exported or multi-callee thunk, OR + # (c) it was classified IMPORTED by the disassembler but its + # name is absent from LIEF's imported-symbol table AND it + # is registered in the binary — the disassembler + # mis-classified an inlined C++ function (e.g. D0Ev + # deleting-destructor, virtual thunks) as an external + # stub. Keeping it lets callers resolve it as a local + # call rather than generating an unresolved-callee error. + # The function_exists guard prevents promoting functions + # that were never registered (e.g. genuine C-linkage + # imports whose unmangled name happens to be absent from + # lief_imported_names). + if ( + func_data.type in (FuncType.LIBRARY, FuncType.NORMAL) + or (func_data.type == FuncType.THUNK and (exported or len(func_data.calls) > 1)) + or ( + func_data.type == FuncType.IMPORTED + and func_data.name not in lief_imported_names + and self._binary.function_exists(func_data.name) + ) ): call_graph[func_data.symbol] = self._build_calls_list(func_data, program_data) continue @@ -184,17 +213,41 @@ def __init__(self, root_directory: Path, file_path: Path) -> None: and trampoline_map[destination_name] != destination_name ): destination_name = trampoline_map[destination_name] - trampoline_map[trampoline_name] = destination_name - for key, val in trampoline_map.items(): - if val == trampoline_name: - trampoline_map[key] = destination_name + # Do not record a trampoline substitution when the destination + # is a tool-generated synthetic name (e.g. "FUN_1234" or + # "sub_5678"): the disassembler could not identify the branch + # target, so replacing the original stub name with a synthetic + # placeholder would drop the cross-binary call edge entirely. + # Skipping the substitution leaves the stub name intact so + # fwmapper can still resolve it against exported_functions. + if not _SYNTHETIC_FUNC_NAME_RE.match(destination_name): + trampoline_map[trampoline_name] = destination_name + for key, val in trampoline_map.items(): + if val == trampoline_name: + trampoline_map[key] = destination_name + + # Only remove the thunk stub when it wraps a true external + # (IMPORTED) symbol — i.e. it is a genuine PLT stub. Internal + # forwarding thunks (callee type is NORMAL or another THUNK) + # must stay registered in the binary so their callers can + # resolve them as local calls. + if callee_data.type != FuncType.IMPORTED: + continue elif func_data.type == FuncType.THUNK and not func_data.calls and func_data.callers: # Terminal thunk with callers but no callees — keep it continue - # Remove functions not kept as exported/library/normal - if self._binary.get_function_by_name(func_data.name).addr == func_data.addr: + # Remove functions not kept as exported/library/normal. + # _Z-prefixed names are preserved: a statically linked binary can + # contain a private copy of a C++ symbol also present in the + # dynamic import table — removing it would break intra-binary edges. + if func_data.name.startswith("_Z"): + continue + if ( + self._binary.function_exists(func_data.name) + and self._binary.get_function_by_name(func_data.name).addr == func_data.addr + ): self._binary.remove_function(func_data.name) to_analyse = missed_data program_data.update(missed_data) @@ -282,6 +335,10 @@ def _generate_lief_bin(self, root_directory: Path, file_path: Path) -> Binary: raise FsMapperError(f"{self.log_prefix}: real_path not set (skip)") if not lief_binary.real_path.exists(): raise FsMapperError(f"{self.log_prefix}: executable not found (skip)") + + # Detect ET_REL (kernel modules, object files) via LIEF before the + # raw LIEF object is discarded. Stored on the instance so BinaryParser + # can skip address-based export matching for relocatable binaries. return lief_binary def _build_calls_list( @@ -339,10 +396,17 @@ def _combine_program_analysis_binary( for sym in symbols: self._binary.replace_function(func_symbol, sym, True) else: - # Internal function — create a new Symbol in parser space + # Internal function — create a new Symbol in parser space. mangled_name = self._func_mangled_name(parser_addr) - # Skip PLT stubs and functions already tracked as imports by LIEF - if mangled_name in imported_names: + # Skip LIEF-imported names except: (a) PLT thunks — must reach + # Step 3 to build trampoline_map; (b) _Z-prefixed names — a + # statically linked binary may contain a private copy of a symbol + # whose mangled name also appears in the dynamic import table. + if ( + mangled_name in imported_names + and not mangled_name.startswith("_Z") + and self._func_type(parser_addr) != FuncType.THUNK + ): continue func_symbol = Symbol( name=mangled_name, @@ -428,8 +492,22 @@ def _is_func_start(self, addr: int) -> bool: return False def _iter_func_addr(self) -> Iterator[int]: - """Yield the entry-point address of every function known to IDA.""" + """Yield the entry-point address of every function known to IDA. + + IDA's function list includes ``FUNC_TAIL`` entries (flag ``0x400``) — + non-contiguous tail chunks that belong to a parent function defined + elsewhere. These are not callable entry points: they share their + mangled name with the parent, making ``add_function`` overwrite + ``internal_functions`` with the chunk address and breaking + ``function_exists`` lookups for the real function. They must be + skipped so only true function starts enter ``program_data``. + """ + from ida_domain.functions import FunctionFlags + for func in self._ida_db.functions.get_all(): + flags = self._ida_db.functions.get_flags(func) + if FunctionFlags.TAIL in flags: + continue self._ida_cached_func = func yield func.start_ea @@ -444,7 +522,7 @@ def _func_mangled_name(self, addr: int) -> str: func = self._get_ida_func(addr) import_info = self._ida_db.imports.get_import_at(addr) if import_info is not None and import_info.name is not None: - return import_info.name.split("@")[0] + return import_info.name if func is not None: name = self._ida_db.functions.get_name(func) if name: @@ -458,11 +536,33 @@ def _func_demangled_name(self, addr: int) -> str: return demangled if demangled is not None else mangled def _func_children(self, addr: int) -> list[int]: - """:return: parser-space addresses of callees of the function at *addr*.""" + """:return: parser-space addresses of callees of the function at *addr*. + + When IDA's ``get_callees`` returns a ``FUNC_TAIL`` chunk (a + non-contiguous piece of a parent function, flag ``0x400``), the chunk + address is not a valid callable entry point and must not enter + ``program_data``. Instead, follow the chunk's own callee list to + obtain the real parent function's ``start_ea`` and use that. This + ensures calls to tail-chunked functions are recorded against the true + entry point that ``_iter_func_addr`` emitted. + """ + from ida_domain.functions import FunctionFlags + func = self._get_ida_func(addr) if func is None: return [] - return [callee.start_ea for callee in self._ida_db.functions.get_callees(func)] + result: list[int] = [] + for callee in self._ida_db.functions.get_callees(func): + flags = self._ida_db.functions.get_flags(callee) + if FunctionFlags.TAIL in flags: + # Resolve to the real parent entry point via the chunk's callees. + parents = list(self._ida_db.functions.get_callees(callee)) + if parents and parents[0].start_ea != callee.start_ea: + result.append(parents[0].start_ea) + # If the chunk calls itself (unresolvable), drop it silently. + else: + result.append(callee.start_ea) + return result def _func_parents(self, addr: int) -> list[int]: """:return: parser-space addresses of callers of the function at *addr*.""" @@ -491,7 +591,7 @@ def _func_type(self, addr: int) -> FuncType: if self._ida_db.imports.get_import_at(addr): is_imported = True - if is_imported: + if is_imported or len(func.name.split("@@")) == 2: # symbols with a specific version: return FuncType.IMPORTED elif FunctionFlags.THUNK in flags: callees = list(self._ida_db.functions.get_callees(func)) @@ -509,16 +609,58 @@ def _func_type(self, addr: int) -> FuncType: # Ghidra backend # ====================================================================== +# Analyzers required for call-graph extraction (function discovery, xrefs, +# thunk resolution, import/export tables, and name demangling). +# Everything else is explicitly disabled to minimise analysis time. +_GHIDRA_REQUIRED_ANALYZERS: frozenset[str] = frozenset( + [ + # --- Function discovery --- + "Disassemble Entry Points", + "Function Start Search", + "Function Start Search After Code", + "Non-Returning Functions - Discovered", + "Non-Returning Functions - Known", + # --- Call graph / cross-references --- + "Call Convention ID", + "Call-Fixup Installer", + "Subroutine References", + "Subroutine References - One Time", + # --- Thunk resolution --- + "Thunk Function", + # --- Format-specific import/export tables --- + # ELF + "ELF Scalar Operand References", + "External Entry References", + # PE + "PE Entry Point", + "Windows x86 PE Thunk Functions", + # Mach-O (no extra analyzer needed beyond the loader itself) + # --- Demangling --- + "Demangler GNU", + "Demangler Microsoft", + ] +) + class GhidraParser(BinaryParser): """BinaryParser backed by Ghidra 12.0+ via PyGhidra.""" def _initiate_bin_parser(self, root_directory: Path, file_path: Path, image_base: int = 0): - """Start the JVM, open and fully analyse the binary, and initialise handles.""" + """Start the JVM, open the binary with only the required analyzers, and initialise handles. + + All Ghidra analyzers not listed in ``_GHIDRA_REQUIRED_ANALYZERS`` are + disabled before analysis runs, which significantly reduces analysis time + while preserving full function-discovery and call-graph accuracy. + + Analyzer selection is done via ``program.getOptions("Analyzers")`` and + ``setBoolean``, which is the stable public API across all Ghidra versions + supported by PyGhidra. No internal ``AutoAnalysisManager`` import is + needed. + """ import os import tempfile - import pyghidra + import pyghidra # type: ignore # Initialise all attributes upfront so _close_bin_parser is always safe self._pyghidra_ctx = None @@ -526,54 +668,91 @@ def _initiate_bin_parser(self, root_directory: Path, file_path: Path, image_base self._ghidra_project_dir: Path | None = None self._ghidra_func_manager = None self._ghidra_symbol_table = None - self._ghidra_ext_manager = None self._ghidra_demangler = None self._ghidra_cached_func = None self._ghidra_load_base: int = 0 self._ghidra_monitor = None self._ghidra_exported_parser_addrs: set[int] = set() + self._ghidra_is_relocatable: bool = False + self._ghidra_exported_names: dict[str, Symbol] = {} full_path = root_directory / file_path self._ghidra_project_dir = Path(tempfile.mkdtemp(prefix=f"ghidra_{os.getpid()}_")) # Start the JVM once per worker process (no-op if already running) if not pyghidra.started(): - from pyghidra.launcher import HeadlessPyGhidraLauncher + from pyghidra.launcher import HeadlessPyGhidraLauncher # type: ignore launcher = HeadlessPyGhidraLauncher() launcher.add_vmargs("-Xms512m", "-Xmx2g", "-XX:+UseG1GC") launcher.start() # Ghidra imports must come after JVM start - from ghidra.app.util.demangler.gnu import GnuDemangler - from ghidra.util.task import ConsoleTaskMonitor + from ghidra.app.util.demangler.gnu import GnuDemangler # type: ignore + from ghidra.util.task import ConsoleTaskMonitor # type: ignore self._ghidra_monitor = ConsoleTaskMonitor() - # open_program(analyze=True) runs full blocking analysis and correctly - # populates all cross-references including the call graph. - # Note: open_program is deprecated in PyGhidra 3.0 but is currently - # the only reliable path for complete headless analysis. + # Open without running analysis yet so we can configure the analyzer set. self._pyghidra_ctx = pyghidra.open_program( str(full_path), project_location=str(self._ghidra_project_dir), project_name="p", - analyze=True, + analyze=False, ) flat_api = self._pyghidra_ctx.__enter__() program = flat_api.getCurrentProgram() + # Disable every analyzer not in the required set via the stable + # "Analyzers" options block, then trigger analysis through the flat API. + analyzer_options = program.getOptions("Analyzers") + for option_name in analyzer_options.getOptionNames(): + enabled = option_name in _GHIDRA_REQUIRED_ANALYZERS + try: + analyzer_options.setBoolean(option_name, enabled) + except Exception: + # Some option names in the "Analyzers" block are not simple + # booleans (e.g. sub-option strings); skip them silently. + pass + if not enabled: + logging.debug(f"{self.log_prefix}: disabled Ghidra analyzer '{option_name}'") + + # Run analysis with the filtered analyzer set via the stable flat API. + flat_api.analyzeAll(program) + self._ghidra_program = program # Derive load base from the program itself, not from LIEF's image_base, # so that _to_ghidra_address / _to_parser_addr are always consistent. self._ghidra_load_base = program.getImageBase().getOffset() # Build the exported-address set once so _func_type can check it cheaply. - self._ghidra_exported_parser_addrs: set[int] = { + self._ghidra_exported_parser_addrs = { lief_addr - self._binary.image_base for lief_addr in self._binary.exported_funcs_by_addr } self._ghidra_func_manager = program.getFunctionManager() self._ghidra_symbol_table = program.getSymbolTable() - self._ghidra_ext_manager = program.getExternalManager() + + # ET_REL (kernel modules, object files): Ghidra lays sections out in a + # fake address space starting at 0x10000; LIEF reports raw + # section-relative offsets. The two coordinate systems are + # incompatible, so address-based matching is impossible — we must match + # exported symbols by name instead. + # The "relocatable" flag is written by ElfProgramBuilder into the + # program's PROGRAM_INFO options block under the key used by + # RelocationTable.RELOCATABLE_PROP_NAME ("Relocatable"). We read it + # directly as a string to avoid importing the internal class. + self._ghidra_is_relocatable = bool( + program.getOptions(program.PROGRAM_INFO).getBoolean("Relocatable", False) + ) + # Name → LIEF Symbol map, populated only for relocatable binaries. + self._ghidra_exported_names = ( + { + sym.name: sym + for symbols in self._binary.exported_funcs_by_addr.values() + for sym in symbols + } + if self._ghidra_is_relocatable + else {} + ) demangler = GnuDemangler() self._ghidra_demangler = demangler if demangler.canDemangle(program) else None @@ -645,6 +824,79 @@ def _get_ghidra_func(self, parser_addr: int): # BinaryParser interface # ------------------------------------------------------------------ + def _combine_program_analysis_binary( + self, + parser_exports: dict[int, list[Symbol]], + ) -> dict[int, FuncData]: + """Override for relocatable binaries (ET_REL, e.g. kernel modules). + + For ``ET_REL`` files Ghidra places sections in a fake address space + (default base ``0x10000``) while LIEF reports raw section-relative + offsets. Address-based matching is therefore impossible and exported + symbols are matched by name instead. + + For non-relocatable binaries the base-class implementation is used + unchanged. + + :param parser_exports: LIEF exports already remapped to parser space. + :return: mapping from parser-space address to FuncData. + """ + if not self._ghidra_is_relocatable: + return super()._combine_program_analysis_binary(parser_exports) + + imported_names: set[str] = set(self._binary.imported_symbol_names) + program_data: dict[int, FuncData] = {} + + for parser_addr in self._iter_func_addr(): + mangled_name = self._func_mangled_name(parser_addr) + + if mangled_name in self._ghidra_exported_names: + # ET_REL: adopt name and demangled name from the LIEF export + # symbol, but use the Ghidra parser-space address so that the + # rest of BinaryParser sees a consistent address space. + # The LIEF address is a raw section-relative offset and must + # not be used as a key anywhere in the resolution logic. + # Use add_exported_symbol so the symbol stays in exported_functions + # (consistent with what load_binary registered via LIEF) and is + # evicted from internal_functions if it was registered there first. + lief_sym = self._ghidra_exported_names[mangled_name] + func_symbol = Symbol( + name=lief_sym.name, + demangled_name=lief_sym.demangled_name, + is_func=True, + addr=parser_addr, + ) + parser_name = self._func_demangled_name(parser_addr) + if parser_name != func_symbol.demangled_name: + logging.debug( + f"{self.log_prefix}: rename {parser_name} → {func_symbol.demangled_name}" + ) + self._binary.add_exported_symbol(func_symbol) + else: + # Internal function — same guard as base class. + if ( + mangled_name in imported_names + and not mangled_name.startswith("_Z") + and self._func_type(parser_addr) != FuncType.THUNK + ): + continue + func_symbol = Symbol( + name=mangled_name, + demangled_name=self._func_demangled_name(parser_addr), + is_func=True, + addr=parser_addr, + ) + self._binary.add_function(func_symbol) + + program_data[parser_addr] = FuncData( + symbol=func_symbol, + type=self._func_type(parser_addr), + calls=self._func_children(parser_addr), + callers=self._func_parents(parser_addr), + ) + + return program_data + def _is_func_start(self, addr: int) -> bool: """:return: True if *addr* (parser space) is a known Ghidra function entry.""" return self._get_ghidra_func(addr) is not None @@ -669,13 +921,36 @@ def _iter_func_addr(self) -> Iterator[int]: yield parser_addr def _func_mangled_name(self, addr: int) -> str: - """:return: the raw name of the function at *addr*, or ``sub_``.""" + """:return: the raw mangled name of the function at *addr*, or ``sub_``. + + Queries the symbol table directly for symbols whose name starts with + ``_Z`` (Itanium ABI mangled prefix) at the given address, which gives + the raw mangled name before Ghidra's demangler has processed it. + Falls back to ``func.getName()`` for non-C++ functions, rejecting names + that look like partial demangles (operators, destructors, anonymous). + """ func = self._get_ghidra_func(addr) - if func is not None: - name = func.getName() - if name: - return name - return f"sub_{addr:X}" + if func is None: + return f"FUN_{addr:X}" + + # Search all symbols at this address for a mangled (_Z...) name. + ghidra_addr = self._to_ghidra_address(addr) + for sym in self._ghidra_symbol_table.getSymbols(ghidra_addr): + raw = sym.getName() + if raw and raw.startswith("_Z"): + return raw + + # No mangled symbol found — use func.getName() but reject partial + # demangles: operators, destructors, and anonymous constructs. + name = func.getName() + if name and not ( + name.startswith("~") + or name.startswith("operator") + or (name.startswith("<") and name.endswith(">")) + ): + return name + + return f"FUN_{addr:X}" def _func_demangled_name(self, addr: int) -> str: """:return: the demangled name, falling back to the mangled name.""" @@ -684,7 +959,13 @@ def _func_demangled_name(self, addr: int) -> str: try: result = self._ghidra_demangler.demangle(mangled, True) if result is not None: - return result.getSignature(False) + # Use getName() to get the bare function name without return + # type or parameter signature, so it matches the short callee + # names used in the call graph (e.g. "basic_string" rather + # than "std::__cxx11::basic_string"). + name = result.getName() + if name: + return name except Exception: pass return mangled @@ -692,23 +973,32 @@ def _func_demangled_name(self, addr: int) -> str: def _func_children(self, addr: int) -> list[int]: """:return: parser-space addresses of callees of the function at *addr*. - External callees are not returned here — they are handled by - ``_func_type`` classifying their PLT thunk stubs as ``FuncType.IMPORTED``. + Uses Ghidra's reference manager to collect raw CALL instruction targets + rather than ``getCalledFunctions()``. ``getCalledFunctions()`` resolves + thunk chains and returns the *external* symbol directly, bypassing the + PLT stub that lives in the binary's address space. By reading raw call + references we obtain the actual branch targets — including PLT thunk + addresses — so the trampoline resolution in ``BinaryParser`` can + correctly classify them as ``IMPORTED`` and forward callers to the + imported symbol name. """ func = self._get_ghidra_func(addr) if func is None: return [] - seen: set[str] = set() + listing = self._ghidra_program.getListing() + seen: set[int] = set() result: list[int] = [] - for callee in func.getCalledFunctions(self._ghidra_monitor): - if callee.isExternal(): - continue - name = callee.getName() - if name in seen: - continue - seen.add(name) - result.append(self._to_parser_addr(callee.getEntryPoint().getOffset())) + for cu in listing.getCodeUnits(func.getBody(), True): + for ref in cu.getReferencesFrom(): + if not ref.getReferenceType().isCall(): + continue + target_offset = ref.getToAddress().getOffset() + parser_addr = self._to_parser_addr(target_offset) + if parser_addr in seen: + continue + seen.add(parser_addr) + result.append(parser_addr) return result def _func_parents(self, addr: int) -> list[int]: @@ -730,18 +1020,7 @@ def _func_parents(self, addr: int) -> list[int]: return result def _func_type(self, addr: int) -> FuncType: - """:return: the FuncType of the function at *addr* (parser space). - - Thunk stubs that resolve to external functions are classified as - ``IMPORTED`` so the trampoline resolution in ``BinaryParser`` correctly - forwards all callers to the imported symbol name. - - Exception: if the thunk is itself exported (i.e. it appears in the - binary's export table), it must be kept as ``THUNK`` so that - ``BinaryParser`` adds it to the call graph rather than silently - dropping it. ``IMPORTED`` is reserved for non-exported stubs whose - only purpose is to forward calls to an external symbol. - """ + """:return: the FuncType of the function at *addr* (parser space).""" func = self._get_ghidra_func(addr) if func is None: return FuncType.NORMAL @@ -750,22 +1029,8 @@ def _func_type(self, addr: int) -> FuncType: return FuncType.IMPORTED if func.isThunk(): - # Resolve thunk chain; classify as IMPORTED only when the thunk is - # not exported — exported thunks must remain visible in the call - # graph so BinaryParser does not drop them. - thunked = func.getThunkedFunction(True) - if thunked is not None and thunked.isExternal(): - if addr not in self._ghidra_exported_parser_addrs: - return FuncType.IMPORTED + # Always THUNK — the trampoline resolution in Step 3 detects the + # external callee via callee_data.type and collapses the chain. return FuncType.THUNK - # Heuristic: function in a namespace matching a known external library - from ghidra.program.model.symbol import SourceType - - symbol = self._ghidra_symbol_table.getPrimarySymbol(self._to_ghidra_address(addr)) - if symbol is not None and symbol.getSource() == SourceType.ANALYSIS: - namespace = func.getParentNamespace() - if namespace is not None and self._ghidra_ext_manager.contains(namespace.getName(True)): - return FuncType.LIBRARY - return FuncType.NORMAL From 7c5c4c5dd42df8bc6f346c3b523b374c6b97892b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 17 Apr 2026 15:55:13 +0200 Subject: [PATCH 19/62] ci, install: CI build docker image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 25 +++++++++++++++++ Dockerfile | 38 -------------------------- ci/ghidra/build.sh | 2 +- ci/ida/build.sh | 2 +- ci/pyrrha/Dockerfile | 65 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 92 insertions(+), 40 deletions(-) delete mode 100644 Dockerfile create mode 100644 ci/pyrrha/Dockerfile diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1256ccd..97cb013 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,3 +1,28 @@ +build_image: + stage: build + image: docker:test_cli + parallel: + matrix: + - BACKEND: "ida" + VERSION: [91, 93] + LATEST: 93 + - BACKEND: "ghidra" + VERSION: ["12.0.4"] + LATEST: "12.0.4" + variables: + DOCKER_IMAGE_NAME: $CI_REGISTRY_IMAGE/pyrrha-$BACKEND + before_script: + - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY + script: + - docker build --pull -t "$DOCKER_IMAGE_NAME:$VERSION" ci/pyrrha/Dockerfile --build-arg DISASS_IMAGE=$CI_REGISTRY_IMAGE/ci/${BACKEND} --build-arg DISASS_IMAGE_VERSION=$VERSION + - docker push "$DOCKER_IMAGE_NAME:$VERSION" + - | + if [[ "$VERSION" == "$LATEST" ]]; then + docker tag "$DOCKER_IMAGE_NAME:$VERSION" "$DOCKER_IMAGE_NAME:latest" + docker push "$DOCKER_IMAGE_NAME:latest" + fi + + #========================== OBJECTS TESTS ==================================== test_data_structures: diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 1eb160f..0000000 --- a/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -FROM docker.io/library/python:3.11-slim -SHELL ["/bin/bash", "-c"] - -ENV PYRRHA_INSTALL_DIR=/tmp/pyrrha_install -ENV PYRRHA_WORKING_DIR=/tmp/pyrrha - -RUN mkdir -p $PYRRHA_INSTALL_DIR - -WORKDIR ${PYRRHA_INSTALL_DIR} - -RUN python3 -m pip install --no-cache-dir -U pip - -COPY src src/ -COPY pyproject.toml ./ -COPY README.md ./ - -RUN python3 -m pip install --no-cache-dir . && \ - rm -rf $PYRRHA_INSTALL_DIR - -WORKDIR ${PYRRHA_WORKING_DIR} - -ENTRYPOINT ["pyrrha"] diff --git a/ci/ghidra/build.sh b/ci/ghidra/build.sh index 02e6617..52c46c8 100755 --- a/ci/ghidra/build.sh +++ b/ci/ghidra/build.sh @@ -64,7 +64,7 @@ readonly DOCKER readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" readonly DOCKERFILE="${SCRIPT_DIR}/Dockerfile" -readonly IMAGE_NAME_DEFAULT="pyrrha-ghidra" +readonly IMAGE_NAME_DEFAULT="ghidra" # Default Ghidra release — update these when a new version is published. readonly DEFAULT_VERSION="12.0.4" diff --git a/ci/ida/build.sh b/ci/ida/build.sh index 4460081..5b23857 100755 --- a/ci/ida/build.sh +++ b/ci/ida/build.sh @@ -58,7 +58,7 @@ readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" readonly DOCKERFILE="${SCRIPT_DIR}/Dockerfile" readonly DOCKERFILE_FINAL="${SCRIPT_DIR}/Dockerfile.final" readonly DEFAULT_LICENSE="${SCRIPT_DIR}/idapro.hexlic" -readonly IMAGE_NAME_DEFAULT="pyrrha-ida" +readonly IMAGE_NAME_DEFAULT="ida" # ── Helpers ─────────────────────────────────────────────────────────────────── diff --git a/ci/pyrrha/Dockerfile b/ci/pyrrha/Dockerfile new file mode 100644 index 0000000..be64e52 --- /dev/null +++ b/ci/pyrrha/Dockerfile @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ----------------------------------------------------------------------------- +# Pyrrha image +# +# Layered on top of an image which contained one of the supported disassembler. +# The image should provide: +# * the disassembler already installed and on PATH (or correct env variable set) +# * a non-root "user" (uid 1000) +# * a Python >= 3.11 venv already on PATH +# +# The base image tag is parameterised so callers can reuse a pre-built image or +# have it built on-demand by scripts/build-docker.sh. +# +# Manual usage: +# docker build -t pyrrha-ghidra:latest ci/ghidra +# docker build -t pyrrha-ida:latest \ +# --build-arg DISASS_IMAGE=ida --build-arg DISASS_IMAGE_VERSION=latest . +# ----------------------------------------------------------------------------- + +ARG GHIDRA_IMAGE=pyrrha-ghidra:latest + +ARG DISASS_IMAGE=ghidra +ARG DISASS_IMAGE_VERSION=latest +ARG DISASS_IMAGE_USER=user + +FROM ${DISASS_IMAGE}:{DISASS_IMAGE_VERSION} +SHELL ["/bin/bash", "-c"] + +ENV PYRRHA_INSTALL_DIR=/tmp/pyrrha_install +ENV PYRRHA_WORKING_DIR=/tmp/pyrrha + +USER ${DISASS_IMAGE_USER} +WORKDIR ${PYRRHA_INSTALL_DIR} + +RUN python3 -m pip install --no-cache-dir -U pip + +COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER} ../../src src/ +COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER}user:user ../../pyproject.toml ./ +COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER}user:user ../../README.md ./ + + +# Overwrite the PyPI install from the base image with the local working copy. +# --force-reinstall guarantees we replace the base layer's version even when +# the local pyproject.toml reports an identical version number. +RUN pip install --no-cache-dir --force-reinstall . && \ + rm -rf ${PYRRHA_INSTALL_DIR} + +WORKDIR ${PYRRHA_WORKING_DIR} + +ENTRYPOINT ["pyrrha"] From 702aabad0891d70584a7b2cef8e3df786f5a5ae5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 17 Apr 2026 15:56:41 +0200 Subject: [PATCH 20/62] [fix] ci: image name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 97cb013..e07ad95 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,8 @@ build_image: stage: build - image: docker:test_cli + image: docker:cli + services: + - docker:dind parallel: matrix: - BACKEND: "ida" From f08024d1dcfcc8537fb12c22d2ccce595255a37e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 17 Apr 2026 16:00:39 +0200 Subject: [PATCH 21/62] [fix] ci: secure password MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 29 ++++++++++++++++++++++------- ci/pyrrha/Dockerfile | 10 +++++----- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e07ad95..9572b60 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,8 +1,8 @@ build_image: stage: build - image: docker:cli - services: - - docker:dind + image: docker:29-dind + tags: + - dind parallel: matrix: - BACKEND: "ida" @@ -13,17 +13,32 @@ build_image: LATEST: "12.0.4" variables: DOCKER_IMAGE_NAME: $CI_REGISTRY_IMAGE/pyrrha-$BACKEND + DOCKER_HOST: unix:///var/run/docker.sock + DOCKER_TLS_CERTDIR: "" before_script: - - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY + - echo "$CA_CERT" > /usr/local/share/ca-certificates/local-ca.pem + - update-ca-certificates + - | + dockerd-entrypoint.sh --host=unix:///var/run/docker.sock & + for i in $(seq 1 30); do + if docker info >/dev/null 2>&1; then + echo "Docker daemon is ready" + break + fi + echo "Waiting for docker daemon... ($i/30)" + sleep 2 + done + - echo "$CI_REGISTRY_PASSWORD" | docker login --username "$CI_REGISTRY_USER" --password-stdin "$CI_REGISTRY" script: - - docker build --pull -t "$DOCKER_IMAGE_NAME:$VERSION" ci/pyrrha/Dockerfile --build-arg DISASS_IMAGE=$CI_REGISTRY_IMAGE/ci/${BACKEND} --build-arg DISASS_IMAGE_VERSION=$VERSION + - docker build --pull -t "$DOCKER_IMAGE_NAME:$VERSION" -f ci/pyrrha/Dockerfile --build-arg DISASS_IMAGE=$CI_REGISTRY_IMAGE/ci/${BACKEND} --build-arg DISASS_IMAGE_VERSION=$VERSION . - docker push "$DOCKER_IMAGE_NAME:$VERSION" - | if [[ "$VERSION" == "$LATEST" ]]; then docker tag "$DOCKER_IMAGE_NAME:$VERSION" "$DOCKER_IMAGE_NAME:latest" docker push "$DOCKER_IMAGE_NAME:latest" fi - + after_script: + - docker logout $CI_REGISTRY #========================== OBJECTS TESTS ==================================== @@ -70,7 +85,7 @@ test_fs-cg: extends: - test_fs image: - name: $CONTAINER_PATH/${BACKEND}:${VERSION} + name: $CI_REGISTRY_IMAGE/ci/${BACKEND}:${VERSION} docker: user: user variables: diff --git a/ci/pyrrha/Dockerfile b/ci/pyrrha/Dockerfile index be64e52..c2277d2 100644 --- a/ci/pyrrha/Dockerfile +++ b/ci/pyrrha/Dockerfile @@ -36,11 +36,11 @@ ARG GHIDRA_IMAGE=pyrrha-ghidra:latest ARG DISASS_IMAGE=ghidra ARG DISASS_IMAGE_VERSION=latest -ARG DISASS_IMAGE_USER=user -FROM ${DISASS_IMAGE}:{DISASS_IMAGE_VERSION} +FROM ${DISASS_IMAGE}:${DISASS_IMAGE_VERSION} SHELL ["/bin/bash", "-c"] +ENV DISASS_IMAGE_USER=user ENV PYRRHA_INSTALL_DIR=/tmp/pyrrha_install ENV PYRRHA_WORKING_DIR=/tmp/pyrrha @@ -49,9 +49,9 @@ WORKDIR ${PYRRHA_INSTALL_DIR} RUN python3 -m pip install --no-cache-dir -U pip -COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER} ../../src src/ -COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER}user:user ../../pyproject.toml ./ -COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER}user:user ../../README.md ./ +COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER} src src/ +COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER} pyproject.toml ./ +COPY --chown=${DISASS_IMAGE_USER}:${DISASS_IMAGE_USER} README.md ./ # Overwrite the PyPI install from the base image with the local working copy. From 1bfd92d4c0c1f922b87cf32b25530024a4c82d5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 21 Apr 2026 10:10:45 +0200 Subject: [PATCH 22/62] ci: add new floating tags in images built MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 63 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9572b60..3109acc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,11 +3,15 @@ build_image: image: docker:29-dind tags: - dind + rules: + - if: '$CI_COMMIT_BRANCH == "main"' + - if: '$CI_COMMIT_BRANCH == "dev"' + - if: '$CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/' parallel: matrix: - BACKEND: "ida" VERSION: [91, 93] - LATEST: 93 + LATEST: 93 - BACKEND: "ghidra" VERSION: ["12.0.4"] LATEST: "12.0.4" @@ -28,14 +32,61 @@ build_image: echo "Waiting for docker daemon... ($i/30)" sleep 2 done - - echo "$CI_REGISTRY_PASSWORD" | docker login --username "$CI_REGISTRY_USER" --password-stdin "$CI_REGISTRY" + - echo "$CI_REGISTRY_PASSWORD" | docker login --username "$CI_REGISTRY_USER" --password-stdin "$CI_REGISTRY" script: - - docker build --pull -t "$DOCKER_IMAGE_NAME:$VERSION" -f ci/pyrrha/Dockerfile --build-arg DISASS_IMAGE=$CI_REGISTRY_IMAGE/ci/${BACKEND} --build-arg DISASS_IMAGE_VERSION=$VERSION . - - docker push "$DOCKER_IMAGE_NAME:$VERSION" + # Resolve the pyrrha version component from the ref that triggered the pipeline. + # - tag v1.2.3 -> "1.2.3" + # - main branch -> "main" + # - dev branch -> "dev" + - | + if [[ -n "$CI_COMMIT_TAG" && "$CI_COMMIT_TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + PYRRHA_VERSION="${CI_COMMIT_TAG#v}" + REF_KIND="tag" + elif [[ "$CI_COMMIT_BRANCH" == "main" ]]; then + PYRRHA_VERSION="main" + REF_KIND="main" + elif [[ "$CI_COMMIT_BRANCH" == "dev" ]]; then + PYRRHA_VERSION="dev" + REF_KIND="dev" + else + echo "Unexpected ref (branch=$CI_COMMIT_BRANCH, tag=$CI_COMMIT_TAG) — aborting." + exit 1 + fi + echo "Resolved PYRRHA_VERSION=$PYRRHA_VERSION (REF_KIND=$REF_KIND)" + + # Primary image tag: - + - PRIMARY_TAG="${VERSION}-${PYRRHA_VERSION}" + - echo "Building $DOCKER_IMAGE_NAME:$PRIMARY_TAG" + - docker build --pull + -t "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" + -f ci/pyrrha/Dockerfile + --build-arg DISASS_IMAGE=$CI_REGISTRY_IMAGE/ci/${BACKEND} + --build-arg DISASS_IMAGE_VERSION=$VERSION + . + - docker push "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" + + # Additional floating tags, only for the LATEST backend version of this matrix row. + # - main -> `latest` and `` + # - dev -> `latest-dev` + # - tag -> `stable` - | if [[ "$VERSION" == "$LATEST" ]]; then - docker tag "$DOCKER_IMAGE_NAME:$VERSION" "$DOCKER_IMAGE_NAME:latest" - docker push "$DOCKER_IMAGE_NAME:latest" + case "$REF_KIND" in + main) + for t in "latest" "$VERSION"; do + docker tag "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" "$DOCKER_IMAGE_NAME:$t" + docker push "$DOCKER_IMAGE_NAME:$t" + done + ;; + dev) + docker tag "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" "$DOCKER_IMAGE_NAME:latest-dev" + docker push "$DOCKER_IMAGE_NAME:latest-dev" + ;; + tag) + docker tag "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" "$DOCKER_IMAGE_NAME:stable" + docker push "$DOCKER_IMAGE_NAME:stable" + ;; + esac fi after_script: - docker logout $CI_REGISTRY From cd9da4214bc366ae5805cf0c1be5e088cc5383da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 21 Apr 2026 10:10:56 +0200 Subject: [PATCH 23/62] ci: add trigger to internal doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3109acc..46f794a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,3 +1,9 @@ +stages: + - test + - build + - notify + +#======================== BUILD DOCKER IMAGE AND PUSH TO REGISTRY ====================== build_image: stage: build image: docker:29-dind @@ -154,3 +160,43 @@ test_fs-cg: - BACKEND: "ghidra" VERSION: ["12.0.4"] +#========================== TRIGGER INTERNAL DOC UPDATE ================================ +.trigger_docs_base: + stage: notify + trigger: + project: firmware-re/cartography/pyrrha-docs-internal + branch: main + # strategy: depend # flip on to surface downstream failure on pyrrha + # CI. Off by default — a broken docs build shouldn't + # block pyrrha. + variables: + UPSTREAM_PIPELINE_URL: $CI_PIPELINE_URL + +trigger_docs_main: + extends: .trigger_docs_base + variables: + UPSTREAM_REF: "main" + UPSTREAM_SHA: $CI_COMMIT_SHA + rules: + - if: $CI_COMMIT_BRANCH == "main" + when: on_success + +trigger_docs_dev: + extends: .trigger_docs_base + variables: + UPSTREAM_REF: "dev" + UPSTREAM_SHA: $CI_COMMIT_SHA + rules: + - if: $CI_COMMIT_BRANCH == "dev" + when: on_success + +trigger_docs_tag: + extends: .trigger_docs_base + variables: + UPSTREAM_REF: $CI_COMMIT_TAG + UPSTREAM_TAG: $CI_COMMIT_TAG + UPSTREAM_SHA: $CI_COMMIT_SHA + rules: + - if: $CI_COMMIT_TAG =~ /^v\d+\.\d+/ + when: on_success + From ae91da0890f2b2cffe36c5d1c0fa3a22bfc76774 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 21 Apr 2026 10:37:34 +0200 Subject: [PATCH 24/62] [fix]ci: path in trigger job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 46f794a..4e1efac 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -164,7 +164,7 @@ test_fs-cg: .trigger_docs_base: stage: notify trigger: - project: firmware-re/cartography/pyrrha-docs-internal + project: firmware-re/cartography/pyrrha-internal-documentation branch: main # strategy: depend # flip on to surface downstream failure on pyrrha # CI. Off by default — a broken docs build shouldn't From c53587f68e23be1f1357f5aab9fb402ad1c182f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 21 Apr 2026 10:42:27 +0200 Subject: [PATCH 25/62] ci: only trigger build and test when src or tests are changed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4e1efac..95f2aff 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,6 +3,13 @@ stages: - build - notify +# Paths that should trigger test and build pipelines. +# Anchored here so test and build jobs stay in sync. +.source_paths: &source_paths + - tests/**/* + - src/**/* + - pyproject.toml + #======================== BUILD DOCKER IMAGE AND PUSH TO REGISTRY ====================== build_image: stage: build @@ -10,9 +17,13 @@ build_image: tags: - dind rules: + # Release tags always build, regardless of which files changed. + - if: '$CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/' + # Branch builds only when source, tests, or packaging metadata changed. - if: '$CI_COMMIT_BRANCH == "main"' + changes: *source_paths - if: '$CI_COMMIT_BRANCH == "dev"' - - if: '$CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/' + changes: *source_paths parallel: matrix: - BACKEND: "ida" @@ -101,6 +112,11 @@ build_image: test_data_structures: stage: test + # Only run tests when source, tests, or packaging metadata changed. + # Inherited by test_fs and test_fs-cg via `extends`. + rules: + - if: '$CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/' + - changes: *source_paths before_script: - echo -e "\e[95m===== Install Pyrrha with test extension" - pip install '.[test]' @@ -198,5 +214,4 @@ trigger_docs_tag: UPSTREAM_SHA: $CI_COMMIT_SHA rules: - if: $CI_COMMIT_TAG =~ /^v\d+\.\d+/ - when: on_success - + when: on_success \ No newline at end of file From e0953601e2fbdc77f7b04132e6eb063c650adf26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 21 Apr 2026 17:51:02 +0200 Subject: [PATCH 26/62] doc: add new quick summary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- README.md | 7 +- docs/contributing/dev_mapper.md | 4 +- docs/index.md | 149 ++++++++++++++++++-------------- mkdocs.yml | 10 ++- pyproject.toml | 2 +- 5 files changed, 98 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index d441e1b..c6a69f0 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,11 @@ path to function. ## Installation -The installation is done in three parts: +The installation is done in two parts: -1. Install mapper external dependencies: IDA dissassembler (with the decompilation option for the `exe-decomp` mapper) and [`Quokka` IDA plugin](https://github.com/quarkslab/quokka/releases). 1. Install `Pyrrha` itself. 1. Install [`NumbatUI`](https://github.com/quarkslab/NumbatUI) (or [`Sourcetrail`](https://github.com/CoatiSoftware/Sourcetrail)) to be able to visualize Pyrrha's results. - +1. _(Optional)_ Install Ghidra or IDA if you want to use `fs-cg` or `exe-decomp` mappers. > [!NOTE] > A quick start installation is available on [Pyrrha documentation](https://quarkslab.github.io/pyrrha/#installation). @@ -46,11 +45,11 @@ The installation is done in three parts: ## Usage The usage workflow is composed of two steps which allow you to separate DB creation and result visualization. + 1. Run Pyrrha to obtain NumbatUI compatible files (`*.srctrlprj` for the project file and `*.srctrldb` for the DB file). With the python package, you can just launch the command `pyrrha`. 2. Visualize your results with Sourcetrail/NumbatUI. - > [!NOTE] > The detailed documentation of each mapper is available in the [documentation](https://quarkslab.github.io/pyrrha/mappers/mappers/). diff --git a/docs/contributing/dev_mapper.md b/docs/contributing/dev_mapper.md index f46fd8f..2fd1054 100644 --- a/docs/contributing/dev_mapper.md +++ b/docs/contributing/dev_mapper.md @@ -6,7 +6,7 @@ First develop your mapper. We are using `numbat` to manipulate the db used by so Then, add the required dependencies into `pyproject.toml`. ## Integration into the main program -Once the mapper is ready, it should be integrated into `pyrrha` CLI by adding the corresponding subcommand in the `src/pyrrha_mapper/__main__.py`. The CLI system is handled with [click](https://click.palletsprojects.com) +Once the mapper is ready, it should be integrated into `pyrrha` CLI by adding the corresponding subcommand in the `src/pyrrha_mapper/__main__.py`. The CLI system is handled with [click](https://click.palletsprojects.com). The subcommand corresponds to a function implementing the main of your mapper and some decorators to declare the subcommand name, its options and its arguments. @@ -100,7 +100,7 @@ Finally, you should add a page relative to your mapper inside the documentation. 1. Write your documentation in a markdown file that should be place into the `docs/mappers` folder. !!! tip - We are using `material` theme of the `mkdocs` doc system. It provides a lot of nice features to improve your documentation like this note block. Do not hesitate to take a look at their [documentation](https://squidfunk.github.io/mkdocs-material/reference/)! + We are using `materialx` theme of the `mkdocs` doc system. It provides a lot of nice features to improve your documentation like this note block. Do not hesitate to take a look at their [documentation](https://jaywhj.github.io/mkdocs-materialx/)! 2. Add your mapper in mapper lists (in `README.md` and in `docs/mappers/mappers.md`). 3. Complete the `nav` section in the `mkdocs.yml` file to add your file in the site navigation system. diff --git a/docs/index.md b/docs/index.md index c6743eb..07e3913 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,89 +8,110 @@ ## Installation -The installation is done in three parts: + -1. Install mapper external dependencies: IDA dissassembler (with the decompilation option for the `exe-decomp` mapper) and [`Quokka` IDA plugin](https://github.com/quarkslab/quokka/releases). -1. Install `Pyrrha` itself. -1. Install [`NumbatUI`](https://github.com/quarkslab/NumbatUI) (or [`Sourcetrail`](https://github.com/CoatiSoftware/Sourcetrail)) to be able to visualize Pyrrha's results. - -!!! example "Quick Start" +??? code "Install Visualisation Tool" === "Sourcetrail" - 1. Install Quokka plugin by downloaded the appropriate version from its [release](https://github.com/quarkslab/quokka/releases) page. Then follow the instructions according to your OS. - - 2. Install Sourcetrail and Pyrrha. - - === "Linux" - ```bash - SOURCETRAIL_URL='https://github.com/CoatiSoftware/Sourcetrail/releases/download/2021.4.19/Sourcetrail_2021_4_19_Linux_64bit.tar.gz' - CHECKSUM=""f65a401daad8e16f29f7b2ff062a559999b6a8d44606db36cdf803de0cd7816d - EXTRACTION_DIR="/tmp/Sourcetrail_2021_4_19_Linux_64bit" - DOWNLOAD_PATH="$EXTRACTION_PATH.tar.gz" + === "Linux" + ```bash + SOURCETRAIL_URL='https://github.com/CoatiSoftware/Sourcetrail/releases/download/2021.4.19/Sourcetrail_2021_4_19_Linux_64bit.tar.gz' + CHECKSUM=""f65a401daad8e16f29f7b2ff062a559999b6a8d44606db36cdf803de0cd7816d + EXTRACTION_DIR="/tmp/Sourcetrail_2021_4_19_Linux_64bit" + DOWNLOAD_PATH="$EXTRACTION_PATH.tar.gz" - wget $SOURCETRAIL_URL -O $DOWNLOAD_PATH - echo $CHECKSUM $DOWNLOAD_PATH | sha256sum -c + wget $SOURCETRAIL_URL -O $DOWNLOAD_PATH + echo $CHECKSUM $DOWNLOAD_PATH | sha256sum -c - if [ $? == 0 ]; then - echo '==== Install Sourcetrail' - tar xf $DOWNLOAD_PATH -C $EXTRACTION_DIR - sudo $EXTRACTION_DIR/Sourcetrail/install.sh - rm -rf $DOWNLOAD_PATH $EXTRACTION_DIR - fi + if [ $? == 0 ]; then + echo '==== Install Sourcetrail' + tar xf $DOWNLOAD_PATH -C $EXTRACTION_DIR + sudo $EXTRACTION_DIR/Sourcetrail/install.sh + rm -rf $DOWNLOAD_PATH $EXTRACTION_DIR + fi + ``` + === "Windows" - # Install pyrrha - if [ $? == 0 ]; then - echo '==== Install Pyrrha' - pip install pyrrha-mapper - fi - ``` - === "Windows" + Download last Sourcetrail [release](https://github.com/CoatiSoftware/Sourcetrail/releases), unzip it and run the `setup.exe`. - 1. Download last Sourcetrail [release](https://github.com/CoatiSoftware/Sourcetrail/releases), unzip it and run the `setup.exe`. - 2. Install pyrrha: `pip install pyrrha-mapper` + === "MacOS" - === "MacOS" - - 1. Download last Sourcetrail [release](https://github.com/CoatiSoftware/Sourcetrail/releases), and install it following [Sourcetrail documentation](https://github.com/CoatiSoftware/Sourcetrail/releases). - 2. Install pyrrha: `pip install pyrrha-mapper` + Download last Sourcetrail [release](https://github.com/CoatiSoftware/Sourcetrail/releases), and install it following [Sourcetrail documentation](https://github.com/CoatiSoftware/Sourcetrail/releases). === "NumbatUI (Ubuntu/Debian)" - _Tested only for last Ubuntu/Debian._ - - First install Quokka plugin by downloaded the appropriate version from its [release](https://github.com/quarkslab/quokka/releases) page. + _Tested only for last Ubuntu/Debian._ + + Run the following script that will clone and build `NumbatUI` and install `Pyrrha`. `NumbatUI` executable will be in `numbatui/build/Release/app`. + + ```sh + # Prerequisites for Numbat UI + sudo apt-get update + sudo apt-get install -y \ + cmake \ + git \ + build-essential \ + libboost-filesystem-dev libboost-program-options-dev libboost-system-dev libboost-date-time-dev \ + qt6-svg-dev qt6-base-dev qt6-5compat-dev \ + unzip wget \ + libclang-17-dev clang-17 + + # Clone and Build NumbatUI + git clone https://github.com/quarkslab/NumbatUI.git numbatui + cd numbatui + mkdir -p build/Release + cd build/Release + cmake -DCMAKE_BUILD_TYPE="Release" -DBUILD_CXX_LANGUAGE_PACKAGE=ON -DBUILD_PYTHON_LANGUAGE_PACKAGE=ON ../.. && make NumbatUI -j $(nproc) + ``` + +!!! code "Install Pyrrha" + === ":fontawesome-brands-python: Python Package" + Require a local installation of **IDA Pro 9.1+** and/or **Ghidra 12.0+** except for `fs` mapper. + ```python + # in a virtualenv + pip install pyrrha-mapper + ``` + === ":fontawesome-brands-docker: Docker Image" + Download the docker image from Github Registry, this image is backed by Ghidra. + + ```sh + docker pull ghcr.io/quarkslab/pyrrha:latest + ``` - Then run the following script that will clone and build `NumbatUI` and install `Pyrrha`. `NumbatUI` will in `numbatui/build/Release/app`. - - ``` - # Prerequisites for Numbat UI - sudo apt-get update - sudo apt-get install -y \ - cmake \ - git \ - build-essential \ - libboost-filesystem-dev libboost-program-options-dev libboost-system-dev libboost-date-time-dev \ - qt6-svg-dev qt6-base-dev qt6-5compat-dev \ - unzip wget \ - libclang-17-dev clang-17 - - # Clone and Build NumbatUI - git clone https://github.com/quarkslab/NumbatUI.git numbatui - cd numbatui - mkdir -p build/Release - cd build/Release - cmake -DCMAKE_BUILD_TYPE="Release" -DBUILD_CXX_LANGUAGE_PACKAGE=ON -DBUILD_PYTHON_LANGUAGE_PACKAGE=ON ../.. && make NumbatUI -j $(nproc) - - # Install pyrrha - pip install pyrrha-mapper - ``` !!! note Detailed instructions can be found on the [dedicated documentation page](installation.md). --8<-- "README.md:usage" + +!!! code "Run Pyrrha" + === ":fontawesome-brands-python: Python Package" + If your backend is not on `PATH`, indicate its directory using the matching environment variable. + ```sh + export IDADIR=/opt/idapro + export GHIDRA_INSTALL_DIR=/opt/ghidra_12.0.4_PUBLIC + ``` + Run **Pyrrha**, to obtain NumbatUI/Sourcetrail compatible files. + ``` + pyrrha MAPPER [OPTIONS] ROOT_DIRECTORY + ``` + + === ":fontawesome-brands-docker: Docker Image" + Download the docker image from Github Registry, this image is backed by Ghidra. + + ```sh + cd ROOT_DIRECTORY/.. + docker run --rm -t -v $PWD:/tmp/pyrrha ghcr.io/quarkslab/pyrrha:latest MAPPER [OPTIONS] ROOT_DIRECTORY + ``` + +!!! code "Visualize results" + You should have a `*.srctrlprj` file corresponding to the project file and a `*.srctrldb` file for the DB. + Run `NumbatUI` or `Sourcetrail` on the project file. You can now navigate into the results. + + The user interface is described in depth in the [NumbatUI documentation](https://github.com/quarkslab/NumbatUI/blob/main/DOCUMENTATION.md#user-interface). + Do not hesitate to take a look at all the possibilities offered by NumbatUI, especially [Custom Trails](https://github.com/quarkslab/NumbatUI/blob/main/DOCUMENTATION.md#custom-trail-dialog). !!! note The detailed documentation of each mapper is available in the [documentation](mappers/mappers.md). diff --git a/mkdocs.yml b/mkdocs.yml index 289cbbd..c461a2a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -5,10 +5,14 @@ site_author: "Quarkslab" repo_url: "https://github.com/quarkslab/pyrrha" repo_name: "quarkslab/pyrrha" watch: [ mkdocs.yml, README.md, CHANGELOG.md, src/pyrrha_mapper ] -copyright: Copyright © 2023-2025 Quarkslab +copyright: Copyright © 2023-2026 Quarkslab theme: - name: "material" + name: "materialx" + admonition: + code: + icon: octicons/file-code-24 + color: rgba(158, 158, 158, 0.7) palette: # Palette toggle for light mode - media: "(prefers-color-scheme: light)" @@ -30,6 +34,7 @@ theme: features: - content.code.annotate - content.code.copy + - content.tabs.link - footer nav: @@ -42,7 +47,6 @@ nav: - Exe-Decomp: mappers/exe-decomp.md - Contributing: - Mapper Development: contributing/dev_mapper.md - - Disassembler Integration: disassembler.md - Changelog: changelog.md - License: license.md diff --git a/pyproject.toml b/pyproject.toml index 7ed00b8..60965da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ doc = [ 'mkdocs', 'mkdocs-autorefs', 'mkdocs-glightbox', - 'mkdocs-material[imaging]', + 'mkdocs-materialx[imaging]', 'mkdocs-section-index', 'mike', 'pymdown-extensions' From 19cfebc09a1a59bda00e4493ae3b1d0deb9db782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 21 Apr 2026 17:53:49 +0200 Subject: [PATCH 27/62] ci: fix documentation of IDA docker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- ci/ida/Dockerfile | 4 +--- ci/ida/build.sh | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ci/ida/Dockerfile b/ci/ida/Dockerfile index f2d91f7..ad8029a 100644 --- a/ci/ida/Dockerfile +++ b/ci/ida/Dockerfile @@ -33,11 +33,9 @@ # # Run command (licence is mounted at runtime, never stored in the image): # docker run --rm \ -# --mount type=secret,id=ida_license,src=idapro.hexlic \ +# --mount type=bind,dsr=/home/user/.idapro/ida_license.hexlic,src=idapro.hexlic,ro \ # pyrrha-ida # -# IDA locates the licence via the HEXRAYS_LICENSE env var which is set to the -# secret path below. # =========================================================================== # ======================== IDA Installation ================================= diff --git a/ci/ida/build.sh b/ci/ida/build.sh index 5b23857..a062c2a 100755 --- a/ci/ida/build.sh +++ b/ci/ida/build.sh @@ -32,8 +32,8 @@ # The installer is passed via bind-mount (no size limit, never committed to any # layer). It must be located next to this script as ida-pro_.run. # The licence file (idapro.hexlic) is NEVER baked into any image layer. -# Pass it at runtime via a Docker secret: -# docker run --mount type=secret,id=ida_license,src=idapro.hexlic : +# Pass it at runtime via a Docker bind: +# docker run --mount type=bind,dsr=/home/user/.idapro/ida_license.hexlic,src=idapro.hexlic,ro : set -euo pipefail From 85aa5857971ef8462b8d5c288cdfcb795d2a0a25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Fri, 24 Apr 2026 12:41:39 +0200 Subject: [PATCH 28/62] doc: new installation doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- docs/installation.md | 106 ++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 62 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index ba09749..e5c3317 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,77 +1,50 @@ # Installation The installation is done in three parts: -- installing `Pyrrha` (as a Python module); -- installing mappers external dependencies if required; -- installing `NumbatUI` to be able to visualize Pyrrha's results. +- installing **Pyrrha** (as a Python module); +- installing **NumbatUI** to be able to visualize Pyrrha's results. !!! info - It is also possible to visualize results with `Sourcetrail`, it is the base from which `NumbatUI` was forked. The user won't be able to use new features like the renaming of the node or the launch of external programs from Sourcetrail/NumbatUI. + It is also possible to visualize results with **Sourcetrail**, it is the base from which **NumbatUI** was forked. The user won't be able to use new features like the renaming of the nodes. ## Pyrrha Installation -=== "Python Package" - Pyrrha requires a Python version >= 3.10. - It is recommended to install the Python package inside a virtualenv. You can use `pip` to install it. - ```python - pip install pyrrha-mapper +=== ":fontawesome-brands-python: Python Package" + **Pyrrha** relies on a backend (IDA or Ghidra) to generate its results, except for the light mapper `fs`. This installation is not covered here, we consider the following prerequisites: + + - Python **≥ 3.10**. + - A local installation of **IDA Pro 9.1+** and/or **Ghidra 12.0+** — + required by the InterCG mapper. + + Then you can install **Pyrrha** Python package in a virtual environment with `pip`. + ```sh + # Do not forget to activate your virtualenv + pip install pyrrha-mapper ``` If you prefer using sources to install Pyrrha, do the following: - ```commandline + ```sh # Do not forget to activate your virtualenv pip install 'pyrrha @ git+https://github.com/quarkslab/pyrrha' - - # If you prefer, you can manually clone the repository and then install the package - git clone https://github.com/quarkslab/pyrrha - cd pyrrha - pip install '.' ``` -=== "Docker" - `pyrrha` can be used with a docker. It provides Pyrrha, but you still need to install NumbatUI on your system as described in the [NumbatUI Installation](#numbatui-installation) section. +=== ":fontawesome-brands-docker: Docker Image" + **Pyrrha** can be used with a docker. It provides **Pyrrha** with a backend (**Ghidra**), but you still need to install NumbatUI on your system as described in the [**NumbatUI** Installation](#numbatui-installation) section. The docker image is directly available from our [Github registry](https://github.com/orgs/quarkslab/packages/container/package/pyrrha). ```commandline cd ROOT_DIRECTORY/.. - docker run --rm -t -v $PWD:/tmp/pyrrha ghcr.io/quarkslab/pyrrha:latest fs [OPTIONS] ROOT_DIRECTORY + docker run --rm -t -v $PWD:/tmp/pyrrha ghcr.io/quarkslab/pyrrha:latest MAPPER [OPTIONS] ROOT_DIRECTORY ``` - !!! warning - The docker image has only be built for the `fs` mapper. + ## Visualizer Installation - -=== "NumbatUI" - NumbatUI should be compiled locally, as explained in its [README](https://github.com/quarkslab/NumbatUI/blob/main/README.md). For the moment it has only be tested on Ubuntu/Debian distributions. - Here are the summarized compilation instructions: - - **Prerequisites** - ```commandline - apt-get update - apt-get install -y \ - cmake \ - git \ - build-essential \ - libboost-filesystem-dev libboost-program-options-dev libboost-system-dev libboost-date-time-dev \ - qt6-svg-dev qt6-base-dev qt6-5compat-dev \ - unzip wget \ - libclang-17-dev clang-17 - ``` - - **Compilation** - ```commandline - git clone https://github.com/quarkslab/NumbatUI.git numbatui - cd numbatui - mkdir -p build/Release - cd build/Release - cmake -DCMAKE_BUILD_TYPE="Release" -DBUILD_CXX_LANGUAGE_PACKAGE=ON -DBUILD_PYTHON_LANGUAGE_PACKAGE=ON ../.. && make NumbatUI -j $(nproc) - ``` -=== "Sourcetrail" +=== "**Sourcetrail**" === "Linux" - ```bash + ```sh SOURCETRAIL_URL='https://github.com/CoatiSoftware/Sourcetrail/releases/download/2021.4.19/Sourcetrail_2021_4_19_Linux_64bit.tar.gz' CHECKSUM=""f65a401daad8e16f29f7b2ff062a559999b6a8d44606db36cdf803de0cd7816d EXTRACTION_DIR="/tmp/Sourcetrail_2021_4_19_Linux_64bit" @@ -86,12 +59,6 @@ The installation is done in three parts: sudo $EXTRACTION_DIR/Sourcetrail/install.sh rm -rf $DOWNLOAD_PATH $EXTRACTION_DIR fi - - # Install pyrrha - if [ $? == 0 ]; then - echo '==== Install Pyrrha' - pip install pyrrha-mapper - fi ``` === "Windows" @@ -101,16 +68,31 @@ The installation is done in three parts: Download last Sourcetrail [release](https://github.com/CoatiSoftware/Sourcetrail/releases), and install it following [Sourcetrail documentation](https://github.com/CoatiSoftware/Sourcetrail/releases). +=== "**NumbatUI**" + **NumbatUI** should be compiled locally, as explained in its [README](https://github.com/quarkslab/NumbatUI/blob/main/README.md). For the moment it has only be tested on Ubuntu/Debian distributions. + Here are the summarized compilation instructions: -## External Dependencies - -The `fs-cg` and the `exec-decomp` mappers require to have a proper installation of [Quokka](https://github.com/quarkslab/quokka) and so of IDA. The `exec-decomp` also requires to have an IDA license with decompiler. - -The Quokka plugin for IDA can directly be downloaded from the [Release page](https://github.com/quarkslab/quokka/releases). The associated Python package is directly installed during Pyrrha Python package installation. - -!!! note - The `fs-cg` and the `exec-decomp` mappers could be used without Quokka and IDA if you already have the cache files for your firmware (`.decompiled` and `.quokka` files). More details in the corresponding mapper documentation. + **Prerequisites** + ```sh + apt-get update + apt-get install -y \ + cmake \ + git \ + build-essential \ + libboost-filesystem-dev libboost-program-options-dev libboost-system-dev libboost-date-time-dev \ + qt6-svg-dev qt6-base-dev qt6-5compat-dev \ + unzip wget \ + libclang-17-dev clang-17 + ``` + **Compilation** + ```sh + git clone https://github.com/quarkslab/NumbatUI.git numbatui + cd numbatui + mkdir -p build/Release + cd build/Release + cmake -DCMAKE_BUILD_TYPE="Release" -DBUILD_CXX_LANGUAGE_PACKAGE=ON -DBUILD_PYTHON_LANGUAGE_PACKAGE=ON ../.. && make NumbatUI -j $(nproc) + ``` ## Documentation From 873a01ae024ec4ebafb775500be46b3fd8744251 Mon Sep 17 00:00:00 2001 From: rletang Date: Fri, 24 Apr 2026 14:49:50 +0200 Subject: [PATCH 29/62] add soname support --- src/pyrrha_mapper/common/objects.py | 26 ++++++++++++++++++++++++- src/pyrrha_mapper/fs/imports_mapper.py | 27 +++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/pyrrha_mapper/common/objects.py b/src/pyrrha_mapper/common/objects.py index e0969cb..62cc2b7 100644 --- a/src/pyrrha_mapper/common/objects.py +++ b/src/pyrrha_mapper/common/objects.py @@ -107,6 +107,7 @@ class Binary(FileSystemComponent): calls: dict[str, list[Symbol]] = Field(default_factory=dict) # ELF-specific fields + soname: str | None = Field(default=None) # ELF DT_SONAME (e.g. "libpthread.so.0") version_requirement: dict[str, list[str]] = Field( default_factory=dict ) # dict(symbol_name, list(requirements)) @@ -419,6 +420,7 @@ class FileSystem(BaseModel): symlinks: dict[Path, Symlink] = Field(default_factory=dict) _binary_names: dict[str, list[Binary]] = PrivateAttr(default_factory=dict, init=False) _symlink_names: dict[str, list[Symlink]] = PrivateAttr(default_factory=dict, init=False) + _soname_to_binaries: dict[str, list[Binary]] = PrivateAttr(default_factory=dict, init=False) def __repr__(self): # noqa: D105 return ( @@ -446,6 +448,7 @@ def fs_bin_serializer(self, v: dict[Path, Binary], info: SerializationInfo) -> A "id": True, "path": True, "name": True, + "soname": True, "imported_symbols": True, "exported_symbols": True, "exported_functions": True, @@ -526,7 +529,11 @@ def fs_bin_validate(cls, data: Any, info: ValidationInfo) -> Any: ) from e if lib_path_obj not in res: raise ValueError(f"Imported lib '{lib_path}' not listed in binaries") - res[bin_path].add_imported_library(res[lib_path_obj]) + # Store under the original import name (the dict key) rather + # than the binary's filename so SONAME keys survive + # serialization round-trips (e.g. "libc.so.6" stays as-is + # instead of being replaced by "libc-2.11.1.so"). + res[bin_path].imported_libraries[name] = res[lib_path_obj] # Deduplicate: replace repeated Symbol instances with the same id by one object. symbols_by_ids: dict[int, Symbol] = { @@ -674,6 +681,15 @@ def _record_component_name(self, fs_object: Binary | Symlink) -> None: names_dict[fs_object.name].append(fs_object) # type: ignore else: names_dict[fs_object.name] = [fs_object] # type: ignore + # Index binaries by their ELF SONAME so that imports referencing the + # SONAME (e.g. "libpthread.so.0") can be resolved even when no symlink + # with that name exists in the firmware filesystem. + if isinstance(fs_object, Binary) and fs_object.soname: + soname = fs_object.soname + if soname in self._soname_to_binaries: + self._soname_to_binaries[soname].append(fs_object) + else: + self._soname_to_binaries[soname] = [fs_object] def _set_object_realpath(self, obj: FileSystemComponent) -> None: obj.real_path = Path(self.root_dir) / ("." + str(obj.path)) @@ -712,6 +728,14 @@ def symlink_name_exists(self, name: str) -> bool: """return: true if the given name is stored in the current FS instance.""" return name in self._symlink_names + def soname_exists(self, soname: str) -> bool: + """return: true if the given SONAME is stored in the current FS instance.""" + return soname in self._soname_to_binaries + + def get_binaries_by_soname(self, soname: str) -> list[Binary]: + """:return: the binaries with the given SONAME.""" + return self._soname_to_binaries[soname] + def get_binaries_by_name(self, name: str) -> list[Binary]: """:return: the binaries with the given path.""" return self._binary_names[name] diff --git a/src/pyrrha_mapper/fs/imports_mapper.py b/src/pyrrha_mapper/fs/imports_mapper.py index 1cdbea0..f013681 100644 --- a/src/pyrrha_mapper/fs/imports_mapper.py +++ b/src/pyrrha_mapper/fs/imports_mapper.py @@ -88,6 +88,13 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s bin_obj.image_base = parsing_res.imagebase bin_obj.is_relocatable = parsing_res.header.file_type == lief.ELF.Header.FILE_TYPE.REL + # Extract the ELF SONAME if present (shared libraries only). + # This allows resolving imports that reference the SONAME rather + # than the actual filename (e.g. libpthread.so.0 vs libpthread-2.11.1.so). + for dyn_entry in parsing_res.dynamic_entries: + if dyn_entry.tag == lief.ELF.DynamicEntry.TAG.SONAME: + bin_obj.soname = str(dyn_entry.name) + break # parse imported libs for lib in parsing_res.libraries: bin_obj.add_imported_library_name(str(lib)) @@ -315,6 +322,16 @@ def _resolve_lib_import( if dest is None: return self._PartialLibImport(initial_import=sym_obj) return self._SolvedLibImport(initial_import=sym_obj, final_import=dest) + elif self.fs.soname_exists(lib_name): + # The imported name matches the SONAME of a binary whose filename + # differs (e.g. libpthread.so.0 is the SONAME of libpthread-2.11.1.so). + matching_binaries = self.fs.get_binaries_by_soname(lib_name) + lib_obj = self._select_fs_component( + strategy, matching_binaries, log_prefix, lib_name + ) + if lib_obj is None: + return self._FailedLibImport() + return self._SolvedLibImport(initial_import=lib_obj, final_import=lib_obj) else: return self._FailedLibImport() @@ -335,6 +352,7 @@ def map_lib_imports( targeted Binary object in the case of a Symlink) """ log_prefix = f"[lib imports] {binary.path}" + for lib_name in binary.imported_library_names: res = self._resolve_lib_import(lib_name, resolution_strategy, log_prefix) match res: @@ -343,7 +361,14 @@ def map_lib_imports( # resolution, the final target of the symlink is considered to be # imported and not the symlink itself self.record_import_in_db(binary.id, res.initial_import.id, log_prefix) - binary.add_imported_library(res.final_import) + + if lib_name != res.final_import.name: + # SONAME case: store the resolved binary under the + # original import name (the SONAME) rather than the + # binary's filename, to avoid a spurious extra entry. + binary.imported_libraries[lib_name] = res.final_import + else: + binary.add_imported_library(res.final_import) case self._PartialLibImport(): self.record_import_in_db(binary.id, res.initial_import.id, log_prefix) logging.warning( From 37ee69c1cdf74b606f50a9e0263f47dbf92686c3 Mon Sep 17 00:00:00 2001 From: rletang Date: Fri, 24 Apr 2026 15:55:46 +0200 Subject: [PATCH 30/62] Update tests for soname --- tests/test_cli.py | 21 +++++++++++++++++- ...so.1.1 => libcrypto.so.FOR_SONAME_TESTING} | Bin 2 files changed, 20 insertions(+), 1 deletion(-) rename tests/test_fw/lib/{libcrypto.so.1.1 => libcrypto.so.FOR_SONAME_TESTING} (100%) diff --git a/tests/test_cli.py b/tests/test_cli.py index c3892c5..5c58429 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -83,7 +83,8 @@ def SUBCOMMAND(self) -> str: FW_TEST_BIN_PATHS = { FW_TEST_LD, Path("/lib/libc.so.6"), - Path("/lib/libcrypto.so.1.1"), + # Path("/lib/libcrypto.so.1.1"), + Path("/lib/libcrypto.so.FOR_SONAME_TESTING"), Path("/lib/libdl.so.2"), Path("/lib/libpthread.so.0"), Path("/lib/libssl.so.1.1"), @@ -91,6 +92,14 @@ def SUBCOMMAND(self) -> str: } FW_TEST_SYMLINKS_PATHS = {Path("/lib/libssl.so")} + FW_TEST_SONAMES = { + "ld-linux.so.3" : "ld-linux.so.3", + "libcrypto.so.FOR_SONAME_TESTING": "libcrypto.so.1.1", + "libdl.so.2": "libdl.so.2", + "libpthread.so.0": "libpthread.so.0", + "libssl.so.1.1": "libssl.so.1.1" + } + # =============================== INTERNAL STUFFS ================================== class ExecResults(NamedTuple): # noqa: D106 @@ -260,6 +269,16 @@ def test_resolved_imported_symbols(self, bin_path: Path, export_dump: FileSystem "Some imported symbols have not been resolved" ) + @pytest.mark.parametrize("export_res", [1, 16], indirect=True) + @pytest.mark.parametrize( + "bin_path", BaseTestFsMapper.FW_TEST_BIN_PATHS, ids=BaseTestFsMapper._path_id + ) + def test_sonames(self, bin_path: Path, export_dump: FileSystem) -> None: + """Imported symbols correspond to a symbol object.""" + _bin = export_dump.get_binary_by_path(bin_path) + if _bin.path.name in BaseTestFsMapper.FW_TEST_SONAMES.keys(): + assert BaseTestFsMapper.FW_TEST_SONAMES[_bin.path.name] == _bin.soname, "Some sonames are not matching" + class TestFsCgMapper(BaseTestFsMapper): """Main functional test class for the fs-cg mapper. Tests are done from the CLI.""" diff --git a/tests/test_fw/lib/libcrypto.so.1.1 b/tests/test_fw/lib/libcrypto.so.FOR_SONAME_TESTING similarity index 100% rename from tests/test_fw/lib/libcrypto.so.1.1 rename to tests/test_fw/lib/libcrypto.so.FOR_SONAME_TESTING From fe29885cb3dccf8bf183e1396f6d315a1043e85f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 5 May 2026 15:04:31 +0200 Subject: [PATCH 31/62] all: rework mappers to have backend suppoort implement only in one common place for all mappers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/__main__.py | 2 +- src/pyrrha_mapper/backend/__init__.py | 22 + src/pyrrha_mapper/backend/base.py | 86 ++++ src/pyrrha_mapper/backend/ghidra.py | 385 +++++++++++++++ src/pyrrha_mapper/backend/ida.py | 212 ++++++++ src/pyrrha_mapper/intercg/fwmapper.py | 30 +- src/pyrrha_mapper/intercg/loader.py | 666 +++----------------------- 7 files changed, 775 insertions(+), 628 deletions(-) create mode 100644 src/pyrrha_mapper/backend/__init__.py create mode 100644 src/pyrrha_mapper/backend/base.py create mode 100644 src/pyrrha_mapper/backend/ghidra.py create mode 100644 src/pyrrha_mapper/backend/ida.py diff --git a/src/pyrrha_mapper/__main__.py b/src/pyrrha_mapper/__main__.py index 4b55451..1a2ab90 100644 --- a/src/pyrrha_mapper/__main__.py +++ b/src/pyrrha_mapper/__main__.py @@ -96,7 +96,7 @@ def backend_option(f): "-b", "--backend", required=False, - type=click.Choice(Backend, case_sensitive=False), + type=click.Choice([Backend.IDA, Backend.GHIDRA], case_sensitive=False), default=Backend.IDA, show_default=True, help="Backend to use.", diff --git a/src/pyrrha_mapper/backend/__init__.py b/src/pyrrha_mapper/backend/__init__.py new file mode 100644 index 0000000..571d0cd --- /dev/null +++ b/src/pyrrha_mapper/backend/__init__.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module for the intercg mapper.""" + +from .base import Backend +from .ghidra import Ghidra +from .ida import IDA + +__all__ = ["Backend", "Ghidra", "IDA"] diff --git a/src/pyrrha_mapper/backend/base.py b/src/pyrrha_mapper/backend/base.py new file mode 100644 index 0000000..fe5400f --- /dev/null +++ b/src/pyrrha_mapper/backend/base.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Interface for backends used by mappers.""" + +from abc import ABCMeta, abstractmethod +from collections.abc import Iterator +from pathlib import Path + +from pyrrha_mapper.types import FuncType + + +class Backend(object, metaclass=ABCMeta): + """Abstraction of any backend used to run analysis.""" + + def __init__( + self, + bin_path: Path, + root_directory: Path | None, + decompilation: bool = False, + image_base: int = 0, + ) -> None: + """Open the binary parser and run any required analysis.""" + pass + + @abstractmethod + def close(self) -> None: + """Close the binary parser and release all resources.""" + ... + + @abstractmethod + def is_func_start(self, addr: int) -> bool: + """:return: True if *addr* (parser space) is the entry point of a function.""" + ... + + @property + @abstractmethod + def func_addrs(self) -> Iterator[int]: + """Yield the parser-space entry-point address of every known function.""" + + @abstractmethod + def func_mangled_name(self, addr: int) -> str: + """:return:: the raw name of a function at *addr*.""" + ... + + @abstractmethod + def func_demangled_name(self, addr: int) -> str: + """:return: the demangled name, falling back to the mangled name.""" + ... + + @abstractmethod + def func_children(self, addr: int) -> list[int]: + """:return: entry-point addresses of callees of the function at *addr*.""" + ... + + @abstractmethod + def func_parents(self, addr: int) -> list[int]: + """:return: entry-point addresses of callers of the function at *addr*.""" + ... + + @abstractmethod + def func_type(self, addr: int) -> FuncType: + """:return: the FuncType of the function at *addr*. + + Thunk stubs that resolve to external/imported functions must return + ``FuncType.IMPORTED`` so the trampoline resolution in ``__init__`` + correctly forwards callers to the imported symbol name. + """ + ... + + @abstractmethod + def func_decompiled(self, addr: int) -> str: + """:return: decompilation result of the function""" + ... diff --git a/src/pyrrha_mapper/backend/ghidra.py b/src/pyrrha_mapper/backend/ghidra.py new file mode 100644 index 0000000..22ebbee --- /dev/null +++ b/src/pyrrha_mapper/backend/ghidra.py @@ -0,0 +1,385 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Ghidra backend implementation of Backend abstract interface.""" + +import logging +import os +import re +import shutil +import tempfile +from collections.abc import Iterator +from pathlib import Path + +import pyghidra + +from pyrrha_mapper.backend import Backend +from pyrrha_mapper.types import FuncType + +# Analyzers required for call-graph extraction. +# Everything else is explicitly disabled to minimise analysis time. +_GHIDRA_REQUIRED_ANALYZERS: frozenset[str] = frozenset( + [ + # --- Function discovery --- + "Disassemble Entry Points", + "Function Start Search", + "Function Start Search After Code", + "Non-Returning Functions - Discovered", + "Non-Returning Functions - Known", + # --- Call graph / cross-references --- + "Call Convention ID", + "Call-Fixup Installer", + "Subroutine References", + "Subroutine References - One Time", + # --- Thunk resolution --- + "Thunk Function", + # --- Format-specific import/export tables --- + # ELF + "ELF Scalar Operand References", + "External Entry References", + # PE + "PE Entry Point", + "Windows x86 PE Thunk Functions", + # Mach-O (no extra analyzer needed beyond the loader itself) + # --- Demangling --- + "Demangler GNU", + "Demangler Microsoft", + ] +) + +# Additional analyzers required when the Ghidra decompiler is used to produce +# pseudocode (i.e. in GhidraLoader but NOT in GhidraParser). +# +# Stack — stack-frame analysis; needed for correct local- +# variable naming in pseudocode (param_N / local_N). +# Stack Variable References — accurate tracking of stack-slot references +# across basic blocks used by the decompiler. +# Shared Return Calls — identifies tail-call / shared-epilogue patterns; +# without it some call edges are absent from the +# decompiled output. +# Data Type Propagation — propagates inferred struct/pointer types through +# the program; without it the decompiler emits +# ``undefined *`` for most pointer arguments, +# making call-site name matching less reliable. +_GHIDRA_DECOMPILER_EXTRA_ANALYZERS: frozenset[str] = frozenset( + [ + "Stack", + "Stack Variable References", + "Shared Return Calls", + "Data Type Propagation", + ] +) + +# Tool-generated fallback names emitted by Ghidra when the real symbol name is +# unknown. Callees matching this pattern cannot be resolved as meaningful +# targets and must be skipped. +# FUN_ unnamed Ghidra function +# _INIT_ ELF .init_array slot +# _FINI_ ELF .fini_array slot +_GHIDRA_SYNTHETIC_NAME_RE: re.Pattern[str] = re.compile( + r"^(?:FUN_[0-9A-Fa-f]+|_INIT_\d+|_FINI_\d+)$" +) + + +class Ghidra(Backend): + """Ghidra backend.""" + + def __init__( + self, + bin_path: Path, + root_directory: Path | None, + decompilation: bool = False, + image_base: int = 0, + timeout = 600, + ) -> None: + """Open the binary parser and run any required analysis.""" + self.decompilation_activated = decompilation + self.image_base = image_base + self._timeout = timeout + + # Initialise all attributes upfront so _close_ghidra is always safe. + self._pyghidra_ctx = None + self._ghidra_program = None + self._ghidra_project_dir: Path | None = None + self._ghidra_func_manager = None + self._ghidra_symbol_table = None + self._ghidra_demangler = None + self._ghidra_cached_func = None + self._ghidra_load_base: int = 0 + self._ghidra_monitor = None + + self._ghidra_project_dir = Path(tempfile.mkdtemp(prefix=f"ghidra_{os.getpid()}_")) + + # Start the JVM once per worker process (no-op if already running). + if not pyghidra.started(): + from pyghidra.launcher import HeadlessPyGhidraLauncher # type: ignore + + launcher = HeadlessPyGhidraLauncher() + launcher.add_vmargs("-Xms512m", "-Xmx2g", "-XX:+UseG1GC") + launcher.start() + + # Ghidra imports must come after JVM start. + from ghidra.app.decompiler import DecompInterface # type: ignore + from ghidra.app.util.demangler.gnu import GnuDemangler # type: ignore + from ghidra.util.task import ConsoleTaskMonitor # type: ignore + + self._ghidra_monitor = ConsoleTaskMonitor() + + # Open without running analysis yet so we can configure the analyser set. + self._pyghidra_ctx = pyghidra.open_program( + str(bin_path) if root_directory is None else str(root_directory / bin_path), + project_location=str(self._ghidra_project_dir), + project_name="p", + analyze=False, + ) + flat_api = self._pyghidra_ctx.__enter__() + program = flat_api.getCurrentProgram() + + # Build the effective analyser set and apply it. + if self.decompilation_activated: + active_analyzers = _GHIDRA_REQUIRED_ANALYZERS | _GHIDRA_DECOMPILER_EXTRA_ANALYZERS + else: + active_analyzers = _GHIDRA_REQUIRED_ANALYZERS + analyzer_options = program.getOptions("Analyzers") + for option_name in analyzer_options.getOptionNames(): + enabled = option_name in active_analyzers + try: + analyzer_options.setBoolean(option_name, enabled) + except Exception: + # Some option names are not simple booleans; skip them silently. + pass + + flat_api.analyzeAll(program) + + self._ghidra_program = program + # Derive load base from the program itself, not from LIEF's image_base, + # so that _to_ghidra_address / _to_parser_addr are always consistent. + self._ghidra_load_base = program.getImageBase().getOffset() + self._ghidra_func_manager = program.getFunctionManager() + self._ghidra_symbol_table = program.getSymbolTable() + + demangler = GnuDemangler() + self._ghidra_demangler = demangler if demangler.canDemangle(program) else None + + if self.decompilation_activated: + self.ifc = DecompInterface() + self.ifc.openProgram(self._ghidra_program) + self.monitor = ConsoleTaskMonitor() + + def close(self) -> None: + """Close the binary parser and release all resources.""" + if self.decompilation_activated: + self.ifc.dispose() + if self._pyghidra_ctx is not None: + try: + self._pyghidra_ctx.__exit__(None, None, None) + except Exception: + pass + self._pyghidra_ctx = None + if self._ghidra_project_dir is not None: + shutil.rmtree(self._ghidra_project_dir, ignore_errors=True) + self._ghidra_project_dir = None + + def is_func_start(self, addr: int) -> bool: + """:return: True if *addr* (parser space) is the entry point of a function.""" + return self._get_ghidra_func(addr) is not None + + @property + def func_addrs(self) -> Iterator[int]: + """Yield the parser-space entry-point address of every known function.""" + seen_addrs: set[int] = set() + for func in self._ghidra_func_manager.getFunctions(True): # type: ignore + if func.isExternal(): + continue + self._ghidra_cached_func = func + parser_addr = self._to_parser_addr(func.getEntryPoint().getOffset()) + if parser_addr in seen_addrs: + continue + seen_addrs.add(parser_addr) + yield self._to_parser_addr(func.getEntryPoint().getOffset()) + + def func_mangled_name(self, addr: int) -> str: + """Return the raw (mangled) name of the function at *addr*. + + Queries the symbol table for ``_Z``-prefixed (Itanium ABI) symbols + first, then falls back to ``func.getName()``, rejecting partial + demangles. Returns ``FUN_`` when no usable name is found. + + :param addr: function entry-point address in parser space. + :return: mangled symbol name or ``FUN_``. + """ + func = self._get_ghidra_func(addr) + if func is None: + return f"FUN_{addr:X}" + + ghidra_addr = self._to_ghidra_address(addr) + for sym in self._ghidra_symbol_table.getSymbols(ghidra_addr): # type: ignore + raw = sym.getName() + if raw and raw.startswith("_Z"): + return raw + + name = func.getName() + if name and not ( + name.startswith("~") + or name.startswith("operator") + or (name.startswith("<") and name.endswith(">")) + ): + return name + + return f"FUN_{addr:X}" + + def func_demangled_name(self, addr: int) -> str: + """Return the demangled name of the function at *addr*. + + Uses ``getName()`` on the ``DemangledObject`` (bare function name + without return type or parameter signature). Falls back to the + mangled name when the demangler is unavailable or returns ``None``. + + :param addr: function entry-point address in parser space. + :return: demangled name, or mangled name if demangling is unavailable. + """ + mangled = self.func_mangled_name(addr) + if self._ghidra_demangler is not None: + try: + result = self._ghidra_demangler.demangle(mangled, True) + if result is not None: + name = result.getName() + if name: + return name + except Exception: + pass + return mangled + + def func_children(self, addr: int) -> list[int]: + """:return: entry-point addresses of callees of the function at *addr*.""" + func = self._get_ghidra_func(addr) + if func is None: + return [] + listing = self._ghidra_program.getListing() # type: ignore + seen: set[int] = set() + result: list[int] = [] + for cu in listing.getCodeUnits(func.getBody(), True): + for ref in cu.getReferencesFrom(): + if not ref.getReferenceType().isCall(): + continue + target_offset = ref.getToAddress().getOffset() + parser_addr = self._to_parser_addr(target_offset) + if parser_addr in seen: + continue + seen.add(parser_addr) + result.append(parser_addr) + return result + + def func_parents(self, addr: int) -> list[int]: + """:return: entry-point addresses of callers of the function at *addr*.""" + func = self._get_ghidra_func(addr) + seen: set[str] = set() + result: list[int] = [] + for caller in func.getCallingFunctions(self._ghidra_monitor) if func is not None else []: + if caller.isExternal(): + continue + name = caller.getName() + if name in seen: + continue + seen.add(name) + result.append(self._to_parser_addr(caller.getEntryPoint().getOffset())) + return result + + def func_type(self, addr: int) -> FuncType: + """:return: the FuncType of the function at *addr*. + + Thunk stubs that resolve to external/imported functions must return + ``FuncType.IMPORTED`` so the trampoline resolution in ``__init__`` + correctly forwards callers to the imported symbol name. + """ + func = self._get_ghidra_func(addr) + if func is None: + return FuncType.NORMAL + if func.isExternal(): + return FuncType.IMPORTED + if func.isThunk(): + return FuncType.THUNK + return FuncType.NORMAL + + def func_decompiled(self, addr: int) -> str: + """:return: decompilation result of the function""" + assert self.decompilation_activated + func = self._get_ghidra_func(addr) + if func is None: + return "" + addr = self._to_parser_addr(func.getEntryPoint().getOffset()) + try: + res = self.ifc.decompileFunction(func, self._timeout, self.monitor) + if res is None or not res.decompileCompleted(): + return "" + return str(res.getDecompiledFunction().getC()) + except Exception as exc: + logging.debug(f"[Ghidra] skipping {addr:#x} ({self.func_mangled_name(addr)!r}): {exc}") + return "" + + # ------------------------------------------------------------------ + # Shared Ghidra primitives + # ------------------------------------------------------------------ + + def _to_ghidra_address(self, parser_addr: int): + """Convert a parser-space address to a Ghidra ``Address`` object. + + :param parser_addr: address in parser space. + :return: Ghidra ``Address`` object. + """ + abs_addr = (parser_addr + self._ghidra_load_base) & 0xFFFFFFFFFFFFFFFF + if abs_addr >= 0x8000000000000000: + abs_addr -= 0x10000000000000000 + return ( + self._ghidra_program.getAddressFactory().getDefaultAddressSpace().getAddress(abs_addr) # type: ignore + ) + + def _to_parser_addr(self, ghidra_offset: int) -> int: + """Convert an absolute Ghidra address offset to parser space. + + :param ghidra_offset: raw offset returned by ``getOffset()``. + :return: address in parser space. + """ + return ghidra_offset - self._ghidra_load_base + + def _get_ghidra_func(self, parser_addr: int): + """Return the Ghidra ``Function`` at *parser_addr*, using a single-entry cache. + + Falls back to ``getFunctionContaining`` when ``getFunctionAt`` returns + ``None``, handling the ARM THUMB ±1 offset case. Only accepts the + fallback result when the entry point matches within ±1 byte. + + :param parser_addr: address in parser space. + :return: Ghidra ``Function``, or ``None`` if not found. + """ + if ( + self._ghidra_cached_func is not None + and self._to_parser_addr(self._ghidra_cached_func.getEntryPoint().getOffset()) + == parser_addr + ): + return self._ghidra_cached_func + + ghidra_addr = self._to_ghidra_address(parser_addr) + func = self._ghidra_func_manager.getFunctionAt(ghidra_addr) # type: ignore + if func is None: + func = self._ghidra_func_manager.getFunctionContaining(ghidra_addr) # type: ignore + if func is not None: + entry_parser_addr = self._to_parser_addr(func.getEntryPoint().getOffset()) + if abs(entry_parser_addr - parser_addr) > 1: + func = None + + if func is not None: + self._ghidra_cached_func = func + return func diff --git a/src/pyrrha_mapper/backend/ida.py b/src/pyrrha_mapper/backend/ida.py new file mode 100644 index 0000000..cccf5e6 --- /dev/null +++ b/src/pyrrha_mapper/backend/ida.py @@ -0,0 +1,212 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""IDA Pro backend implementation of Backend abstract interface.""" + +from __future__ import annotations + +import logging +from collections.abc import Iterator +from pathlib import Path + +from ida_domain.database import Database, IdaCommandOptions +from ida_domain.functions import FunctionFlags + +from pyrrha_mapper.backend import Backend +from pyrrha_mapper.types import FuncType + + +class IDA(Backend): + """IDA Pro backend.""" + + def __init__( + self, + bin_path: Path, + root_directory: Path | None, + decompilation: bool = False, + image_base: int = 0, + ) -> None: + self.decompilation_activated = decompilation + self.image_base = image_base + self._bin_path = bin_path + self._ida_cached_func = None # single-entry cache used by _get_ida_func + self._ida_db: Database = Database.open( + str(bin_path) if root_directory is None else str(root_directory / bin_path), + args=IdaCommandOptions(auto_analysis=True, new_database=False), + ) + + def close(self) -> None: + """Close the binary parser and release all resources.""" + self._ida_db.close(save=False) + + def is_func_start(self, addr: int) -> bool: + """:return: True if *addr* (parser space) is the entry point of a function.""" + from ida_domain.base import InvalidEAError + + try: + if self._ida_cached_func is not None and addr == self._ida_cached_func.start_ea: + return True + func = self._ida_db.functions.get_at(addr) + return func is not None and func.start_ea == addr + except InvalidEAError: + return False + + @property + def func_addrs(self) -> Iterator[int]: + """Yield the parser-space entry-point address of every known function.""" + for func in self._ida_db.functions.get_all(): + if FunctionFlags.TAIL in self._ida_db.functions.get_flags(func): + continue + self._ida_cached_func = func + yield func.start_ea + + def func_mangled_name(self, addr: int) -> str: + """Return the raw (mangled) name of the function at *addr*. + + Resolution order: + + 1. Import table — preferred for genuine PLT stubs so the name matches + LIEF's :attr:`~pyrrha_mapper.common.Binary.imported_symbol_names`. + 2. ``get_name`` on the ``func_t`` — covers normal and library functions. + 3. ``sub_`` fallback when IDA could not recover any name. + + :param addr: function entry-point address. + :return: mangled symbol name or ``sub_``. + """ + func = self._get_ida_func(addr) + import_info = self._ida_db.imports.get_import_at(addr) + if import_info is not None and import_info.name is not None: + return import_info.name + if func is not None: + name = self._ida_db.functions.get_name(func) + if name: + return name + return f"sub_{addr:X}" + + def func_demangled_name(self, addr: int) -> str: + """:return: the demangled name, falling back to the mangled name.""" + mangled = self.func_mangled_name(addr) + demangled = self._ida_db.names.demangle_name(mangled) + return demangled if demangled is not None else mangled + + def func_children(self, addr: int) -> list[int]: + """Return parser-space addresses of callees of the function at *addr*. + + When IDA's ``get_callees`` returns a ``FUNC_TAIL`` chunk, the chunk's + own callee list is followed one level to obtain the real parent + ``start_ea``. Unresolvable self-referential chunks are dropped silently. + + :param addr: function entry-point address. + :return: list of callee entry-point addresses. + """ + func = self._get_ida_func(addr) + result: list[int] = [] + for callee in self._ida_db.functions.get_callees(func) if func is not None else []: + if FunctionFlags.TAIL in self._ida_db.functions.get_flags(callee): + parents = list(self._ida_db.functions.get_callees(callee)) + if parents and parents[0].start_ea != callee.start_ea: + result.append(parents[0].start_ea) + else: + result.append(callee.start_ea) + return result + + def func_parents(self, addr: int) -> list[int]: + """:return: parser-space addresses of callers of the function at *addr*.""" + func = self._get_ida_func(addr) + if func is None: + return [] + return [caller.start_ea for caller in self._ida_db.functions.get_callers(func)] + + def func_type(self, addr: int) -> FuncType: + """:return: the FuncType of the function at *addr*. + + Classification order: + + 1. No callees + present in import table → ``IMPORTED`` (bare PLT stub). + 2. Versioned symbol (``name@@VERSION``) → ``IMPORTED``. + 3. ``FUNC_THUNK`` + single callee whose name is a known import → + ``IMPORTED`` (thunk wrapping an external symbol). + 4. ``FUNC_THUNK`` otherwise → ``THUNK``. + 5. ``FUNC_LIB`` → ``LIBRARY``. + 6. Default → ``NORMAL``. + """ + func = self._get_ida_func(addr) + if func is None: + return FuncType.NORMAL + + flags = self._ida_db.functions.get_flags(func) + callees = list(self._ida_db.functions.get_callees(func)) + + if len(callees) == 0 and self._ida_db.imports.get_import_at(func.start_ea): + return FuncType.IMPORTED + if len(func.name.split("@@")) == 2: + return FuncType.IMPORTED + if FunctionFlags.THUNK in flags: + if len(callees) == 1: + callee_name = self._ida_db.functions.get_name(callees[0]) + if self._ida_db.imports.exists(callee_name): + return FuncType.IMPORTED + return FuncType.THUNK + if FunctionFlags.LIB in flags: + return FuncType.LIBRARY + return FuncType.NORMAL + + def func_decompiled(self, addr: int) -> str: + """:return: decompilation result of the function""" + result: dict[int, str] = {} + func = self._get_ida_func(addr) + if func is None: + return "" + try: + lines = self._ida_db.functions.get_pseudocode(func, remove_tags=True) + except RuntimeError as exc: + logging.debug( + f"[IDA] skipping {func.start_ea:#x} " + f"({self._ida_db.functions.get_name(func)!r}): {exc}" + ) + return "" + logging.info(f"[IDA] decompiled {len(result)} functions from {self._bin_path}") + return "\n".join(lines) + + # ------------------------------------------------------------------ + # Internal IDA method + # ------------------------------------------------------------------ + + def _get_ida_func(self, addr: int): + """Return the IDA ``func_t`` at *addr*, using a single-entry cache. + + :param addr: function entry-point address. + :return: the IDA ``func_t`` object, or ``None`` if not found. + """ + if self._ida_cached_func is not None and addr == self._ida_cached_func.start_ea: + return self._ida_cached_func + return self._ida_db.functions.get_at(addr) + + @property + def _ida_funcs(self) -> Iterator: + """Yield every non-tail ``func_t`` in the IDA database. + + ``FUNC_TAIL`` entries are non-contiguous chunks that share the parent + function's name and address space. Yielding them would produce + duplicate or misleading entries in any downstream mapping, so they are + filtered out here at the source. + + :return: iterator of ``func_t`` objects with ``FUNC_TAIL`` excluded. + """ + for func in self._ida_db.functions.get_all(): + if FunctionFlags.TAIL in self._ida_db.functions.get_flags(func): + continue + self._ida_cached_func = func + yield func diff --git a/src/pyrrha_mapper/intercg/fwmapper.py b/src/pyrrha_mapper/intercg/fwmapper.py index e3e9a78..a68cf2b 100644 --- a/src/pyrrha_mapper/intercg/fwmapper.py +++ b/src/pyrrha_mapper/intercg/fwmapper.py @@ -35,7 +35,7 @@ ) from pyrrha_mapper.exceptions import FsMapperError from pyrrha_mapper.fs import FileSystemImportsMapper -from pyrrha_mapper.intercg.loader import BinaryParser, GhidraParser, IDAParser +from pyrrha_mapper.intercg.loader import BinaryParser, GhidraBinaryParser, IDABinaryParser from pyrrha_mapper.types import Backend, ResolveDuplicateOption IGNORE_LIST: frozenset[str] = frozenset( @@ -174,10 +174,10 @@ def load_binary( """ try: if backend == Backend.IDA: - ida_parser: BinaryParser = IDAParser(root_directory, file_path) + ida_parser: BinaryParser = IDABinaryParser(root_directory, file_path) return ida_parser.binary, ida_parser.call_graph elif backend == Backend.GHIDRA: - ghidra_parser = GhidraParser(root_directory, file_path) + ghidra_parser = GhidraBinaryParser(root_directory, file_path) return ghidra_parser.binary, ghidra_parser.call_graph else: return f" disassembler {backend} is not supported" @@ -199,8 +199,6 @@ def map_binary( self.unresolved_callgraph[bin_object.path] = additional_res if bin_object.id is not None: self.node_ids[bin_object.id] = bin_object - if additional_res is not None: - self._record_custom_command(bin_object, f"[bin mapping] {bin_object.name}") def _treat_bin_parsing_result(self, path: Path, res: Any): """Handle load_binary res, map it or display error.""" @@ -327,7 +325,6 @@ def map_binaries_main(self, threads: int, progress: Progress) -> None: self.record_binary_in_db(binary, log_prefix) if binary.id is not None: self.node_ids[binary.id] = binary - self._record_custom_command(binary, log_prefix) progress.update(binaries_map, advance=1) else: @@ -360,7 +357,7 @@ def mapper_main( self.dry_run_mode = False self.progress = progress - self.exports_to_bins = self.make_export_to_binaries_map() + self.exports_to_bins = self._make_export_to_binaries_map() # Iterate again all binaries to create call edges (all numbat_id are created) cg_map = progress.add_task( @@ -375,8 +372,9 @@ def mapper_main( for f_symb, targets in self.unresolved_callgraph[binary.path].items(): if not binary.function_exists(f_symb.name): if targets: + addr_log = {hex(f_symb.addr) if f_symb.addr is not None else None} logging.error( - f"function {f_symb.name} ({hex(f_symb.addr) if f_symb.addr is not None else None}) not in binary: {binary.name}" + f"function {f_symb.name} ({addr_log}) not in binary: {binary.name}" ) continue @@ -417,20 +415,6 @@ def mapper_main( # return the filesystem object return self.fs - def _record_custom_command(self, binary: Binary, log_prefix: str = "") -> None: - """Add a custom command to call numbat-ui on the underlying Sourcetrail. - - :param binary: binary on which to apply the custom command - """ - if self.dry_run_mode: - return None - assert self.db_interface is not None - cmd = [NUMBAT_UI_BIN, str(binary.real_path) + ".srctrlprj"] - if binary.id is None: - logging.warning(f"{log_prefix}: cannot record command as binary has no id") - else: - self.db_interface.set_custom_command(binary.id, cmd, f"Open in {NUMBAT_UI_BIN}") - def _record_call_ref(self, src: Symbol, dst: Symbol, log_prefix: str = "") -> bool: """Add call reference between two symbols in DB. @@ -471,7 +455,7 @@ def _record_unindexed_call(self, src: Symbol, dst: str, log_prefix: str = "") -> return None self.db_interface.record_ref_call(src.id, tgt_id) - def make_export_to_binaries_map(self) -> dict[str, list[Binary]]: + def _make_export_to_binaries_map(self) -> dict[str, list[Binary]]: """Compute dict mapping: exported-funs -> binaries (exporting the function). Indeed multiple binaries can export the same symbol ! diff --git a/src/pyrrha_mapper/intercg/loader.py b/src/pyrrha_mapper/intercg/loader.py index 59fe817..d2b8935 100644 --- a/src/pyrrha_mapper/intercg/loader.py +++ b/src/pyrrha_mapper/intercg/loader.py @@ -17,24 +17,14 @@ import logging import re -from abc import abstractmethod -from collections.abc import Iterator -from enum import StrEnum from pathlib import Path from typing import NamedTuple +from pyrrha_mapper.backend import IDA, Backend, Ghidra from pyrrha_mapper.common import Binary, Symbol from pyrrha_mapper.exceptions import FsMapperError from pyrrha_mapper.fs import FileSystemImportsMapper - - -class FuncType(StrEnum): - """Represent the type of a function.""" - - IMPORTED = "imported" - LIBRARY = "library" - NORMAL = "normal" - THUNK = "thunk" +from pyrrha_mapper.types import FuncType class FuncData(NamedTuple): @@ -79,7 +69,7 @@ def _count_leading_underscores(name: str) -> int: ) -class BinaryParser: +class BinaryParser(Backend): """Abstract base class that parses a binary and extracts call-graph data. Subclasses implement the parser-specific methods (IDA, Ghidra, …). @@ -92,7 +82,9 @@ def __init__(self, root_directory: Path, file_path: Path) -> None: self._is_relocatable: bool = False self._binary = self._generate_lief_bin(root_directory, file_path) self._is_relocatable = self._binary.is_relocatable - self._initiate_bin_parser(root_directory, file_path, self._binary.image_base) + super().__init__( + file_path, root_directory, decompilation=False, image_base=self._binary.image_base + ) image_base = self._binary.image_base @@ -108,7 +100,7 @@ def __init__(self, root_directory: Path, file_path: Path) -> None: # Step 2 — add exported symbols not discovered by the parser. # Skipped for ET_REL: LIEF addresses are section-relative and incompatible # with the parser address space; Step 1 already matched exports by name. - parser_addrs: set[int] = set(self._iter_func_addr()) + parser_addrs: set[int] = set(self.func_addrs) call_graph: dict[Symbol, list[str]] = {} if not self._is_relocatable: @@ -117,8 +109,8 @@ def __init__(self, root_directory: Path, file_path: Path) -> None: continue canon = self._disambiguate_export(symbols) # ARM THUMB: parser may use address - 1 (THUMB bit cleared) - if self._is_func_start(parser_addr - 1): - if self._func_mangled_name(parser_addr - 1) in {s.name for s in symbols}: + if self.is_func_start(parser_addr - 1): + if self.func_mangled_name(parser_addr - 1) in {s.name for s in symbols}: continue logging.debug( f"{self.log_prefix}: export {canon.name} @ {parser_addr:#x} " @@ -173,23 +165,23 @@ def __init__(self, root_directory: Path, file_path: Path) -> None: if func_data.type == FuncType.THUNK and len(func_data.calls) == 1: if func_data.calls[0] not in program_data: - mangled_name = self._func_mangled_name(func_data.calls[0]) + mangled_name = self.func_mangled_name(func_data.calls[0]) if mangled_name == "": logging.warning("Nothing found ") continue func_symbol = Symbol( name=mangled_name, - demangled_name=self._func_demangled_name(func_data.calls[0]), + demangled_name=self.func_demangled_name(func_data.calls[0]), is_func=True, addr=func_data.calls[0], ) self._binary.add_function(func_symbol) func = FuncData( symbol=func_symbol, - type=self._func_type(func_data.calls[0]), - calls=self._func_children(func_data.calls[0]), - callers=self._func_parents(func_data.calls[0]), + type=self.func_type(func_data.calls[0]), + calls=self.func_children(func_data.calls[0]), + callers=self.func_parents(func_data.calls[0]), ) missed_data[func_data.calls[0]] = func callee_data = func @@ -257,10 +249,10 @@ def __init__(self, root_directory: Path, file_path: Path) -> None: sym: [trampoline_map.get(c, c) for c in callees] for sym, callees in call_graph.items() } - self._close_bin_parser() + self.close() # ------------------------------------------------------------------ - # Useful public properties + # Public properties # ------------------------------------------------------------------ @property @@ -273,51 +265,6 @@ def call_graph(self) -> dict[Symbol, list[str]]: """:return: mapping from each Symbol to its list of callee names.""" return self._call_graph - # ------------------------------------------------------------------ - # Abstract interface — implemented by each parser backend - # ------------------------------------------------------------------ - - @abstractmethod - def _initiate_bin_parser(self, root_directory: Path, file_path: Path, image_base: int = 0): - """Open the binary parser and run any required analysis.""" - - @abstractmethod - def _close_bin_parser(self): - """Close the binary parser and release all resources.""" - - @abstractmethod - def _is_func_start(self, addr: int) -> bool: - """:return: True if *addr* (parser space) is the entry point of a function.""" - - @abstractmethod - def _iter_func_addr(self) -> Iterator[int]: - """Yield the parser-space entry-point address of every known function.""" - - @abstractmethod - def _func_mangled_name(self, addr: int) -> str: - """:return: the raw (mangled) name of the function at *addr*""" - - @abstractmethod - def _func_demangled_name(self, addr: int) -> str: - """:return: the demangled name of the function at *addr*""" - - @abstractmethod - def _func_children(self, addr: int) -> list[int]: - """:return: entry-point addresses of callees of the function at *addr*.""" - - @abstractmethod - def _func_parents(self, addr: int) -> list[int]: - """:return: entry-point addresses of callers of the function at *addr*.""" - - @abstractmethod - def _func_type(self, addr: int) -> FuncType: - """:return: the FuncType of the function at *addr*. - - Thunk stubs that resolve to external/imported functions must return - ``FuncType.IMPORTED`` so the trampoline resolution in ``__init__`` - correctly forwards callers to the imported symbol name. - """ - # ------------------------------------------------------------------ # Concrete helpers # ------------------------------------------------------------------ @@ -335,10 +282,6 @@ def _generate_lief_bin(self, root_directory: Path, file_path: Path) -> Binary: raise FsMapperError(f"{self.log_prefix}: real_path not set (skip)") if not lief_binary.real_path.exists(): raise FsMapperError(f"{self.log_prefix}: executable not found (skip)") - - # Detect ET_REL (kernel modules, object files) via LIEF before the - # raw LIEF object is discarded. Stored on the instance so BinaryParser - # can skip address-based export matching for relocatable binaries. return lief_binary def _build_calls_list( @@ -382,12 +325,12 @@ def _combine_program_analysis_binary( imported_names: set[str] = set(self._binary.imported_symbol_names) program_data: dict[int, FuncData] = {} - for parser_addr in self._iter_func_addr(): + for parser_addr in self.func_addrs: if parser_addr in parser_exports or parser_addr + 1 in parser_exports: - # Exported function — adopt the LIEF symbol + # Exported function — adopt the LIEF symbol. symbols = parser_exports.get(parser_addr, parser_exports.get(parser_addr + 1, [])) func_symbol = self._disambiguate_export(symbols) - parser_name = self._func_demangled_name(parser_addr) + parser_name = self.func_demangled_name(parser_addr) if parser_name != func_symbol.demangled_name: logging.debug( f"{self.log_prefix}: rename {parser_name} → {func_symbol.demangled_name}" @@ -397,7 +340,7 @@ def _combine_program_analysis_binary( self._binary.replace_function(func_symbol, sym, True) else: # Internal function — create a new Symbol in parser space. - mangled_name = self._func_mangled_name(parser_addr) + mangled_name = self.func_mangled_name(parser_addr) # Skip LIEF-imported names except: (a) PLT thunks — must reach # Step 3 to build trampoline_map; (b) _Z-prefixed names — a # statically linked binary may contain a private copy of a symbol @@ -405,12 +348,12 @@ def _combine_program_analysis_binary( if ( mangled_name in imported_names and not mangled_name.startswith("_Z") - and self._func_type(parser_addr) != FuncType.THUNK + and self.func_type(parser_addr) != FuncType.THUNK ): continue func_symbol = Symbol( name=mangled_name, - demangled_name=self._func_demangled_name(parser_addr), + demangled_name=self.func_demangled_name(parser_addr), is_func=True, addr=parser_addr, ) @@ -418,9 +361,9 @@ def _combine_program_analysis_binary( program_data[parser_addr] = FuncData( symbol=func_symbol, - type=self._func_type(parser_addr), - calls=self._func_children(parser_addr), - callers=self._func_parents(parser_addr), + type=self.func_type(parser_addr), + calls=self.func_children(parser_addr), + callers=self.func_parents(parser_addr), ) return program_data @@ -451,300 +394,32 @@ def _disambiguate_export(self, symbols: list[Symbol]) -> Symbol: return chosen -# ====================================================================== -# IDA Pro backend -# ====================================================================== - +class IDABinaryParser(BinaryParser, IDA): + """Binary parser backed by IDA Pro.""" -class IDAParser(BinaryParser): - """BinaryParser implementation using IDA Pro as the analysis backend.""" + pass - def _initiate_bin_parser(self, root_directory: Path, file_path: Path, image_base: int = 0): - """Open the IDA database, running auto-analysis if needed.""" - from ida_domain.database import Database, IdaCommandOptions - self._ida_cached_func = None # single-entry cache used by _get_ida_func - self._ida_db = Database.open( - str(root_directory / file_path), - args=IdaCommandOptions(auto_analysis=True, new_database=False), - ) - - def _close_bin_parser(self): - """Close the IDA database without saving.""" - self._ida_db.close(save=False) - - def _get_ida_func(self, addr: int): - """:return: the IDA function at *addr*, using a single-entry cache.""" - if self._ida_cached_func is not None and addr == self._ida_cached_func.start_ea: - return self._ida_cached_func - return self._ida_db.functions.get_at(addr) - - def _is_func_start(self, addr: int) -> bool: - """:return: True if *addr* is the entry point of a known IDA function.""" - from ida_domain.base import InvalidEAError - - try: - if self._ida_cached_func is not None and addr == self._ida_cached_func.start_ea: - return True - func = self._ida_db.functions.get_at(addr) - return func is not None and func.start_ea == addr - except InvalidEAError: - return False - - def _iter_func_addr(self) -> Iterator[int]: - """Yield the entry-point address of every function known to IDA. - - IDA's function list includes ``FUNC_TAIL`` entries (flag ``0x400``) — - non-contiguous tail chunks that belong to a parent function defined - elsewhere. These are not callable entry points: they share their - mangled name with the parent, making ``add_function`` overwrite - ``internal_functions`` with the chunk address and breaking - ``function_exists`` lookups for the real function. They must be - skipped so only true function starts enter ``program_data``. - """ - from ida_domain.functions import FunctionFlags - - for func in self._ida_db.functions.get_all(): - flags = self._ida_db.functions.get_flags(func) - if FunctionFlags.TAIL in flags: - continue - self._ida_cached_func = func - yield func.start_ea - - def _get_import(self, addr: int) -> str | None: - res = self._ida_db.functions.get_at(addr) - if res: - return res.name - return None - - def _func_mangled_name(self, addr: int) -> str: - """:return: the raw name of the function at *addr*, or ``sub_``.""" - func = self._get_ida_func(addr) - import_info = self._ida_db.imports.get_import_at(addr) - if import_info is not None and import_info.name is not None: - return import_info.name - if func is not None: - name = self._ida_db.functions.get_name(func) - if name: - return name - return f"sub_{addr:X}" - - def _func_demangled_name(self, addr: int) -> str: - """:return: the demangled name, falling back to the mangled name.""" - mangled = self._func_mangled_name(addr) - demangled = self._ida_db.names.demangle_name(mangled) - return demangled if demangled is not None else mangled - - def _func_children(self, addr: int) -> list[int]: - """:return: parser-space addresses of callees of the function at *addr*. - - When IDA's ``get_callees`` returns a ``FUNC_TAIL`` chunk (a - non-contiguous piece of a parent function, flag ``0x400``), the chunk - address is not a valid callable entry point and must not enter - ``program_data``. Instead, follow the chunk's own callee list to - obtain the real parent function's ``start_ea`` and use that. This - ensures calls to tail-chunked functions are recorded against the true - entry point that ``_iter_func_addr`` emitted. - """ - from ida_domain.functions import FunctionFlags - - func = self._get_ida_func(addr) - if func is None: - return [] - result: list[int] = [] - for callee in self._ida_db.functions.get_callees(func): - flags = self._ida_db.functions.get_flags(callee) - if FunctionFlags.TAIL in flags: - # Resolve to the real parent entry point via the chunk's callees. - parents = list(self._ida_db.functions.get_callees(callee)) - if parents and parents[0].start_ea != callee.start_ea: - result.append(parents[0].start_ea) - # If the chunk calls itself (unresolvable), drop it silently. - else: - result.append(callee.start_ea) - return result - - def _func_parents(self, addr: int) -> list[int]: - """:return: parser-space addresses of callers of the function at *addr*.""" - func = self._get_ida_func(addr) - if func is None: - return [] - return [caller.start_ea for caller in self._ida_db.functions.get_callers(func)] - - def _func_type(self, addr: int) -> FuncType: - """:return: the FuncType of the function at *addr*. - - Thunks whose sole callee is an imported symbol are classified as - ``IMPORTED`` so the trampoline resolution correctly forwards callers. - """ - from ida_domain.functions import FunctionFlags - - func = self._get_ida_func(addr) - if func is None: - return FuncType.NORMAL - - flags = self._ida_db.functions.get_flags(func) - is_imported = False - - callees = list(self._ida_db.functions.get_callees(func)) - if len(callees) == 0: - if self._ida_db.imports.get_import_at(addr): - is_imported = True - - if is_imported or len(func.name.split("@@")) == 2: # symbols with a specific version: - return FuncType.IMPORTED - elif FunctionFlags.THUNK in flags: - callees = list(self._ida_db.functions.get_callees(func)) - if len(callees) == 1: - callee_name = self._ida_db.functions.get_name(callees[0]) - if self._ida_db.imports.exists(callee_name): - return FuncType.IMPORTED - return FuncType.THUNK - elif FunctionFlags.LIB in flags: - return FuncType.LIBRARY - return FuncType.NORMAL - - -# ====================================================================== -# Ghidra backend -# ====================================================================== - -# Analyzers required for call-graph extraction (function discovery, xrefs, -# thunk resolution, import/export tables, and name demangling). -# Everything else is explicitly disabled to minimise analysis time. -_GHIDRA_REQUIRED_ANALYZERS: frozenset[str] = frozenset( - [ - # --- Function discovery --- - "Disassemble Entry Points", - "Function Start Search", - "Function Start Search After Code", - "Non-Returning Functions - Discovered", - "Non-Returning Functions - Known", - # --- Call graph / cross-references --- - "Call Convention ID", - "Call-Fixup Installer", - "Subroutine References", - "Subroutine References - One Time", - # --- Thunk resolution --- - "Thunk Function", - # --- Format-specific import/export tables --- - # ELF - "ELF Scalar Operand References", - "External Entry References", - # PE - "PE Entry Point", - "Windows x86 PE Thunk Functions", - # Mach-O (no extra analyzer needed beyond the loader itself) - # --- Demangling --- - "Demangler GNU", - "Demangler Microsoft", - ] -) - - -class GhidraParser(BinaryParser): - """BinaryParser backed by Ghidra 12.0+ via PyGhidra.""" - - def _initiate_bin_parser(self, root_directory: Path, file_path: Path, image_base: int = 0): - """Start the JVM, open the binary with only the required analyzers, and initialise handles. +class GhidraBinaryParser(BinaryParser, Ghidra): + """Binary parser backed by Ghidra.""" - All Ghidra analyzers not listed in ``_GHIDRA_REQUIRED_ANALYZERS`` are - disabled before analysis runs, which significantly reduces analysis time - while preserving full function-discovery and call-graph accuracy. + def __init__(self, *args, **kwargs)-> None: + super().__init__(*args, **kwargs) + program = self._ghidra_program - Analyzer selection is done via ``program.getOptions("Analyzers")`` and - ``setBoolean``, which is the stable public API across all Ghidra versions - supported by PyGhidra. No internal ``AutoAnalysisManager`` import is - needed. - """ - import os - import tempfile - - import pyghidra # type: ignore - - # Initialise all attributes upfront so _close_bin_parser is always safe - self._pyghidra_ctx = None - self._ghidra_program = None - self._ghidra_project_dir: Path | None = None - self._ghidra_func_manager = None - self._ghidra_symbol_table = None - self._ghidra_demangler = None - self._ghidra_cached_func = None - self._ghidra_load_base: int = 0 - self._ghidra_monitor = None - self._ghidra_exported_parser_addrs: set[int] = set() - self._ghidra_is_relocatable: bool = False - self._ghidra_exported_names: dict[str, Symbol] = {} - - full_path = root_directory / file_path - self._ghidra_project_dir = Path(tempfile.mkdtemp(prefix=f"ghidra_{os.getpid()}_")) - - # Start the JVM once per worker process (no-op if already running) - if not pyghidra.started(): - from pyghidra.launcher import HeadlessPyGhidraLauncher # type: ignore - - launcher = HeadlessPyGhidraLauncher() - launcher.add_vmargs("-Xms512m", "-Xmx2g", "-XX:+UseG1GC") - launcher.start() - - # Ghidra imports must come after JVM start - from ghidra.app.util.demangler.gnu import GnuDemangler # type: ignore - from ghidra.util.task import ConsoleTaskMonitor # type: ignore - - self._ghidra_monitor = ConsoleTaskMonitor() - - # Open without running analysis yet so we can configure the analyzer set. - self._pyghidra_ctx = pyghidra.open_program( - str(full_path), - project_location=str(self._ghidra_project_dir), - project_name="p", - analyze=False, - ) - flat_api = self._pyghidra_ctx.__enter__() - program = flat_api.getCurrentProgram() - - # Disable every analyzer not in the required set via the stable - # "Analyzers" options block, then trigger analysis through the flat API. - analyzer_options = program.getOptions("Analyzers") - for option_name in analyzer_options.getOptionNames(): - enabled = option_name in _GHIDRA_REQUIRED_ANALYZERS - try: - analyzer_options.setBoolean(option_name, enabled) - except Exception: - # Some option names in the "Analyzers" block are not simple - # booleans (e.g. sub-option strings); skip them silently. - pass - if not enabled: - logging.debug(f"{self.log_prefix}: disabled Ghidra analyzer '{option_name}'") - - # Run analysis with the filtered analyzer set via the stable flat API. - flat_api.analyzeAll(program) - - self._ghidra_program = program - # Derive load base from the program itself, not from LIEF's image_base, - # so that _to_ghidra_address / _to_parser_addr are always consistent. - self._ghidra_load_base = program.getImageBase().getOffset() - # Build the exported-address set once so _func_type can check it cheaply. - self._ghidra_exported_parser_addrs = { + # Build the exported-address set once so _func_type can check cheaply. + self._ghidra_exported_parser_addrs: set[int] = { lief_addr - self._binary.image_base for lief_addr in self._binary.exported_funcs_by_addr } - self._ghidra_func_manager = program.getFunctionManager() - self._ghidra_symbol_table = program.getSymbolTable() - - # ET_REL (kernel modules, object files): Ghidra lays sections out in a - # fake address space starting at 0x10000; LIEF reports raw - # section-relative offsets. The two coordinate systems are - # incompatible, so address-based matching is impossible — we must match - # exported symbols by name instead. - # The "relocatable" flag is written by ElfProgramBuilder into the - # program's PROGRAM_INFO options block under the key used by - # RelocationTable.RELOCATABLE_PROP_NAME ("Relocatable"). We read it - # directly as a string to avoid importing the internal class. - self._ghidra_is_relocatable = bool( - program.getOptions(program.PROGRAM_INFO).getBoolean("Relocatable", False) + + # ET_REL (kernel modules, object files): Ghidra lays sections out at a + # fake base (0x10000); LIEF reports raw section-relative offsets. + # The two coordinate systems are incompatible — match by name instead. + self._ghidra_is_relocatable: bool = bool( + program.getOptions(program.PROGRAM_INFO).getBoolean("Relocatable", False) # type: ignore ) # Name → LIEF Symbol map, populated only for relocatable binaries. - self._ghidra_exported_names = ( + self._ghidra_exported_names: dict = ( { sym.name: sym for symbols in self._binary.exported_funcs_by_addr.values() @@ -754,111 +429,32 @@ def _initiate_bin_parser(self, root_directory: Path, file_path: Path, image_base else {} ) - demangler = GnuDemangler() - self._ghidra_demangler = demangler if demangler.canDemangle(program) else None - - def _close_bin_parser(self): - """Exit the PyGhidra context and delete the temporary project directory.""" - import shutil - - if self._pyghidra_ctx is not None: - try: - self._pyghidra_ctx.__exit__(None, None, None) - except Exception: - pass - if self._ghidra_project_dir is not None: - shutil.rmtree(self._ghidra_project_dir, ignore_errors=True) - - # ------------------------------------------------------------------ - # Internal helpers - # ------------------------------------------------------------------ - - def _to_ghidra_address(self, parser_addr: int): - """Convert a parser-space address to a Ghidra ``Address`` object. - - Adds ``_ghidra_load_base`` to restore the absolute Ghidra address, then - masks to a signed 64-bit integer to satisfy JPype's type requirements. - """ - abs_addr = (parser_addr + self._ghidra_load_base) & 0xFFFFFFFFFFFFFFFF - if abs_addr >= 0x8000000000000000: - abs_addr -= 0x10000000000000000 - return ( - self._ghidra_program.getAddressFactory().getDefaultAddressSpace().getAddress(abs_addr) - ) - - def _to_parser_addr(self, ghidra_offset: int) -> int: - """Convert an absolute Ghidra address offset to parser space.""" - return ghidra_offset - self._ghidra_load_base - - def _get_ghidra_func(self, parser_addr: int): - """:return: the Ghidra Function at *parser_addr*, with a single-entry cache. - - Falls back to ``getFunctionContaining`` when ``getFunctionAt`` returns - ``None``, which handles the ARM THUMB case where the parser address - may be offset by one from the real entry point stored by Ghidra. - """ - if ( - self._ghidra_cached_func is not None - and self._to_parser_addr(self._ghidra_cached_func.getEntryPoint().getOffset()) - == parser_addr - ): - return self._ghidra_cached_func - - ghidra_addr = self._to_ghidra_address(parser_addr) - func = self._ghidra_func_manager.getFunctionAt(ghidra_addr) - if func is None: - # getFunctionContaining handles mid-function addresses and the ARM - # THUMB ±1 offset; only accept the result when the entry point - # matches exactly (after rounding) to avoid false positives. - func = self._ghidra_func_manager.getFunctionContaining(ghidra_addr) - if func is not None: - entry_parser_addr = self._to_parser_addr(func.getEntryPoint().getOffset()) - if abs(entry_parser_addr - parser_addr) > 1: - func = None - - if func is not None: - self._ghidra_cached_func = func - return func - - # ------------------------------------------------------------------ - # BinaryParser interface - # ------------------------------------------------------------------ - - def _combine_program_analysis_binary( - self, - parser_exports: dict[int, list[Symbol]], - ) -> dict[int, FuncData]: + def _combine_program_analysis_binary(self, parser_exports: dict) -> dict: """Override for relocatable binaries (ET_REL, e.g. kernel modules). For ``ET_REL`` files Ghidra places sections in a fake address space - (default base ``0x10000``) while LIEF reports raw section-relative - offsets. Address-based matching is therefore impossible and exported - symbols are matched by name instead. - - For non-relocatable binaries the base-class implementation is used - unchanged. + while LIEF reports raw section-relative offsets. Address-based + matching is impossible — exported symbols are matched by name instead. + For non-relocatable binaries the base-class implementation is used. :param parser_exports: LIEF exports already remapped to parser space. :return: mapping from parser-space address to FuncData. """ - if not self._ghidra_is_relocatable: + # Only GhidraParser sets _ghidra_is_relocatable; GhidraLoader doesn't + # call BaseParser.__init__ so this method is never reached from there. + if not getattr(self, "_ghidra_is_relocatable", False): return super()._combine_program_analysis_binary(parser_exports) imported_names: set[str] = set(self._binary.imported_symbol_names) program_data: dict[int, FuncData] = {} - for parser_addr in self._iter_func_addr(): - mangled_name = self._func_mangled_name(parser_addr) + for parser_addr in self.func_addrs: + mangled_name = self.func_mangled_name(parser_addr) if mangled_name in self._ghidra_exported_names: - # ET_REL: adopt name and demangled name from the LIEF export - # symbol, but use the Ghidra parser-space address so that the - # rest of BinaryParser sees a consistent address space. - # The LIEF address is a raw section-relative offset and must - # not be used as a key anywhere in the resolution logic. - # Use add_exported_symbol so the symbol stays in exported_functions - # (consistent with what load_binary registered via LIEF) and is - # evicted from internal_functions if it was registered there first. + # ET_REL: adopt name/demangled from the LIEF export symbol but + # use the Ghidra parser-space address so the rest of BaseParser + # sees a consistent address space. lief_sym = self._ghidra_exported_names[mangled_name] func_symbol = Symbol( name=lief_sym.name, @@ -866,23 +462,23 @@ def _combine_program_analysis_binary( is_func=True, addr=parser_addr, ) - parser_name = self._func_demangled_name(parser_addr) + parser_name = self.func_demangled_name(parser_addr) if parser_name != func_symbol.demangled_name: logging.debug( - f"{self.log_prefix}: rename {parser_name} → {func_symbol.demangled_name}" + f"{getattr(self, 'log_prefix', '')}: " + f"rename {parser_name} → {func_symbol.demangled_name}" ) self._binary.add_exported_symbol(func_symbol) else: - # Internal function — same guard as base class. if ( mangled_name in imported_names and not mangled_name.startswith("_Z") - and self._func_type(parser_addr) != FuncType.THUNK + and self.func_type(parser_addr) != FuncType.THUNK ): continue func_symbol = Symbol( name=mangled_name, - demangled_name=self._func_demangled_name(parser_addr), + demangled_name=self.func_demangled_name(parser_addr), is_func=True, addr=parser_addr, ) @@ -890,147 +486,9 @@ def _combine_program_analysis_binary( program_data[parser_addr] = FuncData( symbol=func_symbol, - type=self._func_type(parser_addr), - calls=self._func_children(parser_addr), - callers=self._func_parents(parser_addr), + type=self.func_type(parser_addr), + calls=self.func_children(parser_addr), + callers=self.func_parents(parser_addr), ) return program_data - - def _is_func_start(self, addr: int) -> bool: - """:return: True if *addr* (parser space) is a known Ghidra function entry.""" - return self._get_ghidra_func(addr) is not None - - def _iter_func_addr(self) -> Iterator[int]: - """Yield parser-space entry-point addresses of every non-external Ghidra function. - - ``getFunctions(True)`` skips functions that live in Ghidra's external - program space (imported stubs resolved to library addresses). Those are - handled separately by the LIEF import tracking in ``BinaryParser``. - """ - seen_addrs: set[int] = set() - for func in self._ghidra_func_manager.getFunctions(True): - # Skip external-space functions — they are not mapped in the binary. - if func.isExternal(): - continue - self._ghidra_cached_func = func - parser_addr = self._to_parser_addr(func.getEntryPoint().getOffset()) - if parser_addr in seen_addrs: - continue - seen_addrs.add(parser_addr) - yield parser_addr - - def _func_mangled_name(self, addr: int) -> str: - """:return: the raw mangled name of the function at *addr*, or ``sub_``. - - Queries the symbol table directly for symbols whose name starts with - ``_Z`` (Itanium ABI mangled prefix) at the given address, which gives - the raw mangled name before Ghidra's demangler has processed it. - Falls back to ``func.getName()`` for non-C++ functions, rejecting names - that look like partial demangles (operators, destructors, anonymous). - """ - func = self._get_ghidra_func(addr) - if func is None: - return f"FUN_{addr:X}" - - # Search all symbols at this address for a mangled (_Z...) name. - ghidra_addr = self._to_ghidra_address(addr) - for sym in self._ghidra_symbol_table.getSymbols(ghidra_addr): - raw = sym.getName() - if raw and raw.startswith("_Z"): - return raw - - # No mangled symbol found — use func.getName() but reject partial - # demangles: operators, destructors, and anonymous constructs. - name = func.getName() - if name and not ( - name.startswith("~") - or name.startswith("operator") - or (name.startswith("<") and name.endswith(">")) - ): - return name - - return f"FUN_{addr:X}" - - def _func_demangled_name(self, addr: int) -> str: - """:return: the demangled name, falling back to the mangled name.""" - mangled = self._func_mangled_name(addr) - if self._ghidra_demangler is not None: - try: - result = self._ghidra_demangler.demangle(mangled, True) - if result is not None: - # Use getName() to get the bare function name without return - # type or parameter signature, so it matches the short callee - # names used in the call graph (e.g. "basic_string" rather - # than "std::__cxx11::basic_string"). - name = result.getName() - if name: - return name - except Exception: - pass - return mangled - - def _func_children(self, addr: int) -> list[int]: - """:return: parser-space addresses of callees of the function at *addr*. - - Uses Ghidra's reference manager to collect raw CALL instruction targets - rather than ``getCalledFunctions()``. ``getCalledFunctions()`` resolves - thunk chains and returns the *external* symbol directly, bypassing the - PLT stub that lives in the binary's address space. By reading raw call - references we obtain the actual branch targets — including PLT thunk - addresses — so the trampoline resolution in ``BinaryParser`` can - correctly classify them as ``IMPORTED`` and forward callers to the - imported symbol name. - """ - func = self._get_ghidra_func(addr) - if func is None: - return [] - - listing = self._ghidra_program.getListing() - seen: set[int] = set() - result: list[int] = [] - for cu in listing.getCodeUnits(func.getBody(), True): - for ref in cu.getReferencesFrom(): - if not ref.getReferenceType().isCall(): - continue - target_offset = ref.getToAddress().getOffset() - parser_addr = self._to_parser_addr(target_offset) - if parser_addr in seen: - continue - seen.add(parser_addr) - result.append(parser_addr) - return result - - def _func_parents(self, addr: int) -> list[int]: - """:return: parser-space addresses of callers of the function at *addr*.""" - func = self._get_ghidra_func(addr) - if func is None: - return [] - - seen: set[str] = set() - result: list[int] = [] - for caller in func.getCallingFunctions(self._ghidra_monitor): - if caller.isExternal(): - continue - name = caller.getName() - if name in seen: - continue - seen.add(name) - result.append(self._to_parser_addr(caller.getEntryPoint().getOffset())) - return result - - def _func_type(self, addr: int) -> FuncType: - """:return: the FuncType of the function at *addr* (parser space).""" - func = self._get_ghidra_func(addr) - if func is None: - return FuncType.NORMAL - - if func.isExternal(): - return FuncType.IMPORTED - - if func.isThunk(): - # Always THUNK — the trampoline resolution in Step 3 detects the - # external callee via callee_data.type and collapses the chain. - return FuncType.THUNK - - return FuncType.NORMAL From de8a8c5f2ed3aad1d8a69f2864ca07ff01b97498 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 5 May 2026 15:04:53 +0200 Subject: [PATCH 32/62] types: common types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/types.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/pyrrha_mapper/types.py b/src/pyrrha_mapper/types.py index f645fe6..86b043a 100644 --- a/src/pyrrha_mapper/types.py +++ b/src/pyrrha_mapper/types.py @@ -15,7 +15,7 @@ # limitations under the License. """Types shared in multiple mappers.""" -from enum import Enum, auto +from enum import Enum, StrEnum, auto class Backend(Enum): @@ -34,3 +34,12 @@ class ResolveDuplicateOption(Enum): IGNORE = 1 # doc: The mapper will let the conflict as unresolved. ARBITRARY = 2 # doc: The mapper will choose a default one. INTERACTIVE = 3 # doc: The user can interactively solve the conflict. + + +class FuncType(StrEnum): + """Represent the type of a function.""" + + IMPORTED = "imported" + LIBRARY = "library" + NORMAL = "normal" + THUNK = "thunk" From fc1c4d801639da040bff1cc15a58319d021843eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 5 May 2026 15:32:16 +0200 Subject: [PATCH 33/62] backend: avoid import sre specific modules when not in use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/backend/ida.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/pyrrha_mapper/backend/ida.py b/src/pyrrha_mapper/backend/ida.py index cccf5e6..a1a4903 100644 --- a/src/pyrrha_mapper/backend/ida.py +++ b/src/pyrrha_mapper/backend/ida.py @@ -21,9 +21,6 @@ from collections.abc import Iterator from pathlib import Path -from ida_domain.database import Database, IdaCommandOptions -from ida_domain.functions import FunctionFlags - from pyrrha_mapper.backend import Backend from pyrrha_mapper.types import FuncType @@ -38,6 +35,7 @@ def __init__( decompilation: bool = False, image_base: int = 0, ) -> None: + from ida_domain.database import Database, IdaCommandOptions self.decompilation_activated = decompilation self.image_base = image_base self._bin_path = bin_path @@ -66,6 +64,7 @@ def is_func_start(self, addr: int) -> bool: @property def func_addrs(self) -> Iterator[int]: """Yield the parser-space entry-point address of every known function.""" + from ida_domain.functions import FunctionFlags for func in self._ida_db.functions.get_all(): if FunctionFlags.TAIL in self._ida_db.functions.get_flags(func): continue @@ -111,6 +110,7 @@ def func_children(self, addr: int) -> list[int]: :param addr: function entry-point address. :return: list of callee entry-point addresses. """ + from ida_domain.functions import FunctionFlags func = self._get_ida_func(addr) result: list[int] = [] for callee in self._ida_db.functions.get_callees(func) if func is not None else []: @@ -142,6 +142,7 @@ def func_type(self, addr: int) -> FuncType: 5. ``FUNC_LIB`` → ``LIBRARY``. 6. Default → ``NORMAL``. """ + from ida_domain.functions import FunctionFlags func = self._get_ida_func(addr) if func is None: return FuncType.NORMAL @@ -205,6 +206,7 @@ def _ida_funcs(self) -> Iterator: :return: iterator of ``func_t`` objects with ``FUNC_TAIL`` excluded. """ + from ida_domain.functions import FunctionFlags for func in self._ida_db.functions.get_all(): if FunctionFlags.TAIL in self._ida_db.functions.get_flags(func): continue From ca1b3824f1500eca860e2f649c62bd6921fd88fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 5 May 2026 15:34:48 +0200 Subject: [PATCH 34/62] decomp: full rework to integrate a class object, integration with backend and remove qbinary/quokka MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/__main__.py | 20 +- src/pyrrha_mapper/exedecomp/__init__.py | 4 +- src/pyrrha_mapper/exedecomp/binmapper.py | 362 ------------------- src/pyrrha_mapper/exedecomp/decomp_mapper.py | 341 +++++++++++++++++ src/pyrrha_mapper/exedecomp/decompile.py | 71 ---- 5 files changed, 355 insertions(+), 443 deletions(-) delete mode 100644 src/pyrrha_mapper/exedecomp/binmapper.py create mode 100644 src/pyrrha_mapper/exedecomp/decomp_mapper.py delete mode 100644 src/pyrrha_mapper/exedecomp/decompile.py diff --git a/src/pyrrha_mapper/__main__.py b/src/pyrrha_mapper/__main__.py index 1a2ab90..b9370cd 100644 --- a/src/pyrrha_mapper/__main__.py +++ b/src/pyrrha_mapper/__main__.py @@ -24,7 +24,7 @@ import coloredlogs # type: ignore # no typing used in this library from numbat import SourcetrailDB -from pyrrha_mapper import exedecomp, fs, intercg +from pyrrha_mapper import fs, intercg, exedecomp from pyrrha_mapper.common import FileSystem from pyrrha_mapper.types import Backend, ResolveDuplicateOption @@ -301,7 +301,7 @@ def fs_call_graph_mapper( @pyrrha.command( - "exe-decomp", + "decomp", cls=MapperCommand, short_help="Map an executable call graph with its decompiled code.", help=( @@ -321,18 +321,22 @@ def fs_exe_decompiled_mapper( executable: Path, ): """Map a single executable with decompiled code.""" - if db.name == "exe-decomp.srctrldb": + if db.name == "decomp.srctrldb": db = Path(str(executable) + ".srctrldb") setup_logs(debug, db) db_instance = setup_db(db) - if backend not in (Backend.IDA,): - click.echo(f"Backend {backend.name} not yet supported") - return 1 + match backend: + case Backend.IDA: + mapper = exedecomp.IdaDecompilMapper(db_instance, executable) + case Backend.GHIDRA: + mapper = exedecomp.GhidraDecompilMapper(db_instance, executable) + case _: + click.echo(f"Backend {backend.name} not yet supported") + return 1 - # todo: add backend changes - if exedecomp.map_binary(db_instance, executable): + if mapper.map(): logging.info("success.") else: logging.error("failure.") diff --git a/src/pyrrha_mapper/exedecomp/__init__.py b/src/pyrrha_mapper/exedecomp/__init__.py index 50d429b..9b9f617 100644 --- a/src/pyrrha_mapper/exedecomp/__init__.py +++ b/src/pyrrha_mapper/exedecomp/__init__.py @@ -15,6 +15,6 @@ # limitations under the License. """Module for the decomp mapper.""" -from .binmapper import map_binary +from .decomp_mapper import GhidraDecompilMapper, IdaDecompilMapper -__all__ = ["map_binary"] \ No newline at end of file +__all__ = ["GhidraDecompilMapper", "IdaDecompilMapper"] diff --git a/src/pyrrha_mapper/exedecomp/binmapper.py b/src/pyrrha_mapper/exedecomp/binmapper.py deleted file mode 100644 index e61981c..0000000 --- a/src/pyrrha_mapper/exedecomp/binmapper.py +++ /dev/null @@ -1,362 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Decompilation code binary mapper.""" - -import logging -import json -from pathlib import Path -from collections import defaultdict -from dataclasses import dataclass -from typing import NamedTuple -from tempfile import NamedTemporaryFile -import hashlib - -# third-party imports -from qbinary import Program, Function, FunctionType -from qbinary.types import Disassembler, ExportFormat, DisassExportNotImplemented, ExportException - - -from numbat import SourcetrailDB -from idascript import IDA -from numbat import SourcetrailDB -from rich.progress import ( - BarColumn, - MofNCompleteColumn, - Progress, - TextColumn, - TimeElapsedColumn, -) - -# local imports -from pyrrha_mapper.exceptions import FsMapperError - -DECOMPILE_SCRIPT = Path(__file__).parent / "decompile.py" - - - -once_check = True - - - -class Location(NamedTuple): - start_line: int - start_col: int - end_line: int - end_col: int - - -@dataclass -class DecompiledFunction: - """Class used to represent a decompiled function.""" - - address: int - name: str # demangled (pp_print) - text: str - location: Location # location of the function name within text - references: dict[ - int, list[Location] - ] # callee_addr -> list(start_line, start_col, end_line, end_col) - numbat_id: int = -1 - - -def normalize_name(name: str) -> str: - """Transform function name.""" - return name.strip("_").strip(".") - - -def find_all_call_references(p:Program, f: Function, source: str, - log_prefix: str = "") -> tuple[Location, dict[int, list[Location]]]: - decl_loc = None - refs: dict[int, list[Location]] = defaultdict(list) # dict: call_addr -> list[Location] - #ppname = lambda name: name.strip("_").strip(".") - - # NOTE: we exclude by design calls that don't have a name, usually these are calls - # to unrecognized function e.g: loc_185CC - call_name_to_addr = {normalize_name(p[c].name): c for c in f.children if p[c].name} - call_addr_to_name = {c: normalize_name(p[c].name) for c in f.children if p[c].name} - - - for idx, line in enumerate(source.splitlines()): - # try to find function declaration - if decl_loc is None: - ppname = normalize_name(f.name) - col = line.find(normalize_name(f.name)) - if col != -1: - decl_loc = Location(idx + 1, col + 1, idx + 1, col + len(ppname)) - - # For a given line, this dict keeps the column (index) of all call matched - matches: dict[int, tuple[int, str]] = {} - - # iterate each calls and try to find them in the line - for cname, caddr in call_name_to_addr.items(): - if cname.endswith(")"): # to handle cases of func name with typing of parameter - name = cname.split("(")[0] - else: - name = cname - col = line.find(f"{name}(") - if col != -1: - matches[col] = (caddr, cname) - - # Iterate all matches in a sorted manner to avoid having overlap matches: - # e.g: If a function calls both lxstat() and xstat() for each line we search - # any occurence of this two functions. But if we have a line like: "int c = lxstats()" - # we will match both functions! Thus we sort them by the column index. In that case we - # keep lxstats(). - sorted_matches = sorted(list(matches.items()), key=lambda x: x[0]) - cursor = 0 - previous = (0, "") - while sorted_matches: - col, (caddr, cname) = sorted_matches.pop(0) - if col < cursor: # means the match is overlapping a previous match - if col + len(cname) == cursor and previous[1].endswith(cname): - logging.debug(f"{log_prefix}: skip match {cname}, end of the {previous[1]}") - else: - logging.warning( - f"{log_prefix}: skip match {cname} [col {col}] overlap with previous one " - f"{previous[1]} [col: {previous[0]}]" - ) - else: # its okay we add it - refs[caddr].append(Location(idx + 1, col + 1, idx + 1, col + len(cname))) - cursor = col + len(cname) - previous = (col, cname) - - if decl_loc is None: - logging.error(f"{log_prefix}: function declaration not found in source code") - - if not is_thunk_to_import(p, f): # it is normal no to find the call in thunks to imports - for ref in (x for x in call_addr_to_name if x not in refs): - logging.warning(f"{log_prefix}: call to {ref:#08x}: '{call_addr_to_name[ref]}' not found in source code") - - return decl_loc, refs - - -def decompile_program(program: Program) -> None: - """Generate a PROGRAM_NAME.decompiled file which contained the binary decompilee obtained with IDA. - - :param program: Program object of the file to decompiled - :return: path of the created decompiled file. - """ - bin_path: str = program.exec_path - assert bin_path, "program.exec_path is not set, can't decompile" - ida = IDA(bin_path, str(DECOMPILE_SCRIPT), [], timeout=600, exit_virtualenv=True) - ida.start() - ida.wait() - - -def load_decompiled(program: Program, progress: Progress, - log_prefix: str = "") -> dict[int, DecompiledFunction]: - decompile_file = Path(f"{program.exec_path}.decompiled") - - if decompile_file.exists(): - logging.info(f"{log_prefix}: load file: {decompile_file}") - data = {int(k): v for k, v in json.loads(decompile_file.read_text()).items()} - final_data: dict[int, DecompiledFunction] = {} - # Iterate the decompiled data to try make references inside - decomp_load = progress.add_task("[deep_pink2]Decompiled binary loading", total=len(data)) - for f_addr, source_text in data.items(): - f: Function = program.get(f_addr) - if f is None: - logging.warning(f"{log_prefix}: function at {f_addr:#08x} referenced " - "in decompiled code not found in exported program") - continue - - decl, refs = find_all_call_references(program, f, source_text, f"{log_prefix} {f.name}") - - assert decl is not None, f"function {f.name} declaration not found in source code" - - final_data[f_addr] = DecompiledFunction( - address=f_addr, name=f.name, text=source_text, location=decl, references=refs - ) - progress.update(decomp_load, advance=1) - - return final_data - else: - logging.info(f"{log_prefix}: extracting decompilation file {decompile_file} (with idascript)") - decompile_program(program) - if decompile_file.exists(): - return load_decompiled(program, progress, log_prefix) # call ourselves again - else: - raise FileNotFoundError("can't find decompilation file (idascript failed)") - - -def load_program(bin_path: Path, disass: Disassembler = Disassembler.IDA, format: ExportFormat = ExportFormat.QUOKKA) -> Program | None: - # First try to find pre-existing exported files if format is AUTO - try: - return Program.from_binary(bin_path, - export_format=format, - disassembler=disass, - timeout= 600, # TODO: Receive through command line ? - override=False, # if export exists use it - ) - except DisassExportNotImplemented as e: - logging.error(f"Disassembler {disass} does not support export format {format}: {e}") - except ExportException as e: - logging.error(f"Error while loading binary {bin_path}: {e}") - return None - - -def set_function_color(db: SourcetrailDB, p: Program, fun: Function, f_id: int) -> None: - # Change node color based on its type - if is_thunk_to_import(p, fun): - db.change_node_color(f_id, fill_color="#bee0af", border_color="#395f33") - elif fun.type == FunctionType.thunk: - db.change_node_color(f_id, fill_color="gray") - # elif fun.type == FunctionType.EXTERN: - # db.change_node_color(f_id, fill_color="magenta") - # elif fun.type == FunctionType.IMPORTED: - # db.change_node_color(f_id, fill_color="mediumvioletred") - else: - pass # Normal function let default color - - -def add_source_file( - db: SourcetrailDB, - mangled_name: str, - symbol_id: int, - info: DecompiledFunction, - log_prefix: str = "", -) -> bool: - """:return: True if successfully added source info.text as a source file in DB.""" - with NamedTemporaryFile(mode="wt", delete_on_close=True) as tmp: - tmp.write(info.text) - tmp.flush() # Ensure the file is written before we try to record it - # Record file - file_id = db.record_file(Path(tmp.name), name=mangled_name) - if file_id is None: - return False - db.record_file_language(file_id, "cpp") - tmp.close() - - # Add the function to the file - logging.debug(f"{log_prefix}: add function {mangled_name} to file {file_id}") - info.numbat_id = file_id - # record de symbol declaration - if info.location: - l1, col1, l2, col2 = info.location - db.record_symbol_location(symbol_id, file_id, l1, col1, l2, col2) - else: - logging.warning(f"{log_prefix}: declaration not found in source code") - - return True - - -def is_thunk_to_import(p: Program, f: Function) -> bool: - if f.type == FunctionType.thunk: - if len(f.children) == 1: - c = list(f.children)[0] - callee: Function = p[c] - if callee.type == FunctionType.imported: - return True - return False - else: - return False - - -def map_binary( - db: SourcetrailDB, - program_path: Path, - disass: Disassembler = Disassembler.IDA, - format: ExportFormat = ExportFormat.QUOKKA, -) -> bool: - # Load the Quokka file - with Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(), - MofNCompleteColumn(), - TimeElapsedColumn(), - ) as progress: - # Load the decompilation and quokka files - log_prefix = "[binary loading]" - try: - program = load_program(program_path, disass, format) - if program is None: - logging.error(f"{log_prefix} can't generate exported binary") - return False - except FileNotFoundError as e: - logging.error(f"{log_prefix}: Cannot found {program_path}: {e}") - return False - except FsMapperError as e: - logging.error(f"{log_prefix}: Error during Quokka export generation/loading: {e}") - return False - - # Try loading the decompiled file - try: - decompiled = load_decompiled(program, progress, log_prefix) - except FileNotFoundError as e: - logging.error(f"{log_prefix}: failed to obtain decompiled code: {e}") - return False - - # Compute MD5 hash for URL handler - p_hash = hashlib.md5(Path(program.exec_path).read_bytes()).hexdigest() - - # Index all the functions - f_mapping = {} # f_addr -> numbat_id - func_map = progress.add_task("[orange_red1]Functions analysis", total=len(program)) - for f_addr, f in program.items(): - log_prefix = f"[func analysis] {f.name} ({f.type})" - if f.type == FunctionType.imported: - logging.debug(f"{log_prefix}: extern function, skip") - progress.update(func_map, advance=1) - continue # do not add EXTERN functions - is_imp = is_thunk_to_import(program, f) - f_id = db.record_function(f.name, parent_id=None, is_indexed=not is_imp) - if f_id is None: - logging.error(f"{log_prefix}: error while recording function in db") - progress.update(func_map, advance=1) - continue - f_mapping[f_addr] = f_id - - # Change node color based on its type - set_function_color(db, program, f, f_id) - - # Add source code if any - if f_addr in decompiled and not is_imp: - info = decompiled[f_addr] - if not add_source_file(db, f.mangled_name, f_id, info): - logging.warning(f"{log_prefix}: failed to add decompiled code") - elif f_addr not in decompiled and not is_imp: - logging.warning(f"{log_prefix}: function not in decompiled dict") - else: - pass # do not add decompiled code for thunks to imports - - progress.update(func_map, advance=1) - - - # Index the call graph - cg_map = progress.add_task("[orange1]Call Graph Indexing", total=len(program)) - - for f_addr, f in program.items(): - log_prefix = f"[callgraph indexing] {f.name}" - decomp_fun = decompiled.get(f_addr, None) - - for callee in f.children: - try: - callee_id = f_mapping[callee] - db.record_ref_call(f_mapping[f_addr], callee_id) # record the call - - if decomp_fun: # if we have info about the decompiled function - if refs := decomp_fun.references.get(callee): # get the refs associated with callee - for li, coli, le, cole in refs: # iterate them and add them - db.record_reference_location(callee_id, decomp_fun.numbat_id, li, coli, le, cole) - else: - logging.warning(f"{log_prefix} calls {program[callee].name} " - "but not references in DecompiledFunction") - - except KeyError: - pass # ignore call to non recognized functions - - progress.update(cg_map, advance=1) - return True diff --git a/src/pyrrha_mapper/exedecomp/decomp_mapper.py b/src/pyrrha_mapper/exedecomp/decomp_mapper.py new file mode 100644 index 0000000..7d8f4f1 --- /dev/null +++ b/src/pyrrha_mapper/exedecomp/decomp_mapper.py @@ -0,0 +1,341 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Decompilation code binary mapper.""" + +import logging +from dataclasses import dataclass, field +from pathlib import Path +from tempfile import NamedTemporaryFile +from typing import NamedTuple + +from numbat import SourcetrailDB +from rich.progress import ( + BarColumn, + MofNCompleteColumn, + Progress, + TextColumn, + TimeElapsedColumn, +) + +from pyrrha_mapper.backend import Backend, Ghidra, IDA +from pyrrha_mapper.common import Binary, Symbol +from pyrrha_mapper.types import FuncType + + +class Location(NamedTuple): + """Location inside a text of a word or more.""" + + start_line: int + start_col: int + end_line: int + end_col: int + + +@dataclass +class FuncData: + """Store function data collected by the binary parser. + + All addresses are in **parser space** (the native address space of the + underlying tool — IDA, Ghidra, etc.). + """ + + symbol: Symbol + type: FuncType + calls: list[int] + callers: list[int] + source: str + source_id: int | None = None + declaration: Location | None = None + source_calls_loc: dict[int, list[Location]] = field(default_factory=dict) + + @property + def id(self) -> int | None: + """:return: the associated DB id if any""" + return self.symbol.id + + @id.setter + def id(self, val: int) -> None: + self.symbol.id = val + + @property + def name(self) -> str: + """:return: mangled name of the function""" + return self.symbol.name + + @property + def demangled_name(self) -> str: + """:return: demangled name of the function""" + return self.symbol.demangled_name + + @property + def addr(self) -> int: + """:return: address of the function in the Binary""" + assert self.symbol.addr is not None + return self.symbol.addr + + +def normalize_name(name: str) -> str: + """Transform function name.""" + return name.strip("_").strip(".") + + +class DecompilMapper(Backend): + """Map a single binary's decompiled source and call graph into a Sourcetrail DB.""" + + def __init__( + self, + db: SourcetrailDB, + bin_path: Path, + ) -> None: + self.db_interface = db + super().__init__(bin_path, None, decompilation=True) + self.bin = Binary(path=bin_path) + self.functions: dict[int, FuncData] = dict() + self.source_ids: dict[int, int] = dict() + + def record_function(self, func: FuncData, log_prefix) -> FuncData: + """Record a function into the DB (do not record the associated source). + + :return: updated func data with id + """ + if func.type == FuncType.IMPORTED: + logging.debug(f"{log_prefix}: extern function, skip") + return func # do not add EXTERN functions + f_id = self.db_interface.record_function( + func.demangled_name, + prefix=hex(func.addr) if func.addr is not None else "None", + parent_id=self.bin.id, + ) + if f_id is None: + logging.error(f"{log_prefix}: error while recording function in db") + else: + func.id = f_id + return func + + def index_function(self, addr: int, log_prefix: str) -> None: + """Iterate over all the functions of the binary and extract useful data. + + Record function at the given address (addr) into DB and as member of + self.binary. + """ + func_type = self.func_type(addr) + func_data = FuncData( + symbol=Symbol( + name=self.func_mangled_name(addr), + demangled_name=self.func_demangled_name(addr), + is_func=True, + addr=addr, + ), + type=func_type, + calls=self.func_children(addr), + callers=self.func_parents(addr), + source=self.func_decompiled(addr) if func_type != FuncType.IMPORTED else "", + ) + self.bin.add_function(func_data.symbol) + self.functions[addr] = self.record_function(func_data, log_prefix) + + def record_source(self, func: FuncData, log_prefix: str) -> FuncData: + """Record decompiled version of each function. + + :param func: Func data object to treat + :param log_prefix: string prepended to every log message. + :return: updated func data object + """ + with NamedTemporaryFile(mode="wt", delete_on_close=True) as tmp: + tmp.write(func.source) + tmp.flush() + func.source_id = self.db_interface.record_file(Path(tmp.name), name=func.name) + if func.source_id is None: + return func + self.db_interface.record_file_language(func.source_id, "cpp") + tmp.close() + + logging.debug(f"{log_prefix}: add function {func.name} to file {func.source_id}") + if func.id is not None and func.declaration is not None: + self.db_interface.record_symbol_location(func.id, func.source_id, *func.declaration) + else: + logging.warning(f"{log_prefix}: declaration not found in source code") + + return func + + def index_decompiled(self, addr, log_prefix) -> None: + """Locate the declaration and every call-site inside the source of function at address addr. + + Record the associated source. + :param addr: address of the function to treat + :param log_prefix: string prepended to every log message. + """ + func = self.functions[addr] + + # Build lookup tables for the callees of this function. + # normalize_name strips leading/trailing underscores and dots so that + # e.g. "__memcpy" and "memcpy" both match the same call-site token. + callee_name_to_addr: dict[str, int] = { + normalize_name(self.functions[callee_addr].name): callee_addr + for callee_addr in func.calls + if callee_addr in self.functions and self.functions[callee_addr].name + } + + func_name = normalize_name(func.name) + + for line_index, line_text in enumerate(func.source.splitlines()): + # Lines in Location are 1-based; line_index is 0-based. + line_number = line_index + 1 + + # Try to find the function declaration on this line. + if func.declaration is None: + decl_col = line_text.find(func_name) + if decl_col != -1: + func.declaration = Location( + line_number, + decl_col + 1, + line_number, + decl_col + len(func_name), + ) + + # Scan the line for each callee name, recording the start column of + # every hit. The dict key is the column so overlaps are detected + # in the sort pass below. + # key: start_col, value: (callee_addr, callee_name) + hits_by_col: dict[int, tuple[int, str]] = {} + + for callee_name, callee_addr in callee_name_to_addr.items(): + # If the stored name includes a type signature (e.g. "func(int)") + # strip to the bare identifier before searching. + search_token = ( + callee_name.split("(")[0] if callee_name.endswith(")") else callee_name + ) + hit_col = line_text.find(f"{search_token}(") + if hit_col != -1: + hits_by_col[hit_col] = (callee_addr, callee_name) + + # Process hits left-to-right so that a longer earlier match + # (e.g. "lxstat") shadows a shorter later substring (e.g. "xstat"). + # end_of_last_accepted tracks the column just past the last accepted + # match so that substring overlaps are detected. + end_of_last_accepted = 0 + last_accepted_col = 0 + last_accepted_name = "" + + for start_col, (callee_addr, callee_name) in sorted(hits_by_col.items()): + if start_col < end_of_last_accepted: + # This hit starts inside the span of the previous match. + if start_col + len( + callee_name + ) == end_of_last_accepted and last_accepted_name.endswith(callee_name): + # The hit is a suffix of the accepted match — expected, + # not a real overlap (e.g. "stat" at the end of "lxstat"). + logging.debug( + f"{log_prefix}: skip '{callee_name}' — suffix of '{last_accepted_name}'" + ) + else: + logging.warning( + f"{log_prefix}: skip '{callee_name}' [col {start_col}] — " + f"overlaps '{last_accepted_name}' [col {last_accepted_col}]" + ) + else: + func.source_calls_loc[callee_addr].append( + Location( + line_number, + start_col + 1, + line_number, + start_col + len(callee_name), + ) + ) + end_of_last_accepted = start_col + len(callee_name) + last_accepted_col = start_col + last_accepted_name = callee_name + + if func.declaration is None: + logging.error(f"{log_prefix}: function declaration not found in source code") + + self.functions[addr] = self.record_source(func, log_prefix) + + def index_call_graph(self, addr, log_prefix) -> None: + """Map the call graph of the function at address addr. + + It also map as the associated references in source if any. + Record the callgraph into db.. + :param addr: address of the function to treat + :param log_prefix: string prepended to every log message. + """ + func = self.functions[addr] + if func.id is None: + logging.warning(f"{log_prefix}: {func.name} is not a registered function, skip") + return + for child_addr in func.calls: + if child_addr not in self.functions: + logging.warning( + f"{log_prefix}: Calls to {child_addr:0x} addr from {func.name} " + + "does not match a registered function" + ) + continue + child = self.functions[child_addr] + if child.id is None: + logging.warning( + f"{log_prefix}: cannot record call to {child.name} from {func.name} " + + "missing target id." + ) + continue + self.db_interface.record_ref_call(func.id, child.id) + + if func.source == "" or func.source_calls_loc[addr] == [] or func.source_id is None: + continue + for location in func.source_calls_loc[addr]: + self.db_interface.record_reference_location(child.id, func.source_id, *location) + + def map(self) -> None: + """Run the successive steps of the mapping.""" + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + MofNCompleteColumn(), + TimeElapsedColumn(), + ) as progress: + + func_addrs = list(self.func_addrs) + func_indexing = progress.add_task( + "[red]Function indexing", total=len(func_addrs) + ) + for addr in func_addrs: + self.index_function(addr, f"[function indexing] {func_addrs:0x}") + progress.update(func_indexing, advance=1) + + decompilee_indexing = progress.add_task( + "[orange_red1]Source indexing", total=len(func_addrs) + ) + for addr in self.functions.keys(): + self.index_function(addr, f"[source indexing] {self.functions[addr].name}") + progress.update(decompilee_indexing, advance=1) + + cg_indexing = progress.add_task( + "[gold1]Call graph indexing", total=len(func_addrs) + ) + for addr in self.functions.keys(): + self.index_function(addr, f"[call graph indexing] {self.functions[addr].name}") + progress.update(cg_indexing, advance=1) + + +class IdaDecompilMapper(DecompilMapper, IDA): + """Decompile Mapper backed by IDA Pro.""" + + pass + + +class GhidraDecompilMapper(DecompilMapper, Ghidra): + """Decompile Mapper backed by IDA Pro.""" + + pass \ No newline at end of file diff --git a/src/pyrrha_mapper/exedecomp/decompile.py b/src/pyrrha_mapper/exedecomp/decompile.py deleted file mode 100644 index 4fb8078..0000000 --- a/src/pyrrha_mapper/exedecomp/decompile.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -import json -from pathlib import Path -import sys - -try: - import ida_auto - import idautils - import ida_nalt - import ida_pro - import ida_hexrays - INSIDE_IDA = True -except ImportError: - INSIDE_IDA = False - - from idascript import MultiIDA, iter_binary_files, IDA - - -def main_ida(): - ida_auto.auto_wait() - - input_file = ida_nalt.get_input_file_path() - output_file = input_file+".decompiled" - raw_file = input_file+".c" - - # First decompile the whole program - ida_hexrays.clear_cached_cfuncs() - ida_hexrays.decompile_many(raw_file, None, - ida_hexrays.VDRUN_NEWFILE | ida_hexrays.VDRUN_MAYSTOP | ida_hexrays.VDRUN_SILENT) - - funs = {} - - # Then reiterate all functions to get them individually - for fun_ea in idautils.Functions(): - decomp = ida_hexrays.decompile(fun_ea) - if decomp is not None: - funs[fun_ea] = str(decomp) - - with open(output_file, "w") as f: - f.write(json.dumps(funs)) - - ida_pro.qexit(0) - - -def file_iterator(path): - for file in iter_binary_files(path): - ida_i64 = Path(str(file)+".i64") - if ida_i64.exists(): - yield file - - -def main_main(): - """ - Main function called when launched normally - """ - if len(sys.argv) != 2: - print("Usage: decompile_program.py dir/") - sys.exit(1) - - root = sys.argv[1] - - # For each file identified launch many IDA in parrallel this very same script - for (file, retcode) in MultiIDA.map(file_iterator(root), __file__, [], 6): - print(f"Processed {file} [{retcode}]") - - -if __name__ == "__main__": - if INSIDE_IDA: - main_ida() - else: - main_main() From 0dab328878a28d562c5d7dbd1c71cef7c18fe525 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 5 May 2026 15:37:52 +0200 Subject: [PATCH 35/62] doc: adapt decomp documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- README.md | 2 +- docs/mappers/exe-decomp.md | 16 ++++++++-------- docs/mappers/mappers.md | 2 +- mkdocs.yml | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index c6a69f0..020f963 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ The installation is done in two parts: 1. Install `Pyrrha` itself. 1. Install [`NumbatUI`](https://github.com/quarkslab/NumbatUI) (or [`Sourcetrail`](https://github.com/CoatiSoftware/Sourcetrail)) to be able to visualize Pyrrha's results. -1. _(Optional)_ Install Ghidra or IDA if you want to use `fs-cg` or `exe-decomp` mappers. +1. _(Optional)_ Install Ghidra or IDA if you want to use `fs-cg` or `decomp` mappers. > [!NOTE] > A quick start installation is available on [Pyrrha documentation](https://quarkslab.github.io/pyrrha/#installation). diff --git a/docs/mappers/exe-decomp.md b/docs/mappers/exe-decomp.md index a27a96d..8add081 100644 --- a/docs/mappers/exe-decomp.md +++ b/docs/mappers/exe-decomp.md @@ -1,4 +1,4 @@ -# `exe-decomp`: Executable Decompilation mapper +# `decomp`: Executable Decompilation mapper ## Introduction @@ -13,16 +13,16 @@ the mapper will: ## Usage ```commandline -Usage: pyrrha exe-decomp [OPTIONS] EXECUTABLE +Usage: pyrrha decomp [OPTIONS] EXECUTABLE - Map a single executable call graph into a numbatui-compatible database.It also index the decompiled code - along with all call cross-references. + Map a single executable call graph into a NumbatUI-compatible database. Also indexes the decompiled code along with + all call cross-references. Options: - -d, --debug Set log level to DEBUG - --db PATH NumbatUI DB file path (.srctrldb). [default: pyrrha.srctrldb] - --disassembler DISASSEMBLER Disassembler to use for disassembly. [default: Disassembler.AUTO] - -h, --help Show this message and exit. + -d, --debug Set log level to DEBUG. + --db PATH NumbatUI DB file path (.srctrldb). [default: decomp.srctrldb] + -b, --backend [ida|ghidra] Backend to use. [default: Backend.IDA] + -h, --help Show this message and exit. Show this message and exit. ``` !!! note diff --git a/docs/mappers/mappers.md b/docs/mappers/mappers.md index 8b801bb..56170dd 100644 --- a/docs/mappers/mappers.md +++ b/docs/mappers/mappers.md @@ -5,4 +5,4 @@ Pyrrha provides the following mappers: - [`fs`](fs.md): a filesystem mapper. It maps ELF/PE files, their imports and their exports. Also map symlinks which target ELF files. - [`fs-cg`](fs-cg.md): a filesystem call graph mapper. It maps the whole firmware by interconnecting call graphs of all executables (requires disassembly). -- [`exe-decomp`](exe-decomp.md): Map an executable call graph along with its decompiled code. The mapper will use Sourcetrail source code indexing features to cross-reference calls within the source code. +- [`decomp`](exe-decomp.md): Map an executable call graph along with its decompiled code. The mapper will use Sourcetrail source code indexing features to cross-reference calls within the source code. diff --git a/mkdocs.yml b/mkdocs.yml index c461a2a..60dc132 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -44,7 +44,7 @@ nav: - mappers/mappers.md - Filesystem (Overview): mappers/fs.md - Filesystem (CallGraph): mappers/fs-cg.md - - Exe-Decomp: mappers/exe-decomp.md + - Decompilation Graph: mappers/exe-decomp.md - Contributing: - Mapper Development: contributing/dev_mapper.md - Changelog: changelog.md From 00b35a33bcfb0f64ca5fd7b21a418517aaa3bb5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 5 May 2026 15:38:34 +0200 Subject: [PATCH 36/62] [fix] doc: decomp documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- docs/mappers/exe-decomp.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/mappers/exe-decomp.md b/docs/mappers/exe-decomp.md index 8add081..a73d9a6 100644 --- a/docs/mappers/exe-decomp.md +++ b/docs/mappers/exe-decomp.md @@ -5,8 +5,7 @@ This mapper is not a firmware mapper but an executable mapper. It will map its call graph and its decompiled code with cross-references within the source code. In order the mapper will: -* Export the executable (Quokka) to extract its call graph -* Decompile all functions (with Hex-Rays) to dump the whole decompiled code +* Decompile all functions (with Hex-Rays or Ghidra) to dump the whole decompiled code * Index all functions with the associated decompilation * Apply cross-references between functions From fff6d0e20c6296f086fc28d91e51561faf91170f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 5 May 2026 15:39:15 +0200 Subject: [PATCH 37/62] setup: remove useless modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 60965da..5d8aeed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,8 +44,6 @@ dependencies = [ 'numbat>=0.2.6', 'pydantic', 'rich', - # InterCG mapper - "qbinary>=0.0.3", # will also install idascript "ida_domain", "pyghidra" ] From e7f6e1cd7e2d97448d2cf57191b124e3c492171f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 19 May 2026 15:23:52 +0200 Subject: [PATCH 38/62] fs: merge abstract fs mapper and fs import mapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/common/__init__.py | 3 - src/pyrrha_mapper/common/filesystem_mapper.py | 347 ------------------ src/pyrrha_mapper/fs/imports_mapper.py | 323 +++++++++++++++- 3 files changed, 312 insertions(+), 361 deletions(-) delete mode 100755 src/pyrrha_mapper/common/filesystem_mapper.py diff --git a/src/pyrrha_mapper/common/__init__.py b/src/pyrrha_mapper/common/__init__.py index b4be036..db402be 100644 --- a/src/pyrrha_mapper/common/__init__.py +++ b/src/pyrrha_mapper/common/__init__.py @@ -15,14 +15,11 @@ # limitations under the License. """Common objects and functions that can be used for any mapper.""" -from .filesystem_mapper import FileSystemMapper, hide_progress from .objects import Binary, FileSystem, Symbol, Symlink __all__ = [ - "FileSystemMapper", "Binary", "FileSystem", - "hide_progress", "Symbol", "Symlink", ] diff --git a/src/pyrrha_mapper/common/filesystem_mapper.py b/src/pyrrha_mapper/common/filesystem_mapper.py deleted file mode 100755 index 7ad55ef..0000000 --- a/src/pyrrha_mapper/common/filesystem_mapper.py +++ /dev/null @@ -1,347 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Base classes for mapping binaries of a filesystem.""" - -import logging -from abc import ABC, abstractmethod -from contextlib import contextmanager -from pathlib import Path -from typing import overload - -from numbat import SourcetrailDB -from numbat.exceptions import DBException -from rich.progress import ( - BarColumn, - MofNCompleteColumn, - Progress, - TextColumn, - TimeElapsedColumn, -) - -from pyrrha_mapper.common.objects import Binary, FileSystem, Symbol, Symlink -from pyrrha_mapper.exceptions import PyrrhaError -from pyrrha_mapper.types import ResolveDuplicateOption - - -@contextmanager -def hide_progress(progress: Progress): - """Context Manager which temporally hide a `rich` progress bar. - - Code from https://github.com/Textualize/rich/issues/1535#issuecomment-1745297594 - """ - transient = progress.live.transient # save the old value - progress.live.transient = True - progress.stop() - progress.live.transient = transient # restore the old value - try: - yield - finally: - # make space for the progress to use so it doesn't overwrite any previous lines - print("\n" * (len(progress.tasks) - 2)) - progress.start() - - -class FileSystemMapper(ABC): - """Abstract class which is a base mapper to binaries of a filesystem. - - It maps a filesystem in the following order: - - binaries - - symlinks - - lib imports - - symbol_imports. - To change the behavior of these mapping you can reimplement the - map_* corresponding method. - - Init params - :param root_directory: directory containing the filesystem to map - :param db: interface to the DB - """ - - def __init__(self, root_directory: Path | str, db: SourcetrailDB | None): - self.root_directory = Path(root_directory).resolve().absolute() - self.db_interface = db - self.fs = FileSystem(root_dir=self.root_directory) - self._dry_run = not bool(db) - - @property - def dry_run_mode(self) -> bool: - """Returns whether a Sourcetrail DB as been provided or not. - - If not, only produce the FileSystem object that can also - be used independently. - """ - return self._dry_run - - @dry_run_mode.setter - def dry_run_mode(self, value: bool) -> None: - """If True does not record in db.""" - self._dry_run = value - - # ===================== Records in DB (NumbatUI DB) =============================== - - def record_import_in_db( - self, source_id: int | None, dest_id: int | None, log_prefix: str = "" - ) -> None: - """Record in DB the import of dest by source.""" - if self.dry_run_mode: - return None - assert self.db_interface is not None - if source_id is None or dest_id is None: - logging.error(f"{log_prefix}: Cannot record import, src and/or dest are unknown") - else: - self.db_interface.record_ref_import(source_id, dest_id) - - def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: - """Record the binary inside the DB as well as its internal symbols. - - Update 'bin_obj.id' with the id of the created object in DB and does the same - thing for its symbol. It will record symbols using their demangled names. - - :warning: do not record calls as well as any links between several binaries - - :param binary: the Binary object to map - :return: the updated object - """ - # If dry run do not store the binary in DB - if self.dry_run_mode: - return binary - - assert self.db_interface is not None - binary.id = self.db_interface.record_class( - binary.name, prefix=f"{binary.path.parent}/", delimiter=":" - ) - if binary.id is None: - logging.error(f"{log_prefix}: Record of binary failed.") - return binary - - recorded_symb: dict[str, int] = dict() - for symbol in set(binary.iter_exported_symbols()): - if symbol.demangled_name in recorded_symb: - logging.debug( - f"{log_prefix}: demangled name {symbol.demangled_name} already in db " - "common node for these symbols" - ) - symbol.id = recorded_symb[symbol.demangled_name] - # Also propagate the id to any other symbol registered under - # the same mangled name (e.g. secondary demangled-key entries). - for other in binary.exported_functions.values(): - if other.name == symbol.name and other.id is None: - other.id = symbol.id - continue - if symbol.is_func: - symbol.id = self.db_interface.record_method( - symbol.demangled_name, - parent_id=binary.id, - prefix=hex(symbol.addr) if symbol.addr is not None else "None", - ) - if symbol.id is not None: - self.db_interface.change_node_color( - symbol.id, fill_color="#bee0af", border_color="#395f33" - ) - else: - symbol.id = self.db_interface.record_field( - symbol.demangled_name, - parent_id=binary.id, - prefix=hex(symbol.addr) if symbol.addr is not None else "None", - ) - - if symbol.id is None: - logging.error(f"{log_prefix}: Record of symbol '{symbol.demangled_name}' failed.") - else: - try: - self.db_interface.record_public_access(symbol.id) - recorded_symb[symbol.demangled_name] = symbol.id - # Propagate id to all symbols sharing the same mangled name - # (covers secondary demangled-key registrations). - for other in binary.exported_functions.values(): - if other.name == symbol.name and other.id is None: - other.id = symbol.id - except DBException as e: - raise PyrrhaError( - f"{log_prefix}: Cannot register access to symbol {symbol.demangled_name}: " - f"{e}" - ) from e - - for symbol in set(binary.iter_not_exported_functions()): - # Skip if this demangled name was already recorded as an exported - # symbol — same demangled name means same DB node, and calling - # record_private_access on it would violate the UNIQUE constraint. - if symbol.demangled_name in recorded_symb: - logging.debug( - f"{log_prefix}: demangled name {symbol.demangled_name} already recorded " - "as exported, skipping internal registration" - ) - symbol.id = recorded_symb[symbol.demangled_name] - continue - symbol.id = self.db_interface.record_method( - symbol.demangled_name, - parent_id=binary.id, - prefix=hex(symbol.addr) if symbol.addr is not None else "None", - ) - if symbol.id is None: - logging.error(f"{log_prefix}: Record of symbol '{symbol.demangled_name}' failed.") - else: - try: - self.db_interface.record_private_access(symbol.id) - recorded_symb[symbol.demangled_name] = symbol.id - except DBException as e: - raise PyrrhaError( - f"{log_prefix}: Cannot register access to symbol" - f" {symbol.demangled_name}: {e}" - ) from e - - return binary - - def record_symlink_in_db(self, sym: Symlink, log_prefix: str = "") -> Symlink: - """Record into DB the symlink and its link to its target. - - Update 'sym.id' with the id of the created object. - :param sym: symlink object - :return: the updated object - """ - if self.dry_run_mode: - return sym - assert self.db_interface is not None - sym.id = self.db_interface.record_typedef_node( - sym.name, prefix=f"{sym.path.parent}/", delimiter=":" - ) - if sym.id is None: - logging.error(f"{log_prefix}: Record of symlink failed.") - else: - self.record_import_in_db(sym.id, sym.target.id) - return sym - - # =============================== Utils =============================== - - @overload - @staticmethod - def _select_fs_component( - strategy: ResolveDuplicateOption, - matching_objects: list[Binary], - log_prefix: str, - target_name: str, - cache: set[Binary] | None = None, - ) -> Binary | None: ... - - @overload - @staticmethod - def _select_fs_component( - strategy: ResolveDuplicateOption, - matching_objects: list[Symlink], - log_prefix: str, - target_name: str, - cache: set[Symlink] | None = None, - ) -> Symlink | None: ... - - @staticmethod - def _select_fs_component( - strategy: ResolveDuplicateOption, - matching_objects: list[Binary] | list[Symlink], - log_prefix: str, - target_name: str, - cache: set[Binary] | set[Symlink] | None = None, - ) -> Binary | Symlink | None: - """Choice of one element of a given list according to the strategy. - - Given a list of objects which match a target, select one or None among - the given list according the strategy given It also logs the choice made - (debug level). If requireds by the strategy, an interaction with the user could - be made. - :param strategy: the resolution strategy - :param matching_objects: a list of FileSystemComponents (NOT empty, not - check by the function) - :param log_prefix: Prefix used at the beginning of each log - :param target_name: Target name, used in logs (and user interaction) - :param resolve_cache: cache of previously selected choices for this target - :return: the selected FileSystemComponent | None if resolution strategy - is IGNORE - """ - if len(matching_objects) > 1 and strategy is ResolveDuplicateOption.IGNORE: - logging.debug( - f"{log_prefix}: several matches for {target_name} but strategy is " - f"{ResolveDuplicateOption.IGNORE.name} so nothing selected" - ) - return None - selected_index = None - selected_bin = None - if len(matching_objects) > 1 and strategy is ResolveDuplicateOption.INTERACTIVE: - for cache_entry in cache or {}: - if cache_entry in matching_objects: # reuse already selected entry - logging.debug( - f"{log_prefix}: manually selected entry to disambiguate {target_name}" - ) - selected_bin = cache_entry - - while ( - selected_bin is None - or selected_index is None - or selected_index < 0 - or selected_index >= len(matching_objects) - ): - print(f"{log_prefix}: several matches for {target_name}, select one\n") - for i in range(len(matching_objects)): - print(f"{i}: {matching_objects[i].path}") - try: - selected_index = int(input()) - except ValueError: - print("Enter a valid number") - else: # "arbitrary" option - selected_index = 0 - if selected_bin is None: - selected_bin = matching_objects[selected_index] - return selected_bin - - def commit(self) -> None: - """Commit changes in database.""" - if not self.dry_run_mode and self.db_interface is not None: - self.db_interface.commit() - - # ================================ Main function ================================== - - def map( - self, - threads: int, - resolution_strategy: ResolveDuplicateOption = ResolveDuplicateOption.IGNORE, - ) -> FileSystem: - """Wrap mapper_main with usefull elements for CLI rendering. - - :param threads: number of threads to use - :param resolution_strategy: the chosen option for duplicate import resolution - :return: The FileSystem object filled - """ - with Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(), - MofNCompleteColumn(), - TimeElapsedColumn(), - ) as progress: - return self.mapper_main(threads, progress, resolution_strategy) - - @abstractmethod - def mapper_main( - self, - threads: int, - progress: Progress, - resolution_strategy: ResolveDuplicateOption = ResolveDuplicateOption.IGNORE, - ) -> FileSystem: - """Main function of the mapper, return the result stored in a FileSytsem. - - :param threads: number of threads to use - :param progress: a progress bar ready to be filled - :param resolution_strategy: the chosen option for duplicate import resolution - :return: The FileSystem object filled - """ # noqa: D401 - pass diff --git a/src/pyrrha_mapper/fs/imports_mapper.py b/src/pyrrha_mapper/fs/imports_mapper.py index f013681..4478d11 100644 --- a/src/pyrrha_mapper/fs/imports_mapper.py +++ b/src/pyrrha_mapper/fs/imports_mapper.py @@ -19,27 +19,71 @@ import queue from abc import ABC from collections.abc import Callable +from contextlib import contextmanager from dataclasses import dataclass from functools import partial from multiprocessing import Queue, get_context from pathlib import Path -from typing import Any +from typing import Any, overload import lief from numbat import SourcetrailDB -from rich.progress import Progress - -from pyrrha_mapper.common import Binary, FileSystem, FileSystemMapper, Symbol, Symlink +from numbat.exceptions import DBException +from rich.progress import ( + BarColumn, + MofNCompleteColumn, + Progress, + TextColumn, + TimeElapsedColumn, +) + +from pyrrha_mapper.common import Binary, FileSystem, Symbol, Symlink +from pyrrha_mapper.exceptions import PyrrhaError from pyrrha_mapper.types import ResolveDuplicateOption lief.logging.disable() -class FileSystemImportsMapper(FileSystemMapper): - """Filesystem mapper based on Lief, which computes imports and exports.""" +@contextmanager +def hide_progress(progress: Progress): + """Context Manager which temporally hide a `rich` progress bar. + + Code from https://github.com/Textualize/rich/issues/1535#issuecomment-1745297594 + """ + transient = progress.live.transient # save the old value + progress.live.transient = True + progress.stop() + progress.live.transient = transient # restore the old value + try: + yield + finally: + # make space for the progress to use so it doesn't overwrite any previous lines + print("\n" * (len(progress.tasks) - 2)) + progress.start() + + + +class FileSystemImportsMapper: + """Filesystem mapper based on Lief, which computes imports and exports. + + It maps a filesystem in the following order: + - binaries + - symlinks + - lib imports + - symbol_imports. + To change the behavior of these mapping you can reimplement the + map_* corresponding method. + + Init params + :param root_directory: directory containing the filesystem to map + :param db: interface to the DB + """ def __init__(self, root_directory: Path | str, db: SourcetrailDB | None): - super(FileSystemImportsMapper, self).__init__(root_directory, db) + self.root_directory = Path(root_directory).resolve().absolute() + self.db_interface = db + self.fs = FileSystem(root_dir=self.root_directory) + self._dry_run = not bool(db) if not self.dry_run_mode and self.db_interface is not None: # Setup graph customisation in NumbatUI @@ -57,6 +101,242 @@ def is_binary_supported(p: Path) -> bool: :return: True is the path point on a file """ return p.is_file() and not p.is_symlink() and (lief.is_elf(str(p)) or lief.is_pe(str(p))) + + @property + def dry_run_mode(self) -> bool: + """Returns whether a Sourcetrail DB as been provided or not. + + If not, only produce the FileSystem object that can also + be used independently. + """ + return self._dry_run + + @dry_run_mode.setter + def dry_run_mode(self, value: bool) -> None: + """If True does not record in db.""" + self._dry_run = value + + # ===================== Records in DB (NumbatUI DB) =============================== + + def record_import_in_db( + self, source_id: int | None, dest_id: int | None, log_prefix: str = "" + ) -> None: + """Record in DB the import of dest by source.""" + if self.dry_run_mode: + return None + assert self.db_interface is not None + if source_id is None or dest_id is None: + logging.error(f"{log_prefix}: Cannot record import, src and/or dest are unknown") + else: + self.db_interface.record_ref_import(source_id, dest_id) + + def record_binary_in_db(self, binary: Binary, log_prefix: str = "") -> Binary: + """Record the binary inside the DB as well as its internal symbols. + + Update 'bin_obj.id' with the id of the created object in DB and does the same + thing for its symbol. It will record symbols using their demangled names. + + :warning: do not record calls as well as any links between several binaries + + :param binary: the Binary object to map + :return: the updated object + """ + # If dry run do not store the binary in DB + if self.dry_run_mode: + return binary + + assert self.db_interface is not None + binary.id = self.db_interface.record_class( + binary.name, prefix=f"{binary.path.parent}/", delimiter=":" + ) + if binary.id is None: + logging.error(f"{log_prefix}: Record of binary failed.") + return binary + + recorded_symb: dict[str, int] = dict() + for symbol in set(binary.iter_exported_symbols()): + if symbol.demangled_name in recorded_symb: + logging.debug( + f"{log_prefix}: demangled name {symbol.demangled_name} already in db " + "common node for these symbols" + ) + symbol.id = recorded_symb[symbol.demangled_name] + # Also propagate the id to any other symbol registered under + # the same mangled name (e.g. secondary demangled-key entries). + for other in binary.exported_functions.values(): + if other.name == symbol.name and other.id is None: + other.id = symbol.id + continue + if symbol.is_func: + symbol.id = self.db_interface.record_method( + symbol.demangled_name, + parent_id=binary.id, + prefix=hex(symbol.addr) if symbol.addr is not None else "None", + ) + if symbol.id is not None: + self.db_interface.change_node_color( + symbol.id, fill_color="#bee0af", border_color="#395f33" + ) + else: + symbol.id = self.db_interface.record_field( + symbol.demangled_name, + parent_id=binary.id, + prefix=hex(symbol.addr) if symbol.addr is not None else "None", + ) + + if symbol.id is None: + logging.error(f"{log_prefix}: Record of symbol '{symbol.demangled_name}' failed.") + else: + try: + self.db_interface.record_public_access(symbol.id) + recorded_symb[symbol.demangled_name] = symbol.id + # Propagate id to all symbols sharing the same mangled name + # (covers secondary demangled-key registrations). + for other in binary.exported_functions.values(): + if other.name == symbol.name and other.id is None: + other.id = symbol.id + except DBException as e: + raise PyrrhaError( + f"{log_prefix}: Cannot register access to symbol {symbol.demangled_name}: " + f"{e}" + ) from e + + for symbol in set(binary.iter_not_exported_functions()): + # Skip if this demangled name was already recorded as an exported + # symbol — same demangled name means same DB node, and calling + # record_private_access on it would violate the UNIQUE constraint. + if symbol.demangled_name in recorded_symb: + logging.debug( + f"{log_prefix}: demangled name {symbol.demangled_name} already recorded " + "as exported, skipping internal registration" + ) + symbol.id = recorded_symb[symbol.demangled_name] + continue + symbol.id = self.db_interface.record_method( + symbol.demangled_name, + parent_id=binary.id, + prefix=hex(symbol.addr) if symbol.addr is not None else "None", + ) + if symbol.id is None: + logging.error(f"{log_prefix}: Record of symbol '{symbol.demangled_name}' failed.") + else: + try: + self.db_interface.record_private_access(symbol.id) + recorded_symb[symbol.demangled_name] = symbol.id + except DBException as e: + raise PyrrhaError( + f"{log_prefix}: Cannot register access to symbol" + f" {symbol.demangled_name}: {e}" + ) from e + + return binary + + def record_symlink_in_db(self, sym: Symlink, log_prefix: str = "") -> Symlink: + """Record into DB the symlink and its link to its target. + + Update 'sym.id' with the id of the created object. + :param sym: symlink object + :return: the updated object + """ + if self.dry_run_mode: + return sym + assert self.db_interface is not None + sym.id = self.db_interface.record_typedef_node( + sym.name, prefix=f"{sym.path.parent}/", delimiter=":" + ) + if sym.id is None: + logging.error(f"{log_prefix}: Record of symlink failed.") + else: + self.record_import_in_db(sym.id, sym.target.id) + return sym + + + # =============================== Utils =============================== + + @overload + @staticmethod + def _select_fs_component( + strategy: ResolveDuplicateOption, + matching_objects: list[Binary], + log_prefix: str, + target_name: str, + cache: set[Binary] | None = None, + ) -> Binary | None: ... + + @overload + @staticmethod + def _select_fs_component( + strategy: ResolveDuplicateOption, + matching_objects: list[Symlink], + log_prefix: str, + target_name: str, + cache: set[Symlink] | None = None, + ) -> Symlink | None: ... + + @staticmethod + def _select_fs_component( + strategy: ResolveDuplicateOption, + matching_objects: list[Binary] | list[Symlink], + log_prefix: str, + target_name: str, + cache: set[Binary] | set[Symlink] | None = None, + ) -> Binary | Symlink | None: + """Choice of one element of a given list according to the strategy. + + Given a list of objects which match a target, select one or None among + the given list according the strategy given It also logs the choice made + (debug level). If requireds by the strategy, an interaction with the user could + be made. + :param strategy: the resolution strategy + :param matching_objects: a list of FileSystemComponents (NOT empty, not + check by the function) + :param log_prefix: Prefix used at the beginning of each log + :param target_name: Target name, used in logs (and user interaction) + :param resolve_cache: cache of previously selected choices for this target + :return: the selected FileSystemComponent | None if resolution strategy + is IGNORE + """ + if len(matching_objects) > 1 and strategy is ResolveDuplicateOption.IGNORE: + logging.debug( + f"{log_prefix}: several matches for {target_name} but strategy is " + f"{ResolveDuplicateOption.IGNORE.name} so nothing selected" + ) + return None + selected_index = None + selected_bin = None + if len(matching_objects) > 1 and strategy is ResolveDuplicateOption.INTERACTIVE: + for cache_entry in cache or {}: + if cache_entry in matching_objects: # reuse already selected entry + logging.debug( + f"{log_prefix}: manually selected entry to disambiguate {target_name}" + ) + selected_bin = cache_entry + + while ( + selected_bin is None + or selected_index is None + or selected_index < 0 + or selected_index >= len(matching_objects) + ): + print(f"{log_prefix}: several matches for {target_name}, select one\n") + for i in range(len(matching_objects)): + print(f"{i}: {matching_objects[i].path}") + try: + selected_index = int(input()) + except ValueError: + print("Enter a valid number") + else: # "arbitrary" option + selected_index = 0 + if selected_bin is None: + selected_bin = matching_objects[selected_index] + return selected_bin + + def commit(self) -> None: + """Commit changes in database.""" + if not self.dry_run_mode and self.db_interface is not None: + self.db_interface.commit() + + # =================== Binary parsing ============================== def load_binary_args(self) -> dict[str, Any]: """Return dict of args for load_binary that are always the same for the wholde firmware. @@ -219,6 +499,8 @@ def map_binary(self, bin_object: Binary, additional_res: Any = None) -> None: if not self.dry_run_mode: self.record_binary_in_db(bin_object, f"[binary mapping] {bin_object.name}") + #=============================== Symlinks ================================== + def map_symlink(self, path: Path) -> None: """Given a symlink, resolve it and create the associated objects if needed. @@ -263,6 +545,8 @@ def map_symlink(self, path: Path) -> None: else: logging.warning(f"{log_prefix}: '{target}' does not correspond to a recorded binary") + # =============================== Imports ================================== + @dataclass(frozen=True) class _LibImport(ABC): initial_import: Symlink | Binary | None @@ -326,9 +610,7 @@ def _resolve_lib_import( # The imported name matches the SONAME of a binary whose filename # differs (e.g. libpthread.so.0 is the SONAME of libpthread-2.11.1.so). matching_binaries = self.fs.get_binaries_by_soname(lib_name) - lib_obj = self._select_fs_component( - strategy, matching_binaries, log_prefix, lib_name - ) + lib_obj = self._select_fs_component(strategy, matching_binaries, log_prefix, lib_name) if lib_obj is None: return self._FailedLibImport() return self._SolvedLibImport(initial_import=lib_obj, final_import=lib_obj) @@ -361,7 +643,7 @@ def map_lib_imports( # resolution, the final target of the symlink is considered to be # imported and not the symlink itself self.record_import_in_db(binary.id, res.initial_import.id, log_prefix) - + if lib_name != res.final_import.name: # SONAME case: store the resolved binary under the # original import name (the SONAME) rather than the @@ -575,6 +857,25 @@ def map_symbol_imports_main( progress.update(symbol_imports, advance=1) self.commit() + def map( + self, + threads: int, + resolution_strategy: ResolveDuplicateOption = ResolveDuplicateOption.IGNORE, + ) -> FileSystem: + """Wrap mapper_main with usefull elements for CLI rendering. + + :param threads: number of threads to use + :param resolution_strategy: the chosen option for duplicate import resolution + :return: The FileSystem object filled + """ + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + MofNCompleteColumn(), + TimeElapsedColumn(), + ) as progress: + return self.mapper_main(threads, progress, resolution_strategy) + def mapper_main( self, threads: int, From 026a0a3e1168f9290666909a00ef5e013e7abe4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 19 May 2026 15:54:26 +0200 Subject: [PATCH 39/62] all: reorganize repo in only two submodules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/__init__.py | 4 +-- src/pyrrha_mapper/__main__.py | 17 ++++++++----- src/pyrrha_mapper/common/__init__.py | 25 ------------------- src/pyrrha_mapper/fs/__init__.py | 20 --------------- src/pyrrha_mapper/intercg/__init__.py | 20 --------------- .../{exedecomp => mappers}/__init__.py | 16 +++++++++--- .../{exedecomp => mappers}/decomp_mapper.py | 4 +-- .../{fs => mappers}/imports_mapper.py | 2 +- .../intercg_bin_loader.py} | 3 +-- .../fwmapper.py => mappers/intercg_mapper.py} | 13 +++++++--- .../{common => mappers}/objects.py | 0 11 files changed, 39 insertions(+), 85 deletions(-) delete mode 100644 src/pyrrha_mapper/common/__init__.py delete mode 100644 src/pyrrha_mapper/fs/__init__.py delete mode 100644 src/pyrrha_mapper/intercg/__init__.py rename src/pyrrha_mapper/{exedecomp => mappers}/__init__.py (59%) rename src/pyrrha_mapper/{exedecomp => mappers}/decomp_mapper.py (99%) rename src/pyrrha_mapper/{fs => mappers}/imports_mapper.py (99%) rename src/pyrrha_mapper/{intercg/loader.py => mappers/intercg_bin_loader.py} (99%) rename src/pyrrha_mapper/{intercg/fwmapper.py => mappers/intercg_mapper.py} (99%) rename src/pyrrha_mapper/{common => mappers}/objects.py (100%) diff --git a/src/pyrrha_mapper/__init__.py b/src/pyrrha_mapper/__init__.py index d2092aa..075f0dc 100644 --- a/src/pyrrha_mapper/__init__.py +++ b/src/pyrrha_mapper/__init__.py @@ -16,8 +16,8 @@ """Pyrrha is a mapper collection for firmware analysis.""" -from pyrrha_mapper.common import Binary, FileSystem, FileSystemMapper, Symbol, Symlink +from pyrrha_mapper.mappers import Binary, FileSystem, Symbol, Symlink __version__ = "1.0.1" -__all__ = ["Binary", "FileSystem", "FileSystemMapper", "Symbol", "Symlink"] +__all__ = ["Binary", "FileSystem", "Symbol", "Symlink"] diff --git a/src/pyrrha_mapper/__main__.py b/src/pyrrha_mapper/__main__.py index b9370cd..76ef92b 100644 --- a/src/pyrrha_mapper/__main__.py +++ b/src/pyrrha_mapper/__main__.py @@ -24,8 +24,13 @@ import coloredlogs # type: ignore # no typing used in this library from numbat import SourcetrailDB -from pyrrha_mapper import fs, intercg, exedecomp -from pyrrha_mapper.common import FileSystem +from pyrrha_mapper.mappers import ( + FileSystem, + FileSystemImportsMapper, + GhidraDecompilMapper, + IdaDecompilMapper, + InterImageCGMapper, +) from pyrrha_mapper.types import Backend, ResolveDuplicateOption # ------------------------------------------------------------------------------- @@ -244,7 +249,7 @@ def fs_mapper( db_instance = setup_db(db) root_directory = root_directory.absolute() - filesystem = fs.FileSystemImportsMapper(root_directory, db_instance).map( + filesystem = FileSystemImportsMapper(root_directory, db_instance).map( jobs, resolve_duplicates ) @@ -290,7 +295,7 @@ def fs_call_graph_mapper( root_directory = root_directory.absolute() try: - intercg_mapper = intercg.InterImageCGMapper(root_directory, db_instance, backend) + intercg_mapper = InterImageCGMapper(root_directory, db_instance, backend) fs_object: FileSystem = intercg_mapper.map(jobs, resolve_duplicates) fs_object.write(db_instance.path.with_suffix(intercg_mapper.FS_EXT)) except RuntimeError: @@ -329,9 +334,9 @@ def fs_exe_decompiled_mapper( match backend: case Backend.IDA: - mapper = exedecomp.IdaDecompilMapper(db_instance, executable) + mapper = IdaDecompilMapper(db_instance, executable) case Backend.GHIDRA: - mapper = exedecomp.GhidraDecompilMapper(db_instance, executable) + mapper = GhidraDecompilMapper(db_instance, executable) case _: click.echo(f"Backend {backend.name} not yet supported") return 1 diff --git a/src/pyrrha_mapper/common/__init__.py b/src/pyrrha_mapper/common/__init__.py deleted file mode 100644 index db402be..0000000 --- a/src/pyrrha_mapper/common/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Common objects and functions that can be used for any mapper.""" - -from .objects import Binary, FileSystem, Symbol, Symlink - -__all__ = [ - "Binary", - "FileSystem", - "Symbol", - "Symlink", -] diff --git a/src/pyrrha_mapper/fs/__init__.py b/src/pyrrha_mapper/fs/__init__.py deleted file mode 100644 index c1cac54..0000000 --- a/src/pyrrha_mapper/fs/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Module for the FS mapper.""" - -from .imports_mapper import FileSystemImportsMapper - -__all__ = ["FileSystemImportsMapper"] diff --git a/src/pyrrha_mapper/intercg/__init__.py b/src/pyrrha_mapper/intercg/__init__.py deleted file mode 100644 index da2fca5..0000000 --- a/src/pyrrha_mapper/intercg/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2023-2025 Quarkslab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Module for the intercg mapper.""" - -from .fwmapper import InterImageCGMapper - -__all__ = ["InterImageCGMapper"] \ No newline at end of file diff --git a/src/pyrrha_mapper/exedecomp/__init__.py b/src/pyrrha_mapper/mappers/__init__.py similarity index 59% rename from src/pyrrha_mapper/exedecomp/__init__.py rename to src/pyrrha_mapper/mappers/__init__.py index 9b9f617..4411c01 100644 --- a/src/pyrrha_mapper/exedecomp/__init__.py +++ b/src/pyrrha_mapper/mappers/__init__.py @@ -13,8 +13,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Module for the decomp mapper.""" - +"""Module for the intercg mapper.""" from .decomp_mapper import GhidraDecompilMapper, IdaDecompilMapper +from .intercg_mapper import InterImageCGMapper +from .imports_mapper import FileSystemImportsMapper, hide_progress +from .objects import Binary, FileSystem, Symbol, Symlink -__all__ = ["GhidraDecompilMapper", "IdaDecompilMapper"] +__all__ = ["IdaDecompilMapper", + "GhidraDecompilMapper", + "InterImageCGMapper", + "FileSystemImportsMapper", + "hide_progress", + "Binary", + "FileSystem", + "Symbol", + "Symlink"] \ No newline at end of file diff --git a/src/pyrrha_mapper/exedecomp/decomp_mapper.py b/src/pyrrha_mapper/mappers/decomp_mapper.py similarity index 99% rename from src/pyrrha_mapper/exedecomp/decomp_mapper.py rename to src/pyrrha_mapper/mappers/decomp_mapper.py index 7d8f4f1..b8f8d60 100644 --- a/src/pyrrha_mapper/exedecomp/decomp_mapper.py +++ b/src/pyrrha_mapper/mappers/decomp_mapper.py @@ -30,8 +30,8 @@ TimeElapsedColumn, ) -from pyrrha_mapper.backend import Backend, Ghidra, IDA -from pyrrha_mapper.common import Binary, Symbol +from pyrrha_mapper.backend import IDA, Backend, Ghidra +from pyrrha_mapper.mappers import Binary, Symbol from pyrrha_mapper.types import FuncType diff --git a/src/pyrrha_mapper/fs/imports_mapper.py b/src/pyrrha_mapper/mappers/imports_mapper.py similarity index 99% rename from src/pyrrha_mapper/fs/imports_mapper.py rename to src/pyrrha_mapper/mappers/imports_mapper.py index 4478d11..89d58fd 100644 --- a/src/pyrrha_mapper/fs/imports_mapper.py +++ b/src/pyrrha_mapper/mappers/imports_mapper.py @@ -37,8 +37,8 @@ TimeElapsedColumn, ) -from pyrrha_mapper.common import Binary, FileSystem, Symbol, Symlink from pyrrha_mapper.exceptions import PyrrhaError +from pyrrha_mapper.mappers import Binary, FileSystem, Symbol, Symlink from pyrrha_mapper.types import ResolveDuplicateOption lief.logging.disable() diff --git a/src/pyrrha_mapper/intercg/loader.py b/src/pyrrha_mapper/mappers/intercg_bin_loader.py similarity index 99% rename from src/pyrrha_mapper/intercg/loader.py rename to src/pyrrha_mapper/mappers/intercg_bin_loader.py index d2b8935..26c57ae 100644 --- a/src/pyrrha_mapper/intercg/loader.py +++ b/src/pyrrha_mapper/mappers/intercg_bin_loader.py @@ -21,9 +21,8 @@ from typing import NamedTuple from pyrrha_mapper.backend import IDA, Backend, Ghidra -from pyrrha_mapper.common import Binary, Symbol from pyrrha_mapper.exceptions import FsMapperError -from pyrrha_mapper.fs import FileSystemImportsMapper +from pyrrha_mapper.mappers import Binary, FileSystemImportsMapper, Symbol from pyrrha_mapper.types import FuncType diff --git a/src/pyrrha_mapper/intercg/fwmapper.py b/src/pyrrha_mapper/mappers/intercg_mapper.py similarity index 99% rename from src/pyrrha_mapper/intercg/fwmapper.py rename to src/pyrrha_mapper/mappers/intercg_mapper.py index a68cf2b..b19c079 100644 --- a/src/pyrrha_mapper/intercg/fwmapper.py +++ b/src/pyrrha_mapper/mappers/intercg_mapper.py @@ -25,17 +25,22 @@ from numbat import SourcetrailDB from rich.progress import Progress +from pyrrha_mapper.exceptions import FsMapperError + # local imports -from pyrrha_mapper.common import ( +from pyrrha_mapper.mappers import ( Binary, FileSystem, + FileSystemImportsMapper, Symbol, Symlink, hide_progress, ) -from pyrrha_mapper.exceptions import FsMapperError -from pyrrha_mapper.fs import FileSystemImportsMapper -from pyrrha_mapper.intercg.loader import BinaryParser, GhidraBinaryParser, IDABinaryParser +from pyrrha_mapper.mappers.intercg_bin_loader import ( + BinaryParser, + GhidraBinaryParser, + IDABinaryParser, +) from pyrrha_mapper.types import Backend, ResolveDuplicateOption IGNORE_LIST: frozenset[str] = frozenset( diff --git a/src/pyrrha_mapper/common/objects.py b/src/pyrrha_mapper/mappers/objects.py similarity index 100% rename from src/pyrrha_mapper/common/objects.py rename to src/pyrrha_mapper/mappers/objects.py From c2d9e6207e774f567f46673ecc83a9a40ed9e934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 19 May 2026 15:57:42 +0200 Subject: [PATCH 40/62] [fix] all: missing import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/mappers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyrrha_mapper/mappers/__init__.py b/src/pyrrha_mapper/mappers/__init__.py index 4411c01..39dba61 100644 --- a/src/pyrrha_mapper/mappers/__init__.py +++ b/src/pyrrha_mapper/mappers/__init__.py @@ -15,8 +15,8 @@ # limitations under the License. """Module for the intercg mapper.""" from .decomp_mapper import GhidraDecompilMapper, IdaDecompilMapper -from .intercg_mapper import InterImageCGMapper from .imports_mapper import FileSystemImportsMapper, hide_progress +from .intercg_mapper import InterImageCGMapper from .objects import Binary, FileSystem, Symbol, Symlink __all__ = ["IdaDecompilMapper", From d39ce7ceb9d75f7cc8a4212f1670c589ff9a5b45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 19 May 2026 16:05:01 +0200 Subject: [PATCH 41/62] [fix] all: circular imports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- src/pyrrha_mapper/mappers/decomp_mapper.py | 3 ++- src/pyrrha_mapper/mappers/imports_mapper.py | 3 ++- .../mappers/intercg_bin_loader.py | 4 +++- src/pyrrha_mapper/mappers/intercg_mapper.py | 21 ++++++++++--------- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/pyrrha_mapper/mappers/decomp_mapper.py b/src/pyrrha_mapper/mappers/decomp_mapper.py index b8f8d60..0638c2d 100644 --- a/src/pyrrha_mapper/mappers/decomp_mapper.py +++ b/src/pyrrha_mapper/mappers/decomp_mapper.py @@ -31,9 +31,10 @@ ) from pyrrha_mapper.backend import IDA, Backend, Ghidra -from pyrrha_mapper.mappers import Binary, Symbol from pyrrha_mapper.types import FuncType +from .objects import Binary, Symbol + class Location(NamedTuple): """Location inside a text of a word or more.""" diff --git a/src/pyrrha_mapper/mappers/imports_mapper.py b/src/pyrrha_mapper/mappers/imports_mapper.py index 89d58fd..8ccdfc9 100644 --- a/src/pyrrha_mapper/mappers/imports_mapper.py +++ b/src/pyrrha_mapper/mappers/imports_mapper.py @@ -38,9 +38,10 @@ ) from pyrrha_mapper.exceptions import PyrrhaError -from pyrrha_mapper.mappers import Binary, FileSystem, Symbol, Symlink from pyrrha_mapper.types import ResolveDuplicateOption +from .objects import Binary, FileSystem, Symbol, Symlink + lief.logging.disable() diff --git a/src/pyrrha_mapper/mappers/intercg_bin_loader.py b/src/pyrrha_mapper/mappers/intercg_bin_loader.py index 26c57ae..c9cb4db 100644 --- a/src/pyrrha_mapper/mappers/intercg_bin_loader.py +++ b/src/pyrrha_mapper/mappers/intercg_bin_loader.py @@ -22,9 +22,11 @@ from pyrrha_mapper.backend import IDA, Backend, Ghidra from pyrrha_mapper.exceptions import FsMapperError -from pyrrha_mapper.mappers import Binary, FileSystemImportsMapper, Symbol from pyrrha_mapper.types import FuncType +from .imports_mapper import FileSystemImportsMapper +from .objects import Binary, Symbol + class FuncData(NamedTuple): """Store function data collected by the binary parser. diff --git a/src/pyrrha_mapper/mappers/intercg_mapper.py b/src/pyrrha_mapper/mappers/intercg_mapper.py index b19c079..d4dd3b4 100644 --- a/src/pyrrha_mapper/mappers/intercg_mapper.py +++ b/src/pyrrha_mapper/mappers/intercg_mapper.py @@ -26,16 +26,6 @@ from rich.progress import Progress from pyrrha_mapper.exceptions import FsMapperError - -# local imports -from pyrrha_mapper.mappers import ( - Binary, - FileSystem, - FileSystemImportsMapper, - Symbol, - Symlink, - hide_progress, -) from pyrrha_mapper.mappers.intercg_bin_loader import ( BinaryParser, GhidraBinaryParser, @@ -43,6 +33,17 @@ ) from pyrrha_mapper.types import Backend, ResolveDuplicateOption +from .imports_mapper import ( + FileSystemImportsMapper, + hide_progress, +) +from .objects import ( + Binary, + FileSystem, + Symbol, + Symlink, +) + IGNORE_LIST: frozenset[str] = frozenset( [ # Linker-injected bookkeeping stubs From e955e78521c324cba00f45bda1f7eead7defe5cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 19 May 2026 16:16:55 +0200 Subject: [PATCH 42/62] [fix] tests: adapt to new module structure --- tests/test_cli.py | 4 ++-- tests/test_filesystem_objects.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 5c58429..23fb242 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -23,9 +23,9 @@ from click import Command from click.testing import CliRunner, Result +from pyrrha_mapper import FileSystem, Symbol from pyrrha_mapper.__main__ import pyrrha -from pyrrha_mapper.common import FileSystem, Symbol -from pyrrha_mapper.intercg.fwmapper import InterImageCGMapper +from pyrrha_mapper.mappers import InterImageCGMapper def check_click_result(res: Result) -> None: diff --git a/tests/test_filesystem_objects.py b/tests/test_filesystem_objects.py index 5df62dd..3658aab 100644 --- a/tests/test_filesystem_objects.py +++ b/tests/test_filesystem_objects.py @@ -22,8 +22,8 @@ import pytest -from pyrrha_mapper.common import Binary, FileSystem, Symbol, Symlink -from pyrrha_mapper.common.objects import TargetType +from pyrrha_mapper import Binary, FileSystem, Symbol, Symlink +from pyrrha_mapper.mappers.objects import TargetType class SerializedFS(NamedTuple): From 8d1195149b2c0a093f8c58813163d76392c1f48e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 19 May 2026 16:36:32 +0200 Subject: [PATCH 43/62] [fix] tests: backend possible values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- tests/conftest.py | 2 +- tests/test_cli.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index f21f422..20f6f7e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,7 +30,7 @@ def pytest_addoption(parser: pytest.Parser) -> None: "--backend", action="store", help="backend", - choices={x.name.lower() for x in Backend}, + choices={x.name.lower() for x in [Backend.IDA, Backend.GHIDRA]}, ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 23fb242..527c777 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -30,7 +30,7 @@ def check_click_result(res: Result) -> None: """Raise Assertion error if issue.""" - assert res.exit_code == 0 + assert res.exit_code == 0, res.output assert not res.exception, res.exception for log in res.stderr.splitlines(): assert ( From 33743009df9e3d856160b47d2c2a668e88deabd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 19 May 2026 16:49:02 +0200 Subject: [PATCH 44/62] setup: update pyproject MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5d8aeed..acdb934 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ classifiers = [ dependencies = [ 'click>=8.2.0', 'coloredlogs', - 'lief>=0.15.0', + 'lief>=0.17.0', 'numbat>=0.2.6', 'pydantic', 'rich', From d5ffccef70b1a0382a5d4ff2ef14689e50e340f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 19 May 2026 16:55:13 +0200 Subject: [PATCH 45/62] fs: change import of lief of place --- src/pyrrha_mapper/mappers/imports_mapper.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/pyrrha_mapper/mappers/imports_mapper.py b/src/pyrrha_mapper/mappers/imports_mapper.py index 8ccdfc9..5f89ba8 100644 --- a/src/pyrrha_mapper/mappers/imports_mapper.py +++ b/src/pyrrha_mapper/mappers/imports_mapper.py @@ -26,7 +26,6 @@ from pathlib import Path from typing import Any, overload -import lief from numbat import SourcetrailDB from numbat.exceptions import DBException from rich.progress import ( @@ -42,8 +41,6 @@ from .objects import Binary, FileSystem, Symbol, Symlink -lief.logging.disable() - @contextmanager def hide_progress(progress: Progress): @@ -63,7 +60,6 @@ def hide_progress(progress: Progress): progress.start() - class FileSystemImportsMapper: """Filesystem mapper based on Lief, which computes imports and exports. @@ -81,6 +77,10 @@ class FileSystemImportsMapper: """ def __init__(self, root_directory: Path | str, db: SourcetrailDB | None): + import lief + + lief.logging.disable() + self.root_directory = Path(root_directory).resolve().absolute() self.db_interface = db self.fs = FileSystem(root_dir=self.root_directory) @@ -101,8 +101,12 @@ def is_binary_supported(p: Path) -> bool: :param p: the path of the file to analyzed :return: True is the path point on a file """ + import lief + + lief.logging.disable() + return p.is_file() and not p.is_symlink() and (lief.is_elf(str(p)) or lief.is_pe(str(p))) - + @property def dry_run_mode(self) -> bool: """Returns whether a Sourcetrail DB as been provided or not. @@ -250,7 +254,6 @@ def record_symlink_in_db(self, sym: Symlink, log_prefix: str = "") -> Symlink: else: self.record_import_in_db(sym.id, sym.target.id) return sym - # =============================== Utils =============================== @@ -353,6 +356,9 @@ def load_binary(root_directory: Path, file_path: Path) -> tuple[Binary, Any] | s raise: FsMapperError if cannot load it :return: bin object and additionnal info if needed or a string in case of error """ + import lief + + lief.logging.disable() base = Path(root_directory.anchor) rel_path = base.joinpath(file_path.relative_to(root_directory)) @@ -500,7 +506,7 @@ def map_binary(self, bin_object: Binary, additional_res: Any = None) -> None: if not self.dry_run_mode: self.record_binary_in_db(bin_object, f"[binary mapping] {bin_object.name}") - #=============================== Symlinks ================================== + # =============================== Symlinks ================================== def map_symlink(self, path: Path) -> None: """Given a symlink, resolve it and create the associated objects if needed. From dbf69ac26803224a5a110a700bfb525ba0ecb35f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 19 May 2026 17:12:21 +0200 Subject: [PATCH 46/62] ci: change coverage paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 95f2aff..728df32 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -127,7 +127,7 @@ test_data_structures: - coverage report image: python:latest variables: - TEST_COVERAGE_SOURCE: pyrrha_mapper.common.objects + TEST_COVERAGE_SOURCE: pyrrha_mapper.mappers.objects TEST_PATH: tests/test_filesystem_objects.py coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' artifacts: @@ -149,7 +149,7 @@ test_fs: variables: DB: fs MAPPER: fs - TEST_COVERAGE_SOURCE: pyrrha_mapper.common.filesystem_mapper,pyrrha_mapper.fs + TEST_COVERAGE_SOURCE: pyrrha_mapper.mappers.imports_mapper TEST_PATH: tests/test_cli.py::TestFSMapper PYTEST_ARTIFACTS_DIR: test_artifacts @@ -165,7 +165,7 @@ test_fs-cg: DB: ${BACKEND}_${VERSION} MAPPER: fs-cg MAPPER_OPTIONS: '--backend ${BACKEND}' - TEST_COVERAGE_SOURCE: pyrrha_mapper.common.filesystem_mapper,pyrrha_mapper.intercg + TEST_COVERAGE_SOURCE: pyrrha_mapper.mappers.intercg_bin_loader,pyrrha_mapper.mappers.intercg_mapper TEST_PATH: tests/test_cli.py::TestFsCgMapper TEST_SUP_OPTIONS: ${MAPPER_OPTIONS} HEXRAYS_LICENSE: "${IDA_LICENSE}" From 7b01650bf1df9e39cea8009ec5ecaab91bcd027b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Wed, 3 Jun 2026 17:14:41 +0200 Subject: [PATCH 47/62] ci: new ghidra dockerfile --- .gitlab-ci.yml | 31 +++++++++++++++++++++++- ci/ghidra/Dockerfile | 57 ++++++++++++++++++++++++++++++++------------ 2 files changed, 72 insertions(+), 16 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 728df32..08c8cdd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -117,7 +117,36 @@ test_data_structures: rules: - if: '$CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/' - changes: *source_paths - before_script: + before_script: + - echo "=== Java diagnostics ===" + - java -version + - echo "JAVA_HOME=$JAVA_HOME" + - cat "${GHIDRA_INSTALL_DIR}/support/launch.properties" + - python3 -c " +import os, jpype, subprocess +print('JAVA_HOME:', os.environ.get('JAVA_HOME', '')) +print('JPype default JVM:', jpype.getDefaultJVMPath()) +# Simulate exactly what pyghidra does +launch_support = '${GHIDRA_INSTALL_DIR}/support/LaunchSupport.jar' +ghidra_dir = '${GHIDRA_INSTALL_DIR}' +cmd = f'java -cp \"{launch_support}\" LaunchSupport \"{ghidra_dir}\" -jdk_home -save' +print('Running:', cmd) +result = subprocess.run(cmd, shell=True, capture_output=True, text=True) +print('stdout:', result.stdout) +print('stderr:', result.stderr) +print('returncode:', result.returncode) +if result.returncode == 0: + java_home = result.stdout.strip() + print('LaunchSupport returned JAVA_HOME:', java_home) + os.environ['JAVA_HOME'] = java_home + print('JPype JVM path with that JAVA_HOME:', jpype.getDefaultJVMPath()) + try: + jpype.startJVM(None, convertStrings=True, ignoreUnrecognized=True) + print('JVM started OK, java version:', jpype.java.lang.System.getProperty('java.version')) + jpype.shutdownJVM() + except Exception as e: + print('JVM start FAILED:', e) +" - echo -e "\e[95m===== Install Pyrrha with test extension" - pip install '.[test]' script: diff --git a/ci/ghidra/Dockerfile b/ci/ghidra/Dockerfile index 1d45454..3e4e4c0 100644 --- a/ci/ghidra/Dockerfile +++ b/ci/ghidra/Dockerfile @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - # ======================== Ghidra Download and Extraction ============================== # Use a dedicated stage so that wget, unzip, and the zip archive itself are # never committed to the final image layer. @@ -45,10 +44,22 @@ RUN wget --no-verbose "${GHIDRA_URL}" -O /tmp/ghidra.zip && \ /opt/ghidra_${GHIDRA_VERSION}_PUBLIC/licenses # ======================== Ghidra Installation and Runtime Image ======================= -# eclipse-temurin:21-jdk-jammy is the current maintained replacement for the -# deprecated openjdk:21-jdk-slim. The full JDK is required as Ghidra's -# launcher validates the Java installation and rejects JRE-only setups. -FROM eclipse-temurin:21-jdk-jammy +# eclipse-temurin:21-jdk-noble provides JDK 21, required by Ghidra 12.0.4+. +# JPype1==1.5.2 (hard-pinned by pyghidra) is compatible with JDK 21 on Linux. +# +# Ubuntu 24.04 (Noble) is used instead of 22.04 (Jammy) because Noble ships +# openjdk-21-jdk in its default repos whereas Jammy does not. +# +# Problem: eclipse-temurin sets JAVA_HOME=/opt/java/openjdk. pyghidra's launcher +# runs LaunchSupport to discover the JDK home, then sets JAVA_HOME to that result +# before calling jpype.startJVM(None). LaunchSupport may return /opt/java/openjdk +# (the temurin path), which JPype cannot reliably use to locate libjvm.so. +# +# Fix: install the standard Ubuntu openjdk-21-jdk package (predictable path at +# /usr/lib/jvm/java-21-openjdk-amd64) and inject JAVA_HOME_OVERRIDE into Ghidra's +# support/launch.properties. pyghidra reads this file and uses the override path +# directly, bypassing LaunchSupport's JDK search entirely. +FROM eclipse-temurin:21-jdk-noble ARG GHIDRA_VERSION=12.0.4 @@ -57,17 +68,27 @@ ARG GHIDRA_VERSION=12.0.4 ENV GHIDRA_INSTALL_DIR=/opt/ghidra_${GHIDRA_VERSION}_PUBLIC ENV PATH=${GHIDRA_INSTALL_DIR}:${PATH} -# Copy only the extracted Ghidra tree from the download stage — no wget/unzip -# tooling or archive bytes are present in this layer. -COPY --from=ghidra-download /opt/ghidra_${GHIDRA_VERSION}_PUBLIC ${GHIDRA_INSTALL_DIR} +# User creation. +# eclipse-temurin:21-jdk-noble already has a user at UID 1000 ("ubuntu"), +# so we use UID 1001 to avoid conflicts. +# Created here (before COPY) so --chown can reference it directly, avoiding +# a separate chown -R layer that would double Ghidra's disk footprint. +RUN useradd --create-home -u 1001 -m user + +# Copy only the extracted Ghidra tree from the download stage, owned by user +# from the start — no separate chown layer needed. +COPY --chown=user:user --from=ghidra-download /opt/ghidra_${GHIDRA_VERSION}_PUBLIC ${GHIDRA_INSTALL_DIR} # Install the minimal runtime dependencies for Ghidra headless + pyghidra and # for building/installing the pyrrha-mapper Python package. -# Python 3.11 is installed via the deadsnakes PPA as Ubuntu 22.04 (Jammy) -# ships 3.10 by default. +# - openjdk-21-jdk: provides a standard, well-known JDK path that both +# LaunchSupport and JPype handle correctly. Available in Ubuntu 24.04 (Noble). +# - Python 3.11 is installed via the deadsnakes PPA as Ubuntu 24.04 (Noble) +# ships 3.12 by default. # All apt artefacts are removed in the same RUN layer to keep layer size down. RUN apt-get update && \ apt-get install --yes --no-install-recommends \ + openjdk-21-jdk \ software-properties-common \ && add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ @@ -80,14 +101,21 @@ RUN apt-get update && \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* +# Override JAVA_HOME to the Ubuntu OpenJDK path so that both LaunchSupport and +# JPype use the same predictable JDK installation, overriding the temurin default +# of /opt/java/openjdk which JPype cannot reliably resolve libjvm.so from. +ENV JAVA_HOME=/usr/lib/jvm/java-21-openjdk-amd64 + +# Inject JAVA_HOME_OVERRIDE into Ghidra's launch.properties so pyghidra reads +# it directly and skips LaunchSupport's JDK search entirely. This is the +# supported mechanism (see pyghidra/launcher.py _jvm_args()). +RUN echo "JAVA_HOME_OVERRIDE=/usr/lib/jvm/java-21-openjdk-amd64" \ + >> "${GHIDRA_INSTALL_DIR}/support/launch.properties" + # Make python3.11 the default python3 and python. RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 -# User creation -RUN useradd --create-home -u 1000 -m user && \ - chown -R user:user "${GHIDRA_INSTALL_DIR}" - USER user WORKDIR /home/user @@ -98,7 +126,6 @@ ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" RUN python3.11 -m venv "${VIRTUAL_ENV}" && \ pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir pyrrha-mapper && \ echo "source ${VIRTUAL_ENV}/bin/activate" >> /home/user/.bashrc CMD ["/bin/bash"] \ No newline at end of file From 2ef775f2c3c03ad783ad721a8b2fd7102cca2f2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Wed, 3 Jun 2026 17:14:56 +0200 Subject: [PATCH 48/62] ci: update --- .gitlab-ci.yml | 114 ++++++++++++++++++++++++++++++------------------- 1 file changed, 69 insertions(+), 45 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 08c8cdd..dfd0866 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,6 @@ stages: - - test - build + - test - notify # Paths that should trigger test and build pipelines. @@ -9,6 +9,8 @@ stages: - tests/**/* - src/**/* - pyproject.toml + - .gitlab-ci.yml + - ci/ghidra/* #======================== BUILD DOCKER IMAGE AND PUSH TO REGISTRY ====================== build_image: @@ -30,8 +32,15 @@ build_image: VERSION: [91, 93] LATEST: 93 - BACKEND: "ghidra" - VERSION: ["12.0.4"] - LATEST: "12.0.4" + LATEST: "12.1" + VERSION: "12.0.4" + GHIDRA_SHA256: "c3b458661d69e26e203d739c0c82d143cc8a4a29d9e571f099c2cf4bda62a120" + DATE: "20260303" + - BACKEND: "ghidra" + LATEST: "12.1" + VERSION: "12.1" + GHIDRA_SHA256: "aa5cbcbbf48f41ca185fce900e19592f1ade4cd5994eb6e0ede468dac8a6f302" + DATE: "20260513" variables: DOCKER_IMAGE_NAME: $CI_REGISTRY_IMAGE/pyrrha-$BACKEND DOCKER_HOST: unix:///var/run/docker.sock @@ -50,6 +59,7 @@ build_image: sleep 2 done - echo "$CI_REGISTRY_PASSWORD" | docker login --username "$CI_REGISTRY_USER" --password-stdin "$CI_REGISTRY" + - apk add --no-cache bash script: # Resolve the pyrrha version component from the ref that triggered the pipeline. # - tag v1.2.3 -> "1.2.3" @@ -71,14 +81,24 @@ build_image: fi echo "Resolved PYRRHA_VERSION=$PYRRHA_VERSION (REF_KIND=$REF_KIND)" + - | + if [ "$BACKEND" = "ghidra" ]; then + ci/ghidra/build.sh --version "$VERSION" --date "$DATE" --sha256 "$GHIDRA_SHA256" --name "$CI_REGISTRY_IMAGE/ci/$BACKEND" + fi + - | + if [ "$BACKEND" = "ghidra" ]; then + docker push "$CI_REGISTRY_IMAGE/ci/$BACKEND:$VERSION" + fi + # Primary image tag: - - PRIMARY_TAG="${VERSION}-${PYRRHA_VERSION}" - echo "Building $DOCKER_IMAGE_NAME:$PRIMARY_TAG" - - docker build --pull - -t "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" - -f ci/pyrrha/Dockerfile - --build-arg DISASS_IMAGE=$CI_REGISTRY_IMAGE/ci/${BACKEND} - --build-arg DISASS_IMAGE_VERSION=$VERSION + - | + docker build --pull \ + -t "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" \ + -f ci/pyrrha/Dockerfile \ + --build-arg DISASS_IMAGE=$CI_REGISTRY_IMAGE/ci/${BACKEND} \ + --build-arg DISASS_IMAGE_VERSION=$VERSION \ . - docker push "$DOCKER_IMAGE_NAME:$PRIMARY_TAG" @@ -110,7 +130,7 @@ build_image: #========================== OBJECTS TESTS ==================================== -test_data_structures: +test_data_structures: stage: test # Only run tests when source, tests, or packaging metadata changed. # Inherited by test_fs and test_fs-cg via `extends`. @@ -118,35 +138,6 @@ test_data_structures: - if: '$CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/' - changes: *source_paths before_script: - - echo "=== Java diagnostics ===" - - java -version - - echo "JAVA_HOME=$JAVA_HOME" - - cat "${GHIDRA_INSTALL_DIR}/support/launch.properties" - - python3 -c " -import os, jpype, subprocess -print('JAVA_HOME:', os.environ.get('JAVA_HOME', '')) -print('JPype default JVM:', jpype.getDefaultJVMPath()) -# Simulate exactly what pyghidra does -launch_support = '${GHIDRA_INSTALL_DIR}/support/LaunchSupport.jar' -ghidra_dir = '${GHIDRA_INSTALL_DIR}' -cmd = f'java -cp \"{launch_support}\" LaunchSupport \"{ghidra_dir}\" -jdk_home -save' -print('Running:', cmd) -result = subprocess.run(cmd, shell=True, capture_output=True, text=True) -print('stdout:', result.stdout) -print('stderr:', result.stderr) -print('returncode:', result.returncode) -if result.returncode == 0: - java_home = result.stdout.strip() - print('LaunchSupport returned JAVA_HOME:', java_home) - os.environ['JAVA_HOME'] = java_home - print('JPype JVM path with that JAVA_HOME:', jpype.getDefaultJVMPath()) - try: - jpype.startJVM(None, convertStrings=True, ignoreUnrecognized=True) - print('JVM started OK, java version:', jpype.java.lang.System.getProperty('java.version')) - jpype.shutdownJVM() - except Exception as e: - print('JVM start FAILED:', e) -" - echo -e "\e[95m===== Install Pyrrha with test extension" - pip install '.[test]' script: @@ -167,9 +158,9 @@ if result.returncode == 0: path: coverage.xml #========================== MAPPERS TESTS ==================================== -test_fs: +test_fs: extends: - - test_data_structures + - test_data_structures artifacts: name: db_$CI_JOB_NAME_SLUG paths: @@ -181,15 +172,21 @@ test_fs: TEST_COVERAGE_SOURCE: pyrrha_mapper.mappers.imports_mapper TEST_PATH: tests/test_cli.py::TestFSMapper PYTEST_ARTIFACTS_DIR: test_artifacts - test_fs-cg: - extends: + extends: - test_fs - image: - name: $CI_REGISTRY_IMAGE/ci/${BACKEND}:${VERSION} + # Pull the image built in this pipeline when on main, dev, or a release tag + # (tagged as -main, -dev, or -). + # For any other branch (e.g. feature branches) no image is built, so we + # fall back to `latest` which always tracks the last successful main build. + image: + name: $CI_REGISTRY_IMAGE/pyrrha-${BACKEND}:${DISASS_IMAGE_TAG} docker: user: user + before_script: + - echo -e "\e[95m===== Install Pyrrha with test extension" + - pip install '.[test]' variables: DB: ${BACKEND}_${VERSION} MAPPER: fs-cg @@ -198,6 +195,33 @@ test_fs-cg: TEST_PATH: tests/test_cli.py::TestFsCgMapper TEST_SUP_OPTIONS: ${MAPPER_OPTIONS} HEXRAYS_LICENSE: "${IDA_LICENSE}" + # Default: fall back to latest (last successful main build). + # Overridden per-ref by the rules below. + DISASS_IMAGE_TAG: "latest" + # Point pyghidra at the JDK as proper job variables so they are present in + # the environment of EVERY process in the job — including the + # `coverage run -m pytest` process and any multiprocessing workers spawned + # by the mapper (which use the "spawn" start method and re-exec Python). + # Shell `export`s in before_script are not guaranteed to reach those + # re-exec'd children, but GitLab job variables always are. + # /opt/java/openjdk is the temurin JDK path, confirmed present in all + # ghidra images. pyghidra reads JAVA_HOME_OVERRIDE first (launcher.py:202), + # bypassing LaunchSupport entirely. + JAVA_HOME: /opt/java/openjdk + JAVA_HOME_OVERRIDE: /opt/java/openjdk + rules: + - if: '$CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/' + variables: + DISASS_IMAGE_TAG: "${VERSION}-${CI_COMMIT_TAG#v}" # strip the leading v, e.g. 12.0.4-1.2.3 + - if: '$CI_COMMIT_BRANCH == "main"' + changes: *source_paths + variables: + DISASS_IMAGE_TAG: "${VERSION}-main" + - if: '$CI_COMMIT_BRANCH == "dev"' + changes: *source_paths + variables: + DISASS_IMAGE_TAG: "${VERSION}-dev" + - changes: *source_paths # any other branch: use latest parallel: matrix: - BACKEND: "ida" @@ -212,7 +236,7 @@ test_fs-cg: project: firmware-re/cartography/pyrrha-internal-documentation branch: main # strategy: depend # flip on to surface downstream failure on pyrrha - # CI. Off by default — a broken docs build shouldn't + # CI. Off for now — a broken docs build shouldn't # block pyrrha. variables: UPSTREAM_PIPELINE_URL: $CI_PIPELINE_URL From 723ca6165dc3b3ed1e3bd216c50e4fbc6c6ef3e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Wed, 3 Jun 2026 17:17:43 +0200 Subject: [PATCH 49/62] [fix] ci: ghidra docker building script --- ci/ghidra/build.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ci/ghidra/build.sh b/ci/ghidra/build.sh index 52c46c8..7375489 100755 --- a/ci/ghidra/build.sh +++ b/ci/ghidra/build.sh @@ -121,6 +121,7 @@ ghidra_date="${DEFAULT_DATE}" ghidra_sha256="${DEFAULT_SHA256}" image_name="${IMAGE_NAME_DEFAULT}" version_overridden=false +sha256_overridden=false while [[ $# -gt 0 ]]; do case "$1" in @@ -138,6 +139,7 @@ while [[ $# -gt 0 ]]; do -s|--sha256) [[ -n "${2:-}" ]] || die "--sha256 requires an argument." ghidra_sha256="$2" + sha256_overridden=true shift 2 ;; -n|--name) @@ -154,9 +156,9 @@ while [[ $# -gt 0 ]]; do esac done -# If the user overrode --version but not --sha256, warn that the default -# SHA-256 is almost certainly wrong for a different version. -if [[ "${version_overridden}" == true && "${ghidra_sha256}" == "${DEFAULT_SHA256}" ]]; then +# If the user overrode --version but not --sha256, the default SHA-256 is +# almost certainly wrong for a different version. +if [[ "${version_overridden}" == true && "${sha256_overridden}" == false ]]; then die "You overrode --version but not --sha256. " \ "Please provide the correct SHA-256 for Ghidra ${ghidra_version} via --sha256." fi @@ -186,5 +188,4 @@ ${DOCKER} tag "${image_tag}" "${image_name}:latest" echo "==> Successfully built ${image_tag}" echo "==> Also tagged as ${image_name}:latest" -echo "==> Done." - +echo "==> Done." \ No newline at end of file From d7195e6c92ddcf462dcc6ebe20bfc14090ef4cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Wed, 3 Jun 2026 17:19:38 +0200 Subject: [PATCH 50/62] ci: change entrypoint of Pyrrha docker --- ci/pyrrha/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/pyrrha/Dockerfile b/ci/pyrrha/Dockerfile index c2277d2..4788010 100644 --- a/ci/pyrrha/Dockerfile +++ b/ci/pyrrha/Dockerfile @@ -62,4 +62,4 @@ RUN pip install --no-cache-dir --force-reinstall . && \ WORKDIR ${PYRRHA_WORKING_DIR} -ENTRYPOINT ["pyrrha"] +CMD ["pyrrha"] From 0b15f6b3894a037e1ff4143bb5430c2f6796173c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Wed, 3 Jun 2026 16:33:22 +0200 Subject: [PATCH 51/62] [fix] tests: run pyrrha in a subprocess to allow JVM to start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Eloïse Brocas --- tests/test_cli.py | 75 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 527c777..20dc8aa 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -33,11 +33,50 @@ def check_click_result(res: Result) -> None: assert res.exit_code == 0, res.output assert not res.exception, res.exception for log in res.stderr.splitlines(): - assert ( - "ERROR" not in log - and "WARNING" not in log - and "CRITICAL" not in log - ), f"Error log: {log}" + assert "ERROR" not in log and "WARNING" not in log and "CRITICAL" not in log, ( + f"Error log: {log}" + ) + + +class _SubprocessResult(NamedTuple): + """Mimic the subset of click ``Result`` used by ``check_click_result``. + + Backends that start a JVM (Ghidra via pyghidra/JPype) cannot be launched + reliably with ``CliRunner.invoke``: it runs the command *in-process*, and + starting the JVM inside the already-initialised pytest/coverage process + aborts JVM start-up (surfacing as + ``module '_jpype' has no attribute '_java_lang_Class'``). Running pyrrha + in a fresh subprocess - exactly how it is used in production and in the + standalone CLI - avoids this entirely. + """ + + exit_code: int + output: str + stderr: str + exception: BaseException | None = None + + +def run_pyrrha_subprocess(args: list) -> "_SubprocessResult": + """Run the pyrrha CLI in a separate process and adapt the result. + + :param args: CLI arguments (without the leading ``pyrrha``). + :return: a ``Result``-compatible object accepted by ``check_click_result``. + """ + import subprocess + import sys + + completed = subprocess.run( + [sys.executable, "-m", "pyrrha_mapper", *map(str, args)], + capture_output=True, + text=True, + ) + return _SubprocessResult( + exit_code=completed.returncode, + output=completed.stdout + completed.stderr, + stderr=completed.stderr, + exception=None, + ) + class TestCLI: """Tests to check that the CLI works and display correct messages.""" @@ -93,11 +132,11 @@ def SUBCOMMAND(self) -> str: FW_TEST_SYMLINKS_PATHS = {Path("/lib/libssl.so")} FW_TEST_SONAMES = { - "ld-linux.so.3" : "ld-linux.so.3", + "ld-linux.so.3": "ld-linux.so.3", "libcrypto.so.FOR_SONAME_TESTING": "libcrypto.so.1.1", "libdl.so.2": "libdl.so.2", "libpthread.so.0": "libpthread.so.0", - "libssl.so.1.1": "libssl.so.1.1" + "libssl.so.1.1": "libssl.so.1.1", } # =============================== INTERNAL STUFFS ================================== @@ -277,7 +316,9 @@ def test_sonames(self, bin_path: Path, export_dump: FileSystem) -> None: """Imported symbols correspond to a symbol object.""" _bin = export_dump.get_binary_by_path(bin_path) if _bin.path.name in BaseTestFsMapper.FW_TEST_SONAMES.keys(): - assert BaseTestFsMapper.FW_TEST_SONAMES[_bin.path.name] == _bin.soname, "Some sonames are not matching" + assert BaseTestFsMapper.FW_TEST_SONAMES[_bin.path.name] == _bin.soname, ( + "Some sonames are not matching" + ) class TestFsCgMapper(BaseTestFsMapper): @@ -296,8 +337,11 @@ def export_path(self) -> Path: # noqa: D102 @pytest.fixture(scope="class") def pyrrha_exec(self, request, tmp_path_factory) -> BaseTestFsMapper.ExecResults: - """Run pyrrha whith the given thread number and the given db path.""" - runner = CliRunner() + """Run pyrrha whith the given thread number and the given db path. + + Uses a subprocess (not CliRunner) because the Ghidra backend starts a + JVM, which cannot be launched in-process inside pytest. + """ tmp_path = ( tmp_path_factory.mktemp("db", numbered=True) / f"{self.SUBCOMMAND}-{request.param}.srctrldb" @@ -312,12 +356,15 @@ def pyrrha_exec(self, request, tmp_path_factory) -> BaseTestFsMapper.ExecResults request.param, f"{self.FW_TEST_PATH}", ] - return self.ExecResults(res=runner.invoke(self.COMMAND, args), db_path=tmp_path) + return self.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) @pytest.fixture(scope="class") def export_res(self, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: - """Run Pyrrha with export activated.""" - runner = CliRunner() + """Run Pyrrha with export activated. + + Uses a subprocess (not CliRunner) because the Ghidra backend starts a + JVM, which cannot be launched in-process inside pytest. + """ tmp_path = ( tmp_path_factory.mktemp("db", numbered=True) / f"{self.SUBCOMMAND}-{request.param}-export.srctrldb" @@ -332,7 +379,7 @@ def export_res(self, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: request.param, f"{self.FW_TEST_PATH}", ] - return self.ExecResults(res=runner.invoke(self.COMMAND, args), db_path=tmp_path) + return self.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) # =================================== TESTS ======================================== From df7435411c165b89cdb9e3712b4af7284ef8a86c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 16 Jun 2026 07:37:25 +0000 Subject: [PATCH 52/62] fix(decomp): repair mapper run, call-graph and source-location indexing - map() now returns bool so the CLI reports success/failure correctly - map() runs the right phase per loop: index_function, then index_decompiled, then index_call_graph (all three previously called index_function) - fix log-prefix formatting that applied :0x to a list instead of the addr - record the binary as a class node and set self.bin.id so functions get a valid parent_id (mirrors InterImageCGMapper.record_binary_in_db) - index_call_graph reads source_calls_loc by callee address (the key used at population) instead of the caller address - source_calls_loc is a defaultdict(list) so per-callee appends no longer raise KeyError - fix GhidraDecompilMapper docstring (was 'IDA Pro') --- src/pyrrha_mapper/mappers/decomp_mapper.py | 65 ++++++++++++++-------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/src/pyrrha_mapper/mappers/decomp_mapper.py b/src/pyrrha_mapper/mappers/decomp_mapper.py index 0638c2d..8054bc5 100644 --- a/src/pyrrha_mapper/mappers/decomp_mapper.py +++ b/src/pyrrha_mapper/mappers/decomp_mapper.py @@ -16,6 +16,7 @@ """Decompilation code binary mapper.""" import logging +from collections import defaultdict from dataclasses import dataclass, field from pathlib import Path from tempfile import NamedTemporaryFile @@ -60,7 +61,9 @@ class FuncData: source: str source_id: int | None = None declaration: Location | None = None - source_calls_loc: dict[int, list[Location]] = field(default_factory=dict) + # Keyed by callee (parser-space) address; defaultdict so call-site locations + # can be appended without pre-seeding each callee entry. + source_calls_loc: dict[int, list[Location]] = field(default_factory=lambda: defaultdict(list)) @property def id(self) -> int | None: @@ -106,6 +109,9 @@ def __init__( self.bin = Binary(path=bin_path) self.functions: dict[int, FuncData] = dict() self.source_ids: dict[int, int] = dict() + # Display binaries as a dedicated "Binaries" group in NumbatUI, mirroring + # the inter-image call graph mapper so both mappers share a graph shape. + self.db_interface.set_node_type("class", "Binaries", "binary") def record_function(self, func: FuncData, log_prefix) -> FuncData: """Record a function into the DB (do not record the associated source). @@ -293,42 +299,55 @@ def index_call_graph(self, addr, log_prefix) -> None: continue self.db_interface.record_ref_call(func.id, child.id) - if func.source == "" or func.source_calls_loc[addr] == [] or func.source_id is None: + # source_calls_loc is keyed by the *callee* address (see + # index_decompiled), so look up the locations for this child. + child_locations = func.source_calls_loc.get(child_addr, []) + if func.source == "" or child_locations == [] or func.source_id is None: continue - for location in func.source_calls_loc[addr]: + for location in child_locations: self.db_interface.record_reference_location(child.id, func.source_id, *location) - def map(self) -> None: - """Run the successive steps of the mapping.""" - with Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(), - MofNCompleteColumn(), - TimeElapsedColumn(), - ) as progress: + def map(self) -> bool: + """Run the successive steps of the mapping. + :return: True if the binary node was recorded and indexing ran, else False. + """ + # Record the binary as a class node so functions can be attached to it + # via parent_id. Without this id, record_function would orphan every + # function. Mirrors InterImageCGMapper.record_binary_in_db. + self.bin.id = self.db_interface.record_class( + self.bin.name, prefix=f"{self.bin.path.parent}/", delimiter=":" + ) + if self.bin.id is None: + logging.error(f"[binary indexing] {self.bin.name}: record of binary failed") + return False + + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + MofNCompleteColumn(), + TimeElapsedColumn(), + ) as progress: func_addrs = list(self.func_addrs) - func_indexing = progress.add_task( - "[red]Function indexing", total=len(func_addrs) - ) + func_indexing = progress.add_task("[red]Function indexing", total=len(func_addrs)) for addr in func_addrs: - self.index_function(addr, f"[function indexing] {func_addrs:0x}") + self.index_function(addr, f"[function indexing] {addr:#x}") progress.update(func_indexing, advance=1) decompilee_indexing = progress.add_task( - "[orange_red1]Source indexing", total=len(func_addrs) + "[orange_red1]Source indexing", total=len(self.functions) ) for addr in self.functions.keys(): - self.index_function(addr, f"[source indexing] {self.functions[addr].name}") + self.index_decompiled(addr, f"[source indexing] {self.functions[addr].name}") progress.update(decompilee_indexing, advance=1) - cg_indexing = progress.add_task( - "[gold1]Call graph indexing", total=len(func_addrs) - ) + cg_indexing = progress.add_task("[gold1]Call graph indexing", total=len(self.functions)) for addr in self.functions.keys(): - self.index_function(addr, f"[call graph indexing] {self.functions[addr].name}") + self.index_call_graph(addr, f"[call graph indexing] {self.functions[addr].name}") progress.update(cg_indexing, advance=1) + return True + class IdaDecompilMapper(DecompilMapper, IDA): """Decompile Mapper backed by IDA Pro.""" @@ -337,6 +356,6 @@ class IdaDecompilMapper(DecompilMapper, IDA): class GhidraDecompilMapper(DecompilMapper, Ghidra): - """Decompile Mapper backed by IDA Pro.""" + """Decompile Mapper backed by Ghidra.""" - pass \ No newline at end of file + pass From ee202b1aa9539e8f44856894a31607d72b66e584 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 16 Jun 2026 07:39:01 +0000 Subject: [PATCH 53/62] feat(decomp): add serialisable export model and CLI export flag Add decomp_objects.py with three pydantic models projecting the decomp mapper's transient structures into a JSON-serialisable form: - ExportedLocation: serialisable mirror of Location - ExportedFunction: serialisable mirror of FuncData, embedding Symbol and exposing id/name/demangled_name/addr as delegating properties - ExportedDecompilation: single-binary container keyed by parser-space address, with int-key serialiser/validator, write/from_json_export and a model_dump_json override, mirroring the FileSystem export style Wire it in: - DecompilMapper.to_export() builds an ExportedDecompilation from a run - the decomp CLI command gains -e/--export, writing .json when set - export the new classes from pyrrha_mapper.mappers --- src/pyrrha_mapper/__main__.py | 16 +- src/pyrrha_mapper/mappers/__init__.py | 4 + src/pyrrha_mapper/mappers/decomp_mapper.py | 9 + src/pyrrha_mapper/mappers/decomp_objects.py | 294 ++++++++++++++++++++ 4 files changed, 320 insertions(+), 3 deletions(-) create mode 100644 src/pyrrha_mapper/mappers/decomp_objects.py diff --git a/src/pyrrha_mapper/__main__.py b/src/pyrrha_mapper/__main__.py index 76ef92b..7bfa423 100644 --- a/src/pyrrha_mapper/__main__.py +++ b/src/pyrrha_mapper/__main__.py @@ -249,9 +249,7 @@ def fs_mapper( db_instance = setup_db(db) root_directory = root_directory.absolute() - filesystem = FileSystemImportsMapper(root_directory, db_instance).map( - jobs, resolve_duplicates - ) + filesystem = FileSystemImportsMapper(root_directory, db_instance).map(jobs, resolve_duplicates) if export: filesystem.write(db_instance.path.with_suffix(".json")) @@ -315,6 +313,13 @@ def fs_call_graph_mapper( ), ) @backend_option +@click.option( + "-e", + "--export", + help="Create a JSON export of the resulting decompilation mapping.", + is_flag=True, + default=False, +) @click.argument( "executable", type=click.Path(exists=False, file_okay=True, dir_okay=False, path_type=Path), @@ -323,6 +328,7 @@ def fs_exe_decompiled_mapper( debug: bool, db: Path, backend: Backend, + export: bool, executable: Path, ): """Map a single executable with decompiled code.""" @@ -343,6 +349,10 @@ def fs_exe_decompiled_mapper( if mapper.map(): logging.info("success.") + if export: + export_path = db_instance.path.with_suffix(".json") + mapper.to_export().write(export_path) + logging.info(f"write export into: {export_path}") else: logging.error("failure.") diff --git a/src/pyrrha_mapper/mappers/__init__.py b/src/pyrrha_mapper/mappers/__init__.py index 39dba61..239b7c5 100644 --- a/src/pyrrha_mapper/mappers/__init__.py +++ b/src/pyrrha_mapper/mappers/__init__.py @@ -15,12 +15,16 @@ # limitations under the License. """Module for the intercg mapper.""" from .decomp_mapper import GhidraDecompilMapper, IdaDecompilMapper +from .decomp_objects import ExportedDecompilation, ExportedFunction, ExportedLocation from .imports_mapper import FileSystemImportsMapper, hide_progress from .intercg_mapper import InterImageCGMapper from .objects import Binary, FileSystem, Symbol, Symlink __all__ = ["IdaDecompilMapper", "GhidraDecompilMapper", + "ExportedDecompilation", + "ExportedFunction", + "ExportedLocation", "InterImageCGMapper", "FileSystemImportsMapper", "hide_progress", diff --git a/src/pyrrha_mapper/mappers/decomp_mapper.py b/src/pyrrha_mapper/mappers/decomp_mapper.py index 8054bc5..4a3eeba 100644 --- a/src/pyrrha_mapper/mappers/decomp_mapper.py +++ b/src/pyrrha_mapper/mappers/decomp_mapper.py @@ -34,6 +34,7 @@ from pyrrha_mapper.backend import IDA, Backend, Ghidra from pyrrha_mapper.types import FuncType +from .decomp_objects import ExportedDecompilation from .objects import Binary, Symbol @@ -348,6 +349,14 @@ def map(self) -> bool: return True + def to_export(self) -> ExportedDecompilation: + """Build a serialisable export of the current mapping result. + + :return: an ExportedDecompilation projecting this mapper's binary and + functions into a JSON-serialisable model. + """ + return ExportedDecompilation.from_mapper(self) + class IdaDecompilMapper(DecompilMapper, IDA): """Decompile Mapper backed by IDA Pro.""" diff --git a/src/pyrrha_mapper/mappers/decomp_objects.py b/src/pyrrha_mapper/mappers/decomp_objects.py new file mode 100644 index 0000000..d90ee69 --- /dev/null +++ b/src/pyrrha_mapper/mappers/decomp_objects.py @@ -0,0 +1,294 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Serialisable export model for the decompilation mapper. + +These pydantic models mirror the transient analysis structures of +``decomp_mapper`` (``Location``, ``FuncData``) but, unlike them, can be dumped +to / loaded from JSON. The mapper keeps using the lightweight dataclass/ +NamedTuple for the hot indexing loop; this module provides the serialisable +projection produced once at the end of a run (see +``ExportedDecompilation.from_mapper``). + +All function addresses are expressed in **parser space** (the native address +space of the underlying tool — IDA, Ghidra, etc.), exactly as in the mapper. +""" + +from __future__ import annotations + +import json +from collections.abc import Iterable +from pathlib import Path +from typing import TYPE_CHECKING, Any, Self + +from pydantic import ( + BaseModel, + Field, + SerializationInfo, + ValidationInfo, + field_serializer, + field_validator, + model_validator, +) + +from pyrrha_mapper.types import FuncType + +from .objects import Symbol + +if TYPE_CHECKING: # pragma: no cover + from .decomp_mapper import DecompilMapper, FuncData, Location + + +class ExportedLocation(BaseModel): + """Serialisable location of a word (or more) inside a decompiled source. + + Mirror of ``decomp_mapper.Location`` (a ``NamedTuple``) that can be dumped + to and loaded from JSON. Lines and columns are 1-based, matching the + convention used by the mapper when it records symbol/reference locations. + """ + + start_line: int + start_col: int + end_line: int + end_col: int + + @classmethod + def from_location(cls, location: Location) -> Self: + """:return: an ExportedLocation built from a mapper Location.""" + return cls( + start_line=location.start_line, + start_col=location.start_col, + end_line=location.end_line, + end_col=location.end_col, + ) + + def as_tuple(self) -> tuple[int, int, int, int]: + """:return: the location as a ``(start_line, start_col, end_line, end_col)`` tuple.""" + return (self.start_line, self.start_col, self.end_line, self.end_col) + + # from https://github.com/pydantic/pydantic/discussions/2910 + def __lt__(self, other): # noqa: D105 + return tuple(self.model_dump().values()) < tuple(other.model_dump().values()) + + def __le__(self, other): # noqa: D105 + return tuple(self.model_dump().values()) <= tuple(other.model_dump().values()) + + def __gt__(self, other): # noqa: D105 + return tuple(self.model_dump().values()) > tuple(other.model_dump().values()) + + def __ge__(self, other): # noqa: D105 + return tuple(self.model_dump().values()) >= tuple(other.model_dump().values()) + + +class ExportedFunction(BaseModel): + """Serialisable view of a single decompiled function. + + Mirror of ``decomp_mapper.FuncData``. The underlying :class:`Symbol` is + embedded directly; ``id``/``name``/``demangled_name``/``addr`` are exposed + as delegating properties so this object offers the same read surface as + ``FuncData``. + """ + + symbol: Symbol + type: FuncType + calls: list[int] = Field(default_factory=list) + callers: list[int] = Field(default_factory=list) + source: str = "" + source_id: int | None = None + declaration: ExportedLocation | None = None + # Keyed by callee (parser-space) address, as in FuncData.source_calls_loc. + source_calls_loc: dict[int, list[ExportedLocation]] = Field(default_factory=dict) + + @field_validator("symbol", mode="after") + @classmethod + def validate_symbol_is_func(cls, value: Symbol) -> Symbol: + """Ensure the embedded symbol is a function.""" + if not value.is_func: + raise ValueError(f"symbol '{value}' cannot back a function as 'is_func' is False") + return value + + @property + def id(self) -> int | None: + """:return: the associated DB id if any.""" + return self.symbol.id + + @id.setter + def id(self, val: int) -> None: + self.symbol.id = val + + @property + def name(self) -> str: + """:return: mangled name of the function.""" + return self.symbol.name + + @property + def demangled_name(self) -> str: + """:return: demangled name of the function.""" + return self.symbol.demangled_name + + @property + def addr(self) -> int: + """:return: address of the function in the binary (parser space).""" + assert self.symbol.addr is not None + return self.symbol.addr + + @classmethod + def from_func_data(cls, func: FuncData) -> Self: + """:return: an ExportedFunction built from a mapper FuncData object.""" + declaration = ( + ExportedLocation.from_location(func.declaration) + if func.declaration is not None + else None + ) + source_calls_loc = { + callee_addr: [ExportedLocation.from_location(loc) for loc in locations] + for callee_addr, locations in func.source_calls_loc.items() + } + return cls( + symbol=func.symbol, + type=func.type, + calls=list(func.calls), + callers=list(func.callers), + source=func.source, + source_id=func.source_id, + declaration=declaration, + source_calls_loc=source_calls_loc, + ) + + def __repr__(self): # noqa: D105 + return f"ExportedFunction('{self.name}')" + + +class ExportedDecompilation(BaseModel): + """Serialisable result of a single ``DecompilMapper`` run. + + It stores the analysed binary identity and the decompiled functions keyed + by their parser-space entry-point address. It is based on pydantic so it + can be dumped to a dict/JSON and rebuilt from these dumps. + """ + + path: Path + id: int | None = None + name: str = "" + functions: dict[int, ExportedFunction] = Field(default_factory=dict) + + def model_post_init(self, __context: Any) -> None: + """Enforce object name based on its path.""" + self.name = self.path.name + + # ----------------------------- Serialisation --------------------------------- + + @field_serializer("functions", mode="plain", when_used="always") + def serialize_functions( + self, v: dict[int, ExportedFunction], info: SerializationInfo + ) -> dict[Any, Any]: + """Serialize the address-keyed functions dict. + + JSON object keys must be strings, so integer addresses are stringified + in JSON mode and kept as integers in python mode. + """ + mode = "json" if info.mode_is_json() else "python" + res: dict[Any, Any] = dict() + for addr, func in v.items(): + key = str(addr) if info.mode_is_json() else addr + res[key] = func.model_dump(mode=mode) + return res + + @field_validator("functions", mode="before") + @classmethod + def validate_functions(cls, data: Any, info: ValidationInfo) -> Any: + """Validate a dict dump and turn it into an ``int -> ExportedFunction`` dict. + + Accepts an already-built mapping, a python dump (int keys) or a JSON + dump (string keys); the latter has its keys converted back to int. + """ + if not isinstance(data, dict): + raise ValueError("provided functions data is not a dict") + res: dict[int, ExportedFunction] = dict() + for addr, content in data.items(): + try: + int_addr = int(addr) + except (TypeError, ValueError) as e: + raise ValueError(f"Cannot convert function key '{addr}' into an int: {e}") from e + if isinstance(content, ExportedFunction): + res[int_addr] = content + else: + res[int_addr] = ExportedFunction.model_validate(content) + return res + + @model_validator(mode="after") + def validate_keys_match_addr(self) -> Self: + """Ensure each function is stored under its own address when it has one.""" + for addr, func in self.functions.items(): + if func.symbol.addr is not None and func.symbol.addr != addr: + raise ValueError( + f"function '{func.name}' stored under address {addr} but its symbol " + f"address is {func.symbol.addr}" + ) + return self + + def model_dump_json(self, **args) -> str: + """Override classic pydantic model_dump_json with preselected arguments.""" + return json.dumps(self.model_dump(mode="json", **args)) + + def write(self, export_path: Path) -> None: + """Dump content of the instance into a JSON file.""" + export_path.write_text(self.model_dump_json()) + + @classmethod + def from_json_export(cls, export_path: Path | str) -> Self: + """Create and populate an instance from a JSON file content.""" + export_path = Path(export_path) + return cls.model_validate_json(export_path.read_text()) + + @classmethod + def from_mapper(cls, mapper: DecompilMapper) -> Self: + """:return: an ExportedDecompilation built from a DecompilMapper run.""" + functions = { + addr: ExportedFunction.from_func_data(func) for addr, func in mapper.functions.items() + } + return cls(path=mapper.bin.path, id=mapper.bin.id, functions=functions) + + # ----------------------------- Manipulation helpers --------------------------- + + def add_function(self, func: ExportedFunction) -> None: + """Record a function, keyed by its address. Overrides any existing entry.""" + self.functions[func.addr] = func + + def function_exists(self, addr: int) -> bool: + """:return: True if a function exists at the given address.""" + return addr in self.functions + + def function_name_exists(self, name: str) -> bool: + """:return: True if a function with the given (mangled) name exists.""" + return any(func.name == name for func in self.functions.values()) + + def get_function_by_addr(self, addr: int) -> ExportedFunction: + """:return: the function recorded at the given address.""" + return self.functions[addr] + + def get_function_by_name(self, name: str) -> ExportedFunction: + """:return: the first function with the given (mangled) name.""" + for func in self.functions.values(): + if func.name == name: + return func + raise KeyError(name) + + def iter_functions(self) -> Iterable[ExportedFunction]: + """:return: an iterable over the functions of the binary.""" + yield from self.functions.values() + + def __repr__(self): # noqa: D105 + return f"ExportedDecompilation('{self.path}', funcs={len(self.functions)})" From 2e3f0e283e99c9b326e7289b771ea8bad628da17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 16 Jun 2026 07:41:31 +0000 Subject: [PATCH 54/62] test(decomp): cover the export model and the decomp CLI - tests/test_decomp_objects.py: unit tests for ExportedLocation, ExportedFunction and ExportedDecompilation with hand-built fixtures (property delegation, is_func validation, from_func_data/from_mapper conversions, python vs json dump parity, int-key round-trip including the nested source_calls_loc, write/from_json_export round-trip and malformed -dump error paths). 100% line coverage of decomp_objects. - tests/test_cli.py: TestDecompMapper runs the decomp subcommand via a subprocess (Ghidra JVM) on every executable of the test firmware, asserting the JSON export exists, loads as ExportedDecompilation, keys functions by address and carries decompiled source. --- tests/test_cli.py | 105 ++++++++++- tests/test_decomp_objects.py | 355 +++++++++++++++++++++++++++++++++++ 2 files changed, 459 insertions(+), 1 deletion(-) create mode 100644 tests/test_decomp_objects.py diff --git a/tests/test_cli.py b/tests/test_cli.py index 20dc8aa..6c2bd53 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -25,7 +25,7 @@ from pyrrha_mapper import FileSystem, Symbol from pyrrha_mapper.__main__ import pyrrha -from pyrrha_mapper.mappers import InterImageCGMapper +from pyrrha_mapper.mappers import ExportedDecompilation, InterImageCGMapper def check_click_result(res: Result) -> None: @@ -408,3 +408,106 @@ def test_resolved_imported_symbols(self, bin_path: Path, export_dump: FileSystem assert target.name in _bin.imported_symbol_names assert _bin.imported_symbol_exists(target.name) assert isinstance(_bin.get_imported_symbol(target.name), Symbol) + + +class TestDecompMapper: + """Functional tests for the decomp mapper. Tests are done from the CLI. + + The decomp mapper runs on a single executable, so each binary of the test + firmware triggers its own ``decomp`` invocation. A subprocess (not + CliRunner) is used because the Ghidra backend starts a JVM, which cannot be + launched in-process inside pytest. + """ + + COMMAND: Command = pyrrha + SUBCOMMAND = "decomp" + + FW_TEST_PATH = Path(__file__).parent / "test_fw" + # Same set of executables as the fs-cg functional tests. + FW_TEST_BIN_PATHS = BaseTestFsMapper.FW_TEST_BIN_PATHS + + class ExecResults(NamedTuple): # noqa: D106 + res: Result + db_path: Path + + @property + def export_path(self) -> Path: # noqa: D102 + return self.db_path.with_suffix(".json") + + @staticmethod + def _path_id(val): + if isinstance(val, Path): + return str(val) + return val + + def _host_path(self, bin_path: Path) -> Path: + """:return: the on-host path of a firmware-relative binary path.""" + return self.FW_TEST_PATH / bin_path.relative_to(bin_path.anchor) + + # =============================== FIXTURES ========================================= + + @pytest.fixture(scope="class") + def export_res(self, tmp_path_factory, request) -> "TestDecompMapper.ExecResults": + """Run the decomp mapper with export activated on a single executable.""" + bin_path: Path = request.param + executable = self._host_path(bin_path) + tmp_path = ( + tmp_path_factory.mktemp("db", numbered=True) + / f"{self.SUBCOMMAND}-{bin_path.name}.srctrldb" + ) + args = [ + self.SUBCOMMAND, + "--backend", + f"{request.config.getoption('--backend')}", + "--db", + f"{tmp_path}", + "--export", + f"{executable}", + ] + return self.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) + + @pytest.fixture(scope="class") + def export_dump(self, export_res: "TestDecompMapper.ExecResults") -> ExportedDecompilation: + """Load the JSON export into an ExportedDecompilation object.""" + return ExportedDecompilation.from_json_export(export_res.export_path) + + # =================================== TESTS ======================================== + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_db_creation(self, export_res: "TestDecompMapper.ExecResults") -> None: + """The NumbatUI DB and project files are generated.""" + check_click_result(export_res.res) + assert export_res.db_path.with_suffix(".srctrldb").exists(), "Missing DB file" + assert export_res.db_path.with_suffix(".srctrlprj").exists(), "Missing project file" + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_export_creation(self, export_res: "TestDecompMapper.ExecResults") -> None: + """The JSON export file exists.""" + check_click_result(export_res.res) + assert export_res.export_path.exists(), "Export file does not exist" + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_export_format(self, export_dump: ExportedDecompilation) -> None: + """The JSON export loads as an ExportedDecompilation object.""" + assert isinstance(export_dump, ExportedDecompilation), "Export cannot be loaded correctly" + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_functions_present(self, request, export_dump: ExportedDecompilation) -> None: + """The export records functions and binds them to the analysed binary.""" + bin_path: Path = request.node.callspec.params["export_res"] + assert export_dump.path.name == bin_path.name + assert len(list(export_dump.iter_functions())) > 0, "No function recorded" + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_function_addr_keys(self, export_dump: ExportedDecompilation) -> None: + """Every function is stored under its own (parser-space) address.""" + for addr, func in export_dump.functions.items(): + assert func.addr == addr, ( + f"{func.name} stored under {addr:#x} but addr is {func.addr:#x}" + ) + + @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) + def test_decompiled_source(self, export_dump: ExportedDecompilation) -> None: + """At least one non-imported function carries decompiled source.""" + with_source = [f for f in export_dump.iter_functions() if f.type != "imported" and f.source] + assert with_source, "No decompiled source recorded for any local function" diff --git a/tests/test_decomp_objects.py b/tests/test_decomp_objects.py new file mode 100644 index 0000000..9d20446 --- /dev/null +++ b/tests/test_decomp_objects.py @@ -0,0 +1,355 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023-2025 Quarkslab +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for the decompilation export model (decomp_objects).""" + +import json +from pathlib import Path + +import pytest + +from pyrrha_mapper.mappers.decomp_objects import ( + ExportedDecompilation, + ExportedFunction, + ExportedLocation, +) +from pyrrha_mapper.mappers.objects import Symbol +from pyrrha_mapper.types import FuncType + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def callee_symbol() -> Symbol: + """:return: the symbol used as a callee function.""" + return Symbol(name="bar", demangled_name="bar", is_func=True, id=6, addr=0x2000) + + +@pytest.fixture +def caller_symbol() -> Symbol: + """:return: the symbol used as a caller function.""" + return Symbol(name="foo", demangled_name="foo(int)", is_func=True, id=5, addr=0x1000) + + +@pytest.fixture +def declaration_loc() -> ExportedLocation: + """:return: a location pointing at a function declaration.""" + return ExportedLocation(start_line=1, start_col=6, end_line=1, end_col=9) + + +@pytest.fixture +def call_loc() -> ExportedLocation: + """:return: a location pointing at a call site.""" + return ExportedLocation(start_line=2, start_col=3, end_line=2, end_col=6) + + +@pytest.fixture +def callee_func(callee_symbol: Symbol) -> ExportedFunction: + """:return: an ExportedFunction with no calls (a leaf callee).""" + return ExportedFunction(symbol=callee_symbol, type=FuncType.NORMAL, source="void bar(){}") + + +@pytest.fixture +def caller_func( + caller_symbol: Symbol, declaration_loc: ExportedLocation, call_loc: ExportedLocation +) -> ExportedFunction: + """:return: an ExportedFunction that calls bar (addr 0x2000).""" + return ExportedFunction( + symbol=caller_symbol, + type=FuncType.NORMAL, + calls=[0x2000], + callers=[], + source="void foo(int a){\n bar();\n}", + source_id=9, + declaration=declaration_loc, + source_calls_loc={0x2000: [call_loc]}, + ) + + +@pytest.fixture +def imported_func() -> ExportedFunction: + """:return: an imported (extern) ExportedFunction with no source.""" + return ExportedFunction( + symbol=Symbol(name="puts", demangled_name="puts", is_func=True, id=7, addr=0x3000), + type=FuncType.IMPORTED, + ) + + +@pytest.fixture +def example_decomp( + caller_func: ExportedFunction, + callee_func: ExportedFunction, + imported_func: ExportedFunction, +) -> ExportedDecompilation: + """:return: an ExportedDecompilation with caller, callee and an import.""" + return ExportedDecompilation( + path=Path("/bin/example"), + id=1, + functions={ + 0x1000: caller_func, + 0x2000: callee_func, + 0x3000: imported_func, + }, + ) + + +# --------------------------------------------------------------------------- +# ExportedLocation +# --------------------------------------------------------------------------- + + +class TestExportedLocation: + """Tests for the ExportedLocation model.""" + + def test_from_location(self) -> None: + """from_location copies the four coordinates.""" + from pyrrha_mapper.mappers.decomp_mapper import Location + + loc = Location(start_line=3, start_col=4, end_line=3, end_col=10) + exported = ExportedLocation.from_location(loc) + assert exported.as_tuple() == (3, 4, 3, 10) + + def test_as_tuple(self, call_loc: ExportedLocation) -> None: + """as_tuple returns the coordinates in declaration order.""" + assert call_loc.as_tuple() == (2, 3, 2, 6) + + def test_ordering(self) -> None: + """Locations are ordered by their dumped tuple.""" + small = ExportedLocation(start_line=1, start_col=1, end_line=1, end_col=2) + big = ExportedLocation(start_line=2, start_col=1, end_line=2, end_col=2) + assert small < big + assert small <= big + assert big > small + assert big >= small + assert small <= small + assert small >= small + + def test_roundtrip(self, call_loc: ExportedLocation) -> None: + """A location survives a JSON round-trip.""" + reloaded = ExportedLocation.model_validate_json(call_loc.model_dump_json()) + assert reloaded == call_loc + + +# --------------------------------------------------------------------------- +# ExportedFunction +# --------------------------------------------------------------------------- + + +class TestExportedFunction: + """Tests for the ExportedFunction model.""" + + def test_property_delegation(self, caller_func: ExportedFunction) -> None: + """id/name/demangled_name/addr delegate to the embedded symbol.""" + assert caller_func.id == 5 + assert caller_func.name == "foo" + assert caller_func.demangled_name == "foo(int)" + assert caller_func.addr == 0x1000 + + def test_id_setter(self, caller_func: ExportedFunction) -> None: + """Setting id updates the embedded symbol.""" + caller_func.id = 42 + assert caller_func.symbol.id == 42 + + def test_repr(self, caller_func: ExportedFunction) -> None: + """The repr uses the mangled name.""" + assert repr(caller_func) == "ExportedFunction('foo')" + + def test_non_func_symbol_rejected(self) -> None: + """A symbol with is_func=False cannot back an ExportedFunction.""" + with pytest.raises(ValueError): + ExportedFunction( + symbol=Symbol(name="data", demangled_name="data", is_func=False, addr=1), + type=FuncType.NORMAL, + ) + + def test_from_func_data(self) -> None: + """from_func_data converts a mapper FuncData into an ExportedFunction.""" + from pyrrha_mapper.mappers.decomp_mapper import FuncData, Location + + symbol = Symbol(name="foo", demangled_name="foo", is_func=True, id=5, addr=0x1000) + func = FuncData( + symbol=symbol, + type=FuncType.NORMAL, + calls=[0x2000], + callers=[0x500], + source="void foo(){ bar(); }", + source_id=9, + declaration=Location(1, 6, 1, 9), + ) + func.source_calls_loc[0x2000].append(Location(1, 13, 1, 16)) + + exported = ExportedFunction.from_func_data(func) + assert exported.name == "foo" + assert exported.calls == [0x2000] + assert exported.callers == [0x500] + assert exported.source_id == 9 + assert exported.declaration is not None + assert exported.declaration.as_tuple() == (1, 6, 1, 9) + assert exported.source_calls_loc[0x2000][0].as_tuple() == (1, 13, 1, 16) + + def test_from_func_data_no_declaration(self, callee_symbol: Symbol) -> None: + """from_func_data tolerates a missing declaration.""" + from pyrrha_mapper.mappers.decomp_mapper import FuncData + + func = FuncData( + symbol=callee_symbol, + type=FuncType.NORMAL, + calls=[], + callers=[], + source="", + ) + exported = ExportedFunction.from_func_data(func) + assert exported.declaration is None + assert exported.source_calls_loc == {} + + def test_roundtrip(self, caller_func: ExportedFunction) -> None: + """An ExportedFunction survives a JSON round-trip.""" + reloaded = ExportedFunction.model_validate_json(caller_func.model_dump_json()) + assert reloaded == caller_func + + +# --------------------------------------------------------------------------- +# ExportedDecompilation +# --------------------------------------------------------------------------- + + +class TestExportedDecompilation: + """Tests for the ExportedDecompilation model.""" + + def test_name_from_path(self, example_decomp: ExportedDecompilation) -> None: + """The name is derived from the path.""" + assert example_decomp.name == "example" + + def test_repr(self, example_decomp: ExportedDecompilation) -> None: + """The repr reports the path and the function count.""" + assert repr(example_decomp) == "ExportedDecompilation('/bin/example', funcs=3)" + + def test_function_exists(self, example_decomp: ExportedDecompilation) -> None: + """function_exists checks membership by address.""" + assert example_decomp.function_exists(0x1000) + assert not example_decomp.function_exists(0xDEAD) + + def test_function_name_exists(self, example_decomp: ExportedDecompilation) -> None: + """function_name_exists checks membership by mangled name.""" + assert example_decomp.function_name_exists("foo") + assert not example_decomp.function_name_exists("missing") + + def test_get_function_by_addr( + self, example_decomp: ExportedDecompilation, caller_func: ExportedFunction + ) -> None: + """get_function_by_addr retrieves the stored function.""" + assert example_decomp.get_function_by_addr(0x1000) == caller_func + + def test_get_function_by_name( + self, example_decomp: ExportedDecompilation, caller_func: ExportedFunction + ) -> None: + """get_function_by_name retrieves by mangled name and raises otherwise.""" + assert example_decomp.get_function_by_name("foo") == caller_func + with pytest.raises(KeyError): + example_decomp.get_function_by_name("missing") + + def test_add_function(self, callee_func: ExportedFunction) -> None: + """add_function stores a function under its address.""" + decomp = ExportedDecompilation(path=Path("/bin/x")) + decomp.add_function(callee_func) + assert decomp.function_exists(callee_func.addr) + assert decomp.get_function_by_addr(callee_func.addr) == callee_func + + def test_iter_functions(self, example_decomp: ExportedDecompilation) -> None: + """iter_functions yields every stored function.""" + names = sorted(f.name for f in example_decomp.iter_functions()) + assert names == ["bar", "foo", "puts"] + + def test_python_dump_keeps_int_keys(self, example_decomp: ExportedDecompilation) -> None: + """A python-mode dump keeps integer address keys.""" + dump = example_decomp.model_dump() + assert set(dump["functions"].keys()) == {0x1000, 0x2000, 0x3000} + + def test_json_dump_stringifies_keys(self, example_decomp: ExportedDecompilation) -> None: + """A JSON-mode dump stringifies the integer address keys.""" + dump_json = json.loads(example_decomp.model_dump_json()) + assert set(dump_json["functions"].keys()) == {"4096", "8192", "12288"} + + def test_roundtrip_equal(self, example_decomp: ExportedDecompilation) -> None: + """A full JSON round-trip preserves equality and int keys.""" + reloaded = ExportedDecompilation.model_validate_json(example_decomp.model_dump_json()) + assert reloaded == example_decomp + assert set(reloaded.functions.keys()) == {0x1000, 0x2000, 0x3000} + # nested int-keyed source_calls_loc is restored too + assert reloaded.functions[0x1000].source_calls_loc[0x2000][0].as_tuple() == (2, 3, 2, 6) + + def test_write_and_from_json_export( + self, example_decomp: ExportedDecompilation, tmp_path: Path + ) -> None: + """Calling write then from_json_export round-trips through a file.""" + export_path = tmp_path / "decomp.json" + example_decomp.write(export_path) + assert json.loads(export_path.read_text()), "exported data cannot be loaded as JSON" + reloaded = ExportedDecompilation.from_json_export(export_path) + assert reloaded == example_decomp + + def test_from_json_export_accepts_str_path( + self, example_decomp: ExportedDecompilation, tmp_path: Path + ) -> None: + """from_json_export also accepts a plain string path.""" + export_path = tmp_path / "decomp.json" + example_decomp.write(export_path) + reloaded = ExportedDecompilation.from_json_export(str(export_path)) + assert reloaded == example_decomp + + def test_validate_rejects_non_dict_functions(self) -> None: + """A non-dict functions payload is rejected.""" + with pytest.raises(ValueError): + ExportedDecompilation.model_validate({"path": "/bin/x", "functions": "nope"}) + + def test_validate_rejects_non_int_key(self, example_decomp: ExportedDecompilation) -> None: + """A function key that cannot be coerced to int is rejected.""" + dump = json.loads(example_decomp.model_dump_json()) + dump["functions"]["not_an_int"] = dump["functions"].pop("4096") + with pytest.raises(ValueError): + ExportedDecompilation.model_validate(dump) + + def test_validate_rejects_addr_key_mismatch(self, caller_func: ExportedFunction) -> None: + """A function stored under a key different from its symbol addr is rejected.""" + with pytest.raises(ValueError): + ExportedDecompilation(path=Path("/bin/x"), functions={0x9999: caller_func}) + + def test_from_mapper(self, monkeypatch: pytest.MonkeyPatch) -> None: + """from_mapper projects a DecompilMapper's bin and functions.""" + from pyrrha_mapper.mappers.decomp_mapper import FuncData + from pyrrha_mapper.mappers.objects import Binary + + symbol = Symbol(name="foo", demangled_name="foo", is_func=True, id=5, addr=0x1000) + + class _FakeMapper: + def __init__(self) -> None: + self.bin = Binary(path=Path("/bin/example"), id=1) + self.functions = { + 0x1000: FuncData( + symbol=symbol, + type=FuncType.NORMAL, + calls=[], + callers=[], + source="void foo(){}", + ) + } + + export = ExportedDecompilation.from_mapper(_FakeMapper()) # type: ignore[arg-type] + assert export.path == Path("/bin/example") + assert export.id == 1 + assert export.name == "example" + assert export.get_function_by_addr(0x1000).name == "foo" From feaea542ad536d4ca8ea338520eea53332f63f36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 16 Jun 2026 08:02:58 +0000 Subject: [PATCH 55/62] fix(cli): annotate decomp mapper variable with its base type The decomp command's match block assigned both IdaDecompilMapper and GhidraDecompilMapper to one variable, which mypy rejected as incompatible types. Annotate the variable with the common DecompilMapper base class. --- src/pyrrha_mapper/__main__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pyrrha_mapper/__main__.py b/src/pyrrha_mapper/__main__.py index 7bfa423..0c02421 100644 --- a/src/pyrrha_mapper/__main__.py +++ b/src/pyrrha_mapper/__main__.py @@ -31,6 +31,7 @@ IdaDecompilMapper, InterImageCGMapper, ) +from pyrrha_mapper.mappers.decomp_mapper import DecompilMapper from pyrrha_mapper.types import Backend, ResolveDuplicateOption # ------------------------------------------------------------------------------- @@ -338,6 +339,7 @@ def fs_exe_decompiled_mapper( setup_logs(debug, db) db_instance = setup_db(db) + mapper: DecompilMapper match backend: case Backend.IDA: mapper = IdaDecompilMapper(db_instance, executable) From 72d0163ec0ef1e1cb6f8af1bfc6aac29ee6aa176 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 16 Jun 2026 08:03:50 +0000 Subject: [PATCH 56/62] docs(decomp): document JSON export and expand CHANGELOG - exe-decomp mapper page: document the new -e/--export option and the ExportedDecompilation / ExportedFunction / ExportedLocation structure, with a loadable post-processing example (also drop a duplicated help line) - CHANGELOG: add a curated Unreleased section covering all changes on dev since v1.0.1 (single backend value, qbinary/quokka removal, SONAME support, decomp rework + JSON export, mapper fixes, repo reorganisation and CI), folding in this change set's decomp export feature and fixes --- CHANGELOG.md | 24 ++++++++++++++++++++++++ docs/mappers/exe-decomp.md | 17 ++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 582e682..310bd0a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,27 @@ +## Unreleased + +### Features +- All mappers now share a single disassembler `--backend` value (`ida`/`ghidra`) implemented in one common place, replacing the previous per-mapper disassembler/exporter selection. +- Remove the `qbinary`/Quokka dependency: `fs-cg` and `decomp` now interact directly with the disassemblers (IDA, Ghidra), so Pyrrha can run on systems without Quokka. +- Add ELF SONAME support: binaries are indexed by their `DT_SONAME` so imports referencing a SONAME resolve even without a matching symlink. +- `decomp` mapper: full rework around a class-based object integrated with the common backend layer. +- `decomp` mapper: add a `-e/--export` option to dump the result as JSON, loadable through the new `ExportedDecompilation` object exposed by Pyrrha. +- Expose `Binary` image base and relocatable information on the internal representation. +- Improve the documentation (installation, quick summary, decomp mapper) and add unit tests for the `decomp` export model plus functional tests for the `decomp` mapper. + +### Fixes +- `intercg` mapper: various fixes around addresses and demangled names, missing Ghidra thunks, extended ignore list, and argument renaming. +- `fs-cg` mapper: avoid an infinite loop in trampoline resolution and add a real timeout to program loading. +- `fs`/`fs-cg` mappers: pass `load_binary` arguments through a `partial` mechanism and improve multiprocessing error handling. +- `decomp` mapper: fix the mapping run (`map` now reports success/failure, runs the decompilation and call-graph indexing phases, and records the binary node so functions get a valid parent). +- `decomp` mapper: fix call-graph source cross-references (call-site locations are looked up by callee address and no longer raise on the first reference). +- `decomp` mapper: fix command-line arguments and improve the decompilation script (correct `NamedTemporaryFile` usage, better IDA decompilation output). +- `cli`: keep an existing suffix in the DB path and annotate the `decomp` mapper variable with its base type to fix a type-checking error. + +### Internal +- Reorganize the repository into two submodules (`backend` and `mappers`) and rework the mappers so backend support lives in a single common place; remove unused modules and the `heimdallr`/disassembly-sync prototype. +- CI: build and test IDA and Ghidra Docker images, export test artifacts, and trigger builds only on relevant changes. + ## v1.0.1—Improve exe-decomp mapper ### Features diff --git a/docs/mappers/exe-decomp.md b/docs/mappers/exe-decomp.md index a73d9a6..1657d76 100644 --- a/docs/mappers/exe-decomp.md +++ b/docs/mappers/exe-decomp.md @@ -21,9 +21,24 @@ Options: -d, --debug Set log level to DEBUG. --db PATH NumbatUI DB file path (.srctrldb). [default: decomp.srctrldb] -b, --backend [ida|ghidra] Backend to use. [default: Backend.IDA] - -h, --help Show this message and exit. Show this message and exit. + -e, --export Create a JSON export of the resulting decompilation mapping. + -h, --help Show this message and exit. ``` +## JSON export + +With the `-e/--export` option, the mapper writes a JSON file next to the database (`.json`) describing the result of the run. It is loaded back into an `ExportedDecompilation` object exposed by Pyrrha, so results can be post-processed without re-running a disassembler: + +```python +from pyrrha_mapper.mappers import ExportedDecompilation + +result = ExportedDecompilation.from_json_export("my_binary.json") +for func in result.iter_functions(): + print(hex(func.addr), func.name, func.type) +``` + +An `ExportedDecompilation` stores the analysed binary identity (`path`, `id`, `name`) and its functions, keyed by their parser-space entry-point address. Each function is an `ExportedFunction` carrying its `Symbol`, its `FuncType`, the addresses it calls and is called by, its decompiled `source`, and the in-source locations of its declaration and call sites (`ExportedLocation`). + !!! note This mapper create the Quokka export of the binary as well as a cache version of all the decompiled function of the analyzed binary.. If these files already exist, it loads them without regenerate them. Like that it also allowed to use `pyrrha` in systems without Quokka and/or IDA. From beaeea93f8a5e126b18d51f304fc33cb709ea342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 16 Jun 2026 08:33:30 +0000 Subject: [PATCH 57/62] ci: run the new decomp tests - test_decomp_objects: unit-test job for the export model, mirroring test_data_structures on a plain python image with a 100% coverage gate on pyrrha_mapper.mappers.decomp_objects - test_decomp: functional job extending test_fs-cg, running tests/test_cli.py::TestDecompMapper across the ida/ghidra backend matrix with coverage over decomp_mapper and decomp_objects --- .gitlab-ci.yml | 18 ++++++++++++++++++ CHANGELOG.md | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dfd0866..fd7a3f9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -157,6 +157,13 @@ test_data_structures: coverage_format: cobertura path: coverage.xml +test_decomp_objects: + extends: + - test_data_structures + variables: + TEST_COVERAGE_SOURCE: pyrrha_mapper.mappers.decomp_objects + TEST_PATH: tests/test_decomp_objects.py + #========================== MAPPERS TESTS ==================================== test_fs: extends: @@ -229,6 +236,17 @@ test_fs-cg: - BACKEND: "ghidra" VERSION: ["12.0.4"] +test_decomp: + extends: + - test_fs-cg + variables: + DB: decomp_${BACKEND}_${VERSION} + MAPPER: decomp + MAPPER_OPTIONS: '--backend ${BACKEND}' + TEST_COVERAGE_SOURCE: pyrrha_mapper.mappers.decomp_mapper,pyrrha_mapper.mappers.decomp_objects + TEST_PATH: tests/test_cli.py::TestDecompMapper + TEST_SUP_OPTIONS: ${MAPPER_OPTIONS} + #========================== TRIGGER INTERNAL DOC UPDATE ================================ .trigger_docs_base: stage: notify diff --git a/CHANGELOG.md b/CHANGELOG.md index 310bd0a..bde45ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ ### Internal - Reorganize the repository into two submodules (`backend` and `mappers`) and rework the mappers so backend support lives in a single common place; remove unused modules and the `heimdallr`/disassembly-sync prototype. -- CI: build and test IDA and Ghidra Docker images, export test artifacts, and trigger builds only on relevant changes. +- CI: build and test IDA and Ghidra Docker images, run the `decomp` export-model and functional tests, export test artifacts, and trigger builds only on relevant changes. ## v1.0.1—Improve exe-decomp mapper From 9507645aa7bce6e236b68dcfa95efa4619013b61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 16 Jun 2026 08:53:32 +0000 Subject: [PATCH 58/62] fix(ida): use the ida_domain 0.5.0 pseudocode API func_decompiled called get_pseudocode(func, remove_tags=True), but in ida_domain 0.5.0 get_pseudocode(func) takes no remove_tags argument and returns a PseudocodeFunction (not a list of lines), so every decomp run crashed with TypeError once it reached a function body. Call get_pseudocode(func).to_text(remove_tags=True) to get the lines, and catch IdaDomainError (the base of PseudocodeError) instead of RuntimeError, which did not cover decompilation failures. Drop a dead result dict and a misleading log line. --- CHANGELOG.md | 1 + src/pyrrha_mapper/backend/ida.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bde45ab..f9493e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ - `decomp` mapper: fix the mapping run (`map` now reports success/failure, runs the decompilation and call-graph indexing phases, and records the binary node so functions get a valid parent). - `decomp` mapper: fix call-graph source cross-references (call-site locations are looked up by callee address and no longer raise on the first reference). - `decomp` mapper: fix command-line arguments and improve the decompilation script (correct `NamedTemporaryFile` usage, better IDA decompilation output). +- `decomp` mapper (IDA backend): use the `ida_domain` 0.5.0 pseudocode API (`get_pseudocode(func).to_text(...)`), fixing a `TypeError` that broke every IDA decompilation run. - `cli`: keep an existing suffix in the DB path and annotate the `decomp` mapper variable with its base type to fix a type-checking error. ### Internal diff --git a/src/pyrrha_mapper/backend/ida.py b/src/pyrrha_mapper/backend/ida.py index a1a4903..a959ed9 100644 --- a/src/pyrrha_mapper/backend/ida.py +++ b/src/pyrrha_mapper/backend/ida.py @@ -36,6 +36,7 @@ def __init__( image_base: int = 0, ) -> None: from ida_domain.database import Database, IdaCommandOptions + self.decompilation_activated = decompilation self.image_base = image_base self._bin_path = bin_path @@ -65,6 +66,7 @@ def is_func_start(self, addr: int) -> bool: def func_addrs(self) -> Iterator[int]: """Yield the parser-space entry-point address of every known function.""" from ida_domain.functions import FunctionFlags + for func in self._ida_db.functions.get_all(): if FunctionFlags.TAIL in self._ida_db.functions.get_flags(func): continue @@ -111,6 +113,7 @@ def func_children(self, addr: int) -> list[int]: :return: list of callee entry-point addresses. """ from ida_domain.functions import FunctionFlags + func = self._get_ida_func(addr) result: list[int] = [] for callee in self._ida_db.functions.get_callees(func) if func is not None else []: @@ -143,6 +146,7 @@ def func_type(self, addr: int) -> FuncType: 6. Default → ``NORMAL``. """ from ida_domain.functions import FunctionFlags + func = self._get_ida_func(addr) if func is None: return FuncType.NORMAL @@ -166,19 +170,20 @@ def func_type(self, addr: int) -> FuncType: def func_decompiled(self, addr: int) -> str: """:return: decompilation result of the function""" - result: dict[int, str] = {} + from ida_domain.base import IdaDomainError + func = self._get_ida_func(addr) if func is None: return "" try: - lines = self._ida_db.functions.get_pseudocode(func, remove_tags=True) - except RuntimeError as exc: + pseudocode = self._ida_db.functions.get_pseudocode(func) + lines = pseudocode.to_text(remove_tags=True) + except IdaDomainError as exc: logging.debug( f"[IDA] skipping {func.start_ea:#x} " f"({self._ida_db.functions.get_name(func)!r}): {exc}" ) return "" - logging.info(f"[IDA] decompiled {len(result)} functions from {self._bin_path}") return "\n".join(lines) # ------------------------------------------------------------------ @@ -207,6 +212,7 @@ def _ida_funcs(self) -> Iterator: :return: iterator of ``func_t`` objects with ``FUNC_TAIL`` excluded. """ from ida_domain.functions import FunctionFlags + for func in self._ida_db.functions.get_all(): if FunctionFlags.TAIL in self._ida_db.functions.get_flags(func): continue From 500fb6c95ef922369e5957102d3a9574e32aabd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 16 Jun 2026 08:53:52 +0000 Subject: [PATCH 59/62] fix(tests): give TestDecompMapper.ExecResults a project_path The autouse _collect_export_artifacts fixture in conftest reads export_res.project_path when PYTEST_ARTIFACTS_DIR is set (as it is in CI), but TestDecompMapper.ExecResults only defined export_path and db_path, raising AttributeError at teardown. Add project_path, matching BaseTestFsMapper.ExecResults. --- tests/test_cli.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 6c2bd53..0f5e2d9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -430,6 +430,10 @@ class ExecResults(NamedTuple): # noqa: D106 res: Result db_path: Path + @property + def project_path(self) -> Path: # noqa: D102 + return self.db_path.with_suffix(".srctrlprj") + @property def export_path(self) -> Path: # noqa: D102 return self.db_path.with_suffix(".json") From 37740b62b6f72eee4be49ab5e483d839593abb8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 16 Jun 2026 09:17:08 +0000 Subject: [PATCH 60/62] fix(decomp): skip imported functions during source and call-graph indexing Imported functions have no decompiled body (their source is set to "" in index_function) and are never recorded in the DB. Running index_decompiled and index_call_graph over them produced a 'declaration not found' ERROR and 'missing target id' WARNING per imported function, flooding the logs on any real binary. Skip imported functions as source-indexing and call-graph caller targets, and demote the 'cannot record call' message to DEBUG when the callee is imported (an expected condition in a single-binary decompilation, not an error). --- CHANGELOG.md | 1 + src/pyrrha_mapper/mappers/decomp_mapper.py | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f9493e2..586cfbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ - `decomp` mapper: fix call-graph source cross-references (call-site locations are looked up by callee address and no longer raise on the first reference). - `decomp` mapper: fix command-line arguments and improve the decompilation script (correct `NamedTemporaryFile` usage, better IDA decompilation output). - `decomp` mapper (IDA backend): use the `ida_domain` 0.5.0 pseudocode API (`get_pseudocode(func).to_text(...)`), fixing a `TypeError` that broke every IDA decompilation run. +- `decomp` mapper: skip imported functions during source and call-graph indexing (they have no decompiled body), removing spurious per-function error/warning logs. - `cli`: keep an existing suffix in the DB path and annotate the `decomp` mapper variable with its base type to fix a type-checking error. ### Internal diff --git a/src/pyrrha_mapper/mappers/decomp_mapper.py b/src/pyrrha_mapper/mappers/decomp_mapper.py index 4a3eeba..ccf6df8 100644 --- a/src/pyrrha_mapper/mappers/decomp_mapper.py +++ b/src/pyrrha_mapper/mappers/decomp_mapper.py @@ -188,6 +188,12 @@ def index_decompiled(self, addr, log_prefix) -> None: """ func = self.functions[addr] + # Imported functions have no decompiled body (source is set to "" in + # index_function), so there is nothing to locate or record. Skip them + # to avoid spurious "declaration not found" errors. + if func.type == FuncType.IMPORTED: + return + # Build lookup tables for the callees of this function. # normalize_name strips leading/trailing underscores and dots so that # e.g. "__memcpy" and "memcpy" both match the same call-site token. @@ -281,6 +287,10 @@ def index_call_graph(self, addr, log_prefix) -> None: :param log_prefix: string prepended to every log message. """ func = self.functions[addr] + # Imported functions have no body and are not recorded in the DB, so + # they cannot be callers; skip them without warning. + if func.type == FuncType.IMPORTED: + return if func.id is None: logging.warning(f"{log_prefix}: {func.name} is not a registered function, skip") return @@ -293,9 +303,13 @@ def index_call_graph(self, addr, log_prefix) -> None: continue child = self.functions[child_addr] if child.id is None: - logging.warning( + # Imported callees are never recorded in the DB (they have no + # body), so a missing id is expected rather than an error. + level = logging.DEBUG if child.type == FuncType.IMPORTED else logging.WARNING + logging.log( + level, f"{log_prefix}: cannot record call to {child.name} from {func.name} " - + "missing target id." + + "missing target id.", ) continue self.db_interface.record_ref_call(func.id, child.id) From c6d4d27a0c77d7ef96f17b35c8061d15ac43a128 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 16 Jun 2026 09:17:08 +0000 Subject: [PATCH 61/62] test(decomp): tolerate per-function logs in the decomp CLI test The decomp mapper legitimately logs per-function warnings/errors (e.g. a declaration not located in a particular decompiled body) without failing the run. Add check_click_result_allow_logs, which asserts exit code 0 and no exception but does not ban ERROR/WARNING lines (only CRITICAL), and use it in TestDecompMapper. --- tests/test_cli.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0f5e2d9..0e9df49 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -38,6 +38,20 @@ def check_click_result(res: Result) -> None: ) +def check_click_result_allow_logs(res: Result) -> None: + """Like check_click_result but tolerates per-function ERROR/WARNING logs. + + The decomp mapper legitimately logs warnings/errors for individual + functions (e.g. a declaration not located in some decompiled body); these + do not make the run fail. Only the exit code and absence of an exception + are checked here, plus that no CRITICAL message was emitted. + """ + assert res.exit_code == 0, res.output + assert not res.exception, res.exception + for log in res.stderr.splitlines(): + assert "CRITICAL" not in log, f"Critical log: {log}" + + class _SubprocessResult(NamedTuple): """Mimic the subset of click ``Result`` used by ``check_click_result``. @@ -480,14 +494,14 @@ def export_dump(self, export_res: "TestDecompMapper.ExecResults") -> ExportedDec @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) def test_db_creation(self, export_res: "TestDecompMapper.ExecResults") -> None: """The NumbatUI DB and project files are generated.""" - check_click_result(export_res.res) + check_click_result_allow_logs(export_res.res) assert export_res.db_path.with_suffix(".srctrldb").exists(), "Missing DB file" assert export_res.db_path.with_suffix(".srctrlprj").exists(), "Missing project file" @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) def test_export_creation(self, export_res: "TestDecompMapper.ExecResults") -> None: """The JSON export file exists.""" - check_click_result(export_res.res) + check_click_result_allow_logs(export_res.res) assert export_res.export_path.exists(), "Export file does not exist" @pytest.mark.parametrize("export_res", FW_TEST_BIN_PATHS, indirect=True, ids=_path_id) From 15217ca4fb279308eda7d7c40ec2dc0ae4bb60b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elo=C3=AFse=20Brocas?= Date: Tue, 16 Jun 2026 09:51:59 +0000 Subject: [PATCH 62/62] test(cli): make class-scoped fixtures classmethods pytest 9 deprecates class-scoped fixtures defined as instance methods (they will be removed in pytest 10): the instance a fixture runs on is not the one test methods see. Convert the class-scoped fixtures (pyrrha_exec, export_res, export_dump across the fs, fs-cg and decomp test classes, plus the _host_path helper they call) to classmethods using cls instead of self. The abstract export_res stub stays an instance method since it is always overridden and never run as a fixture. --- tests/test_cli.py | 66 ++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0e9df49..65ad0a4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -176,22 +176,23 @@ def _path_id(val): # =============================== FIXTURES ======================================== @pytest.fixture(scope="class") - def pyrrha_exec(self, request, tmp_path_factory) -> ExecResults: + @classmethod + def pyrrha_exec(cls, request, tmp_path_factory) -> ExecResults: """Run pyrrha whith the given thread number and the given db path.""" runner = CliRunner() tmp_path = ( tmp_path_factory.mktemp("db", numbered=True) - / f"{self.SUBCOMMAND}-{request.param}.srctrldb" + / f"{cls.SUBCOMMAND}-{request.param}.srctrldb" ) args = [ - self.SUBCOMMAND, + cls.SUBCOMMAND, "--db", f"{tmp_path}", "-j", request.param, - f"{self.FW_TEST_PATH}", + f"{cls.FW_TEST_PATH}", ] - return self.ExecResults(res=runner.invoke(self.COMMAND, args), db_path=tmp_path) + return cls.ExecResults(res=runner.invoke(cls.COMMAND, args), db_path=tmp_path) @abstractmethod @pytest.fixture(scope="class") @@ -200,7 +201,8 @@ def export_res(self, tmp_path_factory, request) -> ExecResults: ... @pytest.fixture(scope="class") - def export_dump(self, export_res: ExecResults) -> FileSystem: + @classmethod + def export_dump(cls, export_res: ExecResults) -> FileSystem: """Load JSON export into a FileSystem object.""" return FileSystem.from_json_export(export_res.export_path) @@ -290,23 +292,24 @@ class TestFSMapper(BaseTestFsMapper): # =============================== FIXTURES ======================================== @pytest.fixture(scope="class") - def export_res(self, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: + @classmethod + def export_res(cls, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: """Run Pyrrha with export activated.""" runner = CliRunner() tmp_path = ( tmp_path_factory.mktemp("db", numbered=True) - / f"{self.SUBCOMMAND}-{request.param}-export.srctrldb" + / f"{cls.SUBCOMMAND}-{request.param}-export.srctrldb" ) args = [ - self.SUBCOMMAND, + cls.SUBCOMMAND, "-e", "--db", f"{tmp_path}", "-j", request.param, - f"{self.FW_TEST_PATH}", + f"{cls.FW_TEST_PATH}", ] - return self.ExecResults(res=runner.invoke(self.COMMAND, args), db_path=tmp_path) + return cls.ExecResults(res=runner.invoke(cls.COMMAND, args), db_path=tmp_path) # =================================== TESTS ======================================== @@ -350,7 +353,8 @@ def export_path(self) -> Path: # noqa: D102 # =============================== FIXTURES ========================================= @pytest.fixture(scope="class") - def pyrrha_exec(self, request, tmp_path_factory) -> BaseTestFsMapper.ExecResults: + @classmethod + def pyrrha_exec(cls, request, tmp_path_factory) -> BaseTestFsMapper.ExecResults: """Run pyrrha whith the given thread number and the given db path. Uses a subprocess (not CliRunner) because the Ghidra backend starts a @@ -358,22 +362,23 @@ def pyrrha_exec(self, request, tmp_path_factory) -> BaseTestFsMapper.ExecResults """ tmp_path = ( tmp_path_factory.mktemp("db", numbered=True) - / f"{self.SUBCOMMAND}-{request.param}.srctrldb" + / f"{cls.SUBCOMMAND}-{request.param}.srctrldb" ) args = [ - self.SUBCOMMAND, + cls.SUBCOMMAND, "--backend", f"{request.config.getoption('--backend')}", "--db", f"{tmp_path}", "-j", request.param, - f"{self.FW_TEST_PATH}", + f"{cls.FW_TEST_PATH}", ] - return self.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) + return cls.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) @pytest.fixture(scope="class") - def export_res(self, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: + @classmethod + def export_res(cls, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: """Run Pyrrha with export activated. Uses a subprocess (not CliRunner) because the Ghidra backend starts a @@ -381,19 +386,19 @@ def export_res(self, tmp_path_factory, request) -> BaseTestFsMapper.ExecResults: """ tmp_path = ( tmp_path_factory.mktemp("db", numbered=True) - / f"{self.SUBCOMMAND}-{request.param}-export.srctrldb" + / f"{cls.SUBCOMMAND}-{request.param}-export.srctrldb" ) args = [ - self.SUBCOMMAND, + cls.SUBCOMMAND, "--backend", f"{request.config.getoption('--backend')}", "--db", f"{tmp_path}", "-j", request.param, - f"{self.FW_TEST_PATH}", + f"{cls.FW_TEST_PATH}", ] - return self.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) + return cls.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) # =================================== TESTS ======================================== @@ -458,23 +463,25 @@ def _path_id(val): return str(val) return val - def _host_path(self, bin_path: Path) -> Path: + @classmethod + def _host_path(cls, bin_path: Path) -> Path: """:return: the on-host path of a firmware-relative binary path.""" - return self.FW_TEST_PATH / bin_path.relative_to(bin_path.anchor) + return cls.FW_TEST_PATH / bin_path.relative_to(bin_path.anchor) # =============================== FIXTURES ========================================= @pytest.fixture(scope="class") - def export_res(self, tmp_path_factory, request) -> "TestDecompMapper.ExecResults": + @classmethod + def export_res(cls, tmp_path_factory, request) -> "TestDecompMapper.ExecResults": """Run the decomp mapper with export activated on a single executable.""" bin_path: Path = request.param - executable = self._host_path(bin_path) + executable = cls._host_path(bin_path) tmp_path = ( tmp_path_factory.mktemp("db", numbered=True) - / f"{self.SUBCOMMAND}-{bin_path.name}.srctrldb" + / f"{cls.SUBCOMMAND}-{bin_path.name}.srctrldb" ) args = [ - self.SUBCOMMAND, + cls.SUBCOMMAND, "--backend", f"{request.config.getoption('--backend')}", "--db", @@ -482,10 +489,11 @@ def export_res(self, tmp_path_factory, request) -> "TestDecompMapper.ExecResults "--export", f"{executable}", ] - return self.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) + return cls.ExecResults(res=run_pyrrha_subprocess(args), db_path=tmp_path) @pytest.fixture(scope="class") - def export_dump(self, export_res: "TestDecompMapper.ExecResults") -> ExportedDecompilation: + @classmethod + def export_dump(cls, export_res: "TestDecompMapper.ExecResults") -> ExportedDecompilation: """Load the JSON export into an ExportedDecompilation object.""" return ExportedDecompilation.from_json_export(export_res.export_path)