diff --git a/CMakeLists.txt b/CMakeLists.txt index f715c38..e72f62a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,6 +170,26 @@ add_executable(test_aggregate_drop_budget tests/unit/flow/test_aggregate_drop_bu target_link_libraries(test_aggregate_drop_budget PRIVATE openpenny) add_test(NAME aggregate_drop_budget COMMAND test_aggregate_drop_budget) +add_executable(test_terminal_snapshot_resolution tests/unit/flow/test_terminal_snapshot_resolution.cpp) +target_link_libraries(test_terminal_snapshot_resolution PRIVATE openpenny) +add_test(NAME terminal_snapshot_resolution COMMAND test_terminal_snapshot_resolution) + +add_executable(test_aggregate_pending_resolution tests/unit/flow/test_aggregate_pending_resolution.cpp) +target_link_libraries(test_aggregate_pending_resolution PRIVATE openpenny) +add_test(NAME aggregate_pending_resolution COMMAND test_aggregate_pending_resolution) + +add_executable(test_aggregate_freeze_at_drop_limit tests/unit/flow/test_aggregate_freeze_at_drop_limit.cpp) +target_link_libraries(test_aggregate_freeze_at_drop_limit PRIVATE openpenny) +add_test(NAME aggregate_freeze_at_drop_limit COMMAND test_aggregate_freeze_at_drop_limit) + +add_executable(test_aggregate_duplicate_fallback tests/unit/flow/test_aggregate_duplicate_fallback.cpp) +target_link_libraries(test_aggregate_duplicate_fallback PRIVATE openpenny) +add_test(NAME aggregate_duplicate_fallback COMMAND test_aggregate_duplicate_fallback) + +add_executable(test_flow_evaluation_phase_gate tests/unit/flow/test_flow_evaluation_phase_gate.cpp) +target_link_libraries(test_flow_evaluation_phase_gate PRIVATE openpenny) +add_test(NAME flow_evaluation_phase_gate COMMAND test_flow_evaluation_phase_gate) + add_executable(test_cli_options tests/unit/cli/test_cli_options.cpp) target_link_libraries(test_cli_options PRIVATE openpenny) add_test(NAME cli_options COMMAND test_cli_options) @@ -178,6 +198,10 @@ add_executable(test_traffic_match tests/unit/net/test_traffic_match.cpp) target_link_libraries(test_traffic_match PRIVATE openpenny) add_test(NAME traffic_match COMMAND test_traffic_match) +add_executable(test_packet_parser tests/unit/net/test_packet_parser.cpp) +target_link_libraries(test_packet_parser PRIVATE openpenny) +add_test(NAME packet_parser COMMAND test_packet_parser) + add_executable(test_control_planner tests/unit/control/test_control_planner.cpp) target_link_libraries(test_control_planner PRIVATE openpenny) add_test(NAME control_planner COMMAND test_control_planner) diff --git a/include/openpenny/agg/FlowKey.h b/include/openpenny/agg/FlowKey.h new file mode 100644 index 0000000..4d3e295 --- /dev/null +++ b/include/openpenny/agg/FlowKey.h @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: BSD-2-Clause + +#pragma once + +#include +#include +#include +#include + +namespace openpenny { + +struct FlowKey { + /** + * @brief Protocol-aware flow tuple in host byte order. + * + * Encodes IPv4 source/destination, L4 ports, and the IPv4 protocol + * number so TCP/UDP traffic with the same addresses/ports do not + * alias to the same key. + */ + std::uint32_t src{0}; + std::uint32_t dst{0}; + std::uint16_t sport{0}; + std::uint16_t dport{0}; + std::uint8_t ip_proto{0}; + + bool operator==(const FlowKey& o) const noexcept { + return src == o.src && + dst == o.dst && + sport == o.sport && + dport == o.dport && + ip_proto == o.ip_proto; + } +}; + +struct FlowKeyHash { + /** + * @brief Mix all FlowKey fields into a single hash using 64-bit avalanching. + */ + std::size_t operator()(const FlowKey& k) const noexcept { + const std::uint64_t addr_pair = + (static_cast(k.src) << 32) | k.dst; + const std::uint64_t ports_proto = + (static_cast(k.sport) << 24) | + (static_cast(k.dport) << 8) | + static_cast(k.ip_proto); + + std::uint64_t v = + addr_pair ^ (ports_proto + 0x9e3779b97f4a7c15ULL + + (addr_pair << 6) + (addr_pair >> 2)); + v ^= (v >> 33); + v *= 0xff51afd7ed558ccdULL; + v ^= (v >> 33); + v *= 0xc4ceb9fe1a85ec53ULL; + v ^= (v >> 33); + return static_cast(v); + } +}; + +template +using FlowMap = std::unordered_map; + +using FlowSet = std::unordered_set; + +} // namespace openpenny diff --git a/include/openpenny/agg/Stats.h b/include/openpenny/agg/Stats.h index e735282..7fe61a8 100644 --- a/include/openpenny/agg/Stats.h +++ b/include/openpenny/agg/Stats.h @@ -5,41 +5,18 @@ * @file Stats.h * @brief Per-flow and aggregated statistics with a striped hash table. */ +#include "openpenny/agg/FlowKey.h" + #include #include #include #include -#include #include #include #include namespace openpenny { -struct FlowKey { - /** - * @brief Tuple identifying a TCP/UDP flow in host byte order. - */ - uint32_t src; uint32_t dst; uint16_t sport; uint16_t dport; - bool operator==(const FlowKey& o) const noexcept { - return src==o.src && dst==o.dst && sport==o.sport && dport==o.dport; - } -}; - -struct FlowKeyHash { - /** - * @brief Mix all FlowKey fields into a single hash using 64-bit avalanching. - */ - size_t operator()(const FlowKey& k) const noexcept { - uint64_t v = (static_cast(k.src) << 32) ^ k.dst; - v ^= (static_cast(k.sport) << 16) ^ k.dport; - v ^= (v >> 33); v *= 0xff51afd7ed558ccdULL; - v ^= (v >> 33); v *= 0xc4ceb9fe1a85ec53ULL; - v ^= (v >> 33); - return static_cast(v); - } -}; - /** * @brief Per-flow counters that mirror the BPF-side stats exposed to users. */ @@ -91,7 +68,7 @@ class FlowTable { private: struct Shard { mutable std::shared_mutex mutex; - std::unordered_map map; + FlowMap map; }; std::vector shards_; FlowKeyHash hash_; diff --git a/include/openpenny/app/core/ActiveTestPipeline.h b/include/openpenny/app/core/ActiveTestPipeline.h index ae736f6..266fb37 100644 --- a/include/openpenny/app/core/ActiveTestPipeline.h +++ b/include/openpenny/app/core/ActiveTestPipeline.h @@ -168,8 +168,17 @@ class ActiveTestPipelineRunner : public IPipelineStrategy { /** Expire idle flows based on configured timeout. */ void expire_idle_flows(const std::chrono::steady_clock::time_point& now); - /** Sweep pending snapshots and expire those past timeout. */ - void sweep_expired_snapshots(const std::chrono::steady_clock::time_point& now); + /** Return true once the aggregate phase has completed and per-flow tests may run. */ + bool individual_flow_evaluation_enabled() const; + + /** Evaluate already-tracked flows once per-flow testing becomes active. */ + void evaluate_individual_flows_if_enabled(); + + /** Complete terminal flows once all pending drop snapshots are resolved. */ + void complete_resolved_terminal_flows(); + + /** Complete a flow and preserve a printable closed-loop summary if applicable. */ + void complete_flow_with_summary(const FlowKey& key, const char* reason); // ------------------------------------------------------------------------- // Member state @@ -232,6 +241,8 @@ class ActiveTestPipelineRunner : public IPipelineStrategy { */ std::size_t total_pkts_forwarded_{0}; std::size_t total_forward_errors_{0}; + std::vector closed_loop_flow_summaries_; + std::vector duplicate_exceeded_flow_summaries_; /** * Last time we logged global stats (prevents log flooding). diff --git a/include/openpenny/app/core/DropCollectorBinding.h b/include/openpenny/app/core/DropCollectorBinding.h index 2e26416..a20dea6 100644 --- a/include/openpenny/app/core/DropCollectorBinding.h +++ b/include/openpenny/app/core/DropCollectorBinding.h @@ -5,34 +5,24 @@ #include "openpenny/app/core/OpenpennyPipelineDriver.h" #include "openpenny/agg/Stats.h" -#include #include -#include - -namespace openpenny::penny { -class FlowEngine; -} +#include +#include namespace openpenny::app { /** - * @brief Maintains FlowEngine -> DropCollector bindings and installs the - * snapshot hook so drop events are mirrored into the shared collector. + * @brief Mirrors per-flow drop snapshots into the shared collector. + * + * New drops are inserted one at a time via upsert(). Snapshot state changes + * that affect a suffix of the per-flow snapshot vector (duplicate/rtx/expire) + * are mirrored via refresh_from() so the collector can rescan the already + * contiguous, append-only snapshot storage directly. */ class DropCollectorBinding { public: static DropCollectorBinding& instance(); - // Ensure the global timer snapshot hook is installed exactly once. - void ensure_snapshot_hook(); - - void bind(penny::FlowEngine* flow, - DropCollectorPtr collector, - const std::string& thread_name, - std::size_t shard_index); - - void unbind(penny::FlowEngine* flow); - void upsert(DropCollectorPtr collector, const std::string& thread_name, std::size_t shard_index, @@ -40,23 +30,23 @@ class DropCollectorBinding { penny::PacketDropId packet_id, const penny::PacketDropSnapshot& snap); -private: - struct BindingContext { - DropCollectorPtr collector; - std::string thread_name; - std::size_t shard_index{0}; - }; + void refresh_from( + DropCollectorPtr collector, + const std::string& thread_name, + std::size_t shard_index, + const FlowKey& key, + const std::vector>& snapshots, + std::size_t start_index); +private: DropCollectorBinding() = default; - BindingContext lookup(penny::FlowEngine* flow) const; - void upsert_locked(const BindingContext& binding, + + void upsert_locked(DropCollector& collector, + DropCollector::Shard& shard, + const std::string& thread_name, const FlowKey& key, penny::PacketDropId packet_id, const penny::PacketDropSnapshot& snap); - - mutable std::mutex mtx_; - std::once_flag hook_once_; - std::unordered_map bindings_; }; } // namespace openpenny::app diff --git a/include/openpenny/app/core/OpenpennyPipelineDriver.h b/include/openpenny/app/core/OpenpennyPipelineDriver.h index e8bca0b..17e7d27 100644 --- a/include/openpenny/app/core/OpenpennyPipelineDriver.h +++ b/include/openpenny/app/core/OpenpennyPipelineDriver.h @@ -3,7 +3,7 @@ #pragma once #include "openpenny/config/Config.h" -#include "openpenny/agg/Stats.h" +#include "openpenny/agg/FlowKey.h" #include "openpenny/egress/PacketSink.h" #include "openpenny/penny/flow/state/PennySnapshot.h" #include "openpenny/penny/flow/state/PacketDropId.h" @@ -124,6 +124,10 @@ struct DropCollector { std::atomic accepting{true}; std::size_t shard_count{1}; + std::size_t snapshot_limit{0}; + std::atomic accepted_snapshot_count{0}; + mutable std::mutex frozen_aggregate_counters_mtx; + std::optional frozen_aggregate_counters; std::array shards{}; std::size_t clamp_shard_index(std::size_t idx) const noexcept { @@ -160,10 +164,13 @@ struct ModeResult { std::size_t flows_tracked_data = 0; bool penny_completed = false; // True when Penny heuristics triggered shutdown. bool aggregates_penny_completed = false; // Flag representing aggregate Penny status. + bool closed_loop_stop_hit = false; // True when the configured min_closed_loop_flows threshold was observed. // Passive-mode gap summary. std::size_t passive_flows_with_open_gaps = 0; std::size_t passive_open_gaps = 0; std::vector passive_gap_summaries; + std::vector closed_loop_flow_summaries; + std::vector duplicate_exceeded_flow_summaries; std::size_t passive_flows_rst = 0; std::size_t passive_flows_syn_only = 0; std::size_t passive_flows_finished = 0; diff --git a/include/openpenny/app/core/PassiveTestPipeline.h b/include/openpenny/app/core/PassiveTestPipeline.h index 64d2df7..26167d4 100644 --- a/include/openpenny/app/core/PassiveTestPipeline.h +++ b/include/openpenny/app/core/PassiveTestPipeline.h @@ -2,7 +2,7 @@ #pragma once -#include "openpenny/agg/Stats.h" +#include "openpenny/agg/FlowKey.h" #include "openpenny/app/core/OpenpennyPipelineDriver.h" #include "openpenny/app/core/PipelineRunner.h" #include "openpenny/config/Config.h" @@ -13,8 +13,6 @@ #include #include #include -#include -#include #include namespace openpenny { @@ -75,17 +73,19 @@ class PassiveTestPipelineRunner : public IPipelineStrategy { void finalize(ModeResult& result) override; private: + void reserve_for_config(); + const Config& cfg_; const PipelineOptions& opts_; FlowMatcher matcher_; net::PacketSourcePtr source_; - std::unordered_map flows_; + FlowMap flows_; std::chrono::steady_clock::time_point start_time_{std::chrono::steady_clock::now()}; std::size_t flows_seen_{0}; std::size_t flows_finished_{0}; std::vector finished_flows_; - std::unordered_map finished_index_; - std::unordered_set finished_keys_; + FlowMap finished_index_; + FlowSet finished_keys_; bool stop_grace_active_{false}; std::chrono::steady_clock::time_point stop_grace_start_{}; bool stop_requested_{false}; diff --git a/include/openpenny/app/core/PerThreadStats.h b/include/openpenny/app/core/PerThreadStats.h index d897da4..b3fe671 100644 --- a/include/openpenny/app/core/PerThreadStats.h +++ b/include/openpenny/app/core/PerThreadStats.h @@ -6,7 +6,7 @@ #include #include -#include "openpenny/agg/Stats.h" // for FlowKey +#include "openpenny/agg/FlowKey.h" #include "openpenny/penny/flow/state/PacketDropId.h" namespace openpenny::app { diff --git a/include/openpenny/app/core/RuntimeSetup.h b/include/openpenny/app/core/RuntimeSetup.h index dc99db9..0b624d4 100644 --- a/include/openpenny/app/core/RuntimeSetup.h +++ b/include/openpenny/app/core/RuntimeSetup.h @@ -18,4 +18,13 @@ const RuntimeSetupSnapshot& current_runtime_setup(); // Mutable view for helpers that need to update status fields. RuntimeSetupSnapshot& runtime_setup_mutable(); +bool current_aggregates_active() noexcept; +void set_current_aggregates_active(bool value) noexcept; + +RuntimeStatus::AggregatesStatus current_aggregates_status() noexcept; +void set_current_aggregates_status(RuntimeStatus::AggregatesStatus status) noexcept; + +bool current_has_aggregate_eval() noexcept; +void set_current_has_aggregate_eval(bool value) noexcept; + } // namespace openpenny diff --git a/include/openpenny/egress/PacketSink.h b/include/openpenny/egress/PacketSink.h index 45d56e8..6f68534 100644 --- a/include/openpenny/egress/PacketSink.h +++ b/include/openpenny/egress/PacketSink.h @@ -142,9 +142,9 @@ class PacketSink { * @brief Emit a parsed packet. Must be thread-safe. * * Returns true on a successful write, false on any error. Transient - * EAGAIN/EWOULDBLOCK are counted as errors==0 (pipeline drops the - * packet) because the pipeline is not responsible for reliable - * delivery -- it's a passive mirror. + * EAGAIN/EWOULDBLOCK still mean the packet was dropped; sinks may count + * those in stats_.errors as backpressure-induced loss so operators can + * distinguish real reinjection congestion from intentional Penny drops. */ virtual bool write(const net::PacketView& packet) = 0; diff --git a/include/openpenny/egress/RawNicSink.h b/include/openpenny/egress/RawNicSink.h index e75bd23..9d5c1dc 100644 --- a/include/openpenny/egress/RawNicSink.h +++ b/include/openpenny/egress/RawNicSink.h @@ -16,6 +16,10 @@ #include "openpenny/egress/PacketSink.h" +#include +#include +#include + namespace openpenny::egress { class RawNicSink : public PacketSink { @@ -30,9 +34,15 @@ class RawNicSink : public PacketSink { EgressKind kind() const noexcept override { return EgressKind::RawNic; } private: + int open_socket_fd(bool resolve_ifindex, bool log_failures); + int thread_fd(); + EgressConfig cfg_{}; int fd_ = -1; int if_index_ = -1; ///< Cached ifindex for sendto(2). + std::mutex fds_mtx_; + std::vector additional_fds_; + std::atomic backpressure_logged_{false}; }; } // namespace openpenny::egress diff --git a/include/openpenny/egress/RawSocketSink.h b/include/openpenny/egress/RawSocketSink.h index 1b9754f..e8427a7 100644 --- a/include/openpenny/egress/RawSocketSink.h +++ b/include/openpenny/egress/RawSocketSink.h @@ -15,6 +15,8 @@ #include "openpenny/egress/PacketSink.h" #include +#include +#include namespace openpenny::egress { @@ -30,8 +32,14 @@ class RawSocketSink : public PacketSink { EgressKind kind() const noexcept override { return EgressKind::RawSocket; } private: + int open_socket_fd(bool log_failures); + int thread_fd(); + EgressConfig cfg_{}; int fd_ = -1; + std::mutex fds_mtx_; + std::vector additional_fds_; + std::atomic backpressure_logged_{false}; /// Latched once we have logged the first EMSGSIZE failure. The kernel /// returns EMSGSIZE for any IP datagram larger than the egress /// interface MTU (raw sockets cannot fragment), and on a busy diff --git a/include/openpenny/egress/TunSink.h b/include/openpenny/egress/TunSink.h index e261cd3..e194966 100644 --- a/include/openpenny/egress/TunSink.h +++ b/include/openpenny/egress/TunSink.h @@ -15,6 +15,7 @@ #include "openpenny/egress/PacketSink.h" +#include #include #include @@ -54,6 +55,7 @@ class TunSink : public PacketSink { /// the `thread_local` cache are lock-free after the first call. std::mutex fds_mtx_; std::vector additional_fds_; + std::atomic backpressure_logged_{false}; }; } // namespace openpenny::egress diff --git a/include/openpenny/net/Packet.h b/include/openpenny/net/Packet.h index 218e5e1..35e7503 100644 --- a/include/openpenny/net/Packet.h +++ b/include/openpenny/net/Packet.h @@ -2,7 +2,7 @@ #pragma once -#include "openpenny/agg/Stats.h" // for FlowKey +#include "openpenny/agg/FlowKey.h" #include "openpenny/dataplane/Session.h" #include "openpenny/penny/flow/state/PacketDropId.h" @@ -104,9 +104,9 @@ struct TcpHeaderView { * All pointers into the packet buffer are valid only during the handler call. */ struct PacketView { - FlowKey flow{}; ///< Flow identifier (5-tuple or 4-tuple depending on source). + FlowKey flow{}; ///< Protocol-aware flow identifier (IPv4 src/dst, L4 ports, IP proto). TcpHeaderView tcp{}; ///< Minimal parsed TCP header subset. - uint8_t ip_proto{0}; ///< IPv4 protocol number (TCP=6, UDP=17, etc.). + uint8_t ip_proto{0}; ///< IPv4 protocol number (TCP=6, UDP=17, etc.); mirrors flow.ip_proto. uint64_t payload_bytes{0}; ///< L4 payload length (0 for pure ACKs or empty payloads). uint64_t timestamp_ns{0}; ///< Packet capture timestamp in nanoseconds. diff --git a/include/openpenny/penny/flow/engine/FlowEngine.h b/include/openpenny/penny/flow/engine/FlowEngine.h index 5d59cff..f328c0a 100644 --- a/include/openpenny/penny/flow/engine/FlowEngine.h +++ b/include/openpenny/penny/flow/engine/FlowEngine.h @@ -44,6 +44,10 @@ class FlowEngine { using DropSnapshotSink = std::function; + using SnapshotRefreshSink = std::function>&, + std::size_t start_index)>; /// High-level decision / outcome for this flow. enum class FlowDecision { @@ -152,10 +156,10 @@ class FlowEngine { // Flow identity // --------------------------------------------------------------------- - /// Attach the 5-tuple (or equivalent) key to this flow. + /// Attach the protocol-aware flow key to this flow. void set_flow_key(const FlowKey& key) noexcept { flow_key_ = key; } - /// Return the flow key (5-tuple) associated with this FlowEngine. + /// Return the protocol-aware flow key associated with this FlowEngine. FlowKey flow_key() const noexcept { return flow_key_; } // --------------------------------------------------------------------- @@ -187,6 +191,9 @@ class FlowEngine { /// Install a sink to receive drop snapshots as they are created. void set_drop_sink(DropSnapshotSink sink); + /// Install a sink to mirror in-place snapshot updates from a given suffix onward. + void set_snapshot_refresh_sink(SnapshotRefreshSink sink); + // --------------------------------------------------------------------- // Sequence interval classification // --------------------------------------------------------------------- @@ -335,6 +342,9 @@ class FlowEngine { /// Mark all pending snapshots as expired (used on shutdown/cleanup). void expire_all_pending_snapshots(); + /// Resolve pending snapshots at teardown using the configured timeout. + void resolve_pending_snapshots(const std::chrono::steady_clock::time_point& now); + private: /** * @brief Compute the final classification decision for this flow based on @@ -342,6 +352,12 @@ class FlowEngine { */ FlowDecision evaluate() const; + /// Mirror snapshot updates affecting [start_index, end) to any external collector. + void publish_snapshot_refresh(std::size_t start_index); + + /// Publish a single-snapshot update when no bulk refresh sink is installed. + void publish_single_snapshot_update(PacketDropId packet_id, std::size_t snapshot_index); + // --------------------------------------------------------------------- // Internal gap bookkeeping structures // --------------------------------------------------------------------- @@ -383,6 +399,7 @@ class FlowEngine { /// Mapping from snapshot packet_id to its index in flow_drop_snapshots_. std::unordered_map flow_snapshot_index_by_id_; DropSnapshotSink drop_sink_{}; + SnapshotRefreshSink snapshot_refresh_sink_{}; /** * @brief Shared liveness flag observed by timer entries. @@ -432,7 +449,7 @@ class FlowEngine { // Flow identity // --------------------------------------------------------------------- - FlowKey flow_key_{}; ///< 5-tuple (or equivalent) identifying this flow. + FlowKey flow_key_{}; ///< Protocol-aware tuple identifying this flow. }; } // namespace openpenny::penny diff --git a/include/openpenny/penny/flow/manager/ThreadFlowManager.h b/include/openpenny/penny/flow/manager/ThreadFlowManager.h index 488f368..27a22ec 100644 --- a/include/openpenny/penny/flow/manager/ThreadFlowManager.h +++ b/include/openpenny/penny/flow/manager/ThreadFlowManager.h @@ -2,14 +2,13 @@ #pragma once +#include "openpenny/agg/FlowKey.h" #include "openpenny/penny/flow/engine/FlowEngine.h" #include "openpenny/penny/flow/state/PennyStats.h" #include "openpenny/net/Packet.h" #include "openpenny/app/core/PerThreadStats.h" #include -#include -#include #include #include #include @@ -125,18 +124,20 @@ class ThreadFlowManager { * @param is_syn True if the first packet carried a SYN flag. * @param ts Timestamp of the first packet (for data timing). * - * @return true if a new flow entry was inserted, false if the flow already existed - * or had been monitored before. + * @return pointer to the new flow entry when inserted, nullptr otherwise. */ - bool add_new_flow(const FlowKey& key, - uint32_t seq, - uint32_t payload_bytes, - bool is_syn, - const std::chrono::steady_clock::time_point& ts); + FlowEngineEntry* add_new_flow(const FlowKey& key, + uint32_t seq, + uint32_t payload_bytes, + bool is_syn, + const std::chrono::steady_clock::time_point& ts); /// Install a sink that receives drop snapshots from all managed FlowEngines. void set_drop_sink(FlowEngine::DropSnapshotSink sink); + /// Install a sink that mirrors in-place snapshot updates from managed FlowEngines. + void set_snapshot_refresh_sink(FlowEngine::SnapshotRefreshSink sink); + /** * @brief Update or create the FlowEngine entry corresponding to a packet. * @@ -254,6 +255,8 @@ class ThreadFlowManager { } private: + void reserve_for_config(const Config::ActiveConfig& cfg); + /** * @brief Count how many flows are currently considered "active". * @@ -279,12 +282,13 @@ class ThreadFlowManager { PennyStats stats_{}; /// Map from flow key to the corresponding FlowEngineEntry for active or tracked flows. - std::unordered_map table_active_flows_; + FlowMap table_active_flows_; /// Set of flow keys that have already been fully processed / completed. - std::unordered_set table_completed_flows_; + FlowSet table_completed_flows_; FlowEngine::DropSnapshotSink drop_sink_{}; + FlowEngine::SnapshotRefreshSink snapshot_refresh_sink_{}; }; } // namespace openpenny::penny diff --git a/include/openpenny/penny/flow/timer/ThreadFlowEventTimer.h b/include/openpenny/penny/flow/timer/ThreadFlowEventTimer.h index 516cbb4..53a2340 100644 --- a/include/openpenny/penny/flow/timer/ThreadFlowEventTimer.h +++ b/include/openpenny/penny/flow/timer/ThreadFlowEventTimer.h @@ -2,19 +2,18 @@ #pragma once -#include "openpenny/agg/Stats.h" // for FlowKey +#include "openpenny/agg/FlowKey.h" #include "openpenny/penny/flow/state/PacketDropId.h" #include #include -#include #include #include +#include #include #include #include #include -#include #include #include #include @@ -30,34 +29,34 @@ class FlowEngine; * * High-level design * ----------------- - * - A single background thread runs timer_loop(). - * - Packet-processing threads never mutate FlowEngine snapshots directly. Instead, they: + * - Each worker thread owns a thread-local manager instance. + * - Packet-processing code never mutates FlowEngine snapshots directly from nested + * helper paths. Instead, it: * * register drops (with deadlines), * * enqueue retransmission / duplicate events. - * - The timer thread: + * - The worker periodically calls drain_callbacks(), which: * * pops expired entries from a min-heap, * * consumes queued events, * * turns them into callbacks, - * * and executes those callbacks itself (without holding the manager mutex). + * * and executes those callbacks on the same worker thread. * * As a result: - * - All snapshot mutations are single-threaded (in the timer thread). - * - The packet path stays lightweight and avoids locking around FlowEngine state. + * - All snapshot mutations stay on the queue worker that owns the flow. + * - We avoid one extra timer thread and the associated context switching per queue. */ class ThreadFlowEventTimerManager { public: /** * @brief Access the thread-local timer manager instance. * - * Each packet-processing thread gets its own manager (and timer thread), - * so queues are isolated. + * Each packet-processing thread gets its own manager, so queues are isolated. */ static ThreadFlowEventTimerManager& instance(); ~ThreadFlowEventTimerManager(); /** - * @brief Start the timer thread with a given drop timeout. + * @brief Initialise the per-thread timer state with a given drop timeout. * * @param timeout_sec Timeout in seconds after which an un-repaired drop snapshot * is considered expired. @@ -65,7 +64,7 @@ class ThreadFlowEventTimerManager { void start(double timeout_sec); /** - * @brief Stop the timer thread and flush internal state. + * @brief Stop and flush internal state. * * Safe to call multiple times; subsequent calls after the first have no effect. */ @@ -94,8 +93,8 @@ class ThreadFlowEventTimerManager { /** * @brief Queue an asynchronous "retransmitted" event from the packet path. * - * The timer thread will later convert this into a callback that updates - * the relevant snapshot in the owning FlowEngine. + * The owning worker thread will later convert this into a callback that + * updates the relevant snapshot in the owning FlowEngine. */ void enqueue_retransmitted(PacketDropId packet_id, FlowEngine* flow); @@ -116,26 +115,10 @@ class ThreadFlowEventTimerManager { void purge_flow(FlowEngine* flow); /** - * @brief Optional manual draining of callbacks. - * - * Historically used when callbacks were executed from the packet-processing - * thread; kept for compatibility. In the current design, the timer thread - * is responsible for draining and executing callbacks via run_callbacks(). + * @brief Drain due expirations and queued events on the current worker thread. */ void drain_callbacks(); - enum class SnapshotEventKind { Expire, Retransmit, Duplicate }; - - /** - * @brief Install a hook invoked after a snapshot event is applied. - * - * The hook runs in the packet-processing thread context when callbacks - * are drained. - */ - static void set_snapshot_hook(std::function hook); - private: // --------------------------------------------------------------------- // Internal helper types @@ -187,7 +170,7 @@ class ThreadFlowEventTimerManager { }; /** - * @brief Event generated by the packet path and consumed by the timer thread. + * @brief Event generated by the packet path and consumed by drain_callbacks(). * * These events are cheap to enqueue in the packet-processing context and * later turned into callbacks against FlowEngine. @@ -206,7 +189,7 @@ class ThreadFlowEventTimerManager { }; /** - * @brief Callback to be executed against FlowEngine by the timer thread. + * @brief Callback to be executed against FlowEngine on the worker thread. * * This is the only place where snapshots and FlowEngine state are mutated. */ @@ -227,25 +210,25 @@ class ThreadFlowEventTimerManager { ThreadFlowEventTimerManager(const ThreadFlowEventTimerManager&) = delete; ThreadFlowEventTimerManager& operator=(const ThreadFlowEventTimerManager&) = delete; - // Main thread loop: waits for timers or events, then processes them. - void timer_loop(); - - // Notify the timer thread that new timers/events are available (mutex_ held). - void wake_locked(); - // Run and clear the callbacks in @p pending, without holding mutex_. void run_callbacks(std::deque& pending); + // Collect all due expirations and queued events into @p pending (mutex_ held). + void collect_ready_callbacks(std::deque& pending, + const std::chrono::steady_clock::time_point& now); + + // Discard cancelled heap entries and refresh the lock-free earliest-deadline hint (mutex_ held). + void refresh_next_deadline_locked(); + // --------------------------------------------------------------------- // Synchronisation / thread state // --------------------------------------------------------------------- std::mutex mutex_; - std::condition_variable cv_; - std::thread thread_; + using DeadlineRep = std::chrono::steady_clock::duration::rep; + static constexpr DeadlineRep kNoDeadline = std::numeric_limits::max(); - bool running_{false}; ///< True once the timer thread has been started. - bool stop_flag_{false}; ///< Set to request shutdown of the timer thread. + bool running_{false}; ///< True once start() has initialised this worker-local manager. double timeout_sec_{0.0}; std::uint64_t next_token_{1}; @@ -260,7 +243,7 @@ class ThreadFlowEventTimerManager { std::unordered_map by_id_; /// Record of flow+packet_id pairs already handled as retransmitted. - std::vector retransmit_seen_; + std::unordered_set retransmit_seen_; /// Map from FlowEngine* to active timer tokens (for bulk purge_flow()). std::unordered_multimap by_flow_; @@ -272,33 +255,20 @@ class ThreadFlowEventTimerManager { // Asynchronous events and callbacks // --------------------------------------------------------------------- - /// Events queued by the packet-processing path for the timer thread. + /// Events queued by the packet-processing path for drain_callbacks(). std::deque events_; /** - * @brief Pending callbacks to execute against FlowEngine. + * @brief Lock-free fast-path size of `events_`. * - * These are built while holding mutex_, but always executed by the timer - * thread via run_callbacks() without the lock, avoiding lock contention - * during snapshot updates. + * This lets drain_callbacks() skip taking mutex_ when there are no queued + * retransmit/duplicate events and no drop deadline has elapsed yet. */ - std::deque callbacks_; + std::atomic queued_event_count_{0}; - /** - * @brief Lock-free fast-path size of `callbacks_`. - * - * Every per-packet poll iteration on every worker calls - * `drain_callbacks()`. With many AF_XDP queue workers in busy-poll - * mode that adds up to millions of mutex acquires per second on - * `mutex_` even when no callbacks are pending. This counter lets - * `drain_callbacks()` skip the lock entirely on the common - * "nothing to drain" path. It is incremented under `mutex_` whenever - * we push to `callbacks_`, and reset to 0 inside `drain_callbacks()` - * after we swap the deque out. - */ - std::atomic pending_callbacks_{0}; + /// Lock-free hint for the earliest outstanding drop deadline. + std::atomic next_deadline_{kNoDeadline}; - static std::function snapshot_hook_; }; } // namespace openpenny::penny diff --git a/src/app/cli/penny_cli.cpp b/src/app/cli/penny_cli.cpp index 5aa4a66..08d270a 100644 --- a/src/app/cli/penny_cli.cpp +++ b/src/app/cli/penny_cli.cpp @@ -862,12 +862,8 @@ int main(int argc, char** argv) { // // End state: Passive pipeline completed (flows=42) if (result.active) { - const auto agg_snapshot = - (result.active->aggregates_snapshot - ? *result.active->aggregates_snapshot - : openpenny::app::aggregate_counters()); - const auto agg_live = openpenny::app::aggregate_counters(); + const auto& agg_snapshot = agg_live; const auto runtime = openpenny::current_runtime_setup(); const bool is_passive = @@ -892,6 +888,15 @@ int main(int argc, char** argv) { result.aggregates_enabled && runtime.aggregates_status != openpenny::RuntimeStatus::AggregatesStatus::PENDING; + const std::uint64_t closed_loop_flows_observed = std::max( + agg_snapshot.flows_closed_loop, + agg_live.flows_closed_loop); + const std::uint64_t closed_loop_flows_found = std::max( + closed_loop_flows_observed, + result.active->closed_loop_flow_summaries.size()); + const std::uint64_t duplicate_exceeded_flows_found = std::max( + agg_snapshot.flows_duplicates_exceeded, + result.active->duplicate_exceeded_flow_summaries.size()); // --- Run --------------------------------------------------------- print_section(std::cout, "Run"); @@ -1020,40 +1025,84 @@ int main(int argc, char** argv) { agg_snapshot.flows_duplicates_exceeded); } - // --- Per-flow detail (passive only, if any) ---------------------- + // --- Per-flow detail --------------------------------------------- if (is_passive && !result.active->passive_gap_summaries.empty()) { print_section(std::cout, "Per-flow detail"); for (const auto& g : result.active->passive_gap_summaries) { std::cout << " " << g << "\n"; } } + if (!is_passive && !result.active->closed_loop_flow_summaries.empty()) { + print_section(std::cout, "Closed-loop flows"); + for (const auto& s : result.active->closed_loop_flow_summaries) { + std::cout << " " << s << "\n"; + } + } + if (!is_passive && !result.active->duplicate_exceeded_flow_summaries.empty()) { + print_section(std::cout, "Duplicate-exceeded flows"); + for (const auto& s : result.active->duplicate_exceeded_flow_summaries) { + std::cout << " " << s << "\n"; + } + } // --- End state --------------------------------------------------- - std::ostringstream end_state; + std::ostringstream end_state_primary; + std::ostringstream end_state_closed_loop_suffix; + std::ostringstream end_state_duplicate_suffix; const char* end_color = ""; + const char* closed_loop_suffix_color = ""; + const char* duplicate_suffix_color = ""; if (!is_passive && agg_done) { - end_state << "Aggregates completed (" << agg_status_str << ")"; + end_state_primary << "Aggregates completed (" << agg_status_str << ")"; + if (closed_loop_flows_found > 0) { + end_state_closed_loop_suffix << ", found " << fmt_count(closed_loop_flows_found) + << " closed-loop flow" + << (closed_loop_flows_found == 1 ? "" : "s"); + closed_loop_suffix_color = kAnsiBlue; + } + if (duplicate_exceeded_flows_found > 0) { + end_state_duplicate_suffix << ", found " + << fmt_count(duplicate_exceeded_flows_found) + << " duplicate-exceeded flow" + << (duplicate_exceeded_flows_found == 1 ? "" : "s"); + duplicate_suffix_color = kAnsiYellow; + } end_color = color_for_agg_status(agg_status_str); } else if (result.active->penny_completed) { if (is_passive) { - end_state << "Passive pipeline completed (flows=" - << result.active->passive_flows_finished << ")"; + end_state_primary << "Passive pipeline completed (flows=" + << result.active->passive_flows_finished << ")"; end_color = kAnsiGreen; } else { - end_state << "Penny heuristics completed"; + end_state_primary << "Penny heuristics completed"; + if (closed_loop_flows_found > 0) { + end_state_closed_loop_suffix << ", found " << fmt_count(closed_loop_flows_found) + << " closed-loop flow" + << (closed_loop_flows_found == 1 ? "" : "s"); + closed_loop_suffix_color = kAnsiBlue; + } + if (duplicate_exceeded_flows_found > 0) { + end_state_duplicate_suffix << ", found " + << fmt_count(duplicate_exceeded_flows_found) + << " duplicate-exceeded flow" + << (duplicate_exceeded_flows_found == 1 ? "" : "s"); + duplicate_suffix_color = kAnsiYellow; + } end_color = kAnsiGreen; } } else if (g_stop_requested != 0) { - end_state << "Stopped via signal (Ctrl+C)"; + end_state_primary << "Stopped via signal (Ctrl+C)"; end_color = kAnsiYellow; } else { - end_state << "Reader/pipeline error (see logs)"; + end_state_primary << "Reader/pipeline error (see logs)"; end_color = kAnsiRed; } std::cout << "\n" << ansi(kAnsiBold) << "End state:" << ansi(kAnsiReset) << " " - << ansi(end_color) << end_state.str() << ansi(kAnsiReset) + << ansi(end_color) << end_state_primary.str() << ansi(kAnsiReset) + << ansi(closed_loop_suffix_color) << end_state_closed_loop_suffix.str() << ansi(kAnsiReset) + << ansi(duplicate_suffix_color) << end_state_duplicate_suffix.str() << ansi(kAnsiReset) << "\n"; } else { // No active result usually means no packets were processed or the @@ -1071,4 +1120,4 @@ int main(int argc, char** argv) { // of the forwarding fd is needed here any more. run_detach_command(); return 0; -} \ No newline at end of file +} diff --git a/src/app/core/AggregatesController.cpp b/src/app/core/AggregatesController.cpp index c5cf349..7b3108d 100644 --- a/src/app/core/AggregatesController.cpp +++ b/src/app/core/AggregatesController.cpp @@ -16,10 +16,8 @@ DropCollector::TimestampRep snapshot_timestamp( return snap.timestamp.time_since_epoch().count(); } -void decorate_snapshot_record(DropSnapshotRecord& record, - const openpenny::app::AggregatedCounters& agg) { - record.counters = agg; - record.snapshot.stats.overwrite_from_aggregates(agg); +bool is_pending_snapshot(const penny::PacketDropSnapshot& snap) noexcept { + return snap.state == penny::SnapshotState::Pending; } void set_runtime_eval_counters(RuntimeStatus& runtime, @@ -46,9 +44,21 @@ void store_aggregate_snapshot_once( if (!snapshot_slot) snapshot_slot = agg; } -std::vector collect_all_drop_snapshots( - const DropCollector& collector, +std::optional collect_frozen_aggregate_counters( + const DropCollector& collector) { + std::lock_guard lock(collector.frozen_aggregate_counters_mtx); + return collector.frozen_aggregate_counters; +} + +penny::PennyStats make_eval_stats_from_aggregates( const openpenny::app::AggregatedCounters& agg) { + penny::PennyStats stats; + stats.overwrite_from_aggregates(agg); + return stats; +} + +std::vector collect_all_drop_snapshots( + const DropCollector& collector) { std::vector out; std::size_t total = 0; for (std::size_t shard_index = 0; shard_index < collector.shard_count; ++shard_index) { @@ -61,15 +71,11 @@ std::vector collect_all_drop_snapshots( std::lock_guard lock(shard.mtx); out.insert(out.end(), shard.snapshots.begin(), shard.snapshots.end()); } - for (auto& record : out) { - decorate_snapshot_record(record, agg); - } return out; } std::optional collect_latest_drop_snapshot( - const DropCollector& collector, - const openpenny::app::AggregatedCounters& agg) { + const DropCollector& collector) { std::size_t best_shard_index = 0; auto best_timestamp = DropCollector::kNoSnapshotTimestamp; for (std::size_t shard_index = 0; shard_index < collector.shard_count; ++shard_index) { @@ -96,7 +102,6 @@ std::optional collect_latest_drop_snapshot( if (latest_index < best_shard.snapshots.size()) { auto record = best_shard.snapshots[latest_index]; if (snapshot_timestamp(record.snapshot) == best_timestamp) { - decorate_snapshot_record(record, agg); return record; } } @@ -119,9 +124,6 @@ std::optional collect_latest_drop_snapshot( latest = *it; } } - if (latest) { - decorate_snapshot_record(*latest, agg); - } return latest; } @@ -141,6 +143,29 @@ CollectorSnapshotSummary summarize_collector_snapshots(const DropCollector& coll return summary; } +CollectorSnapshotSummary summarize_drop_snapshots( + const std::vector& snapshots) { + CollectorSnapshotSummary summary; + summary.snapshot_count = snapshots.size(); + summary.pending_snapshot_count = static_cast(std::count_if( + snapshots.begin(), + snapshots.end(), + [](const DropSnapshotRecord& record) { + return is_pending_snapshot(record.snapshot); + })); + return summary; +} + +bool aggregates_ready_for_evaluation(std::size_t required_drops, + std::size_t snapshot_count, + std::size_t pending_snapshot_count, + std::uint64_t pending_rtx_count) noexcept { + return required_drops > 0 && + snapshot_count >= required_drops && + pending_snapshot_count == 0 && + pending_rtx_count == 0; +} + } // namespace AggregatesController::AggregatesController(const Config& cfg, @@ -159,7 +184,11 @@ AggregatesController::AggregatesController(const Config& cfg, individual_limit_enabled_{opts.mode == PipelineOptions::Mode::Active && cfg.active.stop_after_individual_flows > 0}, min_closed_loop_enabled_{opts.mode == PipelineOptions::Mode::Active && - cfg.active.min_closed_loop_flows > 0} {} + cfg.active.min_closed_loop_flows > 0} { + if (collector_enabled_ && collector_) { + collector_->snapshot_limit = required_drops_; + } +} void AggregatesController::start() { if (collector_enabled_) { @@ -214,8 +243,7 @@ std::optional AggregatesController::aggregat void AggregatesController::populate_drop_snapshots(PipelineSummary& summary) const { if (!collector_) return; - const auto agg = openpenny::app::aggregate_counters(); - auto snaps = collect_all_drop_snapshots(*collector_, agg); + auto snaps = collect_all_drop_snapshots(*collector_); std::sort( snaps.begin(), snaps.end(), @@ -228,14 +256,33 @@ void AggregatesController::populate_drop_snapshots(PipelineSummary& summary) con void AggregatesController::evaluate_pending_if_needed(const Config& cfg, PipelineSummary& summary) { auto& runtime = runtime_setup_mutable(); + const auto snapshot_summary = summarize_drop_snapshots(summary.drop_snapshots); + const auto agg = openpenny::app::aggregate_counters(); + const auto frozen_agg = + collector_ ? collect_frozen_aggregate_counters(*collector_) : std::nullopt; + const auto pending_rtx_count = + frozen_agg ? frozen_agg->pending_retransmissions : agg.pending_retransmissions; + const bool ready = aggregates_ready_for_evaluation( + required_drops_, + snapshot_summary.snapshot_count, + snapshot_summary.pending_snapshot_count, + pending_rtx_count); if (!cfg.active.aggregates_enabled || - runtime.aggregates_status != RuntimeStatus::AggregatesStatus::PENDING || - !aggregates_ready_.load(std::memory_order_relaxed) || + current_aggregates_status() != RuntimeStatus::AggregatesStatus::PENDING || + !ready || summary.drop_snapshots.empty()) { return; } + aggregates_ready_.store(true, std::memory_order_relaxed); + if (frozen_agg) { + store_aggregate_snapshot_once(aggregates_snapshot_, aggregates_snapshot_mtx_, *frozen_agg); + } else { + store_aggregate_snapshot_once(aggregates_snapshot_, aggregates_snapshot_mtx_, agg); + } const auto& latest = summary.drop_snapshots.front(); - const auto& stats = latest.snapshot.stats; + const auto stats = frozen_agg + ? make_eval_stats_from_aggregates(*frozen_agg) + : latest.snapshot.stats; const auto miss_prob = std::clamp( cfg.active.retransmission_miss_probability, 0.0, @@ -245,14 +292,20 @@ void AggregatesController::evaluate_pending_if_needed(const Config& cfg, miss_prob, cfg.active.max_duplicate_fraction); if (eval.decision == penny::FlowEngine::FlowDecision::FINISHED_CLOSED_LOOP) { - runtime.aggregates_status = RuntimeStatus::AggregatesStatus::CLOSED_LOOP; + set_current_aggregates_status(RuntimeStatus::AggregatesStatus::CLOSED_LOOP); } else if (eval.decision == penny::FlowEngine::FlowDecision::FINISHED_NOT_CLOSED_LOOP) { - runtime.aggregates_status = RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP; + set_current_aggregates_status(RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP); + } else if (eval.decision == penny::FlowEngine::FlowDecision::FINISHED_DUPLICATE_EXCEEDED) { + set_current_aggregates_status(RuntimeStatus::AggregatesStatus::DUPLICATES_EXCEEDED); } else { - runtime.aggregates_status = RuntimeStatus::AggregatesStatus::DUPLICATES_EXCEEDED; + set_current_aggregates_status(RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP); + } + set_current_has_aggregate_eval(true); + if (frozen_agg) { + set_runtime_eval_counters(runtime, *frozen_agg); + } else { + set_runtime_eval_counters(runtime, stats); } - runtime.has_aggregate_eval = true; - set_runtime_eval_counters(runtime, stats); collector_completed_.store(true, std::memory_order_relaxed); } @@ -266,16 +319,15 @@ void AggregatesController::collector_loop() { // 2. Evaluate the aggregate stats once. // - bidirectional / closed-loop -> stop the pipeline and // report CLOSED_LOOP. - // - duplicates exceeded -> stop and report - // DUPLICATES_EXCEEDED. - // - anything else (NON_CLOSED_LOOP or no verdict yet) - // -> fall through to step 3. + // - anything else + // (NON_CLOSED_LOOP or DUPLICATES_EXCEEDED) + // -> freeze the aggregate verdict, then switch to the + // separate per-flow phase. // 3. Watch the per-flow CLOSED_LOOP termination tally and stop as // soon as it reaches `min_closed_loop_flows` (defaulting to 2 - // when the operator did not configure it). This is the - // "look for the min flows" path and gives the run a chance - // to upgrade to CLOSED_LOOP via per-flow evidence even when - // the one-shot aggregate eval did not. + // when the operator did not configure it). This is a separate + // per-flow stop condition; it does NOT rewrite the aggregate + // verdict from step 2. auto& runtime = runtime_setup_mutable(); bool aggregate_eval_done = false; bool wait_for_closed_loops = false; @@ -288,39 +340,87 @@ void AggregatesController::collector_loop() { cfg_.active.min_closed_loop_flows > 0 ? cfg_.active.min_closed_loop_flows : static_cast(2); + auto finalize_aggregate_verdict = + [&](RuntimeStatus::AggregatesStatus status, + const std::optional& frozen_agg, + const openpenny::app::AggregatedCounters& agg_now, + const std::optional& stats) { + set_current_aggregates_status(status); + set_current_aggregates_active(false); + set_current_has_aggregate_eval(true); + if (frozen_agg) { + set_runtime_eval_counters(runtime, *frozen_agg); + store_aggregate_snapshot_once( + aggregates_snapshot_, + aggregates_snapshot_mtx_, + *frozen_agg); + } else if (stats) { + set_runtime_eval_counters(runtime, *stats); + store_aggregate_snapshot_once( + aggregates_snapshot_, + aggregates_snapshot_mtx_, + agg_now); + } else { + set_runtime_eval_counters(runtime, agg_now); + store_aggregate_snapshot_once( + aggregates_snapshot_, + aggregates_snapshot_mtx_, + agg_now); + } + }; + auto switch_to_individual_flow_phase = + [&](RuntimeStatus::AggregatesStatus status, + const char* verdict_text, + const std::optional& frozen_agg, + const openpenny::app::AggregatedCounters& agg_now, + const std::optional& stats) { + finalize_aggregate_verdict(status, frozen_agg, agg_now, stats); + aggregate_eval_done = true; + wait_for_closed_loops = true; + TCPLOG_INFO( + "[agg_phase] action=switch_to_individual agg_status=%s drops=%zu " + "next=individual wait_closed_loop_flows=%llu", + verdict_text, + required_drops_, + static_cast(closed_loop_required)); + }; while (!stop_flag_.load(std::memory_order_relaxed)) { if (user_should_stop_ && user_should_stop_()) break; if (wait_for_closed_loops) { auto agg = openpenny::app::aggregate_counters(); if (agg.flows_closed_loop >= closed_loop_required) { TCPLOG_INFO( - "[aggregates_closed_loop] flows_closed_loop=%llu flows_not_closed_loop=%llu flows_finished=%llu", + "[closed_loop_threshold] flows_closed_loop=%llu flows_not_closed_loop=%llu flows_finished=%llu " + "aggregate_status=%d", static_cast(agg.flows_closed_loop), static_cast(agg.flows_not_closed_loop), - static_cast(agg.flows_finished)); - runtime.aggregates_status = RuntimeStatus::AggregatesStatus::CLOSED_LOOP; - runtime.has_aggregate_eval = true; - set_runtime_eval_counters(runtime, agg); + static_cast(agg.flows_finished), + static_cast(current_aggregates_status())); collector_completed_.store(true, std::memory_order_relaxed); - store_aggregate_snapshot_once(aggregates_snapshot_, aggregates_snapshot_mtx_, agg); + closed_loop_stop_hit_.store(true, std::memory_order_relaxed); stop_flag_.store(true, std::memory_order_relaxed); break; } + std::this_thread::sleep_for(25ms); + continue; } bool ready = false; - bool pending = false; - bool pending_rtx = false; std::size_t snapshot_count = 0; std::size_t pending_snapshot_count = 0; std::uint64_t pending_rtx_count = 0; { const auto collector_summary = summarize_collector_snapshots(*collector_); + const auto frozen_agg = collect_frozen_aggregate_counters(*collector_); snapshot_count = collector_summary.snapshot_count; pending_snapshot_count = collector_summary.pending_snapshot_count; - pending = pending_snapshot_count > 0; - pending_rtx_count = openpenny::app::aggregate_counters().pending_retransmissions; - pending_rtx = pending_rtx_count > 0; - ready = snapshot_count >= required_drops_ && !pending && !pending_rtx; + pending_rtx_count = frozen_agg + ? frozen_agg->pending_retransmissions + : openpenny::app::aggregate_counters().pending_retransmissions; + ready = aggregates_ready_for_evaluation( + required_drops_, + snapshot_count, + pending_snapshot_count, + pending_rtx_count); } // Periodic gate diagnostic: when snapshot_count has reached the // required threshold but ready stays false, this line tells the @@ -336,9 +436,8 @@ void AggregatesController::collector_loop() { g_last_gate_log_ns.compare_exchange_strong( last, next, std::memory_order_acq_rel)) { TCPLOG_INFO( - "[aggregates_gate] snapshots=%zu/%zu pending_snapshots=%zu " - "pending_rtx=%llu (waiting for both to reach 0 before " - "evaluating)", + "[agg_wait] drops=%zu/%zu pending_snapshots=%zu pending_rtx=%llu " + "state=waiting", snapshot_count, required_drops_, pending_snapshot_count, @@ -349,41 +448,49 @@ void AggregatesController::collector_loop() { aggregates_ready_.store(true, std::memory_order_relaxed); if (!ready_logged) { TCPLOG_INFO( - "Aggregates have %zu drops ready (required=%zu)", + "[agg_ready] drops=%zu required=%zu", snapshot_count, required_drops_); ready_logged = true; } collector_->accepting.store(false, std::memory_order_relaxed); const auto agg_now = openpenny::app::aggregate_counters(); + const auto frozen_agg = collect_frozen_aggregate_counters(*collector_); + const auto eval_stats = frozen_agg + ? make_eval_stats_from_aggregates(*frozen_agg) + : penny::PennyStats{}; if (cfg_.active.max_duplicate_fraction > 0.0) { - if (agg_now.data_packets > 0) { - const double agg_dup_ratio = static_cast(agg_now.duplicate_packets) / - static_cast(agg_now.data_packets); + const auto dup_data_packets = + frozen_agg ? eval_stats.data_packets() : agg_now.data_packets; + const auto dup_packets = + frozen_agg ? eval_stats.duplicate_packets() : agg_now.duplicate_packets; + if (dup_data_packets > 0) { + const double agg_dup_ratio = static_cast(dup_packets) / + static_cast(dup_data_packets); if (agg_dup_ratio > cfg_.active.max_duplicate_fraction) { - runtime.aggregates_status = RuntimeStatus::AggregatesStatus::DUPLICATES_EXCEEDED; - runtime.aggregates_active = false; - runtime.has_aggregate_eval = true; - set_runtime_eval_counters(runtime, agg_now); - collector_completed_.store(true, std::memory_order_relaxed); - store_aggregate_snapshot_once( - aggregates_snapshot_, - aggregates_snapshot_mtx_, - agg_now); - stop_flag_.store(true, std::memory_order_relaxed); - break; + switch_to_individual_flow_phase( + RuntimeStatus::AggregatesStatus::DUPLICATES_EXCEEDED, + "duplicates_exceeded", + frozen_agg, + agg_now, + std::nullopt); } } } if (!aggregate_eval_done) { aggregate_eval_done = true; - auto latest_snapshot = collect_latest_drop_snapshot(*collector_, agg_now); + auto latest_snapshot = collect_latest_drop_snapshot(*collector_); if (latest_snapshot) { - if (agg_now.pending_retransmissions > 0) { + const auto pending_window_rtx = frozen_agg + ? frozen_agg->pending_retransmissions + : agg_now.pending_retransmissions; + if (pending_window_rtx > 0) { continue; } - auto stats = latest_snapshot->snapshot.stats; + const auto stats = frozen_agg + ? make_eval_stats_from_aggregates(*frozen_agg) + : latest_snapshot->snapshot.stats; const auto miss_prob = std::clamp( cfg_.active.retransmission_miss_probability, 0.0, @@ -399,12 +506,26 @@ void AggregatesController::collector_loop() { miss_prob, cfg_.active.max_duplicate_fraction); const auto packet_id_text = penny::format_packet_drop_id(latest_snapshot->packet_id); - - const auto denom = eval.p_closed + eval.p_not_closed; + const auto* eval_verdict_text = [&]() -> const char* { + switch (eval.decision) { + case penny::FlowEngine::FlowDecision::FINISHED_CLOSED_LOOP: + return "closed_loop"; + case penny::FlowEngine::FlowDecision::FINISHED_NOT_CLOSED_LOOP: + return "not_closed_loop"; + case penny::FlowEngine::FlowDecision::FINISHED_DUPLICATE_EXCEEDED: + return "duplicates_exceeded"; + case penny::FlowEngine::FlowDecision::FINISHED_NO_DECISION: + return "no_decision"; + case penny::FlowEngine::FlowDecision::PENDING: + default: + return "pending"; + } + }(); TCPLOG_INFO( - "[agg_eval] data_pkts=%llu dup_pkts=%llu rtx_pkts=%llu non_rtx_pkts=%llu " - "dup_ratio=%.6f miss_prob=%.6f p_closed=%.6f p_not_closed=%.6f denom=%.6f closed_weight=%.6f decision=%s " + "[agg_eval] verdict=%s data=%llu dup=%llu rtx=%llu non_rtx=%llu " + "dup_ratio=%.6f miss_prob=%.6f p_closed=%.6f p_not_closed=%.6f closed_weight=%.6f " "packet_id=%s thread=%s", + eval_verdict_text, static_cast(stats.data_packets()), static_cast(stats.duplicate_packets()), static_cast(stats.retransmitted_packets()), @@ -413,70 +534,38 @@ void AggregatesController::collector_loop() { miss_prob, eval.p_closed, eval.p_not_closed, - denom, eval.closed_weight, - penny::flow_decision_to_string(eval.decision), packet_id_text.c_str(), latest_snapshot->thread_name.c_str()); if (dup_threshold_hit) { - runtime.aggregates_status = RuntimeStatus::AggregatesStatus::DUPLICATES_EXCEEDED; - runtime.aggregates_active = false; - runtime.has_aggregate_eval = true; - set_runtime_eval_counters(runtime, stats); - collector_completed_.store(true, std::memory_order_relaxed); - store_aggregate_snapshot_once( - aggregates_snapshot_, - aggregates_snapshot_mtx_, - agg_now); - break; + switch_to_individual_flow_phase( + RuntimeStatus::AggregatesStatus::DUPLICATES_EXCEEDED, + "duplicates_exceeded", + frozen_agg, + agg_now, + stats); + continue; } if (eval.decision == penny::FlowEngine::FlowDecision::FINISHED_CLOSED_LOOP) { - runtime.aggregates_status = RuntimeStatus::AggregatesStatus::CLOSED_LOOP; - store_aggregate_snapshot_once( - aggregates_snapshot_, - aggregates_snapshot_mtx_, - agg_now); - runtime.has_aggregate_eval = true; - set_runtime_eval_counters(runtime, stats); - collector_completed_.store(true, std::memory_order_relaxed); - stop_flag_.store(true, std::memory_order_relaxed); - break; - } else if (eval.decision == penny::FlowEngine::FlowDecision::FINISHED_NOT_CLOSED_LOOP) { - runtime.aggregates_status = RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP; - } - - set_runtime_eval_counters(runtime, stats); - runtime.has_aggregate_eval = true; - - if (cfg_.active.aggregates_enabled && - eval.decision != penny::FlowEngine::FlowDecision::FINISHED_CLOSED_LOOP) { - // Aggregate eval at `required_drops_` drops did not - // produce a bidirectional verdict; switch to - // step 3 of the contract and wait for - // closed_loop_required per-flow CLOSED_LOOP - // terminations before declaring the run done. - runtime.aggregates_active = false; - wait_for_closed_loops = true; - TCPLOG_INFO( - "[agg_eval_fallback] aggregate verdict %s after %zu drops; " - "waiting for %llu closed-loop flow%s before finishing", - penny::flow_decision_to_string(eval.decision), - required_drops_, - static_cast(closed_loop_required), - closed_loop_required == 1 ? "" : "s"); - } else { - store_aggregate_snapshot_once( - aggregates_snapshot_, - aggregates_snapshot_mtx_, - agg_now); + finalize_aggregate_verdict( + RuntimeStatus::AggregatesStatus::CLOSED_LOOP, + frozen_agg, + agg_now, + stats); collector_completed_.store(true, std::memory_order_relaxed); stop_flag_.store(true, std::memory_order_relaxed); break; } + switch_to_individual_flow_phase( + RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP, + "not_closed_loop", + frozen_agg, + agg_now, + stats); } else { - runtime.aggregates_status = RuntimeStatus::AggregatesStatus::PENDING; + set_current_aggregates_status(RuntimeStatus::AggregatesStatus::PENDING); } } } @@ -488,7 +577,7 @@ void AggregatesController::individual_limit_loop() { using namespace std::chrono_literals; while (!stop_flag_.load(std::memory_order_relaxed)) { if (collector_enabled_ && - runtime_setup_mutable().aggregates_status == RuntimeStatus::AggregatesStatus::PENDING) { + current_aggregates_status() == RuntimeStatus::AggregatesStatus::PENDING) { std::this_thread::sleep_for(100ms); continue; } @@ -518,7 +607,7 @@ void AggregatesController::min_closed_loop_loop() { // and the aggregate eval (if enabled) is not still pending. while (!stop_flag_.load(std::memory_order_relaxed)) { if (collector_enabled_ && - runtime_setup_mutable().aggregates_status == RuntimeStatus::AggregatesStatus::PENDING) { + current_aggregates_status() == RuntimeStatus::AggregatesStatus::PENDING) { std::this_thread::sleep_for(100ms); continue; } @@ -533,14 +622,11 @@ void AggregatesController::min_closed_loop_loop() { static_cast(agg.flows_not_closed_loop), static_cast(agg.flows_rst), static_cast(agg.flows_duplicates_exceeded)); - store_aggregate_snapshot_once(aggregates_snapshot_, aggregates_snapshot_mtx_, agg); - // If the aggregate eval has not produced a verdict yet, mark - // it CLOSED_LOOP since we have collected enough closed-loop - // evidence on its own. auto& runtime = runtime_setup_mutable(); - if (runtime.aggregates_status == RuntimeStatus::AggregatesStatus::PENDING) { - runtime.aggregates_status = RuntimeStatus::AggregatesStatus::CLOSED_LOOP; - runtime.has_aggregate_eval = true; + if (current_aggregates_status() == RuntimeStatus::AggregatesStatus::PENDING) { + store_aggregate_snapshot_once(aggregates_snapshot_, aggregates_snapshot_mtx_, agg); + set_current_aggregates_status(RuntimeStatus::AggregatesStatus::CLOSED_LOOP); + set_current_has_aggregate_eval(true); set_runtime_eval_counters(runtime, agg); } collector_completed_.store(true, std::memory_order_relaxed); diff --git a/src/app/core/DropCollectorBinding.cpp b/src/app/core/DropCollectorBinding.cpp index 1494cda..7ed9e14 100644 --- a/src/app/core/DropCollectorBinding.cpp +++ b/src/app/core/DropCollectorBinding.cpp @@ -2,11 +2,9 @@ #include "openpenny/app/core/DropCollectorBinding.h" -#include "openpenny/penny/flow/engine/FlowEngine.h" -#include "openpenny/penny/flow/timer/ThreadFlowEventTimer.h" +#include "openpenny/app/core/PerThreadStats.h" #include -#include namespace openpenny::app { namespace { @@ -20,57 +18,62 @@ bool is_pending_snapshot(const penny::PacketDropSnapshot& snap) noexcept { return snap.state == penny::SnapshotState::Pending; } -} // namespace - -DropCollectorBinding& DropCollectorBinding::instance() { - static DropCollectorBinding inst; - return inst; +bool try_reserve_snapshot_slot(DropCollector& collector) noexcept { + if (collector.snapshot_limit == 0) { + return true; + } + auto reserved = collector.accepted_snapshot_count.load(std::memory_order_relaxed); + while (reserved < collector.snapshot_limit) { + if (collector.accepted_snapshot_count.compare_exchange_weak( + reserved, + reserved + 1, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + return true; + } + } + return false; } -void DropCollectorBinding::ensure_snapshot_hook() { - std::call_once(hook_once_, []() { - penny::ThreadFlowEventTimerManager::set_snapshot_hook( - [](penny::FlowEngine* flow, - penny::PacketDropId packet_id, - penny::ThreadFlowEventTimerManager::SnapshotEventKind /*kind*/) { - auto& self = DropCollectorBinding::instance(); - const auto binding = self.lookup(flow); - if (!binding.collector) return; - if (!binding.collector->accepting.load(std::memory_order_relaxed)) return; - - const auto& snaps = flow->drop_snapshots(); - const auto key = flow->flow_key(); - auto& shard = binding.collector->shard_for(binding.shard_index); - - std::lock_guard lock(shard.mtx); - if (!binding.collector->accepting.load(std::memory_order_relaxed)) return; - // Mirror any updated packet drop snapshots from the FlowEngine into - // the shared collector so aggregate decisions see fresh stats. - for (const auto& pair : snaps) { - if (packet_id != 0 && pair.first != packet_id) continue; - self.upsert_locked(binding, key, pair.first, pair.second); - } - }); - }); +void maybe_freeze_aggregate_window(DropCollector& collector, + const openpenny::app::AggregatedCounters& agg) { + if (collector.snapshot_limit == 0 || + collector.accepted_snapshot_count.load(std::memory_order_relaxed) < collector.snapshot_limit) { + return; + } + std::lock_guard lock(collector.frozen_aggregate_counters_mtx); + if (!collector.frozen_aggregate_counters) { + collector.frozen_aggregate_counters = agg; + } } -void DropCollectorBinding::bind(penny::FlowEngine* flow, - DropCollectorPtr collector, - const std::string& thread_name, - std::size_t shard_index) { - if (!flow || !collector) return; - std::lock_guard lock(mtx_); - bindings_[flow] = BindingContext{ - std::move(collector), - thread_name, - shard_index - }; +void apply_frozen_aggregate_transition(DropCollector& collector, + const penny::PacketDropSnapshot& before, + const penny::PacketDropSnapshot& after) { + if (before.state == after.state) { + return; + } + std::lock_guard lock(collector.frozen_aggregate_counters_mtx); + if (!collector.frozen_aggregate_counters) { + return; + } + auto& agg = *collector.frozen_aggregate_counters; + if (before.state == penny::SnapshotState::Pending && + agg.pending_retransmissions > 0) { + --agg.pending_retransmissions; + } + if (after.state == penny::SnapshotState::Retransmitted) { + ++agg.retransmitted_packets; + } else if (after.state == penny::SnapshotState::Expired) { + ++agg.non_retransmitted_packets; + } } -void DropCollectorBinding::unbind(penny::FlowEngine* flow) { - if (!flow) return; - std::lock_guard lock(mtx_); - bindings_.erase(flow); +} // namespace + +DropCollectorBinding& DropCollectorBinding::instance() { + static DropCollectorBinding inst; + return inst; } void DropCollectorBinding::upsert(DropCollectorPtr collector, @@ -84,30 +87,43 @@ void DropCollectorBinding::upsert(DropCollectorPtr collector, auto& shard = collector->shard_for(shard_index); std::lock_guard lock(shard.mtx); if (!collector->accepting.load(std::memory_order_relaxed)) return; - upsert_locked(BindingContext{collector, thread_name, shard_index}, key, packet_id, snap); + upsert_locked(*collector, shard, thread_name, key, packet_id, snap); } -DropCollectorBinding::BindingContext DropCollectorBinding::lookup(penny::FlowEngine* flow) const { - std::lock_guard lock(mtx_); - auto it = bindings_.find(flow); - if (it != bindings_.end()) { - return it->second; +void DropCollectorBinding::refresh_from( + DropCollectorPtr collector, + const std::string& thread_name, + std::size_t shard_index, + const FlowKey& key, + const std::vector>& snapshots, + std::size_t start_index) { + if (!collector) return; + if (!collector->accepting.load(std::memory_order_relaxed)) return; + if (start_index >= snapshots.size()) return; + + auto& shard = collector->shard_for(shard_index); + std::lock_guard lock(shard.mtx); + if (!collector->accepting.load(std::memory_order_relaxed)) return; + + for (std::size_t i = start_index; i < snapshots.size(); ++i) { + const auto& pair = snapshots[i]; + upsert_locked(*collector, shard, thread_name, key, pair.first, pair.second); } - return {}; } -void DropCollectorBinding::upsert_locked(const BindingContext& binding, +void DropCollectorBinding::upsert_locked(DropCollector& collector, + DropCollector::Shard& shard, + const std::string& thread_name, const FlowKey& key, penny::PacketDropId packet_id, const penny::PacketDropSnapshot& snap) { - if (!binding.collector) return; - auto& shard = binding.collector->shard_for(binding.shard_index); auto& snapshots = shard.snapshots; DropCollector::SnapshotKey snapshot_key{key, packet_id}; auto index_it = shard.snapshot_index.find(snapshot_key); if (index_it != shard.snapshot_index.end()) { auto& rec = snapshots[index_it->second]; + const auto previous_snapshot = rec.snapshot; auto pending_count = shard.pending_snapshot_count.load(std::memory_order_relaxed); const bool was_pending = is_pending_snapshot(rec.snapshot); const bool now_pending = is_pending_snapshot(snap); @@ -120,9 +136,14 @@ void DropCollectorBinding::upsert_locked(const BindingContext& binding, } shard.pending_snapshot_count.store(pending_count, std::memory_order_relaxed); } + apply_frozen_aggregate_transition(collector, previous_snapshot, snap); } else { + if (!try_reserve_snapshot_slot(collector)) { + return; + } + const auto agg_now = openpenny::app::aggregate_counters(); const auto idx = snapshots.size(); - snapshots.push_back(DropSnapshotRecord{key, packet_id, snap, {}, binding.thread_name}); + snapshots.push_back(DropSnapshotRecord{key, packet_id, snap, agg_now, thread_name}); shard.snapshot_index.emplace(std::move(snapshot_key), idx); shard.snapshot_count.store(snapshots.size(), std::memory_order_relaxed); if (is_pending_snapshot(snap)) { @@ -137,6 +158,7 @@ void DropCollectorBinding::upsert_locked(const BindingContext& binding, shard.latest_snapshot_index.store(idx, std::memory_order_relaxed); shard.latest_snapshot_timestamp.store(ts, std::memory_order_relaxed); } + maybe_freeze_aggregate_window(collector, agg_now); } } diff --git a/src/app/core/OpenpennyPipelineDriver.cpp b/src/app/core/OpenpennyPipelineDriver.cpp index 0667b01..7768f69 100644 --- a/src/app/core/OpenpennyPipelineDriver.cpp +++ b/src/app/core/OpenpennyPipelineDriver.cpp @@ -205,6 +205,9 @@ PipelineSummary drive_pipeline(const Config& cfg_in, const PipelineOptions& opts TCPLOG_INFO("[openpenny] traffic match: %s", net::describe_traffic_match(opts_local.traffic_match).c_str()); + // Number of queues to process traffic. + const unsigned qcount = std::max(1u, opts_local.queue_count); + // Capture the runtime setup at worker start so observers can inspect it. set_runtime_setup(cfg, opts_local, @@ -219,8 +222,6 @@ PipelineSummary drive_pipeline(const Config& cfg_in, const PipelineOptions& opts auto matcher = [&](const FlowKey& key) { return net::traffic_matches_flow(opts_local.traffic_match, key); }; - // Number of queues to process traffic. - const unsigned qcount = std::max(1u, opts_local.queue_count); // ------------------------------------------------------------------ // One-line startup summary at INFO. With many queues the per-worker @@ -338,6 +339,7 @@ PipelineSummary drive_pipeline(const Config& cfg_in, const PipelineOptions& opts aggregates_controller.join(); const auto agg_counters_now = openpenny::app::aggregate_counters(); bool individual_stop_hit = aggregates_controller.individual_stop_hit(); + bool closed_loop_stop_hit = aggregates_controller.closed_loop_stop_hit(); if (!individual_stop_hit && cfg.active.stop_after_individual_flows > 0 && opts_local.mode == PipelineOptions::Mode::Active && @@ -346,12 +348,18 @@ PipelineSummary drive_pipeline(const Config& cfg_in, const PipelineOptions& opts } if (individual_stop_hit && cfg.active.aggregates_enabled && - runtime_setup_mutable().aggregates_status == RuntimeStatus::AggregatesStatus::PENDING && + current_aggregates_status() == RuntimeStatus::AggregatesStatus::PENDING && aggregates_controller.aggregates_ready()) { - runtime_setup_mutable().aggregates_status = RuntimeStatus::AggregatesStatus::DUPLICATES_EXCEEDED; + set_current_aggregates_status(RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP); } aggregates_controller.populate_drop_snapshots(summary); aggregates_controller.evaluate_pending_if_needed(cfg, summary); + if (!closed_loop_stop_hit && + opts_local.mode == PipelineOptions::Mode::Active && + cfg.active.min_closed_loop_flows > 0 && + agg_counters_now.flows_closed_loop >= cfg.active.min_closed_loop_flows) { + closed_loop_stop_hit = true; + } // Fold per-thread results into a single aggregated ModeResult. ModeResult aggregate{}; @@ -385,12 +393,26 @@ PipelineSummary drive_pipeline(const Config& cfg_in, const PipelineOptions& opts r->passive_gap_summaries.begin(), r->passive_gap_summaries.end()); } + if (!r->closed_loop_flow_summaries.empty()) { + aggregate.closed_loop_flow_summaries.insert( + aggregate.closed_loop_flow_summaries.end(), + r->closed_loop_flow_summaries.begin(), + r->closed_loop_flow_summaries.end()); + } + if (!r->duplicate_exceeded_flow_summaries.empty()) { + aggregate.duplicate_exceeded_flow_summaries.insert( + aggregate.duplicate_exceeded_flow_summaries.end(), + r->duplicate_exceeded_flow_summaries.begin(), + r->duplicate_exceeded_flow_summaries.end()); + } // Completion flags are combined with logical OR. aggregate.penny_completed = aggregate.penny_completed || r->penny_completed; aggregate.aggregates_penny_completed = aggregate.aggregates_penny_completed || r->aggregates_penny_completed; + aggregate.closed_loop_stop_hit = + aggregate.closed_loop_stop_hit || r->closed_loop_stop_hit; } // Use aggregated counters to avoid undercounting packets processed. aggregate.packets_processed = std::max( @@ -398,16 +420,23 @@ PipelineSummary drive_pipeline(const Config& cfg_in, const PipelineOptions& opts static_cast(agg_counters_now.packets)); if (aggregates_controller.collector_completed()) { const bool agg_done_status = - runtime_setup_mutable().aggregates_status != RuntimeStatus::AggregatesStatus::PENDING; + current_aggregates_status() != RuntimeStatus::AggregatesStatus::PENDING; aggregate.aggregates_penny_completed = agg_done_status; aggregate.penny_completed = agg_done_status; } if (individual_stop_hit) { aggregate.penny_completed = true; } + if (closed_loop_stop_hit) { + aggregate.closed_loop_stop_hit = true; + } if (auto snapshot = aggregates_controller.aggregates_snapshot()) { aggregate.aggregates_snapshot = snapshot; } + std::sort(aggregate.closed_loop_flow_summaries.begin(), + aggregate.closed_loop_flow_summaries.end()); + std::sort(aggregate.duplicate_exceeded_flow_summaries.begin(), + aggregate.duplicate_exceeded_flow_summaries.end()); // Only populate the summary if at least one worker reported results. if (any) { diff --git a/src/app/core/PerThreadStats.cpp b/src/app/core/PerThreadStats.cpp index fa5b934..9c1ee35 100644 --- a/src/app/core/PerThreadStats.cpp +++ b/src/app/core/PerThreadStats.cpp @@ -60,6 +60,9 @@ static std::atomic g_counters_size{1}; void init_thread_counters(std::size_t count) { const auto clamped = std::min(count, kMaxCounters); + for (auto& counter : g_counters) { + counter = {}; + } for (auto& counter : g_drop_budget_counters) { counter.drops.store(0, std::memory_order_relaxed); } diff --git a/src/app/core/RuntimeSetup.cpp b/src/app/core/RuntimeSetup.cpp index 0093c82..6a8095e 100644 --- a/src/app/core/RuntimeSetup.cpp +++ b/src/app/core/RuntimeSetup.cpp @@ -2,9 +2,15 @@ #include "openpenny/app/core/RuntimeSetup.h" +#include + namespace openpenny { namespace { RuntimeSetupSnapshot g_runtime_setup; +std::atomic g_aggregates_active{true}; +std::atomic g_aggregates_status{ + static_cast(RuntimeStatus::AggregatesStatus::PENDING)}; +std::atomic g_has_aggregate_eval{false}; } void set_runtime_setup(const Config& cfg, @@ -15,6 +21,16 @@ void set_runtime_setup(const Config& cfg, g_runtime_setup.options = opts; g_runtime_setup.use_xdp = use_xdp; g_runtime_setup.use_dpdk = use_dpdk; + g_runtime_setup.aggregates_active = true; + g_runtime_setup.testing_finished = false; + g_runtime_setup.aggregates_status = RuntimeStatus::AggregatesStatus::PENDING; + g_runtime_setup.aggregate_eval_counters = {}; + g_runtime_setup.has_aggregate_eval = false; + g_aggregates_active.store(true, std::memory_order_release); + g_aggregates_status.store( + static_cast(RuntimeStatus::AggregatesStatus::PENDING), + std::memory_order_release); + g_has_aggregate_eval.store(false, std::memory_order_release); } const RuntimeSetupSnapshot& current_runtime_setup() { @@ -25,4 +41,32 @@ RuntimeSetupSnapshot& runtime_setup_mutable() { return g_runtime_setup; } +bool current_aggregates_active() noexcept { + return g_aggregates_active.load(std::memory_order_acquire); +} + +void set_current_aggregates_active(bool value) noexcept { + g_runtime_setup.aggregates_active = value; + g_aggregates_active.store(value, std::memory_order_release); +} + +RuntimeStatus::AggregatesStatus current_aggregates_status() noexcept { + return static_cast( + g_aggregates_status.load(std::memory_order_acquire)); +} + +void set_current_aggregates_status(RuntimeStatus::AggregatesStatus status) noexcept { + g_runtime_setup.aggregates_status = status; + g_aggregates_status.store(static_cast(status), std::memory_order_release); +} + +bool current_has_aggregate_eval() noexcept { + return g_has_aggregate_eval.load(std::memory_order_acquire); +} + +void set_current_has_aggregate_eval(bool value) noexcept { + g_runtime_setup.has_aggregate_eval = value; + g_has_aggregate_eval.store(value, std::memory_order_release); +} + } // namespace openpenny diff --git a/src/app/core/active/ActiveTestPipeline.cpp b/src/app/core/active/ActiveTestPipeline.cpp index b0f7348..a36158b 100644 --- a/src/app/core/active/ActiveTestPipeline.cpp +++ b/src/app/core/active/ActiveTestPipeline.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,7 @@ #include "openpenny/app/core/PipelineRunner.h" #include "openpenny/app/core/PerThreadStats.h" #include "openpenny/app/core/DropCollectorBinding.h" +#include "openpenny/app/core/RuntimeSetup.h" #include "openpenny/log/Log.h" #include "openpenny/penny/flow/engine/FlowEngine.h" #include "openpenny/penny/flow/timer/ThreadFlowEventTimer.h" @@ -30,6 +32,20 @@ namespace openpenny { namespace { thread_local ActiveTestPipelineRunner* tls_runner = nullptr; + +std::string format_closed_loop_flow_summary(const FlowKey& key, + const penny::FlowEngine& flow) { + std::ostringstream summary; + summary << flow_debug_details(key) + << " data=" << flow.data_packets() + << " dropped=" << flow.dropped_packets() + << " rtx=" << flow.retransmitted_packets() + << " non_rtx=" << flow.non_retransmitted_packets() + << " dup=" << flow.duplicate_packets() + << " in_order=" << flow.in_order_packets() + << " out_of_order=" << flow.out_of_order_packets(); + return summary.str(); +} } // namespace // Constructs an active OpenPenny traffic processing pipeline runner. @@ -53,7 +69,6 @@ ActiveTestPipelineRunner::ActiveTestPipelineRunner( std::chrono::duration(cfg.active.flow_idle_timeout_seconds))} // Idle expiry window. { if (drop_collector_) { - app::DropCollectorBinding::instance().ensure_snapshot_hook(); flow_manager_.set_drop_sink( [collector = drop_collector_, name = thread_name_, @@ -68,6 +83,21 @@ ActiveTestPipelineRunner::ActiveTestPipelineRunner( packet_id, snapshot); }); + flow_manager_.set_snapshot_refresh_sink( + [collector = drop_collector_, + name = thread_name_, + shard_index = drop_collector_shard_index_]( + const FlowKey& key, + const std::vector>& snapshots, + std::size_t start_index) { + app::DropCollectorBinding::instance().refresh_from( + collector, + name, + shard_index, + key, + snapshots, + start_index); + }); } } @@ -178,7 +208,8 @@ void ActiveTestPipelineRunner::after_poll( if (idle_timeout_.count() > 0) { expire_idle_flows(now); } - sweep_expired_snapshots(now); + evaluate_individual_flows_if_enabled(); + complete_resolved_terminal_flows(); // Mirrors the post-loop drain in the legacy run() so deferred // expirations aren't stranded between iterations. penny::ThreadFlowEventTimerManager::instance().drain_callbacks(); @@ -187,17 +218,23 @@ void ActiveTestPipelineRunner::after_poll( void ActiveTestPipelineRunner::on_closing() { // Flush any callbacks that arrived after the final poll iteration. penny::ThreadFlowEventTimerManager::instance().drain_callbacks(); - sweep_expired_snapshots(std::chrono::steady_clock::now()); + evaluate_individual_flows_if_enabled(); + complete_resolved_terminal_flows(); } void ActiveTestPipelineRunner::finalize(ModeResult& result) { - // Expire any pending snapshots on remaining flows to ensure expirations are logged/applied. + // Resolve any pending snapshots on remaining flows without bypassing the + // configured retransmission timeout at shutdown. flow_manager_.for_each_flow([](const FlowKey&, penny::FlowEngineEntry& entry) { - entry.flow.expire_all_pending_snapshots(); + entry.flow.resolve_pending_snapshots(std::chrono::steady_clock::now()); }); + evaluate_individual_flows_if_enabled(); + complete_resolved_terminal_flows(); result.packets_forwarded = total_pkts_forwarded_; result.forward_errors = total_forward_errors_; + result.closed_loop_flow_summaries = closed_loop_flow_summaries_; + result.duplicate_exceeded_flow_summaries = duplicate_exceeded_flow_summaries_; } // --------------------------------------------------------------------------- @@ -209,32 +246,105 @@ void ActiveTestPipelineRunner::expire_idle_flows(const std::chrono::steady_clock if (idle_timeout_.count() <= 0) return; auto expired = flow_manager_.collect_idle_flows(now, idle_timeout_); for (const auto& key : expired) { - if (auto* entry = flow_manager_.find(key)) { - app::DropCollectorBinding::instance().unbind(&entry->flow); - } - flow_manager_.complete_flow(key, "idle_timeout"); + complete_flow_with_summary(key, "idle_timeout"); } } -void ActiveTestPipelineRunner::sweep_expired_snapshots(const std::chrono::steady_clock::time_point& now) { - // Expire packet drop snapshots using the configured retransmission timeout (seconds). - const auto retransmission_timeout = std::chrono::duration(cfg_.active.rtt_timeout_factor); - if (retransmission_timeout.count() <= 0.0) return; - flow_manager_.for_each_flow([&](const FlowKey&, penny::FlowEngineEntry& entry) { - const auto& snaps = entry.flow.drop_snapshots(); - for (const auto& pair : snaps) { - if (pair.second.state != penny::SnapshotState::Pending) continue; - if (now - pair.second.timestamp >= retransmission_timeout) { - if (TCPLOG_ENABLED(INFO)) { - const auto packet_id_text = penny::format_packet_drop_id(pair.first); - TCPLOG_INFO("[packet_expired] flow=%s packet_id=%s", - flow_debug_details(entry.flow.flow_key()).c_str(), - packet_id_text.c_str()); +bool ActiveTestPipelineRunner::individual_flow_evaluation_enabled() const { + const bool aggregate_phase_configured = + cfg_.active.aggregates_enabled && + cfg_.active.max_drops_aggregates > 0; + if (!aggregate_phase_configured) { + return true; + } + const auto status = openpenny::current_aggregates_status(); + return status == RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP || + status == RuntimeStatus::AggregatesStatus::DUPLICATES_EXCEEDED; +} + +void ActiveTestPipelineRunner::evaluate_individual_flows_if_enabled() { + if (!individual_flow_evaluation_enabled()) { + return; + } + + flow_manager_.for_each_flow([&](const FlowKey& key, penny::FlowEngineEntry& entry) { + const bool immutable_terminal_state = + entry.state == penny::FlowTrackingState::INTERRUPTED_RST || + entry.state == penny::FlowTrackingState::INTERRUPTED_DUPLICATE_EXCEEDED || + entry.state == penny::FlowTrackingState::INTERRUPTED_OUT_OF_ORDER_EXCEEDED || + entry.state == penny::FlowTrackingState::FINISHED; + + if (!immutable_terminal_state) { + if (flow_out_of_order_threshold_exceeded(entry.flow)) { + entry.state = penny::FlowTrackingState::INTERRUPTED_OUT_OF_ORDER_EXCEEDED; + if (TCPLOG_ENABLED(DEBUG)) { + const auto flow_tag = flow_debug_details(key); + TCPLOG_DEBUG("Out-of-order threshold exceeded %s", flow_tag.c_str()); } - entry.flow.mark_snapshot_expired(pair.first); + return; } + if (flow_duplicate_threshold_exceeded(entry.flow)) { + entry.state = penny::FlowTrackingState::INTERRUPTED_DUPLICATE_EXCEEDED; + if (TCPLOG_ENABLED(DEBUG)) { + const auto flow_tag = flow_debug_details(key); + TCPLOG_DEBUG("Duplicate threshold exceeded %s", flow_tag.c_str()); + } + return; + } + } + + if (entry.flow.final_decision() == penny::FlowEngine::FlowDecision::PENDING) { + entry.flow.evaluate_if_ready(); + } + + if (entry.state != penny::FlowTrackingState::CONNECTION_CLOSED_FIN && + !immutable_terminal_state && + entry.flow.final_decision() != penny::FlowEngine::FlowDecision::PENDING) { + entry.state = penny::FlowTrackingState::FINISHED; + } + }); +} + +void ActiveTestPipelineRunner::complete_resolved_terminal_flows() { + std::vector completed_keys; + const bool individual_eval_enabled = individual_flow_evaluation_enabled(); + flow_manager_.for_each_flow([&](const FlowKey& key, penny::FlowEngineEntry& entry) { + const bool terminal_state = + entry.state == penny::FlowTrackingState::INTERRUPTED_RST || + entry.state == penny::FlowTrackingState::INTERRUPTED_DUPLICATE_EXCEEDED || + entry.state == penny::FlowTrackingState::INTERRUPTED_OUT_OF_ORDER_EXCEEDED || + entry.state == penny::FlowTrackingState::CONNECTION_CLOSED_FIN || + entry.state == penny::FlowTrackingState::FINISHED; + if (!terminal_state) return; + if (!individual_eval_enabled && + entry.flow.final_decision() == penny::FlowEngine::FlowDecision::PENDING) { + return; } + if (entry.flow.pending_retransmissions() != 0) return; + completed_keys.push_back(key); }); + + for (const auto& key : completed_keys) { + complete_flow_with_summary(key, "terminal_state"); + } +} + +void ActiveTestPipelineRunner::complete_flow_with_summary(const FlowKey& key, const char* reason) { + auto* existing = flow_manager_.find(key); + if (!existing) { + return; + } + existing->flow.resolve_pending_snapshots(std::chrono::steady_clock::now()); + const auto final_decision = existing->flow.final_decision(); + const auto summary = format_closed_loop_flow_summary(key, existing->flow); + if (final_decision == penny::FlowEngine::FlowDecision::FINISHED_CLOSED_LOOP) { + closed_loop_flow_summaries_.push_back(summary); + } + if (existing->state == penny::FlowTrackingState::INTERRUPTED_DUPLICATE_EXCEEDED || + final_decision == penny::FlowEngine::FlowDecision::FINISHED_DUPLICATE_EXCEEDED) { + duplicate_exceeded_flow_summaries_.push_back(summary); + } + flow_manager_.complete_flow(key, reason); } void ActiveTestPipelineRunner::handle_packet(const net::PacketView& packet, @@ -276,45 +386,37 @@ void ActiveTestPipelineRunner::handle_packet(const net::PacketView& packet, penny::FlowEngineEntry* ActiveTestPipelineRunner::admit_or_forward_flow( const net::PacketView& packet, const std::chrono::steady_clock::time_point& now) { + auto* flow_entry = flow_manager_.find(packet.flow); // Skip flows we've already monitored in the past. - if (flow_manager_.was_completed(packet.flow)) { + if (!flow_entry && flow_manager_.was_completed(packet.flow)) { forward_packet(packet); return nullptr; } - const auto monitor_state = flow_manager_.flow_state(packet.flow); - if (monitor_state == penny::FlowTrackingState::NOT_ACTIONABLE && - flow_manager_.is_flow_monitoring_capacity_full()) { + if (!flow_entry && flow_manager_.is_flow_monitoring_capacity_full()) { // Flow is not tracked, and there are no spare monitoring slots. forward_packet(packet); return nullptr; } - if (monitor_state == penny::FlowTrackingState::INTERRUPTED_RST || - monitor_state == penny::FlowTrackingState::INTERRUPTED_DUPLICATE_EXCEEDED || - monitor_state == penny::FlowTrackingState::INTERRUPTED_OUT_OF_ORDER_EXCEEDED || - monitor_state == penny::FlowTrackingState::CONNECTION_CLOSED_FIN || - monitor_state == penny::FlowTrackingState::FINISHED) { - // Mark flow as complete and free the monitoring slot. - if (auto* existing = flow_manager_.find(packet.flow)) { - app::DropCollectorBinding::instance().unbind(&existing->flow); + if (flow_entry && + (flow_entry->state == penny::FlowTrackingState::INTERRUPTED_RST || + flow_entry->state == penny::FlowTrackingState::INTERRUPTED_DUPLICATE_EXCEEDED || + flow_entry->state == penny::FlowTrackingState::INTERRUPTED_OUT_OF_ORDER_EXCEEDED || + flow_entry->state == penny::FlowTrackingState::CONNECTION_CLOSED_FIN || + flow_entry->state == penny::FlowTrackingState::FINISHED)) { + // Terminal flows with unresolved drops stay resident until the + // retransmission gap is filled or the timeout expires. + if (flow_entry->flow.pending_retransmissions() == 0) { + complete_flow_with_summary(packet.flow, "terminal_state"); } - flow_manager_.complete_flow(packet.flow, "terminal_state"); forward_packet(packet); return nullptr; } - // Check whether the packet belongs to one of the flows currently being monitored. - auto* flow_entry = flow_manager_.find(packet.flow); - if (flow_entry) { const auto penny_flow_decision = flow_entry->flow.final_decision(); - if (penny_flow_decision != penny::FlowEngine::FlowDecision::PENDING){ - // From Penny perspective the test for the flow is done. - - - } const bool terminal_state = flow_entry->state == penny::FlowTrackingState::INTERRUPTED_RST || flow_entry->state == penny::FlowTrackingState::INTERRUPTED_DUPLICATE_EXCEEDED || @@ -324,35 +426,25 @@ penny::FlowEngineEntry* ActiveTestPipelineRunner::admit_or_forward_flow( if (!terminal_state && penny_flow_decision != penny::FlowEngine::FlowDecision::PENDING) { flow_entry->state = penny::FlowTrackingState::FINISHED; - app::DropCollectorBinding::instance().unbind(&flow_entry->flow); - flow_manager_.complete_flow(packet.flow, "penny_decision"); + complete_flow_with_summary(packet.flow, "penny_decision"); forward_packet(packet); return nullptr; } } - if (!flow_entry && !flow_manager_.is_flow_monitoring_capacity_full()) { + if (!flow_entry) { try { const bool is_syn = packet.tcp.flags_view().syn; - const bool inserted = flow_manager_.add_new_flow( + flow_entry = flow_manager_.add_new_flow( packet.flow, packet.tcp.seq, static_cast(packet.payload_bytes), is_syn, now); - if (inserted) { - if (drop_collector_) { - if (auto* entry = flow_manager_.find(packet.flow)) { - app::DropCollectorBinding::instance().bind( - &entry->flow, - drop_collector_, - thread_name_, - drop_collector_shard_index_); - } - } + if (flow_entry) { if (TCPLOG_ENABLED(INFO)) { const auto flow_tag = flow_debug_details(packet.flow); - TCPLOG_INFO("[monitor_start] %s flow=%s seq=%" PRIu32 " payload_bytes=%zu", + TCPLOG_INFO("[flow_track] action=start trigger=%s flow=%s seq=%" PRIu32 " payload=%zu", is_syn ? "syn" : "data", flow_tag.c_str(), packet.tcp.seq, @@ -426,7 +518,7 @@ bool ActiveTestPipelineRunner::promote_pending_flow( return false; } -// Fast-path check for RST that marks outstanding drop snapshots as expired. +// Fast-path check for RST that marks outstanding drop snapshots as invalid. void ActiveTestPipelineRunner::handle_rst(penny::FlowEngineEntry& entry, const net::PacketView& packet) { if ((packet.tcp.flags & 0x04) == 0) return; // RST bit not set. @@ -450,7 +542,8 @@ void ActiveTestPipelineRunner::handle_rst(penny::FlowEngineEntry& entry, const n entry.state = penny::FlowTrackingState::INTERRUPTED_RST; } -// Fast-path check for FIN that marks outstanding drop snapshots as expired. +// Fast-path check for FIN. A clean close means any still-missing dropped +// payload was not retransmitted before teardown, so we resolve it immediately. void ActiveTestPipelineRunner::handle_fin(penny::FlowEngineEntry& entry, const net::PacketView& packet) { if ((packet.tcp.flags & 0x01) == 0) return; // FIN bit not set. @@ -460,13 +553,12 @@ void ActiveTestPipelineRunner::handle_fin(penny::FlowEngineEntry& entry, const n for (const auto& snap_pair : snapshots) { const auto& snapshot = snap_pair.second; - // Skip snapshots already decided. if (snapshot.state != penny::SnapshotState::Pending || snapshot.stats.pending_retransmissions() == 0) { continue; } - flow.mark_snapshot_invalid(snap_pair.first); // Treat pending gaps as invalid on close. + flow.mark_snapshot_expired(snap_pair.first); if (flow.pending_retransmissions() == 0) break; } penny::ThreadFlowEventTimerManager::instance().purge_flow(&flow); @@ -525,7 +617,9 @@ void ActiveTestPipelineRunner::handle_data_packet(penny::FlowEngineEntry& entry, end_seq, entry.flow.highest_sequence()); } - const bool ooo_exceeded = flow_out_of_order_threshold_exceeded(entry.flow); + const bool ooo_exceeded = + individual_flow_evaluation_enabled() && + flow_out_of_order_threshold_exceeded(entry.flow); if (ooo_exceeded) { entry.state = penny::FlowTrackingState::INTERRUPTED_OUT_OF_ORDER_EXCEEDED; if (TCPLOG_ENABLED(DEBUG)) { @@ -553,7 +647,9 @@ void ActiveTestPipelineRunner::handle_data_packet(penny::FlowEngineEntry& entry, penny::ThreadFlowEventTimerManager::instance().enqueue_duplicate(&entry.flow, start_seq, packet.payload_bytes); // Logging handled in timer callback. - const bool dup_exceeded = flow_duplicate_threshold_exceeded(entry.flow); + const bool dup_exceeded = + individual_flow_evaluation_enabled() && + flow_duplicate_threshold_exceeded(entry.flow); if (dup_exceeded) { entry.state = penny::FlowTrackingState::INTERRUPTED_DUPLICATE_EXCEEDED; if (TCPLOG_ENABLED(DEBUG)) { @@ -576,7 +672,9 @@ void ActiveTestPipelineRunner::handle_data_packet(penny::FlowEngineEntry& entry, penny::ThreadFlowEventTimerManager::instance().enqueue_duplicate(&entry.flow, start_seq, packet.payload_bytes); // Logging handled in timer callback. - const bool dup_exceeded = flow_duplicate_threshold_exceeded(entry.flow); + const bool dup_exceeded = + individual_flow_evaluation_enabled() && + flow_duplicate_threshold_exceeded(entry.flow); if (dup_exceeded) { entry.state = penny::FlowTrackingState::INTERRUPTED_DUPLICATE_EXCEEDED; if (TCPLOG_ENABLED(DEBUG)) { diff --git a/src/app/core/passive/PassiveTestPipeline.cpp b/src/app/core/passive/PassiveTestPipeline.cpp index 4843516..982a052 100644 --- a/src/app/core/passive/PassiveTestPipeline.cpp +++ b/src/app/core/passive/PassiveTestPipeline.cpp @@ -31,7 +31,21 @@ PassiveTestPipelineRunner::PassiveTestPipelineRunner(const Config& cfg, : cfg_(cfg), opts_(opts), matcher_(std::move(matcher)), - source_(std::move(source)) {} + source_(std::move(source)) { + reserve_for_config(); +} + +void PassiveTestPipelineRunner::reserve_for_config() { + if (cfg_.passive.max_parallel_flows > 0) { + flows_.reserve(cfg_.passive.max_parallel_flows); + } + + if (cfg_.passive.min_number_of_flows_to_finish > 0) { + finished_flows_.reserve(cfg_.passive.min_number_of_flows_to_finish); + finished_index_.reserve(cfg_.passive.min_number_of_flows_to_finish); + finished_keys_.reserve(cfg_.passive.min_number_of_flows_to_finish); + } +} std::optional PassiveTestPipelineRunner::run() { PipelineRunner runner(cfg_, diff --git a/src/app/core/utils/FlowDebug.cpp b/src/app/core/utils/FlowDebug.cpp index ccc4628..4f8686a 100644 --- a/src/app/core/utils/FlowDebug.cpp +++ b/src/app/core/utils/FlowDebug.cpp @@ -6,6 +6,21 @@ namespace openpenny { +namespace { + +std::string proto_label(std::uint8_t proto) { + switch (proto) { + case 6: + return "tcp"; + case 17: + return "udp"; + default: + return std::to_string(static_cast(proto)); + } +} + +} // namespace + std::string to_ipv4_string(uint32_t host_order_ip) { std::ostringstream out; out << ((host_order_ip >> 24) & 0xff) << '.' @@ -18,9 +33,14 @@ std::string to_ipv4_string(uint32_t host_order_ip) { std::string flow_debug_details(const FlowKey& flow) { const auto src_ip = to_ipv4_string(flow.src); const auto dst_ip = to_ipv4_string(flow.dst); + const bool have_proto = flow.ip_proto != 0; std::string tag; - tag.reserve(src_ip.size() + dst_ip.size() + 16); + tag.reserve(src_ip.size() + dst_ip.size() + (have_proto ? 24 : 16)); tag.push_back('{'); + if (have_proto) { + tag.append(proto_label(flow.ip_proto)); + tag.push_back('-'); + } tag.append(src_ip); tag.push_back('-'); tag.append(dst_ip); diff --git a/src/app/worker/penny_worker.cpp b/src/app/worker/penny_worker.cpp index 85aa22c..e82edf4 100644 --- a/src/app/worker/penny_worker.cpp +++ b/src/app/worker/penny_worker.cpp @@ -6,6 +6,7 @@ #include "openpenny/egress/PacketSink.h" #include "openpenny/log/Log.h" +#include #include #include #include @@ -297,9 +298,7 @@ int main(int argc, char** argv) { const uint64_t aggregates_snapshots = aggregates_enabled ? summary.drop_snapshots.size() : 0; openpenny::app::AggregatedCounters agg_snapshot{}; if (is_active_mode) { - agg_snapshot = res.aggregates_snapshot - ? *res.aggregates_snapshot - : openpenny::app::aggregate_counters(); + agg_snapshot = openpenny::app::aggregate_counters(); } std::cout << "aggregates_status=" << aggregates_status_str << "\n"; std::cout << "aggregates_decision_complete=" << (aggregates_done ? 1 : 0) << "\n"; @@ -315,6 +314,12 @@ int main(int argc, char** argv) { std::cout << "aggregate_flows_not_closed_loop=" << agg_snapshot.flows_not_closed_loop << "\n"; std::cout << "aggregate_flows_rst=" << agg_snapshot.flows_rst << "\n"; std::cout << "aggregate_flows_duplicates_exceeded=" << agg_snapshot.flows_duplicates_exceeded << "\n"; + const uint64_t closed_loop_flows_found = std::max( + agg_snapshot.flows_closed_loop, + res.closed_loop_flow_summaries.size()); + const uint64_t duplicate_exceeded_flows_found = std::max( + agg_snapshot.flows_duplicates_exceeded, + res.duplicate_exceeded_flow_summaries.size()); // Emit JSON summary similar to CLI output. nlohmann::json j; j["test_id"] = args.test_id; @@ -357,6 +362,35 @@ int main(int argc, char** argv) { {"rst", agg_snapshot.flows_rst}, {"duplicates_exceeded", agg_snapshot.flows_duplicates_exceeded} }; + j["closed_loop_flows_found"] = closed_loop_flows_found; + j["duplicate_exceeded_flows_found"] = duplicate_exceeded_flows_found; + j["closed_loop_flows"] = nlohmann::json::array(); + for (const auto& line : res.closed_loop_flow_summaries) { + j["closed_loop_flows"].push_back(line); + } + j["duplicate_exceeded_flows"] = nlohmann::json::array(); + for (const auto& line : res.duplicate_exceeded_flow_summaries) { + j["duplicate_exceeded_flows"].push_back(line); + } + std::string end_state; + if (aggregates_done) { + end_state = "Aggregates completed (" + aggregates_status_str + ")"; + } else if (res.penny_completed) { + end_state = is_active_mode + ? "Penny heuristics completed" + : "Passive pipeline completed (flows=" + std::to_string(res.passive_flows_finished) + ")"; + } else { + end_state = "Reader/pipeline error"; + } + if (closed_loop_flows_found > 0) { + end_state += ", found " + std::to_string(closed_loop_flows_found) + " closed-loop flow"; + if (closed_loop_flows_found != 1) end_state += "s"; + } + if (duplicate_exceeded_flows_found > 0) { + end_state += ", found " + std::to_string(duplicate_exceeded_flows_found) + " duplicate-exceeded flow"; + if (duplicate_exceeded_flows_found != 1) end_state += "s"; + } + j["end_state"] = end_state; // Aggregate snapshot counters, if available. if (res.passive_flows_finished > 0 || !res.passive_gap_summaries.empty()) { nlohmann::json passive; diff --git a/src/egress/RawNicSink.cpp b/src/egress/RawNicSink.cpp index 963c4ae..291b935 100644 --- a/src/egress/RawNicSink.cpp +++ b/src/egress/RawNicSink.cpp @@ -33,10 +33,12 @@ RawNicSink::~RawNicSink() { close(); } -bool RawNicSink::open() { +int RawNicSink::open_socket_fd(bool resolve_ifindex, bool log_failures) { if (cfg_.device.empty()) { - TCPLOG_ERROR("RawNicSink: device name is required%s", ""); - return false; + if (log_failures) { + TCPLOG_ERROR("RawNicSink: device name is required%s", ""); + } + return -1; } // SOCK_RAW (not SOCK_DGRAM): we want to forward the original frame @@ -47,60 +49,85 @@ bool RawNicSink::open() { // destination. SOCK_RAW preserves the original L2 verbatim. // // ETH_P_ALL on the protocol so we can write any frame type. - fd_ = ::socket(AF_PACKET, SOCK_RAW | SOCK_NONBLOCK, htons(ETH_P_ALL)); - if (fd_ < 0) { - TCPLOG_ERROR("RawNicSink: socket(AF_PACKET, SOCK_RAW) failed: %s (need CAP_NET_RAW)", - std::strerror(errno)); - return false; + int fd = ::socket(AF_PACKET, SOCK_RAW | SOCK_NONBLOCK, htons(ETH_P_ALL)); + if (fd < 0) { + if (log_failures) { + TCPLOG_ERROR("RawNicSink: socket(AF_PACKET, SOCK_RAW) failed: %s (need CAP_NET_RAW)", + std::strerror(errno)); + } + return -1; } - // Resolve ifindex once so the hot path doesn't need another syscall. - ifreq ifr{}; - std::strncpy(ifr.ifr_name, cfg_.device.c_str(), IFNAMSIZ - 1); - if (::ioctl(fd_, SIOCGIFINDEX, &ifr) != 0) { - const int saved = errno; - TCPLOG_ERROR("RawNicSink: SIOCGIFINDEX('%s') failed: %s", - cfg_.device.c_str(), std::strerror(saved)); - ::close(fd_); - fd_ = -1; - errno = saved; - return false; + if (resolve_ifindex || if_index_ <= 0) { + ifreq ifr{}; + std::strncpy(ifr.ifr_name, cfg_.device.c_str(), IFNAMSIZ - 1); + if (::ioctl(fd, SIOCGIFINDEX, &ifr) != 0) { + const int saved = errno; + if (log_failures) { + TCPLOG_ERROR("RawNicSink: SIOCGIFINDEX('%s') failed: %s", + cfg_.device.c_str(), std::strerror(saved)); + } + ::close(fd); + errno = saved; + return -1; + } + if_index_ = ifr.ifr_ifindex; } - if_index_ = ifr.ifr_ifindex; - // Bind to the interface so sendto(2) without a sockaddr works too, and - // so the kernel drops incoming frames targeted at other ifaces. sockaddr_ll addr{}; addr.sll_family = AF_PACKET; addr.sll_protocol = htons(ETH_P_ALL); addr.sll_ifindex = if_index_; - if (::bind(fd_, reinterpret_cast(&addr), sizeof(addr)) != 0) { + if (::bind(fd, reinterpret_cast(&addr), sizeof(addr)) != 0) { const int saved = errno; - TCPLOG_ERROR("RawNicSink: bind to '%s' (ifindex=%d) failed: %s", - cfg_.device.c_str(), if_index_, std::strerror(saved)); - ::close(fd_); - fd_ = -1; - if_index_ = -1; + if (log_failures) { + TCPLOG_ERROR("RawNicSink: bind to '%s' (ifindex=%d) failed: %s", + cfg_.device.c_str(), if_index_, std::strerror(saved)); + } + ::close(fd); errno = saved; - return false; + return -1; } if (cfg_.raw_nic_bind_device) { - // Redundant with the bind() above on modern kernels, but harmless, - // and it mirrors the IPPROTO_RAW path for consistency. - if (::setsockopt(fd_, SOL_SOCKET, SO_BINDTODEVICE, + if (::setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, cfg_.device.c_str(), cfg_.device.size()) != 0) { TCPLOG_WARN("RawNicSink: SO_BINDTODEVICE('%s') failed: %s", cfg_.device.c_str(), std::strerror(errno)); } } + int sndbuf = 16 * 1024 * 1024; + if (::setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf)) != 0) { + TCPLOG_WARN("RawNicSink: SO_SNDBUF(%d) failed: %s", + sndbuf, std::strerror(errno)); + } + + return fd; +} + +bool RawNicSink::open() { + fd_ = open_socket_fd(true, true); + if (fd_ < 0) { + return false; + } + TCPLOG_INFO("RawNicSink: opened (fd=%d, device='%s', ifindex=%d)", fd_, cfg_.device.c_str(), if_index_); return true; } void RawNicSink::close() noexcept { + std::vector to_close; + { + std::lock_guard lock(fds_mtx_); + to_close.swap(additional_fds_); + } + for (int fd : to_close) { + if (fd >= 0) { + ::close(fd); + } + } if (fd_ >= 0) { ::close(fd_); fd_ = -1; @@ -108,8 +135,37 @@ void RawNicSink::close() noexcept { if_index_ = -1; } +int RawNicSink::thread_fd() { + thread_local int t_fd = -1; + thread_local const RawNicSink* t_owner = nullptr; + if (t_owner == this && t_fd >= 0) { + return t_fd; + } + if (fd_ < 0 || if_index_ <= 0) { + t_owner = this; + t_fd = -1; + return t_fd; + } + + int fd = open_socket_fd(false, false); + if (fd < 0) { + t_owner = this; + t_fd = fd_; + return t_fd; + } + + { + std::lock_guard lock(fds_mtx_); + additional_fds_.push_back(fd); + } + t_owner = this; + t_fd = fd; + return t_fd; +} + bool RawNicSink::write(const net::PacketView& packet) { - if (fd_ < 0) { + const int fd = thread_fd(); + if (fd < 0) { return false; } @@ -147,7 +203,7 @@ bool RawNicSink::write(const net::PacketView& packet) { dst.sll_protocol = htons(ETH_P_ALL); dst.sll_ifindex = if_index_; - const ssize_t written = ::sendto(fd_, + const ssize_t written = ::sendto(fd, buf, static_cast(len), 0, @@ -158,12 +214,21 @@ bool RawNicSink::write(const net::PacketView& packet) { return true; } const int err = errno; - if (err != EAGAIN && err != EWOULDBLOCK) { - TCPLOG_WARN("RawNicSink::write (%u bytes) failed on fd=%d (device='%s'): %s", - static_cast(len), fd_, - cfg_.device.c_str(), std::strerror(err)); + if (err == EAGAIN || err == EWOULDBLOCK) { stats_.errors.fetch_add(1, std::memory_order_relaxed); + if (!backpressure_logged_.exchange(true, std::memory_order_relaxed)) { + TCPLOG_WARN( + "RawNicSink: TX backpressure on fd=%d (EAGAIN/EWOULDBLOCK); " + "dropping packets. This can induce real TCP retransmissions at " + "high rates because OpenPenny does not keep a copy-backed TX queue.", + fd); + } + return false; } + TCPLOG_WARN("RawNicSink::write (%u bytes) failed on fd=%d (device='%s'): %s", + static_cast(len), fd, + cfg_.device.c_str(), std::strerror(err)); + stats_.errors.fetch_add(1, std::memory_order_relaxed); return false; } diff --git a/src/egress/RawSocketSink.cpp b/src/egress/RawSocketSink.cpp index 59c7707..2d516d0 100644 --- a/src/egress/RawSocketSink.cpp +++ b/src/egress/RawSocketSink.cpp @@ -28,30 +28,42 @@ RawSocketSink::~RawSocketSink() { close(); } -bool RawSocketSink::open() { - fd_ = ::socket(AF_INET, SOCK_RAW | SOCK_NONBLOCK, IPPROTO_RAW); - if (fd_ < 0) { - TCPLOG_ERROR("RawSocketSink: socket(AF_INET, SOCK_RAW, IPPROTO_RAW) failed: %s", - std::strerror(errno)); - return false; +int RawSocketSink::open_socket_fd(bool log_failures) { + int fd = ::socket(AF_INET, SOCK_RAW | SOCK_NONBLOCK, IPPROTO_RAW); + if (fd < 0) { + if (log_failures) { + TCPLOG_ERROR("RawSocketSink: socket(AF_INET, SOCK_RAW, IPPROTO_RAW) failed: %s", + std::strerror(errno)); + } + return -1; } if (!cfg_.device.empty()) { - if (::setsockopt(fd_, SOL_SOCKET, SO_BINDTODEVICE, + if (::setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, cfg_.device.c_str(), cfg_.device.size()) != 0) { const int saved = errno; TCPLOG_WARN("RawSocketSink: SO_BINDTODEVICE('%s') failed: %s", cfg_.device.c_str(), std::strerror(saved)); - // SO_BINDTODEVICE requires CAP_NET_RAW; treat as non-fatal so - // the sink still works when the operator just hasn't named a - // preferred egress device. } } - // IPPROTO_RAW already implies IP_HDRINCL, but set it explicitly so the - // behaviour is obvious to reviewers tracing packet construction. int one = 1; - (void)::setsockopt(fd_, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one)); + (void)::setsockopt(fd, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one)); + + int sndbuf = 16 * 1024 * 1024; + if (::setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf)) != 0) { + TCPLOG_WARN("RawSocketSink: SO_SNDBUF(%d) failed: %s", + sndbuf, std::strerror(errno)); + } + + return fd; +} + +bool RawSocketSink::open() { + fd_ = open_socket_fd(true); + if (fd_ < 0) { + return false; + } TCPLOG_INFO("RawSocketSink: opened (fd=%d, device='%s')", fd_, cfg_.device.c_str()); @@ -59,18 +71,60 @@ bool RawSocketSink::open() { } void RawSocketSink::close() noexcept { + std::vector to_close; + { + std::lock_guard lock(fds_mtx_); + to_close.swap(additional_fds_); + } + for (int fd : to_close) { + if (fd >= 0) { + ::close(fd); + } + } if (fd_ >= 0) { ::close(fd_); fd_ = -1; } } +int RawSocketSink::thread_fd() { + thread_local int t_fd = -1; + thread_local const RawSocketSink* t_owner = nullptr; + if (t_owner == this && t_fd >= 0) { + return t_fd; + } + if (fd_ < 0) { + t_owner = this; + t_fd = -1; + return t_fd; + } + + int fd = open_socket_fd(false); + if (fd < 0) { + t_owner = this; + t_fd = fd_; + return t_fd; + } + + { + std::lock_guard lock(fds_mtx_); + additional_fds_.push_back(fd); + } + t_owner = this; + t_fd = fd; + return t_fd; +} + bool RawSocketSink::write(const net::PacketView& packet) { - if (fd_ < 0 || !packet.layer3_ptr || packet.layer3_length < 20) { + if (!packet.layer3_ptr || packet.layer3_length < 20) { // IPv4 header is at least 20 bytes; anything shorter isn't a // routable datagram and the kernel would reject it anyway. return false; } + const int fd = thread_fd(); + if (fd < 0) { + return false; + } sockaddr_in dst{}; dst.sin_family = AF_INET; @@ -80,7 +134,7 @@ bool RawSocketSink::write(const net::PacketView& packet) { std::memcpy(&dst.sin_addr.s_addr, packet.layer3_ptr + 16, sizeof(dst.sin_addr.s_addr)); - const ssize_t written = ::sendto(fd_, + const ssize_t written = ::sendto(fd, packet.layer3_ptr, static_cast(packet.layer3_length), 0, @@ -92,9 +146,14 @@ bool RawSocketSink::write(const net::PacketView& packet) { } const int err = errno; if (err == EAGAIN || err == EWOULDBLOCK) { - // Transient back-pressure on a non-blocking raw socket; the - // packet is dropped and no error is recorded (the same policy - // the active path uses). + stats_.errors.fetch_add(1, std::memory_order_relaxed); + if (!backpressure_logged_.exchange(true, std::memory_order_relaxed)) { + TCPLOG_WARN( + "RawSocketSink: TX backpressure on fd=%d (EAGAIN/EWOULDBLOCK); " + "dropping packets. This can induce real TCP retransmissions at " + "high rates because OpenPenny does not keep a copy-backed TX queue.", + fd); + } return false; } if (err == EMSGSIZE) { @@ -116,14 +175,14 @@ bool RawSocketSink::write(const net::PacketView& packet) { "the packet size. Further oversized drops will be " "counted silently.", static_cast(packet.layer3_length), - fd_, + fd, cfg_.device.empty() ? "" : cfg_.device.c_str(), static_cast(packet.layer3_length)); } return false; } TCPLOG_WARN("RawSocketSink::write (%u bytes) failed on fd=%d: %s", - static_cast(packet.layer3_length), fd_, + static_cast(packet.layer3_length), fd, std::strerror(err)); stats_.errors.fetch_add(1, std::memory_order_relaxed); return false; diff --git a/src/egress/TunSink.cpp b/src/egress/TunSink.cpp index bac3cbf..996fd15 100644 --- a/src/egress/TunSink.cpp +++ b/src/egress/TunSink.cpp @@ -294,12 +294,21 @@ bool TunSink::write(const net::PacketView& packet) { return true; } const int err = errno; - if (err != EAGAIN && err != EWOULDBLOCK) { - TCPLOG_WARN("TunSink::write (%u bytes) failed on fd=%d: %s", - static_cast(packet.layer3_length), fd, - std::strerror(err)); + if (err == EAGAIN || err == EWOULDBLOCK) { stats_.errors.fetch_add(1, std::memory_order_relaxed); + if (!backpressure_logged_.exchange(true, std::memory_order_relaxed)) { + TCPLOG_WARN( + "TunSink: TX backpressure on fd=%d (EAGAIN/EWOULDBLOCK); " + "dropping packets. This can induce real TCP retransmissions at " + "high rates because OpenPenny does not keep a copy-backed TX queue.", + fd); + } + return false; } + TCPLOG_WARN("TunSink::write (%u bytes) failed on fd=%d: %s", + static_cast(packet.layer3_length), fd, + std::strerror(err)); + stats_.errors.fetch_add(1, std::memory_order_relaxed); return false; } diff --git a/src/grpc/PennyService.cpp b/src/grpc/PennyService.cpp index 46ac615..08e524a 100644 --- a/src/grpc/PennyService.cpp +++ b/src/grpc/PennyService.cpp @@ -1041,8 +1041,15 @@ ::grpc::Status PennyServiceImpl::StartTest(::grpc::ServerContext*, ? (aggregates_decision_complete ? "completed" : "running") : "n/a"; - // Build a JSON summary akin to the CLI output. - nlohmann::json summary; + // Build a JSON summary akin to the CLI output, preserving any + // worker-emitted detail sections that do not have dedicated proto fields. + nlohmann::json summary = nlohmann::json::object(); + if (!response->json_summary().empty()) { + auto parsed = nlohmann::json::parse(response->json_summary(), nullptr, false); + if (parsed.is_object()) { + summary = std::move(parsed); + } + } summary["test_id"] = response->test_id(); summary["status"] = response->status(); summary["packets"] = { diff --git a/src/ingress/af_xdp/XdpReader.cpp b/src/ingress/af_xdp/XdpReader.cpp index cc7c379..0ed8dd0 100644 --- a/src/ingress/af_xdp/XdpReader.cpp +++ b/src/ingress/af_xdp/XdpReader.cpp @@ -53,7 +53,7 @@ static uint64_t now_ns() { struct SharedAttachState { std::mutex mutex; - unsigned refs{0}; + unsigned refs{0}; ///< Workers currently opening or opened on this shared attach state. bool rss_checked{false}; ///< Only the first-opening worker runs the RSS coverage check. #ifdef OPENPENNY_WITH_LIBBPF bool attached{false}; @@ -511,10 +511,10 @@ bool XdpReader::open(const std::string& ifname, unsigned queue) { return false; } - // Serialise the per-interface attach / map-pin dance across worker - // threads so two queue workers on the same NIC can't race when creating - // or pinning the shared BPF objects. - std::lock_guard shared_lock(impl.shared_attach->mutex); + // Serialize queue-worker bring-up against the shared attach state so + // xsks_map publication and live-rule activation happen in a well-defined + // order across every queue. + std::unique_lock shared_lock(impl.shared_attach->mutex); if (impl.tuning.verbose) { TCPLOG_INFO("Attempting AF_XDP reader on %s queue %u", ifname.c_str(), queue); @@ -930,10 +930,17 @@ bool XdpReader::open(const std::string& ifname, unsigned queue) { const bool shared_reader_already_open = impl.shared_attach->refs > 0; bool pins_ok = false; - if (shared_reader_already_open && open_maps_from_pins()) { - pins_ok = true; + if (shared_reader_already_open) { + if (!open_maps_from_pins()) { + TCPLOG_ERROR("Shared AF_XDP maps are unavailable for %s queue %u; " + "ensure bpffs pins remain accessible while using " + "multiple queues.", + ifname.c_str(), queue); + cleanup(); + return false; + } rs.pinned_maps = true; - rs.xdp_flags = impl.shared_attach->xdp_flags; + pins_ok = true; } else if (impl.tuning.reuse_pins && open_maps_from_pins()) { bool stale_pins = false; bpf_map_info conf_info{}; @@ -988,11 +995,6 @@ bool XdpReader::open(const std::string& ifname, unsigned queue) { pins_ok = true; rs.pinned_maps = true; } - } else if (shared_reader_already_open) { - TCPLOG_ERROR("Pinned AF_XDP maps are not available for shared queue startup on %s.", - ifname.c_str()); - cleanup(); - return false; } if (!pins_ok) { @@ -1104,27 +1106,18 @@ bool XdpReader::open(const std::string& ifname, unsigned queue) { // Real match rules are deferred to the last worker. // - // Why: worker setup is serialised through shared_attach->mutex and - // takes ~80-100 ms per worker (UMEM alloc + bind + fill-ring prime). - // With queue_count=63 that's a 5+ second startup window. If worker 0 - // publishes the real rules during ITS open(), the BPF program starts - // redirecting matched packets immediately — but only xsks_map[0] is - // populated, so packets to queues 1..62 hit xsk_miss until each later - // worker registers. We saw this in the wild: after a 9k-packet burst, - // 2946 xsk_hit (queue 0) and 6213 xsk_miss (the rest). + // Why: worker setup is serialized through shared_attach->mutex and can + // take noticeable time per queue (UMEM alloc + bind + fill-ring prime). + // If worker 0 publishes the real rules during its own open(), the BPF + // program starts redirecting matched packets immediately while later + // queues still have no xsks_map entry yet. // // Fix: every worker publishes pass-only-defaults during worker 0's // open (so the program never blackholes), then the LAST worker swaps // to the real rules once every queue has registered its socket. // - // "Last worker" check: we bump refs BEFORE the check so refs reflects - // the total number of workers that have completed setup, including - // this one. With queue_count=N, the worker that observes refs == N - // after its own increment is the last and owns the rule swap. - // - // Transfer ownership of the attach from this reader to the shared - // state first so the program stays attached if this worker closes - // early, then bump refs and -- if we're last -- publish real rules. + // "Last worker" check: refs is bumped after this worker finishes setup, + // so the worker that observes refs == queue_count owns the real-rule swap. if (rs.attached) { impl.shared_attach->attached = true; impl.shared_attach->ifindex = rs.ifindex; diff --git a/src/net/PacketParser.cpp b/src/net/PacketParser.cpp index e889a16..41239ea 100644 --- a/src/net/PacketParser.cpp +++ b/src/net/PacketParser.cpp @@ -170,6 +170,7 @@ bool PacketParser::decode(const uint8_t* frame, std::size_t length, PacketView& view.flow.dst = dst; view.flow.sport = sport; view.flow.dport = dport; + view.flow.ip_proto = proto; view.ip_proto = proto; diff --git a/src/net/TrafficMatch.cpp b/src/net/TrafficMatch.cpp index 8c25bf8..a03858c 100644 --- a/src/net/TrafficMatch.cpp +++ b/src/net/TrafficMatch.cpp @@ -39,9 +39,7 @@ bool ip_matches(std::uint32_t value, const TrafficIpPrefix& prefix) { return (value & prefix.mask_host) == (prefix.prefix_host & prefix.mask_host); } -bool rule_matches_flow(const TrafficMatchRule& rule, const FlowKey& key) { - if (!rule.enabled) return false; - +bool rule_matches_endpoints(const TrafficMatchRule& rule, const FlowKey& key) { if (rule.src_ip && !ip_matches(key.src, *rule.src_ip)) return false; if (rule.dst_ip && !ip_matches(key.dst, *rule.dst_ip)) return false; @@ -51,8 +49,16 @@ bool rule_matches_flow(const TrafficMatchRule& rule, const FlowKey& key) { return true; } +bool rule_matches_flow(const TrafficMatchRule& rule, const FlowKey& key) { + if (!rule.enabled) return false; + if (!rule_matches_endpoints(rule, key)) return false; + if (rule.ip_proto && key.ip_proto != *rule.ip_proto) return false; + return true; +} + bool rule_matches_packet(const TrafficMatchRule& rule, const PacketView& packet) { - if (!rule_matches_flow(rule, packet.flow)) return false; + if (!rule.enabled) return false; + if (!rule_matches_endpoints(rule, packet.flow)) return false; if (rule.ip_proto && packet.ip_proto != *rule.ip_proto) return false; return true; } diff --git a/src/penny/flow/engine/FlowEngine.cpp b/src/penny/flow/engine/FlowEngine.cpp index df645b5..c579495 100644 --- a/src/penny/flow/engine/FlowEngine.cpp +++ b/src/penny/flow/engine/FlowEngine.cpp @@ -4,6 +4,7 @@ #include "openpenny/penny/flow/engine/FlowEvaluation.h" #include "openpenny/app/core/OpenpennyPipelineDriver.h" #include "openpenny/app/core/PerThreadStats.h" +#include "openpenny/app/core/RuntimeSetup.h" #include "openpenny/log/Log.h" #include "openpenny/app/core/utils/FlowDebug.h" @@ -33,6 +34,35 @@ void FlowEngine::set_drop_sink(DropSnapshotSink sink) { drop_sink_ = std::move(sink); } +void FlowEngine::set_snapshot_refresh_sink(SnapshotRefreshSink sink) { + snapshot_refresh_sink_ = std::move(sink); +} + +void FlowEngine::publish_snapshot_refresh(std::size_t start_index) { + if (!snapshot_refresh_sink_) { + return; + } + if (start_index >= flow_drop_snapshots_.size()) { + return; + } + snapshot_refresh_sink_(flow_key_, flow_drop_snapshots_, start_index); +} + +void FlowEngine::publish_single_snapshot_update(PacketDropId packet_id, + std::size_t snapshot_index) { + if (snapshot_refresh_sink_) { + publish_snapshot_refresh(snapshot_index); + return; + } + if (!drop_sink_) { + return; + } + if (snapshot_index >= flow_drop_snapshots_.size()) { + return; + } + drop_sink_(flow_key_, packet_id, flow_drop_snapshots_[snapshot_index].second); +} + void FlowEngine::reset() { ThreadFlowEventTimerManager::instance().purge_flow(this); flow_drops_enforced_ = 0; @@ -215,15 +245,20 @@ void FlowEngine::register_duplicate_snapshot(uint32_t seq) { // Snanpshots are ordered by insertion; once we find the first snapshot whose coverage // includes this seq (highest_seq >= seq), all later snapshots should reflect the duplicate. bool update = false; + std::size_t first_updated_index = flow_drop_snapshots_.size(); for (size_t i = 0; i < flow_drop_snapshots_.size(); ++i) { auto& snap = flow_drop_snapshots_[i].second; if (!update && snap.stats.highest_seq() >= seq) { update = true; + first_updated_index = i; } if (update) { snap.stats.record_duplicate_packet(); } } + if (update) { + publish_snapshot_refresh(first_updated_index); + } } void FlowEngine::evaluate_snapshot_duplicate_threshold() { @@ -273,8 +308,7 @@ bool FlowEngine::drop_packet(uint32_t start, } if (max_drops_in_aggregates > 0) { - const auto& runtime = openpenny::current_runtime_setup(); - if (runtime.aggregates_active && + if (openpenny::current_aggregates_active() && !openpenny::app::try_reserve_aggregate_drop(max_drops_in_aggregates)) { // Best-effort global drop budget has been exhausted. The atomic // per-worker counters may still allow a small overshoot under @@ -319,7 +353,8 @@ bool FlowEngine::drop_packet(uint32_t start, const size_t snapshot_index = flow_drop_snapshots_.size() - 1; flow_snapshot_index_by_id_[packet_id] = snapshot_index; - // Timer thread will later emit a callback (via ThreadFlowEventTimerManager) that we apply on this thread. + // The owning worker thread will later drain this scheduled timeout/event + // via ThreadFlowEventTimerManager and apply the callback inline. ThreadFlowEventTimerManager::instance().register_drop(key, packet_id, snap.timestamp, flow_alive_flag_, this, snapshot_index); // Register the gap in the SEQ space. @@ -328,7 +363,7 @@ bool FlowEngine::drop_packet(uint32_t start, if (TCPLOG_ENABLED(INFO)) { const auto flow_tag = ::openpenny::flow_debug_details(key); TCPLOG_INFO( - "[drop] flow=%s seq_range=%" PRIu32 "-%" PRIu32 " (len=%" PRIu32 ")", + "[drop_event] action=drop flow=%s start_seq=%" PRIu32 " end_seq=%" PRIu32 " len=%" PRIu32, flow_tag.c_str(), start, end, @@ -406,6 +441,8 @@ void FlowEngine::mark_snapshot_retransmitted(PacketDropId packet_id) { // Remove the packet → snapshot mapping; the snapshot is resolved. flow_snapshot_index_by_id_.erase(index_it); + + publish_single_snapshot_update(packet_id, idx); } /** @@ -481,9 +518,7 @@ void FlowEngine::mark_snapshot_expired(PacketDropId packet_id) { // We no longer need to look up this snapshot by packet ID. flow_snapshot_index_by_id_.erase(index_it); - if (drop_sink_) { - drop_sink_(flow_key_, packet_id, snapshot); - } + publish_single_snapshot_update(packet_id, idx); } /** @@ -531,6 +566,8 @@ void FlowEngine::mark_snapshot_invalid(PacketDropId packet_id) { // Adjust flow-wide pending retransmission statistics. flow_stats_.dec_pending_retransmission(); + auto& counters = openpenny::app::current_thread_counters(); + if (counters.pending_retransmissions > 0) counters.pending_retransmissions--; // Ensure snapshots recorded after this one remain statistically consistent. // They may still include this packet as pending, so remove that dependency. @@ -545,6 +582,8 @@ void FlowEngine::mark_snapshot_invalid(PacketDropId packet_id) { // Remove the packet → snapshot index mapping; this snapshot is now resolved. flow_snapshot_index_by_id_.erase(index_it); + + publish_single_snapshot_update(packet_id, idx); } void FlowEngine::expire_all_pending_snapshots() { @@ -560,6 +599,33 @@ void FlowEngine::expire_all_pending_snapshots() { } } +void FlowEngine::resolve_pending_snapshots(const std::chrono::steady_clock::time_point& now) { + std::vector expired_ids; + std::vector invalid_ids; + expired_ids.reserve(flow_drop_snapshots_.size()); + invalid_ids.reserve(flow_drop_snapshots_.size()); + + const bool timeout_enabled = flow_cfg_.rtt_timeout_factor > 0.0; + const auto timeout = std::chrono::duration_cast( + std::chrono::duration(flow_cfg_.rtt_timeout_factor)); + + for (const auto& pair : flow_drop_snapshots_) { + if (pair.second.state != SnapshotState::Pending) continue; + if (timeout_enabled && now - pair.second.timestamp >= timeout) { + expired_ids.push_back(pair.first); + } else { + invalid_ids.push_back(pair.first); + } + } + + for (const auto& id : expired_ids) { + mark_snapshot_expired(id); + } + for (const auto& id : invalid_ids) { + mark_snapshot_invalid(id); + } +} + FlowEngine::FlowDecision FlowEngine::evaluate() const { const auto eval = evaluate_flow_decision( @@ -575,11 +641,27 @@ FlowEngine::FlowDecision FlowEngine::evaluate() const { const double miss_prob = std::clamp(flow_cfg_.retransmission_miss_probability, 0.0, 1.0); const auto flow_tag = flow_debug_details(flow_key_); + const auto* verdict_text = [&]() -> const char* { + switch (eval.decision) { + case FlowDecision::FINISHED_CLOSED_LOOP: + return "closed_loop"; + case FlowDecision::FINISHED_NOT_CLOSED_LOOP: + return "not_closed_loop"; + case FlowDecision::FINISHED_DUPLICATE_EXCEEDED: + return "duplicates_exceeded"; + case FlowDecision::FINISHED_NO_DECISION: + return "no_decision"; + case FlowDecision::PENDING: + default: + return "pending"; + } + }(); TCPLOG_INFO( - "[flow_eval] flow=%s data_pkts=%llu dup_pkts=%llu rtx_pkts=%llu non_rtx_pkts=%llu " - "dup_ratio=%.6f miss_prob=%.6f p_closed=%.6f p_not_closed=%.6f denom=%.6f closed_weight=%.6f", + "[flow_eval] flow=%s verdict=%s data=%llu dup=%llu rtx=%llu non_rtx=%llu " + "dup_ratio=%.6f miss_prob=%.6f p_closed=%.6f p_not_closed=%.6f closed_weight=%.6f", flow_tag.c_str(), + verdict_text, static_cast(data_pkts), static_cast(dup_pkts), static_cast(retransmitted), @@ -588,7 +670,6 @@ FlowEngine::FlowDecision FlowEngine::evaluate() const { miss_prob, eval.p_closed, eval.p_not_closed, - eval.p_closed + eval.p_not_closed, eval.closed_weight); } @@ -611,6 +692,16 @@ void FlowEngine::evaluate_if_ready() { return; // Decision already made; keep it. } + const bool aggregate_phase_configured = + flow_cfg_.aggregates_enabled && + flow_cfg_.max_drops_aggregates > 0; + const auto aggregates_status = openpenny::current_aggregates_status(); + if (aggregate_phase_configured && + aggregates_status != RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP && + aggregates_status != RuntimeStatus::AggregatesStatus::DUPLICATES_EXCEEDED) { + return; + } + // Do not evaluate if we have not observed any data packets; the classifier // requires data-bearing evidence. if (flow_stats_.data_packets() == 0) { diff --git a/src/penny/flow/manager/ThreadFlowManager.cpp b/src/penny/flow/manager/ThreadFlowManager.cpp index c5703a2..417e8f8 100644 --- a/src/penny/flow/manager/ThreadFlowManager.cpp +++ b/src/penny/flow/manager/ThreadFlowManager.cpp @@ -9,16 +9,27 @@ namespace openpenny::penny { ThreadFlowManager::ThreadFlowManager() = default; -ThreadFlowManager::ThreadFlowManager(const Config::ActiveConfig& cfg) : table_cfg_(cfg) {} +ThreadFlowManager::ThreadFlowManager(const Config::ActiveConfig& cfg) : table_cfg_(cfg) { + reserve_for_config(cfg); +} void ThreadFlowManager::configure(const Config::ActiveConfig& cfg) { table_cfg_ = cfg; + reserve_for_config(cfg); for (auto& [_, entry] : table_active_flows_) { entry.flow.configure(table_cfg_); entry.flow.set_drop_sink(drop_sink_); + entry.flow.set_snapshot_refresh_sink(snapshot_refresh_sink_); } } +void ThreadFlowManager::reserve_for_config(const Config::ActiveConfig& cfg) { + if (cfg.max_tracked_flows == 0) return; + + table_active_flows_.reserve(cfg.max_tracked_flows); + table_completed_flows_.reserve(cfg.max_tracked_flows); +} + void ThreadFlowManager::set_drop_sink(FlowEngine::DropSnapshotSink sink) { drop_sink_ = std::move(sink); for (auto& [_, entry] : table_active_flows_) { @@ -26,28 +37,35 @@ void ThreadFlowManager::set_drop_sink(FlowEngine::DropSnapshotSink sink) { } } -bool ThreadFlowManager::add_new_flow(const FlowKey& key, - uint32_t seq, - uint32_t payload_bytes, - bool is_syn, - const std::chrono::steady_clock::time_point& ts) { - +void ThreadFlowManager::set_snapshot_refresh_sink(FlowEngine::SnapshotRefreshSink sink) { + snapshot_refresh_sink_ = std::move(sink); + for (auto& [_, entry] : table_active_flows_) { + entry.flow.set_snapshot_refresh_sink(snapshot_refresh_sink_); + } +} + +FlowEngineEntry* ThreadFlowManager::add_new_flow(const FlowKey& key, + uint32_t seq, + uint32_t payload_bytes, + bool is_syn, + const std::chrono::steady_clock::time_point& ts) { // Ignore ACK packets with no payload when deciding whether to start monitoring a new flow. if (payload_bytes == 0 && !is_syn) { - return false; + return nullptr; } // try_emplace: insert a new entry if the key is absent, otherwise return the existing one without extra copies. auto [it, inserted] = table_active_flows_.try_emplace(key); auto& entry = it->second; if (!inserted) { - return false; + return nullptr; } auto& counters = openpenny::app::current_thread_counters(); counters.flows_monitored++; counters.active_flows++; entry.flow.configure(table_cfg_); // apply current config for counters/thresholds entry.flow.set_drop_sink(drop_sink_); + entry.flow.set_snapshot_refresh_sink(snapshot_refresh_sink_); entry.flow.set_flow_key(key); // stash identifiers once entry.last_seen = ts; entry.first_seen = ts; @@ -61,7 +79,7 @@ bool ThreadFlowManager::add_new_flow(const FlowKey& key, (void)end_seq; // end_seq retained for potential future use } entry.flow.record_packet(); // count the first packet - return true; + return &entry; } void ThreadFlowManager::track_packet(const ::openpenny::net::PacketView& packet, @@ -71,20 +89,20 @@ void ThreadFlowManager::track_packet(const ::openpenny::net::PacketView& packet, const auto now = ts; auto it = table_active_flows_.find(packet.flow); + FlowEngineEntry* new_entry = nullptr; if (it == table_active_flows_.end()) { if (max_flows != 0 && active_flow_count(max_flows) >= max_flows) { return; } - add_new_flow(packet.flow, - packet.tcp.seq, - static_cast(packet.payload_bytes), - is_syn, - now); - it = table_active_flows_.find(packet.flow); + new_entry = add_new_flow(packet.flow, + packet.tcp.seq, + static_cast(packet.payload_bytes), + is_syn, + now); + if (!new_entry) return; } - if (it == table_active_flows_.end()) return; - auto& entry = it->second; + auto& entry = (it != table_active_flows_.end()) ? it->second : *new_entry; auto& flow = entry.flow; entry.last_seen = now; // Flow starts in PENDING_SEEN_DATA when we first see payload without SYN. @@ -156,16 +174,16 @@ bool ThreadFlowManager::complete_flow(const FlowKey& key, const char* reason) { const auto* test_status_text = [] (FlowEngine::FlowDecision status) -> const char* { switch (status) { case FlowEngine::FlowDecision::FINISHED_CLOSED_LOOP: - return "FINISHED_CLOSED_LOOP"; + return "closed_loop"; case FlowEngine::FlowDecision::FINISHED_NOT_CLOSED_LOOP: - return "FINISHED_NOT_CLOSED_LOOP"; + return "not_closed_loop"; case FlowEngine::FlowDecision::FINISHED_DUPLICATE_EXCEEDED: - return "FINISHED_DUPLICATE_EXCEEDED"; + return "duplicates_exceeded"; case FlowEngine::FlowDecision::FINISHED_NO_DECISION: - return "FINISHED_NO_DECISION"; + return "no_decision"; case FlowEngine::FlowDecision::PENDING: default: - return "PENDING"; + return "pending"; } }(flow.final_decision()); @@ -173,9 +191,9 @@ bool ThreadFlowManager::complete_flow(const FlowKey& key, const char* reason) { const auto flow_tag = flow_debug_details(key); TCPLOG_INFO( - "[flow_complete] reason=%s tcp_state=%s test_status=%s flow=%s " - "data_pkts=%llu dup_pkts=%llu in_order_pkts=%llu out_of_order_pkts=%llu " - "rtx_pkts=%llu non_rtx_pkts=%llu pending_rtx_pkts=%llu", + "[flow_result] stage=complete reason=%s tcp_state=%s verdict=%s flow=%s " + "data=%llu dup=%llu in_order=%llu out_of_order=%llu " + "rtx=%llu non_rtx=%llu pending_rtx=%llu", reason ? reason : "completed", tcp_state_text, test_status_text, @@ -189,8 +207,8 @@ bool ThreadFlowManager::complete_flow(const FlowKey& key, const char* reason) { static_cast(flow.pending_retransmissions())); } - // Expire any remaining pending snapshots before tearing down the flow. - entry.flow.expire_all_pending_snapshots(); + // Resolve any remaining pending snapshots before tearing down the flow. + entry.flow.resolve_pending_snapshots(std::chrono::steady_clock::now()); table_completed_flows_.insert(it->first); table_active_flows_.erase(it); @@ -214,7 +232,9 @@ bool ThreadFlowManager::complete_flow(const FlowKey& key, const char* reason) { counters.flows_not_closed_loop++; break; case FlowEngine::FlowDecision::FINISHED_DUPLICATE_EXCEEDED: - counters.flows_duplicates_exceeded++; + if (entry.state != FlowTrackingState::INTERRUPTED_DUPLICATE_EXCEEDED) { + counters.flows_duplicates_exceeded++; + } break; default: break; diff --git a/src/penny/flow/timer/ThreadFlowEventTimer.cpp b/src/penny/flow/timer/ThreadFlowEventTimer.cpp index 7fef6d7..c02408b 100644 --- a/src/penny/flow/timer/ThreadFlowEventTimer.cpp +++ b/src/penny/flow/timer/ThreadFlowEventTimer.cpp @@ -8,8 +8,8 @@ * Design principles: * 1. Expirations are prioritised to ensure snapshots age out promptly. * 2. Flow mutation never happens while holding internal locks. - * 3. All callbacks execute in the timer thread itself to avoid - * cross-thread data races. + * 3. All callbacks execute on the owning worker thread when it drains + * this manager, avoiding per-queue helper-thread context switches. * 4. Cancelled events are garbage collected lazily using a token heap. */ @@ -32,11 +32,8 @@ ThreadFlowEventTimerManager& ThreadFlowEventTimerManager::instance() { return mgr; } -std::function - ThreadFlowEventTimerManager::snapshot_hook_{}; - ThreadFlowEventTimerManager::~ThreadFlowEventTimerManager() { - stop(); // Ensure the timer thread is terminated cleanly. + stop(); // Ensure the worker-local timer state is flushed cleanly. } // ----------------------------------------------------------------------------- @@ -46,40 +43,25 @@ ThreadFlowEventTimerManager::~ThreadFlowEventTimerManager() { void ThreadFlowEventTimerManager::start(double timeout_sec) { std::lock_guard lock(mutex_); timeout_sec_ = timeout_sec; - - if (running_) return; // Prevent multiple timer threads from starting. - - stop_flag_ = false; + if (running_) return; running_ = true; - thread_ = std::thread(&ThreadFlowEventTimerManager::timer_loop, this); // Spawn background timer loop. + next_deadline_.store(kNoDeadline, std::memory_order_release); + queued_event_count_.store(0, std::memory_order_release); } void ThreadFlowEventTimerManager::stop() { - { - std::lock_guard lock(mutex_); - if (!running_) return; // No action needed if thread is not running. - stop_flag_ = true; - } - - cv_.notify_all(); // Wake sleeping thread so it can terminate. - - if (thread_.joinable()) { - thread_.join(); // Wait for graceful thread shutdown. - } - - // Reset all internal state after stopping. - { - std::lock_guard lock(mutex_); - running_ = false; - heap_ = {}; - by_id_.clear(); - by_flow_.clear(); - cancelled_.clear(); - retransmit_seen_.clear(); - events_.clear(); - callbacks_.clear(); - next_token_ = 1; - } + std::lock_guard lock(mutex_); + if (!running_) return; + running_ = false; + heap_ = {}; + by_id_.clear(); + by_flow_.clear(); + cancelled_.clear(); + retransmit_seen_.clear(); + events_.clear(); + queued_event_count_.store(0, std::memory_order_release); + next_deadline_.store(kNoDeadline, std::memory_order_release); + next_token_ = 1; } // ----------------------------------------------------------------------------- @@ -111,8 +93,11 @@ void ThreadFlowEventTimerManager::register_drop(const ::openpenny::FlowKey& key, heap_.push(e); // Add to min-heap ordered by nearest expiry first. by_id_[PacketKey{flow, packet_id}] = e; // Register lookup by (flow, packet_id). by_flow_.emplace(flow, e.token); // Track token association to flow. - - wake_locked(); // Wake timer thread to re-evaluate scheduling. + const auto deadline = e.deadline.time_since_epoch().count(); + const auto current = next_deadline_.load(std::memory_order_relaxed); + if (deadline < current) { + next_deadline_.store(deadline, std::memory_order_release); + } } void ThreadFlowEventTimerManager::enqueue_retransmitted(PacketDropId packet_id, FlowEngine* flow) { @@ -121,8 +106,7 @@ void ThreadFlowEventTimerManager::enqueue_retransmitted(PacketDropId packet_id, // Queue retransmission event for later servicing. events_.push_back(Event{Event::Kind::Retransmit, packet_id, flow, 0}); - - wake_locked(); // Wake timer loop. + queued_event_count_.store(events_.size(), std::memory_order_release); } void ThreadFlowEventTimerManager::enqueue_duplicate(FlowEngine* flow, std::uint32_t seq, std::uint32_t payload) { @@ -131,8 +115,7 @@ void ThreadFlowEventTimerManager::enqueue_duplicate(FlowEngine* flow, std::uint3 // Queue duplicate detection event for later servicing. events_.push_back(Event{Event::Kind::Duplicate, {}, flow, seq, payload}); - - wake_locked(); // Wake timer loop. + queued_event_count_.store(events_.size(), std::memory_order_release); } // ----------------------------------------------------------------------------- @@ -150,28 +133,19 @@ void ThreadFlowEventTimerManager::purge_flow(FlowEngine* flow) { } by_flow_.erase(flow); // Remove all tokens referencing flow. - retransmit_seen_.erase( - std::remove_if(retransmit_seen_.begin(), - retransmit_seen_.end(), - [flow](const PacketKey& k) { return k.flow == flow; }), - retransmit_seen_.end() - ); - - // Remove pending callbacks that reference the purged flow. - callbacks_.erase( - std::remove_if(callbacks_.begin(), callbacks_.end(), - [flow](const Callback& cb) { return cb.flow == flow; }), - callbacks_.end() - ); - // Resync the lock-free counter with the post-erase deque size so the - // drain_callbacks() fast path doesn't keep firing on stale entries. - pending_callbacks_.store(callbacks_.size(), std::memory_order_release); - - wake_locked(); // Wake timer loop to apply purge. -} - -void ThreadFlowEventTimerManager::wake_locked() { - cv_.notify_all(); // Wake timer thread (called while holding mutex_). + for (auto it = retransmit_seen_.begin(); it != retransmit_seen_.end();) { + if (it->flow == flow) { + it = retransmit_seen_.erase(it); + } else { + ++it; + } + } + events_.erase( + std::remove_if(events_.begin(), events_.end(), + [flow](const Event& ev) { return ev.flow == flow; }), + events_.end()); + queued_event_count_.store(events_.size(), std::memory_order_release); + refresh_next_deadline_locked(); } // ----------------------------------------------------------------------------- @@ -185,56 +159,52 @@ void ThreadFlowEventTimerManager::run_callbacks(std::deque& pending) { // Dispatch callback by type (snapshot mutation). if (cb.kind == Callback::Kind::Expire) { cb.flow->mark_snapshot_expired(cb.packet_id); - if (snapshot_hook_) snapshot_hook_(cb.flow, cb.packet_id, SnapshotEventKind::Expire); } else if (cb.kind == Callback::Kind::Retransmit) { cb.flow->mark_snapshot_retransmitted(cb.packet_id); - if (snapshot_hook_) snapshot_hook_(cb.flow, cb.packet_id, SnapshotEventKind::Retransmit); } else if (cb.kind == Callback::Kind::Duplicate) { cb.flow->register_duplicate_snapshot(cb.seq); cb.flow->evaluate_snapshot_duplicate_threshold(); - if (snapshot_hook_) snapshot_hook_(cb.flow, 0, SnapshotEventKind::Duplicate); } cb.flow->evaluate_if_ready(); // Re-check whether the flow now satisfies its scheduling thresholds. } } -// ----------------------------------------------------------------------------- -// Timer loop (long running background scheduling thread) -// ----------------------------------------------------------------------------- +void ThreadFlowEventTimerManager::refresh_next_deadline_locked() { + while (!heap_.empty() && cancelled_.count(heap_.top().token)) { + cancelled_.erase(heap_.top().token); + heap_.pop(); + } -void ThreadFlowEventTimerManager::timer_loop() { - std::unique_lock lock(mutex_); + if (heap_.empty()) { + next_deadline_.store(kNoDeadline, std::memory_order_release); + } else { + next_deadline_.store( + heap_.top().deadline.time_since_epoch().count(), + std::memory_order_release); + } +} +void ThreadFlowEventTimerManager::collect_ready_callbacks( + std::deque& pending, + const std::chrono::steady_clock::time_point& now) { while (true) { - if (stop_flag_) break; // Stop signal received. - - const auto now = std::chrono::steady_clock::now(); - - // Remove stale cancelled entries at the top of the heap. - while (!heap_.empty() && cancelled_.count(heap_.top().token)) { - cancelled_.erase(heap_.top().token); - heap_.pop(); - } - + refresh_next_deadline_locked(); bool processed_item = false; - // 1) Process the next expiry if it is due. if (!heap_.empty() && now >= heap_.top().deadline) { auto entry = heap_.top(); heap_.pop(); - // Remove entry from lookup maps if not already invalidated. auto id_it = by_id_.find(PacketKey{entry.flow, entry.packet_id}); if (id_it != by_id_.end() && id_it->second.token == entry.token) { by_id_.erase(id_it); } - // Remove only the token that matches this entry for the given flow. auto range = by_flow_.equal_range(entry.flow); - for (auto it = range.first; it != range.second; ) { + for (auto it = range.first; it != range.second;) { if (it->second == entry.token) { it = by_flow_.erase(it); break; @@ -243,47 +213,34 @@ void ThreadFlowEventTimerManager::timer_loop() { } } - // Ensure we only schedule snapshot mutation if the flow is still alive. if (auto alive = entry.flow_alive.lock(); alive && *alive && entry.flow) { if (TCPLOG_ENABLED(INFO)) { const auto packet_id_text = format_packet_drop_id(entry.packet_id); TCPLOG_INFO("[packet_expired] flow=%s packet_id=%s token=%" PRIu64, - flow_debug_details(entry.flow->flow_key()).c_str(), - packet_id_text.c_str(), - entry.token - ); + flow_debug_details(entry.flow->flow_key()).c_str(), + packet_id_text.c_str(), + entry.token); } - - // Schedule expiration callback for lock-free handling. - callbacks_.push_back(Callback{ - Callback::Kind::Expire, entry.packet_id, entry.flow, 0 - }); - pending_callbacks_.fetch_add(1, std::memory_order_release); + pending.push_back( + Callback{Callback::Kind::Expire, entry.packet_id, entry.flow, 0}); } processed_item = true; - } - - // 2) If no expiration was ready, service one queued event. - else if (!events_.empty()) { + } else if (!events_.empty()) { auto ev = events_.front(); events_.pop_front(); + queued_event_count_.store(events_.size(), std::memory_order_release); if (ev.kind == Event::Kind::Retransmit && ev.flow) { auto it = by_id_.find(PacketKey{ev.flow, ev.packet_id}); if (it != by_id_.end()) { const auto token = it->second.token; - - // Skip duplicate retransmit handling for the same flow/packet_id. const PacketKey key{ev.flow, ev.packet_id}; - if (std::find(retransmit_seen_.begin(), retransmit_seen_.end(), key) != retransmit_seen_.end()) { + const auto [_, inserted] = retransmit_seen_.insert(key); + if (!inserted) { processed_item = true; continue; } - retransmit_seen_.push_back(key); - - // If we've already cancelled this token (due to an earlier - // retransmit event), skip duplicate handling/logging. if (cancelled_.find(token) != cancelled_.end()) { processed_item = true; continue; @@ -293,88 +250,55 @@ void ThreadFlowEventTimerManager::timer_loop() { if (TCPLOG_ENABLED(INFO)) { const auto packet_id_text = format_packet_drop_id(ev.packet_id); - TCPLOG_INFO("[packet_retransmitted] flow=%s packet_id=%s seq=%" PRIu32, + TCPLOG_INFO( + "[drop_event] action=retransmitted flow=%s packet_id=%s seq=%" PRIu32, flow_debug_details(ev.flow->flow_key()).c_str(), packet_id_text.c_str(), - ev.seq - ); + ev.seq); } - callbacks_.push_back(Callback{ - Callback::Kind::Retransmit, ev.packet_id, it->second.flow, 0 - }); - pending_callbacks_.fetch_add(1, std::memory_order_release); + pending.push_back( + Callback{Callback::Kind::Retransmit, ev.packet_id, it->second.flow, 0}); } - } - else if (ev.kind == Event::Kind::Duplicate && ev.flow) { + } else if (ev.kind == Event::Kind::Duplicate && ev.flow) { if (TCPLOG_ENABLED(DEBUG)) { TCPLOG_DEBUG("[duplicate_detected] flow=%s seq=%" PRIu32 " payload=%u", - flow_debug_details(ev.flow->flow_key()).c_str(), - ev.seq, - ev.payload); + flow_debug_details(ev.flow->flow_key()).c_str(), + ev.seq, + ev.payload); } - - callbacks_.push_back(Callback{ - Callback::Kind::Duplicate, {}, ev.flow, ev.seq - }); - pending_callbacks_.fetch_add(1, std::memory_order_release); + pending.push_back(Callback{Callback::Kind::Duplicate, {}, ev.flow, ev.seq}); } processed_item = true; } - // 2.5) Run callbacks immediately if any were produced. - if (processed_item && !callbacks_.empty()) { - std::deque pending; - pending.swap(callbacks_); // Extract callbacks without copying. - - lock.unlock(); - run_callbacks(pending); // Execute snapshot mutations in lock-free mode. - lock.lock(); - - continue; // Re-evaluate loop state after callback execution. - } - - if (processed_item) continue; - - // 3) No action needed right now: sleep until the next expiry or event wake. - if (!heap_.empty() && timeout_sec_ > 0.0) { - cv_.wait_until(lock, heap_.top().deadline, [&] { - return stop_flag_ || !events_.empty(); - }); - } else { - cv_.wait(lock, [&] { - return stop_flag_ || !events_.empty() || - (!heap_.empty() && timeout_sec_ > 0.0); - }); + if (!processed_item) { + refresh_next_deadline_locked(); + return; } } } void ThreadFlowEventTimerManager::drain_callbacks() { - // Lock-free fast path. drain_callbacks() is called from every worker's - // before_poll() — i.e. potentially millions of times per second across - // busy-polling AF_XDP workers. Acquiring mutex_ on every call serialises - // the hot path on a single global lock; with many workers this becomes - // the dominant bottleneck. Skip the lock entirely when no callbacks - // are queued, which is the overwhelming common case. - if (pending_callbacks_.load(std::memory_order_acquire) == 0) { - return; + const auto now = std::chrono::steady_clock::now(); + if (queued_event_count_.load(std::memory_order_acquire) == 0) { + const auto next_deadline = next_deadline_.load(std::memory_order_acquire); + if (next_deadline == kNoDeadline || + now.time_since_epoch().count() < next_deadline) { + return; + } } std::deque pending; { std::lock_guard lock(mutex_); - pending.swap(callbacks_); - pending_callbacks_.store(0, std::memory_order_release); + if (!running_) { + return; + } + collect_ready_callbacks(pending, now); } run_callbacks(pending); } -void ThreadFlowEventTimerManager::set_snapshot_hook(std::function hook) { - snapshot_hook_ = std::move(hook); -} - } // namespace openpenny::penny diff --git a/tests/unit/flow/test_aggregate_duplicate_fallback.cpp b/tests/unit/flow/test_aggregate_duplicate_fallback.cpp new file mode 100644 index 0000000..2449e60 --- /dev/null +++ b/tests/unit/flow/test_aggregate_duplicate_fallback.cpp @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: BSD-2-Clause + +#include "openpenny/app/core/AggregatesController.h" +#include "openpenny/app/core/DropCollectorBinding.h" +#include "openpenny/app/core/PerThreadStats.h" +#include "openpenny/app/core/RuntimeSetup.h" +#include "openpenny/config/Config.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +openpenny::FlowKey make_key() { + openpenny::FlowKey key{}; + key.src = 0x0a000011; + key.dst = 0x0a000012; + key.sport = 2222; + key.dport = 5201; + key.ip_proto = 6; + return key; +} + +openpenny::penny::PacketDropSnapshot make_duplicate_exceeded_snapshot() { + openpenny::penny::PacketDropSnapshot snap{}; + snap.timestamp = std::chrono::steady_clock::now(); + snap.state = openpenny::penny::SnapshotState::Expired; + for (int i = 0; i < 10; ++i) { + snap.stats.record_data_packet(); + snap.stats.record_droppable_packet(); + } + for (int i = 0; i < 2; ++i) { + snap.stats.record_duplicate_packet(); + } + snap.stats.record_drop(); + snap.stats.inc_non_retransmitted(); + return snap; +} + +} // namespace + +int main() { + openpenny::app::init_thread_counters(1); + openpenny::app::set_thread_counter_index(0); + + openpenny::Config cfg; + cfg.active.aggregates_enabled = true; + cfg.active.max_drops_aggregates = 1; + cfg.active.max_duplicate_fraction = 0.1; + cfg.active.retransmission_miss_probability = 0.0; + cfg.active.min_closed_loop_flows = 0; + + openpenny::PipelineOptions opts{}; + opts.mode = openpenny::PipelineOptions::Mode::Active; + + openpenny::set_runtime_setup(cfg, opts, false, false); + auto& runtime = openpenny::runtime_setup_mutable(); + runtime.aggregates_status = openpenny::RuntimeStatus::AggregatesStatus::PENDING; + runtime.aggregate_eval_counters = {}; + runtime.has_aggregate_eval = false; + runtime.aggregates_active = true; + + std::atomic stop_flag{false}; + auto collector = std::make_shared(1); + openpenny::AggregatesController controller( + cfg, + opts, + collector, + stop_flag, + std::function{}); + controller.start(); + + auto& counters = openpenny::app::current_thread_counters(); + counters.droppable_packets = 10; + counters.data_packets = 10; + counters.duplicate_packets = 2; + counters.dropped_packets = 1; + counters.non_retransmitted_packets = 1; + counters.pending_retransmissions = 0; + + openpenny::app::DropCollectorBinding::instance().upsert( + collector, + "worker-0", + 0, + make_key(), + openpenny::penny::make_packet_drop_id(2000, 100), + make_duplicate_exceeded_snapshot()); + + const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(1); + while (runtime.aggregates_status == openpenny::RuntimeStatus::AggregatesStatus::PENDING && + std::chrono::steady_clock::now() < deadline) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + + assert(runtime.aggregates_status == + openpenny::RuntimeStatus::AggregatesStatus::DUPLICATES_EXCEEDED); + assert(runtime.has_aggregate_eval); + assert(runtime.aggregate_eval_counters.data_packets == 10); + assert(runtime.aggregate_eval_counters.duplicate_packets == 2); + assert(!runtime.aggregates_active); + assert(!collector->accepting.load(std::memory_order_relaxed)); + assert(!controller.collector_completed()); + assert(!stop_flag.load(std::memory_order_relaxed)); + + stop_flag.store(true, std::memory_order_relaxed); + controller.join(); + return 0; +} diff --git a/tests/unit/flow/test_aggregate_freeze_at_drop_limit.cpp b/tests/unit/flow/test_aggregate_freeze_at_drop_limit.cpp new file mode 100644 index 0000000..f3e5e37 --- /dev/null +++ b/tests/unit/flow/test_aggregate_freeze_at_drop_limit.cpp @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: BSD-2-Clause + +#include "openpenny/app/core/AggregatesController.h" +#include "openpenny/app/core/DropCollectorBinding.h" +#include "openpenny/app/core/PerThreadStats.h" +#include "openpenny/app/core/RuntimeSetup.h" +#include "openpenny/config/Config.h" + +#include +#include +#include +#include +#include +#include + +namespace { + +openpenny::FlowKey make_key(std::uint16_t sport) { + openpenny::FlowKey key{}; + key.src = 0x0a000001; + key.dst = 0x0a000002; + key.sport = sport; + key.dport = 5201; + key.ip_proto = 6; + return key; +} + +openpenny::penny::PacketDropSnapshot make_pending_snapshot() { + openpenny::penny::PacketDropSnapshot snap{}; + snap.timestamp = std::chrono::steady_clock::now(); + snap.state = openpenny::penny::SnapshotState::Pending; + snap.stats.record_data_packet(); + snap.stats.record_droppable_packet(); + snap.stats.record_drop(); + snap.stats.inc_pending_retransmission(); + return snap; +} + +} // namespace + +int main() { + openpenny::app::init_thread_counters(1); + openpenny::app::set_thread_counter_index(0); + + openpenny::Config cfg; + cfg.active.aggregates_enabled = true; + cfg.active.max_drops_aggregates = 1; + cfg.active.max_duplicate_fraction = 1.0; + cfg.active.retransmission_miss_probability = 0.0; + + openpenny::PipelineOptions opts{}; + opts.mode = openpenny::PipelineOptions::Mode::Active; + + openpenny::set_runtime_setup(cfg, opts, false, false); + auto& runtime = openpenny::runtime_setup_mutable(); + runtime.aggregates_status = openpenny::RuntimeStatus::AggregatesStatus::PENDING; + runtime.aggregate_eval_counters = {}; + runtime.has_aggregate_eval = false; + runtime.aggregates_active = true; + + std::atomic stop_flag{false}; + auto collector = std::make_shared(1); + openpenny::AggregatesController controller( + cfg, + opts, + collector, + stop_flag, + std::function{}); + + auto& counters = openpenny::app::current_thread_counters(); + counters.droppable_packets = 10; + counters.data_packets = 10; + counters.dropped_packets = 1; + counters.pending_retransmissions = 1; + + const auto first_key = make_key(40001); + const auto first_id = openpenny::penny::make_packet_drop_id(1000, 100); + auto first_snapshot = make_pending_snapshot(); + openpenny::app::DropCollectorBinding::instance().upsert( + collector, + "worker-0", + 0, + first_key, + first_id, + first_snapshot); + + counters.droppable_packets = 100; + counters.data_packets = 100; + counters.dropped_packets = 2; + counters.pending_retransmissions = 2; + + const auto second_key = make_key(40002); + const auto second_id = openpenny::penny::make_packet_drop_id(2000, 100); + auto second_snapshot = make_pending_snapshot(); + openpenny::app::DropCollectorBinding::instance().upsert( + collector, + "worker-0", + 0, + second_key, + second_id, + second_snapshot); + + assert(collector->accepted_snapshot_count.load(std::memory_order_relaxed) == 1); + + counters.pending_retransmissions = 0; + counters.non_retransmitted_packets = 50; + first_snapshot.state = openpenny::penny::SnapshotState::Expired; + first_snapshot.stats.dec_pending_retransmission(); + first_snapshot.stats.inc_non_retransmitted(); + openpenny::app::DropCollectorBinding::instance().upsert( + collector, + "worker-0", + 0, + first_key, + first_id, + first_snapshot); + + openpenny::PipelineSummary summary; + controller.populate_drop_snapshots(summary); + assert(summary.drop_snapshots.size() == 1); + + controller.evaluate_pending_if_needed(cfg, summary); + + assert(runtime.aggregates_status == + openpenny::RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP); + assert(runtime.has_aggregate_eval); + assert(runtime.aggregate_eval_counters.data_packets == 10); + assert(runtime.aggregate_eval_counters.duplicate_packets == 0); + assert(runtime.aggregate_eval_counters.retransmitted_packets == 0); + assert(runtime.aggregate_eval_counters.non_retransmitted_packets == 1); + + return 0; +} diff --git a/tests/unit/flow/test_aggregate_pending_resolution.cpp b/tests/unit/flow/test_aggregate_pending_resolution.cpp new file mode 100644 index 0000000..e676e7e --- /dev/null +++ b/tests/unit/flow/test_aggregate_pending_resolution.cpp @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: BSD-2-Clause + +#include "openpenny/app/core/AggregatesController.h" +#include "openpenny/app/core/PerThreadStats.h" +#include "openpenny/app/core/RuntimeSetup.h" +#include "openpenny/config/Config.h" + +#include +#include +#include + +namespace { + +openpenny::DropSnapshotRecord make_expired_snapshot_record() { + openpenny::DropSnapshotRecord record{}; + record.key.src = 0x0a000001; + record.key.dst = 0x0a000002; + record.key.sport = 1111; + record.key.dport = 5201; + record.key.ip_proto = 6; + record.packet_id = openpenny::penny::make_packet_drop_id(1000, 100); + record.snapshot.timestamp = std::chrono::steady_clock::now(); + record.snapshot.state = openpenny::penny::SnapshotState::Expired; + for (int i = 0; i < 5; ++i) { + record.snapshot.stats.record_data_packet(); + record.snapshot.stats.record_droppable_packet(); + } + record.snapshot.stats.record_drop(); + record.snapshot.stats.inc_non_retransmitted(); + return record; +} + +openpenny::DropSnapshotRecord make_invalid_snapshot_record() { + auto record = make_expired_snapshot_record(); + record.snapshot.state = openpenny::penny::SnapshotState::Invalid; + record.snapshot.stats = {}; + for (int i = 0; i < 5; ++i) { + record.snapshot.stats.record_data_packet(); + record.snapshot.stats.record_droppable_packet(); + } + record.snapshot.stats.record_drop(); + return record; +} + +openpenny::DropSnapshotRecord make_duplicate_exceeded_snapshot_record() { + openpenny::DropSnapshotRecord record{}; + record.key.src = 0x0a000011; + record.key.dst = 0x0a000012; + record.key.sport = 2222; + record.key.dport = 5201; + record.key.ip_proto = 6; + record.packet_id = openpenny::penny::make_packet_drop_id(2000, 100); + record.snapshot.timestamp = std::chrono::steady_clock::now(); + record.snapshot.state = openpenny::penny::SnapshotState::Expired; + for (int i = 0; i < 10; ++i) { + record.snapshot.stats.record_data_packet(); + record.snapshot.stats.record_droppable_packet(); + } + for (int i = 0; i < 2; ++i) { + record.snapshot.stats.record_duplicate_packet(); + } + record.snapshot.stats.record_drop(); + record.snapshot.stats.inc_non_retransmitted(); + return record; +} + +} // namespace + +int main() { + openpenny::app::init_thread_counters(1); + openpenny::app::set_thread_counter_index(0); + + openpenny::Config cfg; + cfg.active.aggregates_enabled = true; + cfg.active.max_drops_aggregates = 1; + cfg.active.max_duplicate_fraction = 1.0; + cfg.active.retransmission_miss_probability = 0.0; + + openpenny::PipelineOptions opts{}; + opts.mode = openpenny::PipelineOptions::Mode::Active; + + openpenny::set_runtime_setup(cfg, opts, false, false); + auto& runtime = openpenny::runtime_setup_mutable(); + openpenny::set_current_aggregates_status( + openpenny::RuntimeStatus::AggregatesStatus::PENDING); + runtime.aggregate_eval_counters = {}; + openpenny::set_current_has_aggregate_eval(false); + openpenny::set_current_aggregates_active(true); + + std::atomic stop_flag{false}; + auto collector = std::make_shared(1); + openpenny::AggregatesController controller( + cfg, + opts, + collector, + stop_flag, + std::function{}); + + openpenny::PipelineSummary summary; + summary.drop_snapshots.push_back(make_expired_snapshot_record()); + + controller.evaluate_pending_if_needed(cfg, summary); + + assert(runtime.aggregates_status == + openpenny::RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP); + assert(runtime.has_aggregate_eval); + assert(controller.aggregates_ready()); + assert(controller.collector_completed()); + + openpenny::set_current_aggregates_status( + openpenny::RuntimeStatus::AggregatesStatus::PENDING); + runtime.aggregate_eval_counters = {}; + openpenny::set_current_has_aggregate_eval(false); + openpenny::set_current_aggregates_active(true); + + openpenny::PipelineSummary invalid_summary; + invalid_summary.drop_snapshots.push_back(make_invalid_snapshot_record()); + + controller.evaluate_pending_if_needed(cfg, invalid_summary); + + assert(runtime.aggregates_status == + openpenny::RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP); + assert(runtime.has_aggregate_eval); + + openpenny::set_current_aggregates_status( + openpenny::RuntimeStatus::AggregatesStatus::PENDING); + runtime.aggregate_eval_counters = {}; + openpenny::set_current_has_aggregate_eval(false); + openpenny::set_current_aggregates_active(true); + cfg.active.max_duplicate_fraction = 0.1; + + openpenny::PipelineSummary duplicate_summary; + duplicate_summary.drop_snapshots.push_back(make_duplicate_exceeded_snapshot_record()); + + controller.evaluate_pending_if_needed(cfg, duplicate_summary); + + assert(runtime.aggregates_status == + openpenny::RuntimeStatus::AggregatesStatus::DUPLICATES_EXCEEDED); + assert(runtime.has_aggregate_eval); + assert(runtime.aggregate_eval_counters.data_packets == 10); + assert(runtime.aggregate_eval_counters.duplicate_packets == 2); + + return 0; +} diff --git a/tests/unit/flow/test_drop_snapshot_updates.cpp b/tests/unit/flow/test_drop_snapshot_updates.cpp index c27d27c..0e7131d 100644 --- a/tests/unit/flow/test_drop_snapshot_updates.cpp +++ b/tests/unit/flow/test_drop_snapshot_updates.cpp @@ -22,9 +22,10 @@ // // Synchronization caveat: // `FlowEngine::register_filled_gaps()` enqueues a Retransmit event on -// the global `ThreadFlowEventTimerManager`; the actual mutation happens -// on the timer thread. The test polls until the mutation is observed -// so we don't race against the background thread. +// the thread-local `ThreadFlowEventTimerManager`; the actual mutation +// is applied when the worker drains callbacks. The test polls and +// drives that drain explicitly so the assertions observe the updated +// snapshots deterministically. #include "openpenny/config/Config.h" #include "openpenny/penny/flow/engine/FlowEngine.h" @@ -40,15 +41,16 @@ using namespace std::chrono; namespace { // Wait up to `timeout` for `predicate()` to become true. Used to -// synchronise the test thread with the FlowEngine timer thread, which -// processes Retransmit events asynchronously. +// synchronise the test thread with the cooperative timer manager. template bool wait_for(Predicate predicate, milliseconds timeout = milliseconds{2000}) { const auto deadline = steady_clock::now() + timeout; while (steady_clock::now() < deadline) { + openpenny::penny::ThreadFlowEventTimerManager::instance().drain_callbacks(); if (predicate()) return true; std::this_thread::sleep_for(milliseconds{5}); } + openpenny::penny::ThreadFlowEventTimerManager::instance().drain_callbacks(); return predicate(); } @@ -63,11 +65,11 @@ int main() { // is deterministic regardless of the random number generator state. cfg.active.drop_probability = 1.0; // Long retransmission timeout. With `now = steady_clock::now()`, the - // deadline = `now + 60s` lies far in the future so the timer-manager - // background thread's expiry path never runs during this test — - // only the explicit `register_filled_gaps()` events do. (The test's - // assertions break if `mark_snapshot_expired` runs concurrently and - // decrements pending on entries we haven't filled yet.) + // deadline = `now + 60s` lies far in the future so the cooperative + // expiry path never runs during this test — only the explicit + // `register_filled_gaps()` events do. (The test's assertions break + // if `mark_snapshot_expired` also runs and decrements pending on + // entries we haven't filled yet.) cfg.active.rtt_timeout_factor = 60.0; openpenny::penny::FlowEngine flow(cfg.active); @@ -120,7 +122,7 @@ int main() { // Phase 2: drop1 is retransmitted (gap filled by a later packet) // ---------------------------------------------------------------- // register_filled_gaps() queues a Retransmit event on the timer - // manager. The timer thread picks it up and calls + // manager. drain_callbacks() then applies // mark_snapshot_retransmitted on this thread's FlowEngine, which: // - decrements flow_stats_.pending_retransmissions by 1, // - increments flow_stats_.retransmitted_packets by 1, @@ -130,8 +132,7 @@ int main() { // decrementing its frozen pending count. flow.register_filled_gaps(std::vector{drop1_id}); - // Wait for the timer thread to process the event before asserting. - // Without this, the assertions race against the background thread. + // Drain the queued retransmit event before asserting. assert(wait_for([&] { return flow.retransmitted_packets() == 1; })); // Phase 2 verification: flow-wide counters diff --git a/tests/unit/flow/test_drop_timer.cpp b/tests/unit/flow/test_drop_timer.cpp index df9f6b6..e7b98b0 100644 --- a/tests/unit/flow/test_drop_timer.cpp +++ b/tests/unit/flow/test_drop_timer.cpp @@ -4,6 +4,7 @@ #include "openpenny/penny/flow/timer/ThreadFlowEventTimer.h" #include "openpenny/penny/flow/state/PennySnapshot.h" #include "openpenny/penny/flow/engine/FlowEngine.h" +#include "openpenny/app/core/PerThreadStats.h" #include "openpenny/net/Packet.h" #include @@ -84,6 +85,40 @@ int main() { assert(flow.non_retransmitted_packets() == 0); } + // Timer callbacks must publish into the same per-thread counter shard as the + // worker that owns the flow; otherwise multi-queue aggregate pending_rtx can + // stay stuck forever. + openpenny::penny::ThreadFlowEventTimerManager::instance().stop(); + openpenny::app::init_thread_counters(2); + openpenny::app::set_thread_counter_index(1); + { + openpenny::Config cfg; + cfg.active.drop_probability = 1.0; + cfg.active.rtt_timeout_factor = 0.05; + + openpenny::penny::FlowEngine flow(cfg.active); + openpenny::FlowKey key{}; + const auto now = std::chrono::steady_clock::now(); + const auto packet_id = openpenny::penny::make_packet_drop_id(3000, 100); + + flow.record_data(3000, now); + const bool dropped = flow.drop_packet(3000, 3100, packet_id, key, now); + assert(dropped); + assert(openpenny::app::aggregate_counters().pending_retransmissions == 1); + + sleep_for_ms(80); + openpenny::penny::ThreadFlowEventTimerManager::instance().drain_callbacks(); + + const auto counters = openpenny::app::thread_counters(); + assert(counters.size() >= 2); + assert(counters[0].pending_retransmissions == 0); + assert(counters[0].non_retransmitted_packets == 0); + assert(counters[1].pending_retransmissions == 0); + assert(counters[1].non_retransmitted_packets == 1); + assert(openpenny::app::aggregate_counters().pending_retransmissions == 0); + assert(openpenny::app::aggregate_counters().non_retransmitted_packets == 1); + } + // Clean shutdown for other tests. openpenny::penny::ThreadFlowEventTimerManager::instance().stop(); return 0; diff --git a/tests/unit/flow/test_flow_evaluation_phase_gate.cpp b/tests/unit/flow/test_flow_evaluation_phase_gate.cpp new file mode 100644 index 0000000..8259dd3 --- /dev/null +++ b/tests/unit/flow/test_flow_evaluation_phase_gate.cpp @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: BSD-2-Clause + +#include "openpenny/app/core/PerThreadStats.h" +#include "openpenny/app/core/RuntimeSetup.h" +#include "openpenny/config/Config.h" +#include "openpenny/penny/flow/engine/FlowEngine.h" + +#include + +int main() { + openpenny::app::init_thread_counters(1); + openpenny::app::set_thread_counter_index(0); + + openpenny::Config cfg; + cfg.active.aggregates_enabled = true; + cfg.active.max_drops_aggregates = 1; + cfg.active.max_duplicate_fraction = 0.5; + + openpenny::PipelineOptions opts{}; + opts.mode = openpenny::PipelineOptions::Mode::Active; + + openpenny::set_runtime_setup(cfg, opts, false, false); + openpenny::set_current_aggregates_status( + openpenny::RuntimeStatus::AggregatesStatus::PENDING); + openpenny::set_current_aggregates_active(true); + + openpenny::penny::FlowEngine flow(cfg.active); + flow.record_data_packet(); + flow.record_duplicate_packet(); + + flow.evaluate_if_ready(); + assert(flow.final_decision() == + openpenny::penny::FlowEngine::FlowDecision::PENDING); + + openpenny::set_current_aggregates_status( + openpenny::RuntimeStatus::AggregatesStatus::CLOSED_LOOP); + openpenny::set_current_aggregates_active(false); + + flow.evaluate_if_ready(); + assert(flow.final_decision() == + openpenny::penny::FlowEngine::FlowDecision::PENDING); + + openpenny::set_current_aggregates_status( + openpenny::RuntimeStatus::AggregatesStatus::NON_CLOSED_LOOP); + openpenny::set_current_aggregates_active(false); + + flow.evaluate_if_ready(); + assert(flow.final_decision() == + openpenny::penny::FlowEngine::FlowDecision::FINISHED_DUPLICATE_EXCEEDED); + + return 0; +} diff --git a/tests/unit/flow/test_gap_management.cpp b/tests/unit/flow/test_gap_management.cpp index cafa2e9..2a9d31a 100644 --- a/tests/unit/flow/test_gap_management.cpp +++ b/tests/unit/flow/test_gap_management.cpp @@ -29,7 +29,7 @@ int main() { cfg.active.rtt_timeout_factor = 3.0; openpenny::penny::ThreadFlowManager table(cfg.active); - openpenny::FlowKey flow{10, 20, 1111, 2222}; + openpenny::FlowKey flow{10, 20, 1111, 2222, 6}; auto now = steady_clock::time_point{}; // Register a gap representing a dropped packet. diff --git a/tests/unit/flow/test_initial_flow_monitoring.cpp b/tests/unit/flow/test_initial_flow_monitoring.cpp index 7cbbe0a..346d964 100644 --- a/tests/unit/flow/test_initial_flow_monitoring.cpp +++ b/tests/unit/flow/test_initial_flow_monitoring.cpp @@ -24,7 +24,7 @@ namespace net = openpenny::net; auto now = steady_clock::time_point{}; // Case 1: Flow starts with SYN. - openpenny::FlowKey flow_syn{1, 2, 1000, 2000}; + openpenny::FlowKey flow_syn{1, 2, 1000, 2000, 6}; net::PacketView syn_pkt{}; syn_pkt.flow = flow_syn; syn_pkt.tcp.seq = 100; @@ -47,7 +47,7 @@ namespace net = openpenny::net; auto& syn_entry_data = *syn_entry_data_ptr; // Case 2: Flow starts with data (no SYN yet). - openpenny::FlowKey flow_data{3, 4, 3000, 4000}; + openpenny::FlowKey flow_data{3, 4, 3000, 4000, 6}; auto t0 = steady_clock::time_point{}; net::PacketView data_pkt0{}; data_pkt0.flow = flow_data; @@ -81,7 +81,7 @@ namespace net = openpenny::net; assert(data_entry3.flow.highest_sequence() == 60); // Case 3: Flow receives SYN after data-first start. - openpenny::FlowKey flow_data_then_syn{5, 6, 1234, 4321}; + openpenny::FlowKey flow_data_then_syn{5, 6, 1234, 4321, 6}; auto td0 = steady_clock::time_point{}; net::PacketView first_data_pkt{}; first_data_pkt.flow = flow_data_then_syn; diff --git a/tests/unit/flow/test_terminal_snapshot_resolution.cpp b/tests/unit/flow/test_terminal_snapshot_resolution.cpp new file mode 100644 index 0000000..3a70eff --- /dev/null +++ b/tests/unit/flow/test_terminal_snapshot_resolution.cpp @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: BSD-2-Clause + +#include "openpenny/config/Config.h" +#include "openpenny/app/core/PerThreadStats.h" +#include "openpenny/penny/flow/engine/FlowEngine.h" +#include "openpenny/penny/flow/timer/ThreadFlowEventTimer.h" + +#include +#include +#include +#include + +namespace { + +openpenny::FlowKey make_flow_key(std::uint16_t sport) { + openpenny::FlowKey key{}; + key.src = 0x0a000001; + key.dst = 0x0a000002; + key.sport = sport; + key.dport = 5201; + key.ip_proto = 6; + return key; +} + +} // namespace + +int main() { + using Clock = std::chrono::steady_clock; + + openpenny::penny::ThreadFlowEventTimerManager::instance().stop(); + openpenny::app::init_thread_counters(1); + openpenny::app::set_thread_counter_index(0); + + openpenny::Config cfg; + cfg.active.drop_probability = 1.0; + cfg.active.rtt_timeout_factor = 60.0; + + { + openpenny::penny::FlowEngine flow(cfg.active); + std::vector observed_states; + flow.set_flow_key(make_flow_key(1111)); + flow.set_drop_sink([&observed_states](const openpenny::FlowKey&, + openpenny::penny::PacketDropId, + const openpenny::penny::PacketDropSnapshot& snapshot) { + observed_states.push_back(snapshot.state); + }); + const auto drop_time = Clock::now(); + const auto key = make_flow_key(1111); + const auto packet_id = openpenny::penny::make_packet_drop_id(1000, 100); + + flow.record_data(1000, drop_time); + assert(flow.drop_packet(1000, 1100, packet_id, key, drop_time)); + assert(openpenny::app::aggregate_counters().pending_retransmissions == 1); + + // Generic teardown before the timeout should NOT mark the drop expired. + flow.resolve_pending_snapshots(drop_time + std::chrono::seconds(1)); + + assert(flow.pending_retransmissions() == 0); + assert(flow.non_retransmitted_packets() == 0); + assert(openpenny::app::aggregate_counters().pending_retransmissions == 0); + assert(flow.drop_snapshots().size() == 1); + assert(flow.drop_snapshots().front().second.state == + openpenny::penny::SnapshotState::Invalid); + assert(observed_states.size() == 2); + assert(observed_states.front() == openpenny::penny::SnapshotState::Pending); + assert(observed_states.back() == openpenny::penny::SnapshotState::Invalid); + } + + { + openpenny::penny::FlowEngine flow(cfg.active); + flow.set_flow_key(make_flow_key(1112)); + const auto drop_time = Clock::now(); + const auto key = make_flow_key(1112); + const auto packet_id = openpenny::penny::make_packet_drop_id(1500, 100); + + flow.record_data(1500, drop_time); + assert(flow.drop_packet(1500, 1600, packet_id, key, drop_time)); + + // FIN semantics are immediate: outstanding drops become non-retransmitted. + flow.mark_snapshot_expired(packet_id); + + assert(flow.pending_retransmissions() == 0); + assert(flow.non_retransmitted_packets() == 1); + assert(flow.drop_snapshots().size() == 1); + assert(flow.drop_snapshots().front().second.state == + openpenny::penny::SnapshotState::Expired); + } + + { + openpenny::penny::FlowEngine flow(cfg.active); + std::vector observed_states; + flow.set_flow_key(make_flow_key(1113)); + flow.set_drop_sink([&observed_states](const openpenny::FlowKey&, + openpenny::penny::PacketDropId, + const openpenny::penny::PacketDropSnapshot& snapshot) { + observed_states.push_back(snapshot.state); + }); + const auto drop_time = Clock::now(); + const auto key = make_flow_key(1113); + const auto packet_id = openpenny::penny::make_packet_drop_id(2000, 100); + + flow.record_data(2000, drop_time); + assert(flow.drop_packet(2000, 2100, packet_id, key, drop_time)); + assert(openpenny::app::aggregate_counters().pending_retransmissions == 1); + + // Once the timeout has elapsed, teardown should promote to Expired. + flow.resolve_pending_snapshots(drop_time + std::chrono::seconds(61)); + + assert(flow.pending_retransmissions() == 0); + assert(flow.non_retransmitted_packets() == 1); + assert(openpenny::app::aggregate_counters().pending_retransmissions == 0); + assert(flow.drop_snapshots().size() == 1); + assert(flow.drop_snapshots().front().second.state == + openpenny::penny::SnapshotState::Expired); + assert(observed_states.size() == 2); + assert(observed_states.front() == openpenny::penny::SnapshotState::Pending); + assert(observed_states.back() == openpenny::penny::SnapshotState::Expired); + } + + openpenny::penny::ThreadFlowEventTimerManager::instance().stop(); + return 0; +} diff --git a/tests/unit/net/test_packet_parser.cpp b/tests/unit/net/test_packet_parser.cpp index a74be2f..343186d 100644 --- a/tests/unit/net/test_packet_parser.cpp +++ b/tests/unit/net/test_packet_parser.cpp @@ -73,6 +73,7 @@ void assert_decodes(const std::vector& frame) { assert(packet.flow.dst == 0xc0a82902u); assert(packet.flow.sport == 40000); assert(packet.flow.dport == 5201); + assert(packet.flow.ip_proto == 6); assert(packet.ip_proto == 6); } diff --git a/tests/unit/net/test_traffic_match.cpp b/tests/unit/net/test_traffic_match.cpp index f8b52dd..4ed40c4 100644 --- a/tests/unit/net/test_traffic_match.cpp +++ b/tests/unit/net/test_traffic_match.cpp @@ -17,6 +17,7 @@ int main() { matching.dst = 0xc0000201u; matching.sport = 12345; matching.dport = 443; + matching.ip_proto = 6; openpenny::FlowKey non_matching = matching; non_matching.src = 0x0a020203u; @@ -57,12 +58,18 @@ int main() { cfg.rules.clear(); cfg.rules.push_back(tcp_https); + assert(openpenny::net::traffic_matches_flow(cfg, matching)); + auto wrong_proto = matching; + wrong_proto.ip_proto = 17; + assert(!openpenny::net::traffic_matches_flow(cfg, wrong_proto)); + openpenny::net::PacketView packet{}; packet.flow = matching; packet.ip_proto = 6; assert(openpenny::net::traffic_matches_packet(cfg, packet)); packet.ip_proto = 17; + packet.flow.ip_proto = 17; assert(!openpenny::net::traffic_matches_packet(cfg, packet)); cfg.default_action = openpenny::net::TrafficRuleAction::RedirectToUserspace;