From caef9fbdec279e99c0689e4fad80a4d41179f083 Mon Sep 17 00:00:00 2001 From: AmbitiousK <1724260766@qq.com> Date: Tue, 12 May 2026 14:00:03 -0400 Subject: [PATCH 1/4] Add Highway overlapping community detection algorithm --- cdlib/algorithms/internal/Highway.py | 800 ++++++++++++++++++ cdlib/algorithms/overlapping_partition.py | 113 +++ .../cd_algorithms/node_clustering.rst | 1 + 3 files changed, 914 insertions(+) create mode 100644 cdlib/algorithms/internal/Highway.py diff --git a/cdlib/algorithms/internal/Highway.py b/cdlib/algorithms/internal/Highway.py new file mode 100644 index 0000000..3d3758e --- /dev/null +++ b/cdlib/algorithms/internal/Highway.py @@ -0,0 +1,800 @@ +""" +Pure-Python reference implementation of the Highway overlapping community +detection algorithm for CDlib. + +This module is intended to live at: + + cdlib/algorithms/internal/Highway.py + +The public CDlib wrapper should be added in: + + cdlib/algorithms/overlapping_partition.py + +and should call: + + highway_nx(G, ...) + +The implementation below keeps only algorithmic logic. It intentionally removes +the C++ command-line interface, file I/O, tracing CSV writers, manifest writers, +timers, and filesystem dependencies. + +The output format follows CDlib internal implementations such as SLPA_nx.py: +a list of communities, where each community is a list of original NetworkX nodes. +""" + +from __future__ import annotations + +from collections import defaultdict +from dataclasses import dataclass +from math import exp, log, sqrt +from typing import Dict, Hashable, Iterable, List, MutableMapping, Sequence, Tuple + +import networkx as nx + + +Node = Hashable +MembershipRow = List[Tuple[int, float]] + + +@dataclass +class HighwayBuildConfig: + """Configuration for sparse Highway backbone construction.""" + + top_r: int = 3 + mod_jaccard_alpha: float = 0.70 + ensure_min1_per_node: bool = True + symmetrize: bool = True + + +@dataclass +class PropConfig: + """Configuration for anchor-membership propagation.""" + + top_r: int = 3 + T: int = 10 + damping: float = 0.90 + eta_leak: float = 0.0 + tau: float = 0.85 + eps: float = 1e-12 + + +@dataclass +class LocalRefineConfig: + """ + Lightweight Python version of the anchor-preserving pattern refinement. + + This is intentionally conservative for CDlib: it keeps the same high-level + idea as the C++ implementation but avoids trace files and implementation + details that are mainly for experiments. + """ + + enable_pattern_refinement: bool = True + confidence_self_fraction_weight: float = 0.50 + confidence_low_entropy_weight: float = 0.50 + pattern_confidence_floor: float = 0.05 + pattern_confidence_ceiling: float = 0.95 + update_strength: float = 0.45 + node_mode_power: float = 2.00 + pattern_target_mix: float = 0.65 + target_sharpen_gamma: float = 1.00 + min_abs_mass_to_keep: float = 1e-12 + renormalize: bool = True + + +def _safe_int(value: int, lo: int = 1) -> int: + return max(lo, int(value)) + + +def _clamp(value: float, lo: float, hi: float) -> float: + if lo > hi: + lo, hi = hi, lo + return max(lo, min(hi, float(value))) + + +def _normalize(dist: MutableMapping[int, float]) -> Dict[int, float]: + total = sum(v for v in dist.values() if v > 0.0) + if total <= 0.0: + return {} + return {k: float(v) / total for k, v in dist.items() if v > 0.0} + + +def _sharpen(dist: MutableMapping[int, float], gamma: float) -> Dict[int, float]: + gamma = max(1.0, float(gamma)) + if not dist: + return {} + if abs(gamma - 1.0) <= 1e-12: + return _normalize(dist) + return _normalize({k: max(0.0, v) ** gamma for k, v in dist.items()}) + + +def _topk_items(dist: MutableMapping[int, float], k: int) -> MembershipRow: + k = _safe_int(k) + items = [(cid, float(val)) for cid, val in dist.items() if cid >= 0 and val > 0.0] + items.sort(key=lambda x: (-x[1], x[0])) + return items[:k] + + +def _stable_softmax(items: MembershipRow, tau: float, eps: float = 1e-12) -> MembershipRow: + if not items: + return [] + + tau = max(float(tau), eps) + max_value = max(v for _, v in items) + + exps = [] + total = 0.0 + for cid, value in items: + ev = exp((value - max_value) / tau) + exps.append((cid, ev)) + total += ev + + if total <= eps: + mass = 1.0 / len(items) + return [(cid, mass) for cid, _ in items] + + return [(cid, ev / total) for cid, ev in exps] + + +def _relabel_graph_to_contiguous( + g: nx.Graph, +) -> Tuple[List[Node], Dict[Node, int], List[List[int]]]: + """ + Convert a NetworkX graph with arbitrary node labels into contiguous ids. + + Returns: + nodes: original labels by contiguous id + node_to_id: original label -> contiguous id + adj: sorted adjacency lists using contiguous ids + """ + + if g.is_directed(): + g = g.to_undirected() + + nodes = list(g.nodes()) + node_to_id = {node: i for i, node in enumerate(nodes)} + n = len(nodes) + + adj_sets = [set() for _ in range(n)] + for u, v in g.edges(): + if u == v: + continue + if u not in node_to_id or v not in node_to_id: + continue + a = node_to_id[u] + b = node_to_id[v] + if a == b: + continue + adj_sets[a].add(b) + adj_sets[b].add(a) + + adj = [sorted(neigh) for neigh in adj_sets] + return nodes, node_to_id, adj + + +def _degrees(adj: Sequence[Sequence[int]]) -> List[int]: + return [len(neigh) for neigh in adj] + + +def _intersection_size_sorted(a: Sequence[int], b: Sequence[int]) -> int: + i = 0 + j = 0 + count = 0 + + while i < len(a) and j < len(b): + if a[i] == b[j]: + count += 1 + i += 1 + j += 1 + elif a[i] < b[j]: + i += 1 + else: + j += 1 + + return count + + +def _jaccard_score(adj: Sequence[Sequence[int]], deg: Sequence[int], u: int, v: int) -> float: + du = max(0, deg[u]) + dv = max(0, deg[v]) + if du == 0 and dv == 0: + return 0.0 + + inter = _intersection_size_sorted(adj[u], adj[v]) + union = du + dv - inter + if union <= 0: + return 0.0 + + return float(inter) / float(union) + + +def _build_highway_adjacency( + full_adj: Sequence[Sequence[int]], + cfg: HighwayBuildConfig, +) -> List[List[int]]: + """ + Build sparse Highway backbone using the mod-jaccard hybrid score. + + For each source node v, retain the top-r neighbors according to: + + score(u, v) = alpha * s_mod(u, v) + (1 - alpha) * s_jac(u, v) + + where s_mod(u, v) = 1 - deg(u) deg(v) / (2m) for observed edges. + """ + + n = len(full_adj) + if n == 0: + return [] + + deg = _degrees(full_adj) + m_undirected = max(1.0, 0.5 * sum(deg)) + r = _safe_int(cfg.top_r) + alpha = _clamp(cfg.mod_jaccard_alpha, 0.0, 1.0) + + picked: List[List[int]] = [[] for _ in range(n)] + + for v in range(n): + neigh = list(full_adj[v]) + if not neigh: + continue + + scored = [] + dv = float(max(0, deg[v])) + for u in neigh: + du = float(max(0, deg[u])) + modularity_score = 1.0 - (du * dv) / (2.0 * m_undirected) + jaccard_score = _jaccard_score(full_adj, deg, u, v) + hybrid_score = alpha * modularity_score + (1.0 - alpha) * jaccard_score + scored.append((u, hybrid_score)) + + scored.sort(key=lambda x: (-x[1], x[0])) + chosen = [u for u, _ in scored[: min(r, len(scored))]] + + if cfg.ensure_min1_per_node and not chosen and scored: + chosen = [scored[0][0]] + + picked[v] = chosen + + if not cfg.symmetrize: + return [sorted(set(neigh)) for neigh in picked] + + undirected_edges = set() + for v, neigh in enumerate(picked): + for u in neigh: + if u < 0 or u >= n or u == v: + continue + a = min(u, v) + b = max(u, v) + undirected_edges.add((a, b)) + + h_sets = [set() for _ in range(n)] + for a, b in undirected_edges: + h_sets[a].add(b) + h_sets[b].add(a) + + return [sorted(neigh) for neigh in h_sets] + + +def _select_anchors_greedy_dedup( + full_adj: Sequence[Sequence[int]], + max_anchors: int, +) -> List[int]: + """ + Greedy degree-cover anchor selection. + + Nodes are sorted by degree descending. A node becomes an anchor only if it + has not already been covered by a previously selected anchor or one of that + anchor's full-graph neighbors. + """ + + n = len(full_adj) + if n <= 0: + return [] + + max_anchors = _safe_int(max_anchors) + deg = _degrees(full_adj) + order = list(range(n)) + order.sort(key=lambda u: (-deg[u], u)) + + covered = [False] * n + anchors: List[int] = [] + + for u in order: + if len(anchors) >= max_anchors: + break + if covered[u]: + continue + + anchors.append(u) + covered[u] = True + + for v in full_adj[u]: + if 0 <= v < n: + covered[v] = True + + if not anchors: + anchors.append(order[0]) + + return anchors + + +def _initial_state(n: int, r: int, anchors: Sequence[int]) -> List[MembershipRow]: + r = _safe_int(r) + state: List[MembershipRow] = [[] for _ in range(n)] + + for cid, v in enumerate(anchors): + if 0 <= v < n: + state[v] = [(cid, 1.0)] + + return state + + +def _propagate_anchor_memberships( + highway_adj: Sequence[Sequence[int]], + full_adj: Sequence[Sequence[int]], + anchors: Sequence[int], + cfg: PropConfig, +) -> List[MembershipRow]: + """ + Neighbor-only top-r anchor-membership propagation. + + A node is updated from its neighbors' previous anchor memberships. The node's + own previous state is not directly reinforced during the main aggregation. + """ + + n = len(highway_adj) + r = _safe_int(cfg.top_r) + k = len(anchors) + + state = _initial_state(n, r, anchors) + if n <= 0 or k <= 0: + return state + + deg_h = [max(1, d) for d in _degrees(highway_adj)] + deg_f = [max(1, d) for d in _degrees(full_adj)] + + T = max(0, int(cfg.T)) + damping = _clamp(cfg.damping, 0.0, 1.0) + eta_leak = max(0.0, float(cfg.eta_leak)) + + for _ in range(T): + next_state: List[MembershipRow] = [[] for _ in range(n)] + + for v in range(n): + acc: Dict[int, float] = defaultdict(float) + + for u in highway_adj[v]: + if u < 0 or u >= n: + continue + weight = 1.0 / sqrt(float(deg_h[u] * deg_h[v])) + + for cid, prob in state[u]: + if cid >= 0 and prob > 0.0: + acc[cid] += prob * weight + + if eta_leak > 0.0: + for u in full_adj[v]: + if u < 0 or u >= n: + continue + weight = eta_leak / sqrt(float(deg_f[u] * deg_f[v])) + + for cid, prob in state[u]: + if cid >= 0 and prob > 0.0: + acc[cid] += prob * weight + + if not acc: + next_state[v] = list(state[v]) + continue + + top = _topk_items(acc, min(r, k)) + next_state[v] = _stable_softmax(top, cfg.tau, cfg.eps) + + if damping < 1.0: + mixed_state: List[MembershipRow] = [[] for _ in range(n)] + for v in range(n): + mixed: Dict[int, float] = defaultdict(float) + + for cid, prob in next_state[v]: + mixed[cid] += damping * prob + + for cid, prob in state[v]: + mixed[cid] += (1.0 - damping) * prob + + top = _topk_items(mixed, min(r, k)) + mixed_state[v] = _normalize(dict(top)) + mixed_state[v] = _topk_items(mixed_state[v], min(r, k)) + + state = mixed_state + else: + state = next_state + + return state + + +def _support_key(row: MembershipRow) -> Tuple[int, ...]: + ids = sorted({cid for cid, prob in row if cid >= 0 and prob > 0.0}) + return tuple(ids) + + +def _normalized_entropy(counts: MutableMapping[int, int]) -> float: + positive = [v for v in counts.values() if v > 0] + total = float(sum(positive)) + k = len(positive) + + if total <= 0.0 or k <= 1: + return 0.0 + + h = 0.0 + for count in positive: + p = float(count) / total + h -= p * log(p + 1e-12) + + return h / log(float(k)) + + +def _same_pattern_neighbor_ratio( + adj: Sequence[Sequence[int]], + node_pid: Sequence[int], + v: int, +) -> float: + if v < 0 or v >= len(adj) or v >= len(node_pid): + return 0.0 + + pid = node_pid[v] + if pid < 0: + return 0.0 + + total = 0 + same = 0 + + for u in adj[v]: + if u < 0 or u >= len(node_pid): + continue + qid = node_pid[u] + if qid < 0: + continue + total += 1 + if qid == pid: + same += 1 + + if total <= 0: + return 0.0 + + return float(same) / float(total) + + +def _neighbor_anchor_consensus( + state: Sequence[MembershipRow], + adj: Sequence[Sequence[int]], + v: int, +) -> Dict[int, float]: + dist: Dict[int, float] = defaultdict(float) + n = len(state) + + if v < 0 or v >= len(adj) or v >= n: + return {} + + for u in adj[v]: + if u < 0 or u >= n: + continue + for cid, prob in state[u]: + if cid >= 0 and prob > 0.0: + dist[cid] += prob + + return _normalize(dist) + + +def _refine_anchor_preserving_patterns( + state: Sequence[MembershipRow], + full_adj: Sequence[Sequence[int]], + backbone_adj: Sequence[Sequence[int]], + cfg: LocalRefineConfig, + top_r: int, +) -> List[MembershipRow]: + """ + First Python version of the pattern-neighborhood refinement. + + The refinement is anchor-preserving: it only updates distributions over + anchor ids. It does not create new labels unrelated to the selected anchors. + """ + + if not cfg.enable_pattern_refinement: + return [list(row) for row in state] + + n = len(state) + r = _safe_int(top_r) + + if n <= 0: + return [] + + pattern_to_pid: Dict[Tuple[int, ...], int] = {} + patterns: List[Dict[str, object]] = [] + node_pid = [-1] * n + + for v, row in enumerate(state): + key = _support_key(row) + if key not in pattern_to_pid: + pattern_to_pid[key] = len(patterns) + patterns.append( + { + "key": key, + "nodes": [], + "internal_edges": 0, + "external_edges": 0, + "ext_counts": defaultdict(int), + "target": defaultdict(float), + } + ) + + pid = pattern_to_pid[key] + node_pid[v] = pid + patterns[pid]["nodes"].append(v) + + if not patterns: + return [list(row) for row in state] + + # Estimate pattern structure and target anchor distributions. + for u in range(n): + pu = node_pid[u] + if pu < 0: + continue + + for v in full_adj[u]: + if v < 0 or v >= n: + continue + pv = node_pid[v] + if pv < 0: + continue + + if pu == pv: + patterns[pu]["internal_edges"] += 1 + else: + patterns[pu]["external_edges"] += 1 + patterns[pu]["ext_counts"][pv] += 1 + + for cid, prob in state[u]: + if cid >= 0 and prob > 0.0: + patterns[pu]["target"][cid] += prob + + pattern_confidence = [0.0] * len(patterns) + pattern_target: List[Dict[int, float]] = [] + + for pid, pat in enumerate(patterns): + internal_edges = int(pat["internal_edges"]) + external_edges = int(pat["external_edges"]) + total_edges = internal_edges + external_edges + + self_fraction = 0.0 if total_edges <= 0 else float(internal_edges) / total_edges + neighbor_entropy = _normalized_entropy(pat["ext_counts"]) + + w_self = max(0.0, cfg.confidence_self_fraction_weight) + w_entropy = max(0.0, cfg.confidence_low_entropy_weight) + w_sum = w_self + w_entropy + + if w_sum <= 1e-20: + conf = 0.0 + else: + conf = ( + w_self * _clamp(self_fraction, 0.0, 1.0) + + w_entropy * _clamp(1.0 - neighbor_entropy, 0.0, 1.0) + ) / w_sum + + conf = _clamp(conf, cfg.pattern_confidence_floor, cfg.pattern_confidence_ceiling) + pattern_confidence[pid] = conf + + target = _normalize(pat["target"]) + target = _sharpen(target, cfg.target_sharpen_gamma) + pattern_target.append(target) + + refined: List[MembershipRow] = [list(row) for row in state] + + for v in range(n): + pid = node_pid[v] + if pid < 0: + continue + + own = _normalize(dict(state[v])) + if not own: + continue + + neigh = _neighbor_anchor_consensus(state, backbone_adj, v) + target = pattern_target[pid] + + local_same = _same_pattern_neighbor_ratio(backbone_adj, node_pid, v) + local_mode = _clamp(local_same, 0.0, 1.0) ** max(1.0, cfg.node_mode_power) + + consensus: Dict[int, float] = defaultdict(float) + for cid, prob in target.items(): + consensus[cid] += cfg.pattern_target_mix * prob + + for cid, prob in neigh.items(): + consensus[cid] += (1.0 - cfg.pattern_target_mix) * prob + + consensus = _normalize(consensus) + if not consensus: + continue + + strength = _clamp(cfg.update_strength * pattern_confidence[pid] * local_mode, 0.0, 1.0) + + updated: Dict[int, float] = defaultdict(float) + for cid, prob in own.items(): + updated[cid] += (1.0 - strength) * prob + + for cid, prob in consensus.items(): + updated[cid] += strength * prob + + items = [ + (cid, prob) + for cid, prob in updated.items() + if cid >= 0 and prob > cfg.min_abs_mass_to_keep + ] + items.sort(key=lambda x: (-x[1], x[0])) + items = items[:r] + + if cfg.renormalize: + refined[v] = _topk_items(_normalize(dict(items)), r) + else: + refined[v] = items + + return refined + + +def _decode_state_labels( + state: Sequence[MembershipRow], + theta: float, + max_memberships: int, +) -> List[List[int]]: + """ + Decode node-level anchor labels. + + This follows the C++ main decoding behavior: keep labels with probability + above theta; if none survive, keep the top label as a fallback. + """ + + keep_max = _safe_int(max_memberships) + theta = float(theta) + memberships: List[List[int]] = [] + + for row in state: + items = [(cid, prob) for cid, prob in row if cid >= 0 and prob > 0.0] + items.sort(key=lambda x: (-x[1], x[0])) + + labels = [] + for cid, prob in items: + if len(labels) >= keep_max: + break + if prob >= theta: + labels.append(cid) + + if not labels and items: + labels.append(items[0][0]) + + memberships.append(labels) + + return memberships + + +def _build_communities_from_memberships( + node_memberships: Sequence[Sequence[int]], + nodes: Sequence[Node], + min_community_size: int = 1, +) -> List[List[Node]]: + """ + Convert node-level anchor memberships into CDlib-style communities. + + This function intentionally keeps duplicate, nested, and graph-scale + communities. Highway's decoding stage treats each surviving anchor label as + an output community. Therefore, if two anchor labels induce the same node + set, both are preserved instead of being merged in post-processing. + + Only communities smaller than min_community_size are removed. + """ + + comm_to_nodes: Dict[int, List[Node]] = defaultdict(list) + + for vid, labels in enumerate(node_memberships): + for cid in labels: + comm_to_nodes[cid].append(nodes[vid]) + + communities = [] + min_community_size = _safe_int(min_community_size) + + for cid in sorted(comm_to_nodes): + comm = comm_to_nodes[cid] + if len(comm) >= min_community_size: + communities.append(comm) + + return communities + + +def highway_nx( + G: nx.Graph, + highway_top_r: int = 3, + mod_jaccard_alpha: float = 0.70, + ensure_min1_per_node: bool = True, + symmetrize: bool = True, + max_anchors: int | None = None, + prop_top_r: int = 3, + prop_T: int = 10, + prop_damping: float = 0.90, + prop_eta_leak: float = 0.0, + prop_tau: float = 0.85, + enable_pattern_refinement: bool = True, + local_update_strength: float = 0.45, + local_pattern_target_mix: float = 0.65, + local_node_mode_power: float = 2.00, + decode_theta: float = 0.30, + max_memberships: int = 3, + min_community_size: int = 1, +) -> List[List[Node]]: + """ + Run the Highway overlapping community detection algorithm on a NetworkX graph. + + Parameters are named to match the current C++ implementation where possible. + + Returns: + A list of overlapping communities, where each community is represented + as a list of original NetworkX node labels. + """ + + if G is None: + raise ValueError("G must be a NetworkX graph.") + + nodes, _, full_adj = _relabel_graph_to_contiguous(G) + n = len(nodes) + + if n == 0: + return [] + + hcfg = HighwayBuildConfig( + top_r=highway_top_r, + mod_jaccard_alpha=mod_jaccard_alpha, + ensure_min1_per_node=ensure_min1_per_node, + symmetrize=symmetrize, + ) + pcfg = PropConfig( + top_r=prop_top_r, + T=prop_T, + damping=prop_damping, + eta_leak=prop_eta_leak, + tau=prop_tau, + ) + rcfg = LocalRefineConfig( + enable_pattern_refinement=enable_pattern_refinement, + update_strength=local_update_strength, + pattern_target_mix=local_pattern_target_mix, + node_mode_power=local_node_mode_power, + ) + + highway_adj = _build_highway_adjacency(full_adj, hcfg) + + if max_anchors is None or max_anchors < 0: + max_anchors = max(8, min(30, n // 5)) + + anchors = _select_anchors_greedy_dedup(full_adj, max_anchors) + + state = _propagate_anchor_memberships( + highway_adj=highway_adj, + full_adj=full_adj, + anchors=anchors, + cfg=pcfg, + ) + + state_refined = _refine_anchor_preserving_patterns( + state=state, + full_adj=full_adj, + backbone_adj=highway_adj, + cfg=rcfg, + top_r=prop_top_r, + ) + + node_memberships = _decode_state_labels( + state=state_refined, + theta=decode_theta, + max_memberships=max_memberships, + ) + + return _build_communities_from_memberships( + node_memberships=node_memberships, + nodes=nodes, + min_community_size=min_community_size, + ) diff --git a/cdlib/algorithms/overlapping_partition.py b/cdlib/algorithms/overlapping_partition.py index 8a1be26..f9b9cd0 100644 --- a/cdlib/algorithms/overlapping_partition.py +++ b/cdlib/algorithms/overlapping_partition.py @@ -31,6 +31,7 @@ endntm_find_overlap_cluster, endntm_evalFuction, ) +from cdlib.algorithms.internal.Highway import highway_nx from cdlib.prompt_utils import report_missing_packages import warnings @@ -116,6 +117,7 @@ "coach", "graph_entropy", "ebgc", + "highway", ] @@ -2100,3 +2102,114 @@ def ebgc( return NodeClustering( clustering, g_original, "ebgc", method_parameters={}, overlap=True ) + +def highway( + g_original: object, + highway_top_r: int = 3, + mod_jaccard_alpha: float = 0.70, + ensure_min1_per_node: bool = True, + symmetrize: bool = True, + max_anchors: int = None, + prop_top_r: int = 3, + prop_T: int = 10, + prop_damping: float = 0.90, + prop_eta_leak: float = 0.0, + prop_tau: float = 0.85, + enable_pattern_refinement: bool = True, + local_update_strength: float = 0.45, + local_pattern_target_mix: float = 0.65, + local_node_mode_power: float = 2.00, + decode_theta: float = 0.30, + max_memberships: int = 3, + min_community_size: int = 1, +) -> NodeClustering: + """ + Highway is a scalable overlapping community detection algorithm for + networked data. It builds a sparse structural backbone, selects anchor + nodes, propagates anchor memberships, and decodes overlapping communities + from the propagated membership patterns. + + **Supported Graph Types** + + ========== ======== ======== + Undirected Directed Weighted + ========== ======== ======== + Yes No No + ========== ======== ======== + + :param g_original: a networkx/igraph object. + :param highway_top_r: number of retained backbone neighbors per node. + :param mod_jaccard_alpha: balance between modularity-inspired edge score and Jaccard edge score. + :param ensure_min1_per_node: whether to retain at least one backbone edge for non-isolated nodes. + :param symmetrize: whether to symmetrize the retained directed backbone edges. + :param max_anchors: maximum number of selected anchors. If None, uses max(8, min(30, n // 5)). + :param prop_top_r: number of memberships retained during propagation. + :param prop_T: number of propagation iterations. + :param prop_damping: damping factor used during propagation. + :param prop_eta_leak: optional leakage from the full graph during propagation. + :param prop_tau: softmax temperature used during propagation. + :param enable_pattern_refinement: whether to use anchor-preserving pattern refinement. + :param local_update_strength: strength of local pattern refinement. + :param local_pattern_target_mix: mixture weight between pattern target and neighbor consensus. + :param local_node_mode_power: exponent controlling node-level pattern confidence. + :param decode_theta: membership decoding threshold. + :param max_memberships: maximum memberships retained per node. + :param min_community_size: minimum size of returned communities. + :return: NodeClustering object. + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> G = nx.karate_club_graph() + >>> coms = algorithms.highway(G) + """ + + g = convert_graph_formats(g_original, nx.Graph) + + coms = highway_nx( + g, + highway_top_r=highway_top_r, + mod_jaccard_alpha=mod_jaccard_alpha, + ensure_min1_per_node=ensure_min1_per_node, + symmetrize=symmetrize, + max_anchors=max_anchors, + prop_top_r=prop_top_r, + prop_T=prop_T, + prop_damping=prop_damping, + prop_eta_leak=prop_eta_leak, + prop_tau=prop_tau, + enable_pattern_refinement=enable_pattern_refinement, + local_update_strength=local_update_strength, + local_pattern_target_mix=local_pattern_target_mix, + local_node_mode_power=local_node_mode_power, + decode_theta=decode_theta, + max_memberships=max_memberships, + min_community_size=min_community_size, + ) + + return NodeClustering( + coms, + g_original, + "Highway", + method_parameters={ + "highway_top_r": highway_top_r, + "mod_jaccard_alpha": mod_jaccard_alpha, + "ensure_min1_per_node": ensure_min1_per_node, + "symmetrize": symmetrize, + "max_anchors": max_anchors, + "prop_top_r": prop_top_r, + "prop_T": prop_T, + "prop_damping": prop_damping, + "prop_eta_leak": prop_eta_leak, + "prop_tau": prop_tau, + "enable_pattern_refinement": enable_pattern_refinement, + "local_update_strength": local_update_strength, + "local_pattern_target_mix": local_pattern_target_mix, + "local_node_mode_power": local_node_mode_power, + "decode_theta": decode_theta, + "max_memberships": max_memberships, + "min_community_size": min_community_size, + }, + overlap=True, + ) \ No newline at end of file diff --git a/docs/reference/cd_algorithms/node_clustering.rst b/docs/reference/cd_algorithms/node_clustering.rst index 19eb41e..66c8a43 100644 --- a/docs/reference/cd_algorithms/node_clustering.rst +++ b/docs/reference/cd_algorithms/node_clustering.rst @@ -103,6 +103,7 @@ As a result, methods in this subclass return a ``NodeClustering`` object instanc slpa walkscan wCommunity + highway ^^^^^^^^^^^^^^^^^ From 096cac0691f08b1aeb981e971000b85176d1eba0 Mon Sep 17 00:00:00 2001 From: AmbitiousK <1724260766@qq.com> Date: Wed, 20 May 2026 22:57:07 -0400 Subject: [PATCH 2/4] Add Highway overlapping community detection algorithm --- cdlib/algorithms/internal/Highway.py | 2 +- cdlib/test/test_community_discovery_models.py | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/cdlib/algorithms/internal/Highway.py b/cdlib/algorithms/internal/Highway.py index 3d3758e..c7f0a97 100644 --- a/cdlib/algorithms/internal/Highway.py +++ b/cdlib/algorithms/internal/Highway.py @@ -18,7 +18,7 @@ the C++ command-line interface, file I/O, tracing CSV writers, manifest writers, timers, and filesystem dependencies. -The output format follows CDlib internal implementations such as SLPA_nx.py: +The output format designs as follows: a list of communities, where each community is a list of original NetworkX nodes. """ diff --git a/cdlib/test/test_community_discovery_models.py b/cdlib/test/test_community_discovery_models.py index f4c1411..2328011 100644 --- a/cdlib/test/test_community_discovery_models.py +++ b/cdlib/test/test_community_discovery_models.py @@ -1063,3 +1063,26 @@ def test_bayan(self): if len(coms.communities) > 0: self.assertEqual(type(coms.communities[0]), list) self.assertEqual(type(coms.communities[0][0]), str) + + def test_highway(self): + g = get_string_graph() + coms = algorithms.highway(g) + + self.assertEqual(type(coms.communities), list) + self.assertTrue(coms.overlap) + + if len(coms.communities) > 0: + self.assertEqual(type(coms.communities[0]), list) + if len(coms.communities[0]) > 0: + self.assertEqual(type(coms.communities[0][0]), str) + + g = nx.karate_club_graph() + coms = algorithms.highway(g) + + self.assertEqual(type(coms.communities), list) + self.assertTrue(coms.overlap) + + if len(coms.communities) > 0: + self.assertEqual(type(coms.communities[0]), list) + if len(coms.communities[0]) > 0: + self.assertEqual(type(coms.communities[0][0]), int) \ No newline at end of file From 9b54db3cbf1dc92505a9e5640ed3f5fb2d156d51 Mon Sep 17 00:00:00 2001 From: AmbitiousK <1724260766@qq.com> Date: Thu, 21 May 2026 13:26:51 -0400 Subject: [PATCH 3/4] Add Highway C++ backend integration --- cdlib/algorithms/internal/Highway.py | 984 +++++------------- .../internal/highway_cpp/CMakeLists.txt | 34 + .../internal/highway_cpp/include/anchors.hpp | 24 + .../internal/highway_cpp/include/config.hpp | 26 + .../internal/highway_cpp/include/csr.hpp | 18 + .../internal/highway_cpp/include/decode.hpp | 14 + .../internal/highway_cpp/include/graph_io.hpp | 26 + .../internal/highway_cpp/include/highway.hpp | 33 + .../highway_cpp/include/propagation.hpp | 22 + .../highway_cpp/include/refinement.hpp | 64 ++ .../internal/highway_cpp/include/utils.hpp | 69 ++ .../internal/highway_cpp/src/anchors.cpp | 85 ++ .../internal/highway_cpp/src/csr.cpp | 55 + .../internal/highway_cpp/src/decode.cpp | 46 + .../internal/highway_cpp/src/graph_io.cpp | 71 ++ .../internal/highway_cpp/src/highway.cpp | 246 +++++ .../internal/highway_cpp/src/main.cpp | 430 ++++++++ .../internal/highway_cpp/src/propagation.cpp | 217 ++++ .../internal/highway_cpp/src/refinement.cpp | 427 ++++++++ cdlib/algorithms/overlapping_partition.py | 100 +- 20 files changed, 2252 insertions(+), 739 deletions(-) create mode 100644 cdlib/algorithms/internal/highway_cpp/CMakeLists.txt create mode 100644 cdlib/algorithms/internal/highway_cpp/include/anchors.hpp create mode 100644 cdlib/algorithms/internal/highway_cpp/include/config.hpp create mode 100644 cdlib/algorithms/internal/highway_cpp/include/csr.hpp create mode 100644 cdlib/algorithms/internal/highway_cpp/include/decode.hpp create mode 100644 cdlib/algorithms/internal/highway_cpp/include/graph_io.hpp create mode 100644 cdlib/algorithms/internal/highway_cpp/include/highway.hpp create mode 100644 cdlib/algorithms/internal/highway_cpp/include/propagation.hpp create mode 100644 cdlib/algorithms/internal/highway_cpp/include/refinement.hpp create mode 100644 cdlib/algorithms/internal/highway_cpp/include/utils.hpp create mode 100644 cdlib/algorithms/internal/highway_cpp/src/anchors.cpp create mode 100644 cdlib/algorithms/internal/highway_cpp/src/csr.cpp create mode 100644 cdlib/algorithms/internal/highway_cpp/src/decode.cpp create mode 100644 cdlib/algorithms/internal/highway_cpp/src/graph_io.cpp create mode 100644 cdlib/algorithms/internal/highway_cpp/src/highway.cpp create mode 100644 cdlib/algorithms/internal/highway_cpp/src/main.cpp create mode 100644 cdlib/algorithms/internal/highway_cpp/src/propagation.cpp create mode 100644 cdlib/algorithms/internal/highway_cpp/src/refinement.cpp diff --git a/cdlib/algorithms/internal/Highway.py b/cdlib/algorithms/internal/Highway.py index c7f0a97..e583bb0 100644 --- a/cdlib/algorithms/internal/Highway.py +++ b/cdlib/algorithms/internal/Highway.py @@ -1,709 +1,146 @@ """ -Pure-Python reference implementation of the Highway overlapping community -detection algorithm for CDlib. +C++ backend wrapper for the Highway overlapping community detection algorithm. This module is intended to live at: cdlib/algorithms/internal/Highway.py -The public CDlib wrapper should be added in: +The compiled C++ executable is expected at: + + cdlib/algorithms/internal/highway_cpp/build/highway + +The public CDlib wrapper in: cdlib/algorithms/overlapping_partition.py -and should call: +should call: highway_nx(G, ...) -The implementation below keeps only algorithmic logic. It intentionally removes -the C++ command-line interface, file I/O, tracing CSV writers, manifest writers, -timers, and filesystem dependencies. - -The output format designs as follows: -a list of communities, where each community is a list of original NetworkX nodes. +This module keeps the CDlib-facing API in Python, but the actual Highway +algorithm is executed by the optimized C++ backend. """ from __future__ import annotations -from collections import defaultdict -from dataclasses import dataclass -from math import exp, log, sqrt -from typing import Dict, Hashable, Iterable, List, MutableMapping, Sequence, Tuple +import json +import os +import subprocess +import tempfile +from pathlib import Path +from typing import Dict, Hashable, List, Sequence, Tuple import networkx as nx Node = Hashable -MembershipRow = List[Tuple[int, float]] - - -@dataclass -class HighwayBuildConfig: - """Configuration for sparse Highway backbone construction.""" - - top_r: int = 3 - mod_jaccard_alpha: float = 0.70 - ensure_min1_per_node: bool = True - symmetrize: bool = True -@dataclass -class PropConfig: - """Configuration for anchor-membership propagation.""" - - top_r: int = 3 - T: int = 10 - damping: float = 0.90 - eta_leak: float = 0.0 - tau: float = 0.85 - eps: float = 1e-12 - - -@dataclass -class LocalRefineConfig: +def _default_highway_cpp_binary() -> Path: """ - Lightweight Python version of the anchor-preserving pattern refinement. + Return the default path of the compiled Highway C++ executable. - This is intentionally conservative for CDlib: it keeps the same high-level - idea as the C++ implementation but avoids trace files and implementation - details that are mainly for experiments. + Expected layout: + cdlib/algorithms/internal/highway_cpp/build/highway """ + return Path(__file__).resolve().parent / "highway_cpp" / "build" / "highway" - enable_pattern_refinement: bool = True - confidence_self_fraction_weight: float = 0.50 - confidence_low_entropy_weight: float = 0.50 - pattern_confidence_floor: float = 0.05 - pattern_confidence_ceiling: float = 0.95 - update_strength: float = 0.45 - node_mode_power: float = 2.00 - pattern_target_mix: float = 0.65 - target_sharpen_gamma: float = 1.00 - min_abs_mass_to_keep: float = 1e-12 - renormalize: bool = True - - -def _safe_int(value: int, lo: int = 1) -> int: - return max(lo, int(value)) - - -def _clamp(value: float, lo: float, hi: float) -> float: - if lo > hi: - lo, hi = hi, lo - return max(lo, min(hi, float(value))) - - -def _normalize(dist: MutableMapping[int, float]) -> Dict[int, float]: - total = sum(v for v in dist.values() if v > 0.0) - if total <= 0.0: - return {} - return {k: float(v) / total for k, v in dist.items() if v > 0.0} - - -def _sharpen(dist: MutableMapping[int, float], gamma: float) -> Dict[int, float]: - gamma = max(1.0, float(gamma)) - if not dist: - return {} - if abs(gamma - 1.0) <= 1e-12: - return _normalize(dist) - return _normalize({k: max(0.0, v) ** gamma for k, v in dist.items()}) - - -def _topk_items(dist: MutableMapping[int, float], k: int) -> MembershipRow: - k = _safe_int(k) - items = [(cid, float(val)) for cid, val in dist.items() if cid >= 0 and val > 0.0] - items.sort(key=lambda x: (-x[1], x[0])) - return items[:k] - - -def _stable_softmax(items: MembershipRow, tau: float, eps: float = 1e-12) -> MembershipRow: - if not items: - return [] - - tau = max(float(tau), eps) - max_value = max(v for _, v in items) - - exps = [] - total = 0.0 - for cid, value in items: - ev = exp((value - max_value) / tau) - exps.append((cid, ev)) - total += ev - if total <= eps: - mass = 1.0 / len(items) - return [(cid, mass) for cid, _ in items] - - return [(cid, ev / total) for cid, ev in exps] - - -def _relabel_graph_to_contiguous( - g: nx.Graph, -) -> Tuple[List[Node], Dict[Node, int], List[List[int]]]: +def _write_networkx_edgelist_for_cpp( + G: nx.Graph, + path: Path, +) -> Tuple[List[Node], Dict[Node, int]]: """ - Convert a NetworkX graph with arbitrary node labels into contiguous ids. + Write a NetworkX graph as a contiguous integer edge list for the C++ backend. Returns: - nodes: original labels by contiguous id - node_to_id: original label -> contiguous id - adj: sorted adjacency lists using contiguous ids + nodes: original NetworkX node labels indexed by contiguous integer id. + node_to_id: mapping from original NetworkX node label to contiguous id. """ + if G.is_directed(): + G = G.to_undirected() - if g.is_directed(): - g = g.to_undirected() - - nodes = list(g.nodes()) + nodes = list(G.nodes()) node_to_id = {node: i for i, node in enumerate(nodes)} - n = len(nodes) - - adj_sets = [set() for _ in range(n)] - for u, v in g.edges(): - if u == v: - continue - if u not in node_to_id or v not in node_to_id: - continue - a = node_to_id[u] - b = node_to_id[v] - if a == b: - continue - adj_sets[a].add(b) - adj_sets[b].add(a) - - adj = [sorted(neigh) for neigh in adj_sets] - return nodes, node_to_id, adj - - -def _degrees(adj: Sequence[Sequence[int]]) -> List[int]: - return [len(neigh) for neigh in adj] - - -def _intersection_size_sorted(a: Sequence[int], b: Sequence[int]) -> int: - i = 0 - j = 0 - count = 0 - - while i < len(a) and j < len(b): - if a[i] == b[j]: - count += 1 - i += 1 - j += 1 - elif a[i] < b[j]: - i += 1 - else: - j += 1 - return count - - -def _jaccard_score(adj: Sequence[Sequence[int]], deg: Sequence[int], u: int, v: int) -> float: - du = max(0, deg[u]) - dv = max(0, deg[v]) - if du == 0 and dv == 0: - return 0.0 - - inter = _intersection_size_sorted(adj[u], adj[v]) - union = du + dv - inter - if union <= 0: - return 0.0 - - return float(inter) / float(union) - - -def _build_highway_adjacency( - full_adj: Sequence[Sequence[int]], - cfg: HighwayBuildConfig, -) -> List[List[int]]: - """ - Build sparse Highway backbone using the mod-jaccard hybrid score. - - For each source node v, retain the top-r neighbors according to: - - score(u, v) = alpha * s_mod(u, v) + (1 - alpha) * s_jac(u, v) - - where s_mod(u, v) = 1 - deg(u) deg(v) / (2m) for observed edges. - """ - - n = len(full_adj) - if n == 0: - return [] - - deg = _degrees(full_adj) - m_undirected = max(1.0, 0.5 * sum(deg)) - r = _safe_int(cfg.top_r) - alpha = _clamp(cfg.mod_jaccard_alpha, 0.0, 1.0) - - picked: List[List[int]] = [[] for _ in range(n)] - - for v in range(n): - neigh = list(full_adj[v]) - if not neigh: - continue - - scored = [] - dv = float(max(0, deg[v])) - for u in neigh: - du = float(max(0, deg[u])) - modularity_score = 1.0 - (du * dv) / (2.0 * m_undirected) - jaccard_score = _jaccard_score(full_adj, deg, u, v) - hybrid_score = alpha * modularity_score + (1.0 - alpha) * jaccard_score - scored.append((u, hybrid_score)) - - scored.sort(key=lambda x: (-x[1], x[0])) - chosen = [u for u, _ in scored[: min(r, len(scored))]] - - if cfg.ensure_min1_per_node and not chosen and scored: - chosen = [scored[0][0]] - - picked[v] = chosen - - if not cfg.symmetrize: - return [sorted(set(neigh)) for neigh in picked] - - undirected_edges = set() - for v, neigh in enumerate(picked): - for u in neigh: - if u < 0 or u >= n or u == v: + with path.open("w", encoding="utf-8") as f: + for u, v in G.edges(): + if u == v: continue - a = min(u, v) - b = max(u, v) - undirected_edges.add((a, b)) - - h_sets = [set() for _ in range(n)] - for a, b in undirected_edges: - h_sets[a].add(b) - h_sets[b].add(a) - - return [sorted(neigh) for neigh in h_sets] - - -def _select_anchors_greedy_dedup( - full_adj: Sequence[Sequence[int]], - max_anchors: int, -) -> List[int]: - """ - Greedy degree-cover anchor selection. - - Nodes are sorted by degree descending. A node becomes an anchor only if it - has not already been covered by a previously selected anchor or one of that - anchor's full-graph neighbors. - """ - - n = len(full_adj) - if n <= 0: - return [] - - max_anchors = _safe_int(max_anchors) - deg = _degrees(full_adj) - order = list(range(n)) - order.sort(key=lambda u: (-deg[u], u)) - - covered = [False] * n - anchors: List[int] = [] - - for u in order: - if len(anchors) >= max_anchors: - break - if covered[u]: - continue + f.write(f"{node_to_id[u]} {node_to_id[v]}\n") - anchors.append(u) - covered[u] = True + return nodes, node_to_id - for v in full_adj[u]: - if 0 <= v < n: - covered[v] = True - - if not anchors: - anchors.append(order[0]) - - return anchors - - -def _initial_state(n: int, r: int, anchors: Sequence[int]) -> List[MembershipRow]: - r = _safe_int(r) - state: List[MembershipRow] = [[] for _ in range(n)] - - for cid, v in enumerate(anchors): - if 0 <= v < n: - state[v] = [(cid, 1.0)] - - return state - - -def _propagate_anchor_memberships( - highway_adj: Sequence[Sequence[int]], - full_adj: Sequence[Sequence[int]], - anchors: Sequence[int], - cfg: PropConfig, -) -> List[MembershipRow]: - """ - Neighbor-only top-r anchor-membership propagation. - - A node is updated from its neighbors' previous anchor memberships. The node's - own previous state is not directly reinforced during the main aggregation. - """ - - n = len(highway_adj) - r = _safe_int(cfg.top_r) - k = len(anchors) - - state = _initial_state(n, r, anchors) - if n <= 0 or k <= 0: - return state - - deg_h = [max(1, d) for d in _degrees(highway_adj)] - deg_f = [max(1, d) for d in _degrees(full_adj)] - - T = max(0, int(cfg.T)) - damping = _clamp(cfg.damping, 0.0, 1.0) - eta_leak = max(0.0, float(cfg.eta_leak)) - - for _ in range(T): - next_state: List[MembershipRow] = [[] for _ in range(n)] - - for v in range(n): - acc: Dict[int, float] = defaultdict(float) - - for u in highway_adj[v]: - if u < 0 or u >= n: - continue - weight = 1.0 / sqrt(float(deg_h[u] * deg_h[v])) - - for cid, prob in state[u]: - if cid >= 0 and prob > 0.0: - acc[cid] += prob * weight - - if eta_leak > 0.0: - for u in full_adj[v]: - if u < 0 or u >= n: - continue - weight = eta_leak / sqrt(float(deg_f[u] * deg_f[v])) - - for cid, prob in state[u]: - if cid >= 0 and prob > 0.0: - acc[cid] += prob * weight - - if not acc: - next_state[v] = list(state[v]) - continue - top = _topk_items(acc, min(r, k)) - next_state[v] = _stable_softmax(top, cfg.tau, cfg.eps) - - if damping < 1.0: - mixed_state: List[MembershipRow] = [[] for _ in range(n)] - for v in range(n): - mixed: Dict[int, float] = defaultdict(float) - - for cid, prob in next_state[v]: - mixed[cid] += damping * prob - - for cid, prob in state[v]: - mixed[cid] += (1.0 - damping) * prob - - top = _topk_items(mixed, min(r, k)) - mixed_state[v] = _normalize(dict(top)) - mixed_state[v] = _topk_items(mixed_state[v], min(r, k)) - - state = mixed_state - else: - state = next_state - - return state - - -def _support_key(row: MembershipRow) -> Tuple[int, ...]: - ids = sorted({cid for cid, prob in row if cid >= 0 and prob > 0.0}) - return tuple(ids) - - -def _normalized_entropy(counts: MutableMapping[int, int]) -> float: - positive = [v for v in counts.values() if v > 0] - total = float(sum(positive)) - k = len(positive) - - if total <= 0.0 or k <= 1: - return 0.0 - - h = 0.0 - for count in positive: - p = float(count) / total - h -= p * log(p + 1e-12) - - return h / log(float(k)) - - -def _same_pattern_neighbor_ratio( - adj: Sequence[Sequence[int]], - node_pid: Sequence[int], - v: int, -) -> float: - if v < 0 or v >= len(adj) or v >= len(node_pid): - return 0.0 - - pid = node_pid[v] - if pid < 0: - return 0.0 - - total = 0 - same = 0 - - for u in adj[v]: - if u < 0 or u >= len(node_pid): - continue - qid = node_pid[u] - if qid < 0: - continue - total += 1 - if qid == pid: - same += 1 - - if total <= 0: - return 0.0 - - return float(same) / float(total) - - -def _neighbor_anchor_consensus( - state: Sequence[MembershipRow], - adj: Sequence[Sequence[int]], - v: int, -) -> Dict[int, float]: - dist: Dict[int, float] = defaultdict(float) - n = len(state) - - if v < 0 or v >= len(adj) or v >= n: - return {} - - for u in adj[v]: - if u < 0 or u >= n: - continue - for cid, prob in state[u]: - if cid >= 0 and prob > 0.0: - dist[cid] += prob - - return _normalize(dist) - - -def _refine_anchor_preserving_patterns( - state: Sequence[MembershipRow], - full_adj: Sequence[Sequence[int]], - backbone_adj: Sequence[Sequence[int]], - cfg: LocalRefineConfig, - top_r: int, -) -> List[MembershipRow]: +def _read_cpp_communities_json( + path: Path, + nodes: Sequence[Node], + min_community_size: int = 1, +) -> List[List[Node]]: """ - First Python version of the pattern-neighborhood refinement. + Read communities from the C++ communities.json output. - The refinement is anchor-preserving: it only updates distributions over - anchor ids. It does not create new labels unrelated to the selected anchors. + The C++ output uses contiguous integer node ids. This function maps them + back to original NetworkX node labels. """ - - if not cfg.enable_pattern_refinement: - return [list(row) for row in state] - - n = len(state) - r = _safe_int(top_r) - - if n <= 0: + if not path.exists(): return [] - pattern_to_pid: Dict[Tuple[int, ...], int] = {} - patterns: List[Dict[str, object]] = [] - node_pid = [-1] * n - - for v, row in enumerate(state): - key = _support_key(row) - if key not in pattern_to_pid: - pattern_to_pid[key] = len(patterns) - patterns.append( - { - "key": key, - "nodes": [], - "internal_edges": 0, - "external_edges": 0, - "ext_counts": defaultdict(int), - "target": defaultdict(float), - } - ) - - pid = pattern_to_pid[key] - node_pid[v] = pid - patterns[pid]["nodes"].append(v) - - if not patterns: - return [list(row) for row in state] - - # Estimate pattern structure and target anchor distributions. - for u in range(n): - pu = node_pid[u] - if pu < 0: - continue - - for v in full_adj[u]: - if v < 0 or v >= n: - continue - pv = node_pid[v] - if pv < 0: - continue - - if pu == pv: - patterns[pu]["internal_edges"] += 1 - else: - patterns[pu]["external_edges"] += 1 - patterns[pu]["ext_counts"][pv] += 1 - - for cid, prob in state[u]: - if cid >= 0 and prob > 0.0: - patterns[pu]["target"][cid] += prob - - pattern_confidence = [0.0] * len(patterns) - pattern_target: List[Dict[int, float]] = [] - - for pid, pat in enumerate(patterns): - internal_edges = int(pat["internal_edges"]) - external_edges = int(pat["external_edges"]) - total_edges = internal_edges + external_edges - - self_fraction = 0.0 if total_edges <= 0 else float(internal_edges) / total_edges - neighbor_entropy = _normalized_entropy(pat["ext_counts"]) - - w_self = max(0.0, cfg.confidence_self_fraction_weight) - w_entropy = max(0.0, cfg.confidence_low_entropy_weight) - w_sum = w_self + w_entropy + min_community_size = max(1, int(min_community_size)) - if w_sum <= 1e-20: - conf = 0.0 - else: - conf = ( - w_self * _clamp(self_fraction, 0.0, 1.0) - + w_entropy * _clamp(1.0 - neighbor_entropy, 0.0, 1.0) - ) / w_sum + with path.open("r", encoding="utf-8") as f: + raw_communities = json.load(f) - conf = _clamp(conf, cfg.pattern_confidence_floor, cfg.pattern_confidence_ceiling) - pattern_confidence[pid] = conf + communities: List[List[Node]] = [] - target = _normalize(pat["target"]) - target = _sharpen(target, cfg.target_sharpen_gamma) - pattern_target.append(target) + for raw_comm in raw_communities: + comm = [] - refined: List[MembershipRow] = [list(row) for row in state] + for vid in raw_comm: + vid = int(vid) + if 0 <= vid < len(nodes): + comm.append(nodes[vid]) - for v in range(n): - pid = node_pid[v] - if pid < 0: - continue - - own = _normalize(dict(state[v])) - if not own: - continue - - neigh = _neighbor_anchor_consensus(state, backbone_adj, v) - target = pattern_target[pid] - - local_same = _same_pattern_neighbor_ratio(backbone_adj, node_pid, v) - local_mode = _clamp(local_same, 0.0, 1.0) ** max(1.0, cfg.node_mode_power) - - consensus: Dict[int, float] = defaultdict(float) - for cid, prob in target.items(): - consensus[cid] += cfg.pattern_target_mix * prob - - for cid, prob in neigh.items(): - consensus[cid] += (1.0 - cfg.pattern_target_mix) * prob - - consensus = _normalize(consensus) - if not consensus: - continue - - strength = _clamp(cfg.update_strength * pattern_confidence[pid] * local_mode, 0.0, 1.0) - - updated: Dict[int, float] = defaultdict(float) - for cid, prob in own.items(): - updated[cid] += (1.0 - strength) * prob - - for cid, prob in consensus.items(): - updated[cid] += strength * prob - - items = [ - (cid, prob) - for cid, prob in updated.items() - if cid >= 0 and prob > cfg.min_abs_mass_to_keep - ] - items.sort(key=lambda x: (-x[1], x[0])) - items = items[:r] - - if cfg.renormalize: - refined[v] = _topk_items(_normalize(dict(items)), r) - else: - refined[v] = items - - return refined - - -def _decode_state_labels( - state: Sequence[MembershipRow], - theta: float, - max_memberships: int, -) -> List[List[int]]: - """ - Decode node-level anchor labels. - - This follows the C++ main decoding behavior: keep labels with probability - above theta; if none survive, keep the top label as a fallback. - """ - - keep_max = _safe_int(max_memberships) - theta = float(theta) - memberships: List[List[int]] = [] - - for row in state: - items = [(cid, prob) for cid, prob in row if cid >= 0 and prob > 0.0] - items.sort(key=lambda x: (-x[1], x[0])) - - labels = [] - for cid, prob in items: - if len(labels) >= keep_max: - break - if prob >= theta: - labels.append(cid) - - if not labels and items: - labels.append(items[0][0]) - - memberships.append(labels) + if len(comm) >= min_community_size: + communities.append(comm) - return memberships + return communities -def _build_communities_from_memberships( - node_memberships: Sequence[Sequence[int]], - nodes: Sequence[Node], - min_community_size: int = 1, +def _remove_exact_duplicate_communities( + communities: List[List[Node]], + deduplicate_communities: bool = True, ) -> List[List[Node]]: """ - Convert node-level anchor memberships into CDlib-style communities. + Remove exactly duplicated communities. - This function intentionally keeps duplicate, nested, and graph-scale - communities. Highway's decoding stage treats each surviving anchor label as - an output community. Therefore, if two anchor labels induce the same node - set, both are preserved instead of being merged in post-processing. + Two communities are treated as duplicates if they contain the same node set, + regardless of node order. - Only communities smaller than min_community_size are removed. - """ + Args: + communities: + List of communities. + deduplicate_communities: + If True, remove exact duplicate communities. + If False, return communities unchanged. - comm_to_nodes: Dict[int, List[Node]] = defaultdict(list) + Returns: + A list of communities, optionally deduplicated. + """ + if not deduplicate_communities: + return communities - for vid, labels in enumerate(node_memberships): - for cid in labels: - comm_to_nodes[cid].append(nodes[vid]) + seen = set() + deduped: List[List[Node]] = [] - communities = [] - min_community_size = _safe_int(min_community_size) + for comm in communities: + key = tuple(sorted(comm, key=lambda x: str(x))) - for cid in sorted(comm_to_nodes): - comm = comm_to_nodes[cid] - if len(comm) >= min_community_size: - communities.append(comm) + if key not in seen: + seen.add(key) + deduped.append(comm) - return communities + return deduped def highway_nx( @@ -719,82 +156,209 @@ def highway_nx( prop_eta_leak: float = 0.0, prop_tau: float = 0.85, enable_pattern_refinement: bool = True, - local_update_strength: float = 0.45, - local_pattern_target_mix: float = 0.65, - local_node_mode_power: float = 2.00, + local_confidence_self_fraction_weight: float = 0.85, + local_confidence_low_entropy_weight: float = 0.15, + local_pattern_confidence_floor: float = 0.05, + local_pattern_confidence_ceiling: float = 1.00, + local_update_strength: float = 0.50, + local_node_mode_power: float = 1.50, + local_pattern_target_mix: float = 0.75, + local_target_sharpen_gamma: float = 1.20, + local_min_abs_mass_to_keep: float = 1e-8, + local_renormalize: bool = True, decode_theta: float = 0.30, max_memberships: int = 3, min_community_size: int = 1, + deduplicate_communities: bool = True, + cpp_binary: str | None = None, ) -> List[List[Node]]: """ - Run the Highway overlapping community detection algorithm on a NetworkX graph. - - Parameters are named to match the current C++ implementation where possible. + Run Highway using the optimized C++ executable. + + Args: + G: + Input NetworkX graph. + highway_top_r: + Number of top highway edges retained per node. + mod_jaccard_alpha: + Mixture weight between modularity-style score and Jaccard score. + ensure_min1_per_node: + Whether to keep at least one edge per node when possible. + symmetrize: + Whether to symmetrize the retained highway edges. + max_anchors: + Optional maximum number of anchors. + prop_top_r: + Number of propagated memberships retained per node. + prop_T: + Number of propagation iterations. + prop_damping: + Propagation damping factor. + prop_eta_leak: + Leakage parameter in propagation. + prop_tau: + Propagation threshold or temperature parameter. + enable_pattern_refinement: + Whether to enable local pattern refinement. + local_confidence_self_fraction_weight: + Weight for self-fraction confidence. + local_confidence_low_entropy_weight: + Weight for low-entropy confidence. + local_pattern_confidence_floor: + Minimum pattern confidence. + local_pattern_confidence_ceiling: + Maximum pattern confidence. + local_update_strength: + Strength of local refinement update. + local_node_mode_power: + Power parameter for local node mode. + local_pattern_target_mix: + Mixture weight for pattern target. + local_target_sharpen_gamma: + Sharpening parameter for local target. + local_min_abs_mass_to_keep: + Minimum absolute mass to keep during refinement. + local_renormalize: + Whether to renormalize local memberships. + decode_theta: + Decoding threshold. + max_memberships: + Maximum number of memberships per node. + min_community_size: + Minimum size of returned communities. + deduplicate_communities: + If True, remove exact duplicate communities before returning. + If False, preserve duplicate communities from the backend output. + cpp_binary: + Optional custom path to the Highway C++ binary. Returns: - A list of overlapping communities, where each community is represented - as a list of original NetworkX node labels. + A list of overlapping communities. Each community is represented as a + list of original NetworkX node labels. """ - if G is None: raise ValueError("G must be a NetworkX graph.") - nodes, _, full_adj = _relabel_graph_to_contiguous(G) - n = len(nodes) + if G.is_directed(): + G = G.to_undirected() + + nodes = list(G.nodes()) - if n == 0: + if len(nodes) == 0: return [] - hcfg = HighwayBuildConfig( - top_r=highway_top_r, - mod_jaccard_alpha=mod_jaccard_alpha, - ensure_min1_per_node=ensure_min1_per_node, - symmetrize=symmetrize, - ) - pcfg = PropConfig( - top_r=prop_top_r, - T=prop_T, - damping=prop_damping, - eta_leak=prop_eta_leak, - tau=prop_tau, - ) - rcfg = LocalRefineConfig( - enable_pattern_refinement=enable_pattern_refinement, - update_strength=local_update_strength, - pattern_target_mix=local_pattern_target_mix, - node_mode_power=local_node_mode_power, - ) - - highway_adj = _build_highway_adjacency(full_adj, hcfg) - - if max_anchors is None or max_anchors < 0: - max_anchors = max(8, min(30, n // 5)) - - anchors = _select_anchors_greedy_dedup(full_adj, max_anchors) - - state = _propagate_anchor_memberships( - highway_adj=highway_adj, - full_adj=full_adj, - anchors=anchors, - cfg=pcfg, - ) - - state_refined = _refine_anchor_preserving_patterns( - state=state, - full_adj=full_adj, - backbone_adj=highway_adj, - cfg=rcfg, - top_r=prop_top_r, - ) - - node_memberships = _decode_state_labels( - state=state_refined, - theta=decode_theta, - max_memberships=max_memberships, - ) - - return _build_communities_from_memberships( - node_memberships=node_memberships, - nodes=nodes, - min_community_size=min_community_size, - ) + if G.number_of_edges() == 0: + communities = [[node] for node in nodes] + return _remove_exact_duplicate_communities( + communities, + deduplicate_communities=deduplicate_communities, + ) + + binary = Path(cpp_binary) if cpp_binary is not None else _default_highway_cpp_binary() + + if not binary.exists(): + raise FileNotFoundError( + f"Highway C++ binary not found: {binary}. " + "Please compile the C++ backend first." + ) + + if not os.access(binary, os.X_OK): + raise PermissionError( + f"Highway C++ binary is not executable: {binary}. " + f"Run: chmod +x {binary}" + ) + + with tempfile.TemporaryDirectory(prefix="highway_cpp_") as tmpdir_name: + tmpdir = Path(tmpdir_name) + input_path = tmpdir / "graph.edgelist" + communities_json_path = tmpdir / "communities.json" + + nodes, _ = _write_networkx_edgelist_for_cpp(G, input_path) + + cmd = [ + str(binary), + "--input", str(input_path), + + "--highway_top_r", str(highway_top_r), + "--ensure_min1", "1" if ensure_min1_per_node else "0", + "--symmetrize", "1" if symmetrize else "0", + "--mod_jaccard_alpha", str(mod_jaccard_alpha), + + "--prop_top_r", str(prop_top_r), + "--prop_T", str(prop_T), + "--prop_damping", str(prop_damping), + "--prop_eta_leak", str(prop_eta_leak), + "--prop_tau", str(prop_tau), + + "--local_enable_pattern_refinement", + "1" if enable_pattern_refinement else "0", + "--local_confidence_self_fraction_weight", + str(local_confidence_self_fraction_weight), + "--local_confidence_low_entropy_weight", + str(local_confidence_low_entropy_weight), + "--local_pattern_confidence_floor", + str(local_pattern_confidence_floor), + "--local_pattern_confidence_ceiling", + str(local_pattern_confidence_ceiling), + "--local_update_strength", + str(local_update_strength), + "--local_node_mode_power", + str(local_node_mode_power), + "--local_pattern_target_mix", + str(local_pattern_target_mix), + "--local_target_sharpen_gamma", + str(local_target_sharpen_gamma), + "--local_min_abs_mass_to_keep", + str(local_min_abs_mass_to_keep), + "--local_renormalize", + "1" if local_renormalize else "0", + + "--decode_theta", str(decode_theta), + "--max_memberships", str(max_memberships), + ] + + if max_anchors is not None: + cmd.extend(["--max_anchors", str(max_anchors)]) + + completed = subprocess.run( + cmd, + check=False, + capture_output=True, + text=True, + cwd=str(tmpdir), + ) + + if completed.returncode != 0: + raise RuntimeError( + "Highway C++ backend failed.\n" + f"Command: {' '.join(cmd)}\n" + f"Working directory: {tmpdir}\n" + f"stdout:\n{completed.stdout}\n" + f"stderr:\n{completed.stderr}" + ) + + communities = _read_cpp_communities_json( + communities_json_path, + nodes, + min_community_size=min_community_size, + ) + + if not communities: + produced_files = "\n".join( + str(p) for p in tmpdir.rglob("*") if p.is_file() + ) + + raise RuntimeError( + "Highway C++ backend finished, but no communities were parsed.\n" + f"Command: {' '.join(cmd)}\n" + f"Working directory: {tmpdir}\n" + f"Expected file: {communities_json_path}\n" + f"Produced files:\n{produced_files}\n" + f"stdout:\n{completed.stdout}\n" + f"stderr:\n{completed.stderr}" + ) + + return _remove_exact_duplicate_communities( + communities, + deduplicate_communities=deduplicate_communities, + ) \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/CMakeLists.txt b/cdlib/algorithms/internal/highway_cpp/CMakeLists.txt new file mode 100644 index 0000000..83d159a --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/CMakeLists.txt @@ -0,0 +1,34 @@ +cmake_minimum_required(VERSION 3.16) +project(highway LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +find_package(OpenMP) + +add_executable(highway + src/main.cpp + src/graph_io.cpp + src/csr.cpp + src/highway.cpp + src/anchors.cpp + src/propagation.cpp + src/refinement.cpp + src/decode.cpp +) + +target_include_directories(highway PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +if(OpenMP_CXX_FOUND) + target_link_libraries(highway PRIVATE OpenMP::OpenMP_CXX) + target_compile_definitions(highway PRIVATE HIGHWAY_USE_OPENMP) +endif() + +if(MSVC) + target_compile_options(highway PRIVATE /O2) +else() + target_compile_options(highway PRIVATE -O3) +endif() \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/include/anchors.hpp b/cdlib/algorithms/internal/highway_cpp/include/anchors.hpp new file mode 100644 index 0000000..9f65286 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/include/anchors.hpp @@ -0,0 +1,24 @@ +#pragma once +#include "csr.hpp" +#include + +namespace highway { + +struct AnchorSelectionStep { + int step = -1; + int anchor = -1; // contig id + int degree = 0; // in the graph passed into selector + int newly_covered = 0; // how many uncovered nodes became covered due to this anchor + int total_covered = 0; // cumulative covered nodes after this step +}; + +// Original API kept for compatibility. +std::vector select_anchors_greedy_dedup(const CSR& full, int max_anchors); + +// New explainable API. +std::vector select_anchors_greedy_dedup_with_trace( + const CSR& full, + int max_anchors, + std::vector& trace); + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/include/config.hpp b/cdlib/algorithms/internal/highway_cpp/include/config.hpp new file mode 100644 index 0000000..287ed19 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/include/config.hpp @@ -0,0 +1,26 @@ +#pragma once +#include + +namespace highway { + +struct HighwayBuildConfig { + int top_r = 3; + bool ensure_min1_per_node = true; + bool symmetrize = true; + + // Hybrid backbone score: + // score(u,v) = alpha * modularity_score(u,v) + (1 - alpha) * jaccard(u,v) + // alpha in [0, 1]. Larger alpha -> more modularity-driven. + double mod_jaccard_alpha = 0.70; +}; + +struct PropConfig { + int top_r = 3; + int T = 15; + double damping = 0.9; + double eta_leak = 0.0; + double tau = 0.01; + double eps = 1e-12; +}; + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/include/csr.hpp b/cdlib/algorithms/internal/highway_cpp/include/csr.hpp new file mode 100644 index 0000000..e3de300 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/include/csr.hpp @@ -0,0 +1,18 @@ +#pragma once +#include + +namespace highway { + +struct CSR { + int n = 0; + std::vector indptr; // size n+1 + std::vector indices; // size m (two-directed) +}; + +// Build CSR from two-directed edges (src/dst) in contiguous space [0..n-1] +CSR build_csr_from_edges(int n, const std::vector& src, const std::vector& dst); + +// Degrees from CSR +std::vector degrees_from_csr(const CSR& csr); + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/include/decode.hpp b/cdlib/algorithms/internal/highway_cpp/include/decode.hpp new file mode 100644 index 0000000..f74b783 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/include/decode.hpp @@ -0,0 +1,14 @@ +#pragma once +#include "propagation.hpp" +#include + +namespace highway { + +// node -> list of anchor-node-id (contig space) +std::vector> decode_overlapping_communities( + const TopRState& st, + const std::vector& anchors, + float theta = 0.30f, + int max_memberships = 3); + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/include/graph_io.hpp b/cdlib/algorithms/internal/highway_cpp/include/graph_io.hpp new file mode 100644 index 0000000..7829ac1 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/include/graph_io.hpp @@ -0,0 +1,26 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace highway { + +struct EdgeList { + // undirected edge list stored as two-directed (u->v and v->u) + int n = 0; // number of nodes in contiguous space [0..n-1] + std::vector src; + std::vector dst; + + // mapping: contig_id -> original_id (for writing outputs) + std::vector inv_map; +}; + +// Read edge list file: each line "u v" (original ids can be int64) +EdgeList read_edgelist_to_undirected_two_directed_contig(const std::string& path); + +// If you already have edges in memory (original ids), convert to contig + two-directed +EdgeList to_undirected_two_directed_contig(const std::vector>& edges); + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/include/highway.hpp b/cdlib/algorithms/internal/highway_cpp/include/highway.hpp new file mode 100644 index 0000000..b7697ff --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/include/highway.hpp @@ -0,0 +1,33 @@ +#pragma once +#include "config.hpp" +#include "csr.hpp" +#include + +namespace highway { + +struct HighwayEdgeDecision { + int src = -1; // contig id + int dst = -1; // contig id + double modularity_score = 0; // 1 - d_u d_v / (2m) + double jaccard_score = 0; + double hybrid_score = 0; + bool kept_by_src_topr = false; // selected when processing src + bool kept_final = false; // after symmetrize/final retention +}; + +// Original API kept for compatibility. +void build_highway_edges( + const CSR& full, + const HighwayBuildConfig& cfg, + std::vector& out_src, + std::vector& out_dst); + +// New explainable API. +void build_highway_edges_with_trace( + const CSR& full, + const HighwayBuildConfig& cfg, + std::vector& out_src, + std::vector& out_dst, + std::vector& trace); + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/include/propagation.hpp b/cdlib/algorithms/internal/highway_cpp/include/propagation.hpp new file mode 100644 index 0000000..d2ca3b9 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/include/propagation.hpp @@ -0,0 +1,22 @@ +#pragma once +#include "config.hpp" +#include "csr.hpp" +#include + +namespace highway { + +struct TopRState { + int n = 0; + int r = 0; + int k = 0; + std::vector idx; // size n*r, -1 if empty + std::vector val; // size n*r +}; + +TopRState propagate_other_assignment_topr_cpu( + const CSR& highway_csr, + const CSR& full_csr, + const std::vector& anchors, // contig node ids + const PropConfig& cfg); + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/include/refinement.hpp b/cdlib/algorithms/internal/highway_cpp/include/refinement.hpp new file mode 100644 index 0000000..6eb45b8 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/include/refinement.hpp @@ -0,0 +1,64 @@ +#pragma once + +#include "csr.hpp" +#include "propagation.hpp" + +namespace highway { + +// Anchor-preserving pattern decoding configuration. +// +// Design principle: +// 1) Extract support-set patterns from propagated anchor-indexed memberships. +// 2) Use each pattern only as a calibration unit, not as a new community id. +// 3) Build a pattern-level target distribution in the original anchor space. +// 4) Blend this target with local backbone-neighbor consensus. +// 5) Apply conservative shrinkage: +// alpha_new = (1 - lambda_v) * alpha_old + lambda_v * q_v +// +// Therefore, the returned TopRState preserves the original anchor-indexed +// community ids. This avoids the previous pattern-id reassignment problem. +struct LocalRefineConfig { + // Enable/disable anchor-preserving pattern decoding. + bool enable_pattern_refinement = true; + + // Pattern-confidence weights. + // confidence(P) is computed from: + // self_fraction(P): internal cohesion of the support pattern; + // 1 - neighbor_entropy(P): concentration of neighboring patterns. + // The weighted score is normalized by the total positive weight. + float confidence_self_fraction_weight = 0.75f; + float confidence_low_entropy_weight = 0.25f; + + // Bounds for pattern confidence. + float pattern_confidence_floor = 0.00f; + float pattern_confidence_ceiling = 1.00f; + + // Maximum shrinkage/update strength. + // Actual lambda_v is: + // update_strength * confidence(P_v) * same_pattern_ratio(v)^node_mode_power + float update_strength = 0.45f; + float node_mode_power = 2.00f; + + // Blend between pattern target and node-neighbor target: + // q_v = pattern_target_mix * q_pattern + + // (1 - pattern_target_mix) * q_neighbor + float pattern_target_mix = 0.65f; + + // Optional sharpening applied to pattern-level and final target distributions. + // gamma=1 keeps the distribution unchanged; gamma>1 makes it more peaked. + float target_sharpen_gamma = 1.00f; + + // Output sparsification. + bool renormalize = true; + float min_abs_mass_to_keep = 1e-8f; +}; + +// Anchor-preserving pattern decoding refinement. +// The returned TopRState keeps the original anchor ids as community ids. +TopRState refine_uncertain_nodes_set_search_cpu( + const TopRState& in, + const CSR& full_csr, + const CSR& backbone_csr, + const LocalRefineConfig& cfg); + +} // namespace highway diff --git a/cdlib/algorithms/internal/highway_cpp/include/utils.hpp b/cdlib/algorithms/internal/highway_cpp/include/utils.hpp new file mode 100644 index 0000000..6c99b53 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/include/utils.hpp @@ -0,0 +1,69 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace highway { + +inline bool is_finite(double x) { + return std::isfinite(x); +} + +inline void stable_softmax_inplace(std::vector& x, float tau, float eps = 1e-12f) { + if (x.empty()) return; + if (tau <= 0.0f) tau = 1e-6f; + + float mx = -std::numeric_limits::infinity(); + for (float v : x) mx = std::max(mx, v / tau); + + double sum = 0.0; + for (float& v : x) { + float z = (v / tau) - mx; + float e = std::exp(z); + v = e; + sum += e; + } + if (sum <= eps) { + // fallback uniform + float u = 1.0f / std::max(1, x.size()); + for (float& v : x) v = u; + return; + } + float inv = static_cast(1.0 / sum); + for (float& v : x) v *= inv; +} + +template +inline void topk_indices_values( + const std::vector& a, + int k, + std::vector& out_idx, + std::vector& out_val) { + out_idx.clear(); + out_val.clear(); + if (k <= 0 || a.empty()) return; + k = std::min(k, static_cast(a.size())); + + std::vector idx(a.size()); + std::iota(idx.begin(), idx.end(), 0); + + std::nth_element(idx.begin(), idx.begin() + (k - 1), idx.end(), + [&](int i, int j) { return a[i] > a[j]; }); + idx.resize(k); + + std::sort(idx.begin(), idx.end(), + [&](int i, int j) { return a[i] > a[j]; }); + + out_idx = idx; + out_val.reserve(k); + for (int i : out_idx) out_val.push_back(a[i]); +} + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/src/anchors.cpp b/cdlib/algorithms/internal/highway_cpp/src/anchors.cpp new file mode 100644 index 0000000..88308d8 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/src/anchors.cpp @@ -0,0 +1,85 @@ +#include "anchors.hpp" +#include "csr.hpp" +#include +#include + +namespace highway { + +std::vector select_anchors_greedy_dedup_with_trace( + const CSR& full, + int max_anchors, + std::vector& trace) { + trace.clear(); + + const int n = full.n; + max_anchors = std::max(1, max_anchors); + if (n <= 0) return {}; + + std::vector deg = degrees_from_csr(full); + std::vector order(n); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](int a, int b) { + if (deg[a] != deg[b]) return deg[a] > deg[b]; + return a < b; + }); + + std::vector covered(n, 0); + std::vector anchors; + anchors.reserve(std::min(max_anchors, n)); + + int total_covered = 0; + int step_id = 0; + + for (int u : order) { + if ((int)anchors.size() >= max_anchors) break; + if (covered[u]) continue; + + anchors.push_back(u); + + int newly = 0; + if (!covered[u]) { + covered[u] = 1; + ++newly; + ++total_covered; + } + + int s0 = full.indptr[u]; + int s1 = full.indptr[u + 1]; + for (int p = s0; p < s1; ++p) { + int v = full.indices[p]; + if (v >= 0 && v < n && !covered[v]) { + covered[v] = 1; + ++newly; + ++total_covered; + } + } + + AnchorSelectionStep rec; + rec.step = step_id++; + rec.anchor = u; + rec.degree = deg[u]; + rec.newly_covered = newly; + rec.total_covered = total_covered; + trace.push_back(rec); + } + + if (anchors.empty()) { + anchors.push_back(order[0]); + AnchorSelectionStep rec; + rec.step = 0; + rec.anchor = order[0]; + rec.degree = deg[order[0]]; + rec.newly_covered = 1; + rec.total_covered = 1; + trace.push_back(rec); + } + + return anchors; +} + +std::vector select_anchors_greedy_dedup(const CSR& full, int max_anchors) { + std::vector ignored; + return select_anchors_greedy_dedup_with_trace(full, max_anchors, ignored); +} + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/src/csr.cpp b/cdlib/algorithms/internal/highway_cpp/src/csr.cpp new file mode 100644 index 0000000..bf7ab8a --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/src/csr.cpp @@ -0,0 +1,55 @@ +#include "csr.hpp" +#include +#include +#include + +namespace highway { + +CSR build_csr_from_edges(int n, const std::vector& src, const std::vector& dst) { + if (src.size() != dst.size()) throw std::runtime_error("src/dst size mismatch"); + CSR csr; + csr.n = n; + const size_t m = src.size(); + + csr.indptr.assign(n + 1, 0); + + // count + for (size_t i = 0; i < m; ++i) { + int u = src[i]; + if (u < 0 || u >= n) throw std::runtime_error("src out of range"); + csr.indptr[u + 1] += 1; + } + + // prefix sum + for (int i = 0; i < n; ++i) csr.indptr[i + 1] += csr.indptr[i]; + + csr.indices.assign(m, -1); + std::vector cur = csr.indptr; // copy + + // fill + for (size_t i = 0; i < m; ++i) { + int u = src[i]; + int v = dst[i]; + int p = cur[u]++; + csr.indices[p] = v; + } + + // sort neighbor lists (helps deterministic behavior) + for (int u = 0; u < n; ++u) { + int s0 = csr.indptr[u]; + int s1 = csr.indptr[u + 1]; + std::sort(csr.indices.begin() + s0, csr.indices.begin() + s1); + } + + return csr; +} + +std::vector degrees_from_csr(const CSR& csr) { + std::vector deg(csr.n, 0); + for (int u = 0; u < csr.n; ++u) { + deg[u] = csr.indptr[u + 1] - csr.indptr[u]; + } + return deg; +} + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/src/decode.cpp b/cdlib/algorithms/internal/highway_cpp/src/decode.cpp new file mode 100644 index 0000000..3522ceb --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/src/decode.cpp @@ -0,0 +1,46 @@ +#include "decode.hpp" +#include +#include + +namespace highway { + +std::vector> decode_overlapping_communities( + const TopRState& st, + const std::vector& anchors, + float theta, + int max_memberships) { + + max_memberships = std::max(1, max_memberships); + const int n = st.n; + const int r = st.r; + + std::vector> out; + out.reserve(n); + + for (int v = 0; v < n; ++v) { + std::vector> pairs; + pairs.reserve(r); + + for (int j = 0; j < r; ++j) { + int cid = st.idx[v * r + j]; + float pv = st.val[v * r + j]; + if (cid >= 0 && pv >= theta) { + int anchor_node = anchors[cid]; + pairs.emplace_back(anchor_node, pv); + } + } + + std::sort(pairs.begin(), pairs.end(), + [](auto& a, auto& b) { return a.second > b.second; }); + + std::vector labs; + for (int i = 0; i < (int)pairs.size() && i < max_memberships; ++i) { + labs.push_back(pairs[i].first); + } + out.push_back(std::move(labs)); + } + + return out; +} + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/src/graph_io.cpp b/cdlib/algorithms/internal/highway_cpp/src/graph_io.cpp new file mode 100644 index 0000000..8e3699a --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/src/graph_io.cpp @@ -0,0 +1,71 @@ +#include "graph_io.hpp" +#include +#include +#include + +namespace highway { + +static EdgeList build_from_edges(const std::vector>& edges) { + EdgeList out; + std::unordered_map map; + map.reserve(edges.size() * 2); + + auto get_id = [&](long long x) -> int { + auto it = map.find(x); + if (it != map.end()) return it->second; + int nid = static_cast(map.size()); + map.emplace(x, nid); + return nid; + }; + + // first pass: assign ids + for (auto [u, v] : edges) { + (void)get_id(u); + (void)get_id(v); + } + out.n = static_cast(map.size()); + out.inv_map.resize(out.n); + + // invert map + for (auto& kv : map) { + out.inv_map[kv.second] = kv.first; + } + + // build two-directed + out.src.reserve(edges.size() * 2); + out.dst.reserve(edges.size() * 2); + for (auto [u0, v0] : edges) { + int u = map[u0]; + int v = map[v0]; + if (u == v) continue; // drop self-loop (keep consistent w/ most pipelines) + out.src.push_back(u); out.dst.push_back(v); + out.src.push_back(v); out.dst.push_back(u); + } + + return out; +} + +EdgeList read_edgelist_to_undirected_two_directed_contig(const std::string& path) { + std::ifstream fin(path); + if (!fin) throw std::runtime_error("Failed to open file: " + path); + + std::vector> edges; + edges.reserve(1 << 20); + + std::string line; + while (std::getline(fin, line)) { + if (line.empty()) continue; + if (line[0] == '#') continue; + std::istringstream iss(line); + long long u, v; + if (!(iss >> u >> v)) continue; + edges.emplace_back(u, v); + } + return build_from_edges(edges); +} + +EdgeList to_undirected_two_directed_contig(const std::vector>& edges) { + return build_from_edges(edges); +} + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/src/highway.cpp b/cdlib/algorithms/internal/highway_cpp/src/highway.cpp new file mode 100644 index 0000000..beeec40 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/src/highway.cpp @@ -0,0 +1,246 @@ +#include "highway.hpp" +#include "utils.hpp" +#include +#include +#include +#include +#include +#include +#include + +namespace highway { + +static inline int intersection_size_sorted( + const CSR& csr, + int u, + int v) { + int pu = csr.indptr[u]; + int pv = csr.indptr[v]; + const int eu = csr.indptr[u + 1]; + const int ev = csr.indptr[v + 1]; + + int inter = 0; + while (pu < eu && pv < ev) { + const int a = csr.indices[pu]; + const int b = csr.indices[pv]; + if (a == b) { + ++inter; + ++pu; + ++pv; + } else if (a < b) { + ++pu; + } else { + ++pv; + } + } + return inter; +} + +static inline double jaccard_score_from_csr( + const CSR& csr, + const std::vector& deg, + int u, + int v) { + const int du = std::max(0, deg[u]); + const int dv = std::max(0, deg[v]); + if (du == 0 && dv == 0) return 0.0; + + const int inter = intersection_size_sorted(csr, u, v); + const int uni = du + dv - inter; + if (uni <= 0) return 0.0; + return static_cast(inter) / static_cast(uni); +} + +void build_highway_edges_with_trace( + const CSR& full, + const HighwayBuildConfig& cfg, + std::vector& out_src, + std::vector& out_dst, + std::vector& trace) { + + const int n = full.n; + out_src.clear(); + out_dst.clear(); + trace.clear(); + + if (n <= 0) return; + + const std::vector deg_i = degrees_from_csr(full); + + // full stores an undirected graph as two-directed edges + const long long m_two_directed = static_cast(full.indices.size()); + const double m_undirected = std::max(1.0, 0.5 * static_cast(m_two_directed)); + + const int r = std::max(1, cfg.top_r); + const double alpha = std::max(0.0, std::min(1.0, cfg.mod_jaccard_alpha)); + + std::vector> picked(n); + std::vector> local_trace(n); + + #if defined(HIGHWAY_USE_OPENMP) + #pragma omp parallel for schedule(dynamic, 256) + #endif + for (int v = 0; v < n; ++v) { + const int s0 = full.indptr[v]; + const int s1 = full.indptr[v + 1]; + const int dv_i = s1 - s0; + if (dv_i <= 0) continue; + + std::vector neigh; + neigh.reserve(dv_i); + for (int p = s0; p < s1; ++p) { + neigh.push_back(full.indices[p]); + } + + std::vector scores(neigh.size(), -std::numeric_limits::infinity()); + std::vector modularity_scores(neigh.size(), 0.0); + std::vector jaccard_scores(neigh.size(), 0.0); + + for (size_t i = 0; i < neigh.size(); ++i) { + const int u = neigh[i]; + if (u < 0 || u >= n) continue; + + const double du = static_cast(std::max(0, deg_i[u])); + const double dv = static_cast(std::max(0, deg_i[v])); + + const double modularity_score = 1.0 - (du * dv) / (2.0 * m_undirected); + const double jaccard_score = jaccard_score_from_csr(full, deg_i, u, v); + const double score = + alpha * modularity_score + + (1.0 - alpha) * jaccard_score; + + modularity_scores[i] = modularity_score; + jaccard_scores[i] = jaccard_score; + scores[i] = score; + } + + std::vector topi; + std::vector topv; + topk_indices_values( + scores, + std::min(r, static_cast(scores.size())), + topi, + topv); + + std::vector chosen_mask(neigh.size(), 0); + for (int ii : topi) { + if (ii >= 0 && ii < static_cast(neigh.size())) chosen_mask[ii] = 1; + } + + std::vector chosen; + chosen.reserve(topi.size()); + for (int ii : topi) { + if (ii >= 0 && ii < static_cast(neigh.size())) { + chosen.push_back(neigh[ii]); + } + } + + if (cfg.ensure_min1_per_node && chosen.empty() && !neigh.empty()) { + int best_idx = 0; + double best_score = scores[0]; + for (int i = 1; i < static_cast(scores.size()); ++i) { + if (scores[i] > best_score) { + best_score = scores[i]; + best_idx = i; + } + } + chosen.push_back(neigh[best_idx]); + chosen_mask[best_idx] = 1; + } + + std::vector my_trace; + my_trace.reserve(neigh.size()); + for (size_t i = 0; i < neigh.size(); ++i) { + HighwayEdgeDecision rec; + rec.src = v; + rec.dst = neigh[i]; + rec.modularity_score = modularity_scores[i]; + rec.jaccard_score = jaccard_scores[i]; + rec.hybrid_score = scores[i]; + rec.kept_by_src_topr = (chosen_mask[i] != 0); + rec.kept_final = false; // finalized later + my_trace.push_back(rec); + } + + picked[v] = std::move(chosen); + local_trace[v] = std::move(my_trace); + } + + std::unordered_set final_kept_dir; + auto make_key = [&](int a, int b) -> long long { + return static_cast(a) * static_cast(n) + static_cast(b); + }; + + if (cfg.symmetrize) { + std::unordered_set dir; + dir.reserve(static_cast(n) * static_cast(r) * 2); + + for (int v = 0; v < n; ++v) { + for (int u : picked[v]) { + dir.insert(make_key(v, u)); + } + } + + std::unordered_set und_seen; + und_seen.reserve(dir.size()); + + out_src.reserve(dir.size() * 2); + out_dst.reserve(dir.size() * 2); + + for (int v = 0; v < n; ++v) { + for (int u : picked[v]) { + if (u < 0 || u >= n || u == v) continue; + + const bool keep = dir.count(make_key(v, u)) || dir.count(make_key(u, v)); + if (!keep) continue; + + const int a = std::min(u, v); + const int b = std::max(u, v); + const long long und_key = make_key(a, b); + + if (und_seen.insert(und_key).second) { + out_src.push_back(a); + out_dst.push_back(b); + out_src.push_back(b); + out_dst.push_back(a); + + final_kept_dir.insert(make_key(a, b)); + final_kept_dir.insert(make_key(b, a)); + } + } + } + } else { + size_t est = 0; + for (int v = 0; v < n; ++v) est += picked[v].size(); + + out_src.reserve(est); + out_dst.reserve(est); + + for (int v = 0; v < n; ++v) { + for (int u : picked[v]) { + if (u < 0 || u >= n || u == v) continue; + out_src.push_back(v); + out_dst.push_back(u); + final_kept_dir.insert(make_key(v, u)); + } + } + } + + for (int v = 0; v < n; ++v) { + for (auto& rec : local_trace[v]) { + rec.kept_final = final_kept_dir.count(make_key(rec.src, rec.dst)) > 0; + trace.push_back(rec); + } + } +} + +void build_highway_edges( + const CSR& full, + const HighwayBuildConfig& cfg, + std::vector& out_src, + std::vector& out_dst) { + std::vector ignored; + build_highway_edges_with_trace(full, cfg, out_src, out_dst, ignored); +} + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/src/main.cpp b/cdlib/algorithms/internal/highway_cpp/src/main.cpp new file mode 100644 index 0000000..a4b8167 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/src/main.cpp @@ -0,0 +1,430 @@ +#include "anchors.hpp" +#include "config.hpp" +#include "csr.hpp" +#include "graph_io.hpp" +#include "highway.hpp" +#include "propagation.hpp" +#include "refinement.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline double now_sec() { + using clock = std::chrono::high_resolution_clock; + return std::chrono::duration(clock::now().time_since_epoch()).count(); +} + +static highway::TopRState make_init_state( + int n, + int r, + const std::vector& anchors) { + highway::TopRState st; + st.n = n; + st.r = r; + st.k = static_cast(anchors.size()); + st.idx.assign(n * r, -1); + st.val.assign(n * r, 0.0f); + + for (int cid = 0; cid < static_cast(anchors.size()); ++cid) { + int v = anchors[cid]; + if (v < 0 || v >= n) continue; + st.idx[v * r + 0] = cid; + st.val[v * r + 0] = 1.0f; + } + + return st; +} + +static std::vector> decode_state_labels( + const highway::TopRState& st, + float theta, + int max_memberships) { + const int keep_max = std::max(1, max_memberships); + std::vector> out(st.n); + + for (int v = 0; v < st.n; ++v) { + std::vector> items; + items.reserve(st.r); + + for (int j = 0; j < st.r; ++j) { + int cid = st.idx[v * st.r + j]; + float pv = st.val[v * st.r + j]; + + if (cid >= 0 && pv > 0.0f) { + items.emplace_back(cid, pv); + } + } + + std::sort(items.begin(), items.end(), [](const auto& a, const auto& b) { + if (a.second != b.second) return a.second > b.second; + return a.first < b.first; + }); + + for (const auto& kv : items) { + if (static_cast(out[v].size()) >= keep_max) break; + if (kv.second >= theta) out[v].push_back(kv.first); + } + + if (out[v].empty() && !items.empty()) { + out[v].push_back(items.front().first); + } + } + + return out; +} + +static long long count_undirected_edges_from_csr(const highway::CSR& csr) { + long long m = 0; + + for (int u = 0; u < csr.n; ++u) { + int s0 = csr.indptr[u]; + int s1 = csr.indptr[u + 1]; + + for (int p = s0; p < s1; ++p) { + int v = csr.indices[p]; + if (u < v) ++m; + } + } + + return m; +} + +static long long count_total_memberships( + const std::vector>& node_memberships) { + long long total = 0; + + for (const auto& labs : node_memberships) { + total += static_cast(labs.size()); + } + + return total; +} + +static std::vector> build_communities_from_node_memberships( + const std::vector>& node_memberships) { + std::map> comm_to_nodes; + + for (int v = 0; v < static_cast(node_memberships.size()); ++v) { + for (int lab : node_memberships[v]) { + if (lab >= 0) { + comm_to_nodes[lab].push_back(v); + } + } + } + + std::vector> communities; + communities.reserve(comm_to_nodes.size()); + + for (auto& kv : comm_to_nodes) { + if (!kv.second.empty()) { + communities.push_back(std::move(kv.second)); + } + } + + return communities; +} + +static void write_communities_json( + const std::string& path, + const std::vector& inv_map, + const std::vector>& communities_contig) { + std::ofstream fout(path); + + if (!fout) { + throw std::runtime_error("Cannot write communities file: " + path); + } + + fout << "[\n"; + + for (int ci = 0; ci < static_cast(communities_contig.size()); ++ci) { + fout << " ["; + + const auto& comm = communities_contig[ci]; + bool first = true; + + for (int i = 0; i < static_cast(comm.size()); ++i) { + int v = comm[i]; + + if (v < 0 || v >= static_cast(inv_map.size())) { + continue; + } + + if (!first) { + fout << ", "; + } + + fout << inv_map[v]; + first = false; + } + + fout << "]"; + + if (ci + 1 < static_cast(communities_contig.size())) { + fout << ","; + } + + fout << "\n"; + } + + fout << "]\n"; +} + +int main(int argc, char** argv) { + std::string input; + + highway::HighwayBuildConfig hcfg; + int max_anchors = -1; + + highway::PropConfig pcfg; + pcfg.top_r = 3; + pcfg.T = 10; + pcfg.damping = 0.9; + pcfg.eta_leak = 0.0; + pcfg.tau = 0.85; + + highway::LocalRefineConfig lrcfg; + + float decode_theta = 0.30f; + int max_memberships = 3; + + for (int i = 1; i < argc; ++i) { + std::string a = argv[i]; + + auto need = [&](const std::string& key) { + if (i + 1 >= argc) { + throw std::runtime_error("Missing value for " + key); + } + return std::string(argv[++i]); + }; + + if (a == "--input") { + input = need(a); + } + + else if (a == "--highway_top_r") { + hcfg.top_r = std::stoi(need(a)); + } + else if (a == "--ensure_min1") { + hcfg.ensure_min1_per_node = (need(a) == "1"); + } + else if (a == "--symmetrize") { + hcfg.symmetrize = (need(a) == "1"); + } + else if (a == "--mod_jaccard_alpha") { + hcfg.mod_jaccard_alpha = std::stod(need(a)); + } + + else if (a == "--max_anchors") { + max_anchors = std::stoi(need(a)); + } + + else if (a == "--prop_top_r") { + pcfg.top_r = std::stoi(need(a)); + } + else if (a == "--prop_T") { + pcfg.T = std::stoi(need(a)); + } + else if (a == "--prop_damping") { + pcfg.damping = std::stod(need(a)); + } + else if (a == "--prop_eta_leak") { + pcfg.eta_leak = std::stod(need(a)); + } + else if (a == "--prop_tau") { + pcfg.tau = std::stod(need(a)); + } + + else if (a == "--local_enable_pattern_refinement") { + lrcfg.enable_pattern_refinement = (need(a) == "1"); + } + else if (a == "--local_confidence_self_fraction_weight") { + lrcfg.confidence_self_fraction_weight = std::stof(need(a)); + } + else if (a == "--local_confidence_low_entropy_weight") { + lrcfg.confidence_low_entropy_weight = std::stof(need(a)); + } + else if (a == "--local_pattern_confidence_floor") { + lrcfg.pattern_confidence_floor = std::stof(need(a)); + } + else if (a == "--local_pattern_confidence_ceiling") { + lrcfg.pattern_confidence_ceiling = std::stof(need(a)); + } + else if (a == "--local_update_strength") { + lrcfg.update_strength = std::stof(need(a)); + } + else if (a == "--local_node_mode_power") { + lrcfg.node_mode_power = std::stof(need(a)); + } + else if (a == "--local_pattern_target_mix") { + lrcfg.pattern_target_mix = std::stof(need(a)); + } + else if (a == "--local_target_sharpen_gamma") { + lrcfg.target_sharpen_gamma = std::stof(need(a)); + } + else if (a == "--local_min_abs_mass_to_keep") { + lrcfg.min_abs_mass_to_keep = std::stof(need(a)); + } + else if (a == "--local_renormalize") { + lrcfg.renormalize = (need(a) == "1"); + } + + else if (a == "--decode_theta") { + decode_theta = std::stof(need(a)); + } + else if (a == "--max_memberships") { + max_memberships = std::stoi(need(a)); + } + + else if (a == "--help") { + std::cout + << "Usage: highway_algorithm_only --input graph.edgelist\n" + << "\n" + << "Core Highway parameters:\n" + << " --highway_top_r 3\n" + << " --ensure_min1 1\n" + << " --symmetrize 1\n" + << " --mod_jaccard_alpha 0.70\n" + << " --max_anchors \n" + << "\n" + << "Propagation parameters:\n" + << " --prop_top_r 3\n" + << " --prop_T 10\n" + << " --prop_damping 0.9\n" + << " --prop_eta_leak 0\n" + << " --prop_tau 0.85\n" + << "\n" + << "Anchor-preserving pattern decoding parameters:\n" + << " --local_enable_pattern_refinement 1\n" + << " --local_confidence_self_fraction_weight 0.85\n" + << " --local_confidence_low_entropy_weight 0.15\n" + << " --local_pattern_confidence_floor 0.05\n" + << " --local_pattern_confidence_ceiling 1.00\n" + << " --local_update_strength 0.50\n" + << " --local_node_mode_power 1.50\n" + << " --local_pattern_target_mix 0.75\n" + << " --local_target_sharpen_gamma 1.20\n" + << " --local_min_abs_mass_to_keep 1e-8\n" + << " --local_renormalize 1\n" + << "\n" + << "Final decode parameters:\n" + << " --decode_theta 0.30\n" + << " --max_memberships 3\n"; + return 0; + } + + else { + throw std::runtime_error("Unknown argument: " + a); + } + } + + if (input.empty()) { + std::cerr << "Error: --input is required. Use --help.\n"; + return 1; + } + + std::unordered_map timers; + const double t_global0 = now_sec(); + + try { + double t0 = now_sec(); + highway::EdgeList el = + highway::read_edgelist_to_undirected_two_directed_contig(input); + timers["Read+Relabel"] = now_sec() - t0; + + const int n = el.n; + if (n <= 0) { + std::cerr << "Empty graph.\n"; + return 0; + } + + t0 = now_sec(); + highway::CSR full = highway::build_csr_from_edges(n, el.src, el.dst); + timers["Full_CSR"] = now_sec() - t0; + + t0 = now_sec(); + std::vector hsrc; + std::vector hdst; + highway::build_highway_edges(full, hcfg, hsrc, hdst); + highway::CSR hcsr = highway::build_csr_from_edges(n, hsrc, hdst); + timers["Highway_build"] = now_sec() - t0; + + t0 = now_sec(); + if (max_anchors < 0) { + max_anchors = std::max(8, std::min(30, n / 5)); + } + std::vector anchors = + highway::select_anchors_greedy_dedup(full, max_anchors); + timers["Anchor_select"] = now_sec() - t0; + + t0 = now_sec(); + highway::TopRState st_init = + make_init_state(n, std::max(1, pcfg.top_r), anchors); + timers["State_init"] = now_sec() - t0; + + t0 = now_sec(); + highway::TopRState st = + highway::propagate_other_assignment_topr_cpu(hcsr, full, anchors, pcfg); + timers["Propagation"] = now_sec() - t0; + + t0 = now_sec(); + highway::TopRState st_ref = + highway::refine_uncertain_nodes_set_search_cpu(st, full, hcsr, lrcfg); + timers["Anchor_preserving_pattern_decoding"] = now_sec() - t0; + + t0 = now_sec(); + auto node_memberships_contig = + decode_state_labels(st_ref, decode_theta, max_memberships); + timers["Decode"] = now_sec() - t0; + + t0 = now_sec(); + auto communities_contig = + build_communities_from_node_memberships(node_memberships_contig); + write_communities_json("communities.json", el.inv_map, communities_contig); + timers["WriteCommunitiesJson"] = now_sec() - t0; + + timers["TOTAL"] = now_sec() - t_global0; + + const long long m_undirected = count_undirected_edges_from_csr(full); + const long long m_backbone_undirected = count_undirected_edges_from_csr(hcsr); + const long long total_memberships = + count_total_memberships(node_memberships_contig); + + std::cout << "Summary:\n"; + std::cout << " n=" << n << "\n"; + std::cout << " m_undirected=" << m_undirected << "\n"; + std::cout << " m_backbone_undirected=" << m_backbone_undirected << "\n"; + std::cout << " backbone_ratio=" + << std::fixed << std::setprecision(6) + << static_cast(m_backbone_undirected) / + std::max(1.0, static_cast(m_undirected)) + << "\n"; + std::cout << " anchors=" << anchors.size() << "\n"; + std::cout << " total_memberships=" << total_memberships << "\n"; + std::cout << " avg_memberships_per_node=" + << std::fixed << std::setprecision(6) + << static_cast(total_memberships) / + std::max(1.0, static_cast(n)) + << "\n"; + + std::cout << "Timers:\n"; + for (const auto& kv : timers) { + std::cout << " " << kv.first << " = " + << std::fixed << std::setprecision(6) + << kv.second << "s\n"; + } + } catch (const std::exception& e) { + std::cerr << "Fatal: " << e.what() << "\n"; + return 1; + } + + return 0; +} \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/src/propagation.cpp b/cdlib/algorithms/internal/highway_cpp/src/propagation.cpp new file mode 100644 index 0000000..0ad0884 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/src/propagation.cpp @@ -0,0 +1,217 @@ +#include "propagation.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace highway { + +static inline int at(const std::vector& a, int n, int r, int v, int j) { + return a[v * r + j]; +} +static inline float atf(const std::vector& a, int n, int r, int v, int j) { + return a[v * r + j]; +} +static inline void set(std::vector& a, int n, int r, int v, int j, int x) { + a[v * r + j] = x; +} +static inline void setf(std::vector& a, int n, int r, int v, int j, float x) { + a[v * r + j] = x; +} + +TopRState propagate_other_assignment_topr_cpu( + const CSR& highway_csr, + const CSR& full_csr, + const std::vector& anchors, + const PropConfig& cfg) { + + TopRState st; + const int n = highway_csr.n; + const int r = std::max(1, cfg.top_r); + const int k = (int)anchors.size(); + st.n = n; st.r = r; st.k = k; + + st.idx.assign(n * r, -1); + st.val.assign(n * r, 0.0f); + + if (n <= 0 || k <= 0) return st; + + // init anchors: community ids aligned with anchors list + for (int cid = 0; cid < k; ++cid) { + int v = anchors[cid]; + if (v < 0 || v >= n) continue; + set(st.idx, n, r, v, 0, cid); + setf(st.val, n, r, v, 0, 1.0f); + } + + // degrees + std::vector deg_h_i = degrees_from_csr(highway_csr); + std::vector deg_f_i = degrees_from_csr(full_csr); + std::vector deg_h(n, 1.0), deg_f(n, 1.0); + for (int i = 0; i < n; ++i) { + deg_h[i] = std::max(1, deg_h_i[i]); + deg_f[i] = std::max(1, deg_f_i[i]); + } + + const int T = std::max(0, cfg.T); + const double damping = cfg.damping; + const double eta_leak = cfg.eta_leak; + const float tau = (float)cfg.tau; + const float eps = (float)cfg.eps; + + std::vector idx_new(n * r, -1); + std::vector val_new(n * r, 0.0f); + + std::vector acc_k; acc_k.reserve(k); + + for (int it = 0; it < T; ++it) { + std::fill(idx_new.begin(), idx_new.end(), -1); + std::fill(val_new.begin(), val_new.end(), 0.0f); + + // ---- per node update ---- + #if defined(HIGHWAY_USE_OPENMP) + #pragma omp parallel + #endif + { + std::vector acc_local; + acc_local.resize(k, 0.0f); + + #if defined(HIGHWAY_USE_OPENMP) + #pragma omp for schedule(dynamic, 128) + #endif + for (int v = 0; v < n; ++v) { + std::fill(acc_local.begin(), acc_local.end(), 0.0f); + + // highway neighbors + int s0 = highway_csr.indptr[v]; + int s1 = highway_csr.indptr[v + 1]; + if (s1 > s0) { + for (int p = s0; p < s1; ++p) { + int u = highway_csr.indices[p]; + if (u < 0 || u >= n) continue; + double w = 1.0 / std::sqrt(deg_h[u] * deg_h[v]); + + for (int j = 0; j < r; ++j) { + int cid = at(st.idx, n, r, u, j); + float pv = atf(st.val, n, r, u, j); + if (cid >= 0 && pv > 0.0f) { + acc_local[cid] += (float)(pv * w); + } + } + } + } + + // optional leak from full graph + if (eta_leak > 0.0) { + int f0 = full_csr.indptr[v]; + int f1 = full_csr.indptr[v + 1]; + if (f1 > f0) { + for (int p = f0; p < f1; ++p) { + int u = full_csr.indices[p]; + if (u < 0 || u >= n) continue; + double w = eta_leak / std::sqrt(deg_f[u] * deg_f[v]); + for (int j = 0; j < r; ++j) { + int cid = at(st.idx, n, r, u, j); + float pv = atf(st.val, n, r, u, j); + if (cid >= 0 && pv > 0.0f) { + acc_local[cid] += (float)(pv * w); + } + } + } + } + } + + // fallback copy + bool any_pos = false; + for (int c = 0; c < k; ++c) { + if (acc_local[c] > 0.0f) { any_pos = true; break; } + } + if (!any_pos) { + for (int j = 0; j < r; ++j) { + set(idx_new, n, r, v, j, at(st.idx, n, r, v, j)); + setf(val_new, n, r, v, j, atf(st.val, n, r, v, j)); + } + continue; + } + + // top-r + softmax + std::vector topi; + std::vector topv; + topk_indices_values(acc_local, std::min(r, k), topi, topv); + + stable_softmax_inplace(topv, tau, eps); + + for (int j = 0; j < r; ++j) { + set(idx_new, n, r, v, j, -1); + setf(val_new, n, r, v, j, 0.0f); + } + int rr = (int)topi.size(); + for (int j = 0; j < rr; ++j) { + set(idx_new, n, r, v, j, topi[j]); + setf(val_new, n, r, v, j, topv[j]); + } + } + } + + // ---- damping mix (same as python: scatter into dense acc_k then top-r+softmax) ---- + if (damping < 1.0) { + const float d = (float)damping; + const float one_minus = 1.0f - d; + + #if defined(HIGHWAY_USE_OPENMP) + #pragma omp parallel + #endif + { + std::vector acc_local; + acc_local.resize(k, 0.0f); + + #if defined(HIGHWAY_USE_OPENMP) + #pragma omp for schedule(dynamic, 128) + #endif + for (int v = 0; v < n; ++v) { + float sum_new = 0.0f; + for (int j = 0; j < r; ++j) sum_new += atf(val_new, n, r, v, j); + if (sum_new <= 0.0f) continue; + + std::fill(acc_local.begin(), acc_local.end(), 0.0f); + + // add new + for (int j = 0; j < r; ++j) { + int cid = at(idx_new, n, r, v, j); + float pv = atf(val_new, n, r, v, j); + if (cid >= 0 && pv > 0.0f) acc_local[cid] += d * pv; + } + // add old + for (int j = 0; j < r; ++j) { + int cid = at(st.idx, n, r, v, j); + float pv = atf(st.val, n, r, v, j); + if (cid >= 0 && pv > 0.0f) acc_local[cid] += one_minus * pv; + } + + std::vector topi; + std::vector topv; + topk_indices_values(acc_local, std::min(r, k), topi, topv); + stable_softmax_inplace(topv, tau, eps); + + for (int j = 0; j < r; ++j) { + set(idx_new, n, r, v, j, -1); + setf(val_new, n, r, v, j, 0.0f); + } + int rr = (int)topi.size(); + for (int j = 0; j < rr; ++j) { + set(idx_new, n, r, v, j, topi[j]); + setf(val_new, n, r, v, j, topv[j]); + } + } + } + } + + // swap into state + st.idx.swap(idx_new); + st.val.swap(val_new); + } + + return st; +} + +} // namespace highway \ No newline at end of file diff --git a/cdlib/algorithms/internal/highway_cpp/src/refinement.cpp b/cdlib/algorithms/internal/highway_cpp/src/refinement.cpp new file mode 100644 index 0000000..0bc94c9 --- /dev/null +++ b/cdlib/algorithms/internal/highway_cpp/src/refinement.cpp @@ -0,0 +1,427 @@ +#include "refinement.hpp" + +#include +#include +#include +#include +#include + +namespace highway { + +static inline int idx_at(const TopRState& s, int v, int j) { + return s.idx[v * s.r + j]; +} + +static inline float val_at(const TopRState& s, int v, int j) { + return s.val[v * s.r + j]; +} + +static inline float clamp01(float x) { + return std::max(0.0f, std::min(1.0f, x)); +} + +static inline float clamp_range(float x, float lo, float hi) { + if (lo > hi) std::swap(lo, hi); + return std::max(lo, std::min(hi, x)); +} + +struct PatternKey { + std::vector ids; + bool operator==(const PatternKey& o) const { return ids == o.ids; } +}; + +struct PatternHash { + std::size_t operator()(PatternKey const& k) const noexcept { + std::size_t h = 1469598103934665603ULL; + for (int x : k.ids) { + h ^= static_cast(x) + + 0x9e3779b97f4a7c15ULL + + (h << 6) + + (h >> 2); + } + return h; + } +}; + +struct PatternInfo { + PatternKey key; + std::vector nodes; + + int internal_edges = 0; + int external_edges = 0; + std::unordered_map ext_counts; + + float self_fraction = 0.0f; + float neighbor_entropy = 0.0f; + float confidence = 0.0f; + + // Anchor-indexed target distribution, not pattern-id distribution. + std::unordered_map target; +}; + +static PatternKey support_key_of_node(const TopRState& st, int v) { + PatternKey key; + key.ids.reserve(st.r); + + for (int j = 0; j < st.r; ++j) { + const int cid = idx_at(st, v, j); + const float pv = val_at(st, v, j); + if (cid >= 0 && pv > 0.0f) key.ids.push_back(cid); + } + + std::sort(key.ids.begin(), key.ids.end()); + key.ids.erase(std::unique(key.ids.begin(), key.ids.end()), key.ids.end()); + return key; +} + +static float normalized_entropy_counts(const std::unordered_map& counts) { + double total = 0.0; + int k = 0; + + for (const auto& kv : counts) { + if (kv.second > 0) { + total += kv.second; + ++k; + } + } + + if (total <= 0.0 || k <= 1) return 0.0f; + + double h = 0.0; + for (const auto& kv : counts) { + if (kv.second <= 0) continue; + const double p = static_cast(kv.second) / total; + h -= p * std::log(p + 1e-12); + } + + return static_cast(h / std::log(static_cast(k))); +} + +static void normalize_distribution(std::unordered_map& dist) { + double sum = 0.0; + for (const auto& kv : dist) { + if (kv.second > 0.0f) sum += kv.second; + } + + if (sum <= 1e-20) return; + + for (auto& kv : dist) { + if (kv.second > 0.0f) kv.second = static_cast(kv.second / sum); + else kv.second = 0.0f; + } +} + +static void sharpen_distribution(std::unordered_map& dist, float gamma) { + gamma = std::max(1.0f, gamma); + + if (std::abs(gamma - 1.0f) <= 1e-6f) { + normalize_distribution(dist); + return; + } + + for (auto& kv : dist) { + kv.second = std::pow(std::max(0.0f, kv.second), gamma); + } + normalize_distribution(dist); +} + +static float compute_pattern_confidence( + const PatternInfo& p, + const LocalRefineConfig& cfg) { + + const float w_self = std::max(0.0f, cfg.confidence_self_fraction_weight); + const float w_entropy = std::max(0.0f, cfg.confidence_low_entropy_weight); + const float w_sum = w_self + w_entropy; + + float score = 0.0f; + if (w_sum > 1e-20f) { + score = + w_self * clamp01(p.self_fraction) + + w_entropy * clamp01(1.0f - p.neighbor_entropy); + score /= w_sum; + } + + const float lo = clamp01(cfg.pattern_confidence_floor); + const float hi = clamp01(cfg.pattern_confidence_ceiling); + return clamp_range(score, lo, hi); +} + +static float same_pattern_neighbor_ratio( + const CSR& csr, + const std::vector& node_pid, + int v) { + + if (v < 0 || v >= csr.n || v >= static_cast(node_pid.size())) return 0.0f; + + const int pid = node_pid[v]; + if (pid < 0) return 0.0f; + + int deg = 0; + int same = 0; + + const int s0 = csr.indptr[v]; + const int s1 = csr.indptr[v + 1]; + + for (int p = s0; p < s1; ++p) { + const int u = csr.indices[p]; + if (u < 0 || u >= static_cast(node_pid.size())) continue; + + const int qid = node_pid[u]; + if (qid < 0) continue; + + ++deg; + if (qid == pid) ++same; + } + + if (deg <= 0) return 0.0f; + return static_cast(same) / static_cast(deg); +} + +static std::unordered_map node_neighbor_anchor_consensus( + const TopRState& st, + const CSR& csr, + int v) { + + std::unordered_map dist; + if (v < 0 || v >= csr.n || v >= st.n) return dist; + + const int s0 = csr.indptr[v]; + const int s1 = csr.indptr[v + 1]; + + for (int p = s0; p < s1; ++p) { + const int u = csr.indices[p]; + if (u < 0 || u >= st.n) continue; + + for (int j = 0; j < st.r; ++j) { + const int cid = idx_at(st, u, j); + const float pv = val_at(st, u, j); + if (cid >= 0 && pv > 0.0f) dist[cid] += pv; + } + } + + normalize_distribution(dist); + return dist; +} + +static void write_topr_from_distribution( + TopRState& out, + int v, + const std::unordered_map& dist, + float min_abs_mass_to_keep, + bool renormalize) { + + std::vector> items; + items.reserve(dist.size()); + + for (const auto& kv : dist) { + if (kv.first >= 0 && kv.second > min_abs_mass_to_keep) { + items.push_back(kv); + } + } + + if (items.empty()) return; + + std::sort(items.begin(), items.end(), [](const auto& a, const auto& b) { + if (a.second != b.second) return a.second > b.second; + return a.first < b.first; + }); + + const int keep = std::min(out.r, static_cast(items.size())); + + for (int j = 0; j < out.r; ++j) { + out.idx[v * out.r + j] = -1; + out.val[v * out.r + j] = 0.0f; + } + + float kept_sum = 0.0f; + for (int j = 0; j < keep; ++j) { + out.idx[v * out.r + j] = items[j].first; + out.val[v * out.r + j] = items[j].second; + kept_sum += items[j].second; + } + + if (renormalize && kept_sum > 1e-20f) { + for (int j = 0; j < keep; ++j) { + out.val[v * out.r + j] /= kept_sum; + } + } +} + +TopRState refine_uncertain_nodes_set_search_cpu( + const TopRState& in, + const CSR& full_csr, + const CSR& backbone_csr, + const LocalRefineConfig& cfg) { + + if (!cfg.enable_pattern_refinement) return in; + + const int n = in.n; + const int r = in.r; + if (n <= 0 || r <= 0) return in; + + // ------------------------------------------------------------ + // 1. Extract support-set patterns from propagated memberships. + // ------------------------------------------------------------ + std::unordered_map key_to_pid; + key_to_pid.reserve(static_cast(n) * 2U); + + std::vector patterns; + std::vector node_pid(n, -1); + + for (int v = 0; v < n; ++v) { + PatternKey key = support_key_of_node(in, v); + + auto it = key_to_pid.find(key); + int pid = -1; + + if (it == key_to_pid.end()) { + pid = static_cast(patterns.size()); + key_to_pid.emplace(key, pid); + + PatternInfo info; + info.key = std::move(key); + patterns.push_back(std::move(info)); + } else { + pid = it->second; + } + + node_pid[v] = pid; + patterns[pid].nodes.push_back(v); + } + + if (patterns.empty()) return in; + + // ------------------------------------------------------------ + // 2. Estimate pattern structural confidence on the full graph. + // ------------------------------------------------------------ + for (int u = 0; u < full_csr.n && u < n; ++u) { + const int pu = node_pid[u]; + if (pu < 0) continue; + + const int s0 = full_csr.indptr[u]; + const int s1 = full_csr.indptr[u + 1]; + + for (int p = s0; p < s1; ++p) { + const int v = full_csr.indices[p]; + if (v <= u || v < 0 || v >= n) continue; // undirected once + + const int pv = node_pid[v]; + if (pv < 0) continue; + + if (pu == pv) { + patterns[pu].internal_edges += 1; + } else { + patterns[pu].external_edges += 1; + patterns[pv].external_edges += 1; + patterns[pu].ext_counts[pv] += 1; + patterns[pv].ext_counts[pu] += 1; + } + } + } + + for (auto& p : patterns) { + const int incident_units = 2 * p.internal_edges + p.external_edges; + if (incident_units > 0) { + p.self_fraction = static_cast(2 * p.internal_edges) / + static_cast(incident_units); + } + + p.neighbor_entropy = normalized_entropy_counts(p.ext_counts); + p.confidence = compute_pattern_confidence(p, cfg); + } + + // ------------------------------------------------------------ + // 3. Build anchor-space pattern targets by averaging propagated memberships. + // ------------------------------------------------------------ + for (auto& p : patterns) { + for (int v : p.nodes) { + for (int j = 0; j < r; ++j) { + const int cid = idx_at(in, v, j); + const float pv = val_at(in, v, j); + if (cid >= 0 && pv > 0.0f) p.target[cid] += pv; + } + } + + normalize_distribution(p.target); + sharpen_distribution(p.target, cfg.target_sharpen_gamma); + } + + // ------------------------------------------------------------ + // 4. Anchor-preserving soft decoding. + // + // Pattern is only a calibration unit. The output remains in the original + // anchor community space: + // q_v = mix * q_pattern + (1 - mix) * q_neighbor + // lambda_v = update_strength * confidence(P_v) * same_pattern_ratio(v)^gamma + // alpha_new = (1 - lambda_v) * alpha_old + lambda_v * q_v + // ------------------------------------------------------------ + TopRState out = in; + out.k = in.k; + + const CSR& local_csr = (backbone_csr.n == n ? backbone_csr : full_csr); + const float mix = clamp01(cfg.pattern_target_mix); + const float update_strength = clamp01(cfg.update_strength); + const float mode_power = std::max(0.0f, cfg.node_mode_power); + + for (int v = 0; v < n; ++v) { + const int pid = node_pid[v]; + if (pid < 0 || pid >= static_cast(patterns.size())) continue; + + const PatternInfo& p = patterns[pid]; + if (p.target.empty()) continue; + + std::unordered_map q; + + for (const auto& kv : p.target) { + if (kv.first >= 0 && kv.second > 0.0f) q[kv.first] += mix * kv.second; + } + + std::unordered_map q_neighbor = + node_neighbor_anchor_consensus(in, local_csr, v); + + for (const auto& kv : q_neighbor) { + if (kv.first >= 0 && kv.second > 0.0f) { + q[kv.first] += (1.0f - mix) * kv.second; + } + } + + normalize_distribution(q); + if (q.empty()) continue; + + const float same_ratio = same_pattern_neighbor_ratio(local_csr, node_pid, v); + const float node_factor = std::pow(clamp01(same_ratio), mode_power); + const float lambda_v = clamp01(update_strength * clamp01(p.confidence) * node_factor); + + if (lambda_v <= 1e-7f) continue; + + std::unordered_map blended; + + for (int j = 0; j < r; ++j) { + const int cid = idx_at(in, v, j); + const float pv = val_at(in, v, j); + if (cid >= 0 && pv > 0.0f) { + blended[cid] += (1.0f - lambda_v) * pv; + } + } + + for (const auto& kv : q) { + if (kv.first >= 0 && kv.second > 0.0f) { + blended[kv.first] += lambda_v * kv.second; + } + } + + normalize_distribution(blended); + sharpen_distribution(blended, cfg.target_sharpen_gamma); + + write_topr_from_distribution( + out, + v, + blended, + cfg.min_abs_mass_to_keep, + cfg.renormalize); + } + + return out; +} + +} // namespace highway diff --git a/cdlib/algorithms/overlapping_partition.py b/cdlib/algorithms/overlapping_partition.py index f9b9cd0..6ffc9e7 100644 --- a/cdlib/algorithms/overlapping_partition.py +++ b/cdlib/algorithms/overlapping_partition.py @@ -2103,6 +2103,7 @@ def ebgc( clustering, g_original, "ebgc", method_parameters={}, overlap=True ) + def highway( g_original: object, highway_top_r: int = 3, @@ -2116,18 +2117,27 @@ def highway( prop_eta_leak: float = 0.0, prop_tau: float = 0.85, enable_pattern_refinement: bool = True, - local_update_strength: float = 0.45, - local_pattern_target_mix: float = 0.65, - local_node_mode_power: float = 2.00, + local_confidence_self_fraction_weight: float = 0.85, + local_confidence_low_entropy_weight: float = 0.15, + local_pattern_confidence_floor: float = 0.05, + local_pattern_confidence_ceiling: float = 1.00, + local_update_strength: float = 0.50, + local_node_mode_power: float = 1.50, + local_pattern_target_mix: float = 0.75, + local_target_sharpen_gamma: float = 1.20, + local_min_abs_mass_to_keep: float = 1e-8, + local_renormalize: bool = True, decode_theta: float = 0.30, max_memberships: int = 3, min_community_size: int = 1, + deduplicate_communities: bool = True, + cpp_binary: str = None, ) -> NodeClustering: """ - Highway is a scalable overlapping community detection algorithm for - networked data. It builds a sparse structural backbone, selects anchor - nodes, propagates anchor memberships, and decodes overlapping communities - from the propagated membership patterns. + Highway is an overlapping community detection algorithm based on sparse + structurally informative backbones and anchor-membership propagation. + + This CDlib wrapper calls the optimized C++ implementation of Highway. **Supported Graph Types** @@ -2137,25 +2147,34 @@ def highway( Yes No No ========== ======== ======== - :param g_original: a networkx/igraph object. - :param highway_top_r: number of retained backbone neighbors per node. - :param mod_jaccard_alpha: balance between modularity-inspired edge score and Jaccard edge score. - :param ensure_min1_per_node: whether to retain at least one backbone edge for non-isolated nodes. - :param symmetrize: whether to symmetrize the retained directed backbone edges. - :param max_anchors: maximum number of selected anchors. If None, uses max(8, min(30, n // 5)). - :param prop_top_r: number of memberships retained during propagation. - :param prop_T: number of propagation iterations. - :param prop_damping: damping factor used during propagation. - :param prop_eta_leak: optional leakage from the full graph during propagation. - :param prop_tau: softmax temperature used during propagation. - :param enable_pattern_refinement: whether to use anchor-preserving pattern refinement. - :param local_update_strength: strength of local pattern refinement. - :param local_pattern_target_mix: mixture weight between pattern target and neighbor consensus. - :param local_node_mode_power: exponent controlling node-level pattern confidence. - :param decode_theta: membership decoding threshold. - :param max_memberships: maximum memberships retained per node. - :param min_community_size: minimum size of returned communities. - :return: NodeClustering object. + :param g_original: a networkx/igraph object + :param highway_top_r: number of retained neighbors per node in the sparse backbone + :param mod_jaccard_alpha: mixing weight between modularity-based and Jaccard-based edge scores + :param ensure_min1_per_node: whether to keep at least one edge for each non-isolated node + :param symmetrize: whether to symmetrize the sparse backbone + :param max_anchors: maximum number of selected anchors + :param prop_top_r: number of retained anchor memberships per node + :param prop_T: number of propagation iterations + :param prop_damping: damping factor used in anchor-membership propagation + :param prop_eta_leak: optional leakage weight from the full graph + :param prop_tau: softmax temperature for propagation + :param enable_pattern_refinement: whether to enable anchor-preserving pattern decoding + :param local_confidence_self_fraction_weight: self-fraction weight in pattern confidence + :param local_confidence_low_entropy_weight: low-entropy weight in pattern confidence + :param local_pattern_confidence_floor: minimum pattern confidence + :param local_pattern_confidence_ceiling: maximum pattern confidence + :param local_update_strength: local decoding update strength + :param local_node_mode_power: local mode exponent + :param local_pattern_target_mix: pattern/local target mixing parameter + :param local_target_sharpen_gamma: target sharpening exponent + :param local_min_abs_mass_to_keep: minimum membership mass to keep + :param local_renormalize: whether to renormalize local refined memberships + :param decode_theta: threshold for decoding node memberships + :param max_memberships: maximum number of memberships retained per node + :param min_community_size: minimum size of returned communities + :param deduplicate_communities: whether to remove exact duplicate communities before returning + :param cpp_binary: optional custom path to the compiled Highway C++ executable + :return: NodeClustering object :Example: @@ -2163,12 +2182,16 @@ def highway( >>> import networkx as nx >>> G = nx.karate_club_graph() >>> coms = algorithms.highway(G) + + To preserve exact duplicate communities from the backend output: + + >>> coms = algorithms.highway(G, deduplicate_communities=False) """ g = convert_graph_formats(g_original, nx.Graph) coms = highway_nx( - g, + G=g, highway_top_r=highway_top_r, mod_jaccard_alpha=mod_jaccard_alpha, ensure_min1_per_node=ensure_min1_per_node, @@ -2180,12 +2203,21 @@ def highway( prop_eta_leak=prop_eta_leak, prop_tau=prop_tau, enable_pattern_refinement=enable_pattern_refinement, + local_confidence_self_fraction_weight=local_confidence_self_fraction_weight, + local_confidence_low_entropy_weight=local_confidence_low_entropy_weight, + local_pattern_confidence_floor=local_pattern_confidence_floor, + local_pattern_confidence_ceiling=local_pattern_confidence_ceiling, local_update_strength=local_update_strength, - local_pattern_target_mix=local_pattern_target_mix, local_node_mode_power=local_node_mode_power, + local_pattern_target_mix=local_pattern_target_mix, + local_target_sharpen_gamma=local_target_sharpen_gamma, + local_min_abs_mass_to_keep=local_min_abs_mass_to_keep, + local_renormalize=local_renormalize, decode_theta=decode_theta, max_memberships=max_memberships, min_community_size=min_community_size, + deduplicate_communities=deduplicate_communities, + cpp_binary=cpp_binary, ) return NodeClustering( @@ -2204,12 +2236,22 @@ def highway( "prop_eta_leak": prop_eta_leak, "prop_tau": prop_tau, "enable_pattern_refinement": enable_pattern_refinement, + "local_confidence_self_fraction_weight": local_confidence_self_fraction_weight, + "local_confidence_low_entropy_weight": local_confidence_low_entropy_weight, + "local_pattern_confidence_floor": local_pattern_confidence_floor, + "local_pattern_confidence_ceiling": local_pattern_confidence_ceiling, "local_update_strength": local_update_strength, - "local_pattern_target_mix": local_pattern_target_mix, "local_node_mode_power": local_node_mode_power, + "local_pattern_target_mix": local_pattern_target_mix, + "local_target_sharpen_gamma": local_target_sharpen_gamma, + "local_min_abs_mass_to_keep": local_min_abs_mass_to_keep, + "local_renormalize": local_renormalize, "decode_theta": decode_theta, "max_memberships": max_memberships, "min_community_size": min_community_size, + "deduplicate_communities": deduplicate_communities, + "cpp_binary": cpp_binary, + "backend": "cpp", }, overlap=True, ) \ No newline at end of file From 34538c1112295c348158aa2b14bfb378c6b0b759 Mon Sep 17 00:00:00 2001 From: AmbitiousK <1724260766@qq.com> Date: Fri, 22 May 2026 16:01:42 -0400 Subject: [PATCH 4/4] Add Highway C++ backend integration --- cdlib/algorithms/internal/Highway.py | 70 +++++++++++------------ cdlib/algorithms/overlapping_partition.py | 12 ++-- 2 files changed, 39 insertions(+), 43 deletions(-) diff --git a/cdlib/algorithms/internal/Highway.py b/cdlib/algorithms/internal/Highway.py index e583bb0..6a3c713 100644 --- a/cdlib/algorithms/internal/Highway.py +++ b/cdlib/algorithms/internal/Highway.py @@ -17,7 +17,7 @@ highway_nx(G, ...) -This module keeps the CDlib-facing API in Python, but the actual Highway +This module keeps the CDlib-facing API in Python, while the actual Highway algorithm is executed by the optimized C++ backend. """ @@ -116,16 +116,6 @@ def _remove_exact_duplicate_communities( Two communities are treated as duplicates if they contain the same node set, regardless of node order. - - Args: - communities: - List of communities. - deduplicate_communities: - If True, remove exact duplicate communities. - If False, return communities unchanged. - - Returns: - A list of communities, optionally deduplicated. """ if not deduplicate_communities: return communities @@ -170,10 +160,13 @@ def highway_nx( max_memberships: int = 3, min_community_size: int = 1, deduplicate_communities: bool = True, - cpp_binary: str | None = None, ) -> List[List[Node]]: """ - Run Highway using the optimized C++ executable. + Run Highway on a NetworkX graph. + + This function exposes the same Python-facing API as the pure-Python Highway + implementation. In this branch, the computation is delegated internally to + the compiled C++ executable. Args: G: @@ -197,9 +190,9 @@ def highway_nx( prop_eta_leak: Leakage parameter in propagation. prop_tau: - Propagation threshold or temperature parameter. + Softmax temperature for propagation. enable_pattern_refinement: - Whether to enable local pattern refinement. + Whether to enable anchor-preserving pattern refinement. local_confidence_self_fraction_weight: Weight for self-fraction confidence. local_confidence_low_entropy_weight: @@ -217,7 +210,7 @@ def highway_nx( local_target_sharpen_gamma: Sharpening parameter for local target. local_min_abs_mass_to_keep: - Minimum absolute mass to keep during refinement. + Minimum absolute membership mass to keep. local_renormalize: Whether to renormalize local memberships. decode_theta: @@ -228,9 +221,6 @@ def highway_nx( Minimum size of returned communities. deduplicate_communities: If True, remove exact duplicate communities before returning. - If False, preserve duplicate communities from the backend output. - cpp_binary: - Optional custom path to the Highway C++ binary. Returns: A list of overlapping communities. Each community is represented as a @@ -254,7 +244,7 @@ def highway_nx( deduplicate_communities=deduplicate_communities, ) - binary = Path(cpp_binary) if cpp_binary is not None else _default_highway_cpp_binary() + binary = _default_highway_cpp_binary() if not binary.exists(): raise FileNotFoundError( @@ -277,19 +267,26 @@ def highway_nx( cmd = [ str(binary), - "--input", str(input_path), - - "--highway_top_r", str(highway_top_r), - "--ensure_min1", "1" if ensure_min1_per_node else "0", - "--symmetrize", "1" if symmetrize else "0", - "--mod_jaccard_alpha", str(mod_jaccard_alpha), - - "--prop_top_r", str(prop_top_r), - "--prop_T", str(prop_T), - "--prop_damping", str(prop_damping), - "--prop_eta_leak", str(prop_eta_leak), - "--prop_tau", str(prop_tau), - + "--input", + str(input_path), + "--highway_top_r", + str(highway_top_r), + "--ensure_min1", + "1" if ensure_min1_per_node else "0", + "--symmetrize", + "1" if symmetrize else "0", + "--mod_jaccard_alpha", + str(mod_jaccard_alpha), + "--prop_top_r", + str(prop_top_r), + "--prop_T", + str(prop_T), + "--prop_damping", + str(prop_damping), + "--prop_eta_leak", + str(prop_eta_leak), + "--prop_tau", + str(prop_tau), "--local_enable_pattern_refinement", "1" if enable_pattern_refinement else "0", "--local_confidence_self_fraction_weight", @@ -312,9 +309,10 @@ def highway_nx( str(local_min_abs_mass_to_keep), "--local_renormalize", "1" if local_renormalize else "0", - - "--decode_theta", str(decode_theta), - "--max_memberships", str(max_memberships), + "--decode_theta", + str(decode_theta), + "--max_memberships", + str(max_memberships), ] if max_anchors is not None: diff --git a/cdlib/algorithms/overlapping_partition.py b/cdlib/algorithms/overlapping_partition.py index 6ffc9e7..b8736b0 100644 --- a/cdlib/algorithms/overlapping_partition.py +++ b/cdlib/algorithms/overlapping_partition.py @@ -2131,13 +2131,15 @@ def highway( max_memberships: int = 3, min_community_size: int = 1, deduplicate_communities: bool = True, - cpp_binary: str = None, ) -> NodeClustering: """ Highway is an overlapping community detection algorithm based on sparse structurally informative backbones and anchor-membership propagation. - This CDlib wrapper calls the optimized C++ implementation of Highway. + The algorithm first builds a sparse backbone that keeps structurally + informative edges, then selects representative anchor nodes, propagates + anchor-indexed memberships over the backbone, and decodes the resulting + memberships into overlapping communities. **Supported Graph Types** @@ -2173,7 +2175,6 @@ def highway( :param max_memberships: maximum number of memberships retained per node :param min_community_size: minimum size of returned communities :param deduplicate_communities: whether to remove exact duplicate communities before returning - :param cpp_binary: optional custom path to the compiled Highway C++ executable :return: NodeClustering object :Example: @@ -2183,7 +2184,7 @@ def highway( >>> G = nx.karate_club_graph() >>> coms = algorithms.highway(G) - To preserve exact duplicate communities from the backend output: + To preserve exact duplicate communities from the algorithm output: >>> coms = algorithms.highway(G, deduplicate_communities=False) """ @@ -2217,7 +2218,6 @@ def highway( max_memberships=max_memberships, min_community_size=min_community_size, deduplicate_communities=deduplicate_communities, - cpp_binary=cpp_binary, ) return NodeClustering( @@ -2250,8 +2250,6 @@ def highway( "max_memberships": max_memberships, "min_community_size": min_community_size, "deduplicate_communities": deduplicate_communities, - "cpp_binary": cpp_binary, - "backend": "cpp", }, overlap=True, ) \ No newline at end of file