diff --git a/autorca_core/__init__.py b/autorca_core/__init__.py index 75cc8d1..7d1cbc7 100644 --- a/autorca_core/__init__.py +++ b/autorca_core/__init__.py @@ -13,6 +13,7 @@ from autorca_core.model.graph import Service, Dependency, IncidentNode from autorca_core.reasoning.loop import run_rca, RCARunResult from autorca_core.logging import configure_logging, get_logger +from autorca_core.config import ThresholdConfig __all__ = [ "Event", @@ -26,4 +27,5 @@ "RCARunResult", "configure_logging", "get_logger", + "ThresholdConfig", ] diff --git a/autorca_core/config.py b/autorca_core/config.py new file mode 100644 index 0000000..54d6eb6 --- /dev/null +++ b/autorca_core/config.py @@ -0,0 +1,126 @@ +""" +Configuration module for AutoRCA-Core. + +Provides configurable thresholds and settings for anomaly detection and RCA analysis. +""" + +import os +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class ThresholdConfig: + """ + Configurable thresholds for anomaly detection. + + These thresholds control when incidents are detected from observability data. + Different environments have different baselines, so these can be tuned accordingly. + + Attributes: + # Error detection + error_spike_count: Minimum number of errors to consider a spike + error_spike_window_seconds: Time window for error spike detection + + # Latency detection + latency_spike_ms: Latency threshold in milliseconds + latency_spike_count: Minimum number of high-latency samples + + # Resource detection + resource_exhaustion_percent: Resource usage percentage threshold + resource_exhaustion_count: Minimum number of high-usage samples + + # Correlation windows + change_correlation_seconds: Time window for correlating changes with incidents + """ + + # Error detection + error_spike_count: int = 3 + error_spike_window_seconds: int = 300 # 5 minutes + + # Latency detection + latency_spike_ms: float = 1000.0 + latency_spike_count: int = 2 + + # Resource detection + resource_exhaustion_percent: float = 90.0 + resource_exhaustion_count: int = 2 + + # Correlation windows + change_correlation_seconds: int = 600 # 10 minutes + + @classmethod + def from_env(cls) -> "ThresholdConfig": + """ + Load thresholds from environment variables. + + Environment variables: + AUTORCA_ERROR_SPIKE_COUNT: Error spike count threshold + AUTORCA_ERROR_SPIKE_WINDOW: Error spike window in seconds + AUTORCA_LATENCY_SPIKE_MS: Latency spike threshold in ms + AUTORCA_LATENCY_SPIKE_COUNT: Latency spike count threshold + AUTORCA_RESOURCE_EXHAUSTION_PERCENT: Resource exhaustion percentage + AUTORCA_RESOURCE_EXHAUSTION_COUNT: Resource exhaustion count threshold + AUTORCA_CHANGE_CORRELATION_SECONDS: Change correlation window in seconds + + Returns: + ThresholdConfig instance with values from environment + """ + return cls( + error_spike_count=int(os.getenv("AUTORCA_ERROR_SPIKE_COUNT", 3)), + error_spike_window_seconds=int(os.getenv("AUTORCA_ERROR_SPIKE_WINDOW", 300)), + latency_spike_ms=float(os.getenv("AUTORCA_LATENCY_SPIKE_MS", 1000.0)), + latency_spike_count=int(os.getenv("AUTORCA_LATENCY_SPIKE_COUNT", 2)), + resource_exhaustion_percent=float(os.getenv("AUTORCA_RESOURCE_EXHAUSTION_PERCENT", 90.0)), + resource_exhaustion_count=int(os.getenv("AUTORCA_RESOURCE_EXHAUSTION_COUNT", 2)), + change_correlation_seconds=int(os.getenv("AUTORCA_CHANGE_CORRELATION_SECONDS", 600)), + ) + + @classmethod + def strict(cls) -> "ThresholdConfig": + """ + Create a strict configuration (more sensitive to anomalies). + + Returns: + ThresholdConfig with strict thresholds + """ + return cls( + error_spike_count=2, + error_spike_window_seconds=180, # 3 minutes + latency_spike_ms=500.0, + latency_spike_count=1, + resource_exhaustion_percent=80.0, + resource_exhaustion_count=1, + change_correlation_seconds=900, # 15 minutes + ) + + @classmethod + def relaxed(cls) -> "ThresholdConfig": + """ + Create a relaxed configuration (less sensitive to anomalies). + + Returns: + ThresholdConfig with relaxed thresholds + """ + return cls( + error_spike_count=5, + error_spike_window_seconds=600, # 10 minutes + latency_spike_ms=2000.0, + latency_spike_count=3, + resource_exhaustion_percent=95.0, + resource_exhaustion_count=3, + change_correlation_seconds=300, # 5 minutes + ) + + def to_dict(self) -> dict: + """Serialize to dictionary.""" + return { + "error_spike_count": self.error_spike_count, + "error_spike_window_seconds": self.error_spike_window_seconds, + "latency_spike_ms": self.latency_spike_ms, + "latency_spike_count": self.latency_spike_count, + "resource_exhaustion_percent": self.resource_exhaustion_percent, + "resource_exhaustion_count": self.resource_exhaustion_count, + "change_correlation_seconds": self.change_correlation_seconds, + } + diff --git a/autorca_core/graph_engine/builder.py b/autorca_core/graph_engine/builder.py index c4bdf6f..9cd922b 100644 --- a/autorca_core/graph_engine/builder.py +++ b/autorca_core/graph_engine/builder.py @@ -4,7 +4,7 @@ Builds a ServiceGraph from logs, metrics, traces, and config changes. """ -from typing import List, Dict, Set, Tuple +from typing import List, Dict, Set, Tuple, Optional from datetime import datetime, timedelta from collections import defaultdict @@ -17,6 +17,7 @@ IncidentNode, IncidentType, ) +from autorca_core.config import ThresholdConfig class GraphBuilder: @@ -29,9 +30,10 @@ class GraphBuilder: - Incident nodes (detected anomalies/errors) """ - def __init__(self): + def __init__(self, thresholds: Optional[ThresholdConfig] = None): self.graph = ServiceGraph() self._service_metadata: Dict[str, Dict] = defaultdict(dict) + self.thresholds = thresholds or ThresholdConfig() def add_logs(self, logs: List[LogEvent]) -> None: """ @@ -154,17 +156,17 @@ def _detect_error_spikes(self, error_logs: List[LogEvent]) -> None: for log in error_logs: by_service[log.service].append(log) - # Detect spikes (simple threshold: 3+ errors) + # Detect spikes using configurable thresholds for service, service_errors in by_service.items(): - if len(service_errors) >= 3: + if len(service_errors) >= self.thresholds.error_spike_count: # Sort by timestamp service_errors.sort(key=lambda e: e.timestamp) first_error = service_errors[0] last_error = service_errors[-1] - # If errors span less than 5 minutes, it's a spike + # If errors span less than threshold window, it's a spike time_span = (last_error.timestamp - first_error.timestamp).total_seconds() - if time_span <= 300: # 5 minutes + if time_span <= self.thresholds.error_spike_window_seconds: evidence = [f"Error: {e.message}" for e in service_errors[:5]] # Show first 5 self.graph.add_incident(IncidentNode( service=service, @@ -193,11 +195,11 @@ def _detect_metric_anomalies(self, service: str, metrics: List[MetricPoint]) -> # Sort by timestamp metric_points.sort(key=lambda m: m.timestamp) - # Simple threshold-based detection + # Simple threshold-based detection using configurable thresholds if 'latency' in metric_name.lower() or 'duration' in metric_name.lower(): - # Detect latency spike (value > 1000ms) - high_latency = [m for m in metric_points if m.value > 1000] - if len(high_latency) >= 2: + # Detect latency spike using configured threshold + high_latency = [m for m in metric_points if m.value > self.thresholds.latency_spike_ms] + if len(high_latency) >= self.thresholds.latency_spike_count: self.graph.add_incident(IncidentNode( service=service, incident_type=IncidentType.LATENCY_SPIKE, @@ -208,9 +210,9 @@ def _detect_metric_anomalies(self, service: str, metrics: List[MetricPoint]) -> )) elif 'cpu' in metric_name.lower() or 'memory' in metric_name.lower(): - # Detect resource exhaustion (value > 90%) - high_usage = [m for m in metric_points if m.value > 90] - if len(high_usage) >= 2: + # Detect resource exhaustion using configured threshold + high_usage = [m for m in metric_points if m.value > self.thresholds.resource_exhaustion_percent] + if len(high_usage) >= self.thresholds.resource_exhaustion_count: self.graph.add_incident(IncidentNode( service=service, incident_type=IncidentType.RESOURCE_EXHAUSTION, @@ -247,6 +249,7 @@ def build_service_graph( metrics: List[MetricPoint] = None, traces: List[Span] = None, configs: List[ConfigChange] = None, + thresholds: Optional[ThresholdConfig] = None, ) -> ServiceGraph: """ Convenience function to build a ServiceGraph from observability data. @@ -256,11 +259,12 @@ def build_service_graph( metrics: Metric data points traces: Trace spans configs: Config/deployment changes + thresholds: Optional threshold configuration for anomaly detection Returns: Constructed ServiceGraph """ - builder = GraphBuilder() + builder = GraphBuilder(thresholds=thresholds) if logs: builder.add_logs(logs) diff --git a/autorca_core/reasoning/loop.py b/autorca_core/reasoning/loop.py index 1336522..61bc231 100644 --- a/autorca_core/reasoning/loop.py +++ b/autorca_core/reasoning/loop.py @@ -17,6 +17,7 @@ from autorca_core.reasoning.rules import apply_rules, RootCauseCandidate from autorca_core.reasoning.llm import LLMInterface, DummyLLM from autorca_core.logging import get_logger +from autorca_core.config import ThresholdConfig logger = get_logger(__name__) @@ -75,6 +76,7 @@ def run_rca( primary_symptom: str, data_sources: DataSourcesConfig, llm: Optional[LLMInterface] = None, + thresholds: Optional[ThresholdConfig] = None, ) -> RCARunResult: """ Run root cause analysis. @@ -92,6 +94,7 @@ def run_rca( primary_symptom: Description of the symptom (e.g., "Checkout API 500 errors") data_sources: Configuration for data sources llm: Optional LLM interface for enhanced analysis + thresholds: Optional threshold configuration for anomaly detection Returns: RCARunResult with root cause candidates and analysis @@ -139,6 +142,7 @@ def run_rca( logger.info(f" Loaded {len(configs)} config changes") # Step 2: Build service graph +<<<<<<< HEAD logger.info("Building service graph...") graph = build_service_graph(logs=logs, metrics=metrics, traces=traces, configs=configs) logger.info(f" Graph: {len(graph.services)} services, {len(graph.dependencies)} dependencies, {len(graph.incidents)} incidents") @@ -147,6 +151,16 @@ def run_rca( logger.info("Applying RCA rules...") candidates = apply_rules(graph) logger.info(f" Identified {len(candidates)} root cause candidates") +======= + print("Building service graph...") + graph = build_service_graph(logs=logs, metrics=metrics, traces=traces, configs=configs, thresholds=thresholds) + print(f" Graph: {len(graph.services)} services, {len(graph.dependencies)} dependencies, {len(graph.incidents)} incidents") + + # Step 3: Run rule-based analysis + print("Applying RCA rules...") + candidates = apply_rules(graph, thresholds=thresholds) + print(f" Identified {len(candidates)} root cause candidates") +>>>>>>> b2361b9 (feat: add configurable detection thresholds for anomaly detection) # Step 4: Generate summary using LLM logger.info("Generating RCA summary...") @@ -197,6 +211,7 @@ def run_rca_from_files( configs_path: Optional[str] = None, primary_symptom: str = "Unknown incident", window_minutes: int = 60, + thresholds: Optional[ThresholdConfig] = None, ) -> RCARunResult: """ Convenience function to run RCA from file paths. @@ -210,6 +225,7 @@ def run_rca_from_files( configs_path: Path to configs file/directory primary_symptom: Description of the symptom window_minutes: Size of the analysis window in minutes + thresholds: Optional threshold configuration for anomaly detection Returns: RCARunResult @@ -245,4 +261,4 @@ def run_rca_from_files( configs_dir=configs_path, ) - return run_rca((time_from, time_to), primary_symptom, sources) + return run_rca((time_from, time_to), primary_symptom, sources, thresholds=thresholds) diff --git a/autorca_core/reasoning/rules.py b/autorca_core/reasoning/rules.py index 713faf4..4d4763a 100644 --- a/autorca_core/reasoning/rules.py +++ b/autorca_core/reasoning/rules.py @@ -4,12 +4,13 @@ Simple, deterministic rules for identifying root causes without requiring an LLM. """ -from typing import List, Dict, Set +from typing import List, Dict, Set, Optional from dataclasses import dataclass from datetime import timedelta from autorca_core.model.graph import ServiceGraph, IncidentNode, IncidentType from autorca_core.graph_engine.queries import GraphQueries, CausalChain +from autorca_core.config import ThresholdConfig @dataclass @@ -44,7 +45,7 @@ def to_dict(self) -> Dict: } -def apply_rules(graph: ServiceGraph) -> List[RootCauseCandidate]: +def apply_rules(graph: ServiceGraph, thresholds: Optional[ThresholdConfig] = None) -> List[RootCauseCandidate]: """ Apply rule-based heuristics to identify root cause candidates. @@ -57,15 +58,19 @@ def apply_rules(graph: ServiceGraph) -> List[RootCauseCandidate]: Args: graph: ServiceGraph with incidents and dependencies + thresholds: Optional threshold configuration for correlation windows Returns: List of RootCauseCandidate objects, sorted by confidence (highest first) """ + if thresholds is None: + thresholds = ThresholdConfig() + candidates: List[RootCauseCandidate] = [] queries = GraphQueries(graph) # Rule 1: Recent deployments/config changes - candidates.extend(_rule_recent_changes(graph, queries)) + candidates.extend(_rule_recent_changes(graph, queries, thresholds)) # Rule 2: Resource exhaustion candidates.extend(_rule_resource_exhaustion(graph, queries)) @@ -85,7 +90,7 @@ def apply_rules(graph: ServiceGraph) -> List[RootCauseCandidate]: return candidates -def _rule_recent_changes(graph: ServiceGraph, queries: GraphQueries) -> List[RootCauseCandidate]: +def _rule_recent_changes(graph: ServiceGraph, queries: GraphQueries, thresholds: ThresholdConfig) -> List[RootCauseCandidate]: """ Rule: Services with recent deployments or config changes are strong root cause candidates. """ @@ -111,11 +116,11 @@ def _rule_recent_changes(graph: ServiceGraph, queries: GraphQueries) -> List[Roo if i.incident_type not in (IncidentType.DEPLOYMENT, IncidentType.CONFIG_CHANGE) ] - # Check if other incidents occurred shortly after the change + # Check if other incidents occurred shortly after the change using configurable threshold for change in change_incidents: nearby_incidents = [ i for i in other_incidents - if abs((i.timestamp - change.timestamp).total_seconds()) < 600 # Within 10 minutes + if abs((i.timestamp - change.timestamp).total_seconds()) < thresholds.change_correlation_seconds ] if nearby_incidents: