feat(alerts): Phase 13 — Operational Alert Engine

AlertRule + AlertInstance models (b041alerts migration), 8 pre-seeded system rules (high_risk x2, stale_technique, coverage_regression, low_coverage, expiry_wave, new_technique, orphan_spike), evaluation engine with per-rule cooldown, full alert lifecycle (acknowledge/resolve/dismiss), custom rule CRUD, and summary endpoint. Rules seeded at app startup. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 15:25:55 +02:00
parent d81fc04b8f
commit d4b147da7c
8 changed files with 1387 additions and 0 deletions
--- a/backend/app/services/operational_alert_service.py
+++ b/backend/app/services/operational_alert_service.py
@@ -0,0 +1,530 @@
+"""Phase 13: Operational Alert service — rule evaluation engine + CRUD."""
+
+from __future__ import annotations
+
+import logging
+import time
+from datetime import datetime, timedelta
+from typing import List, Optional
+from uuid import UUID
+
+from sqlalchemy.orm import Session
+
+from app.domain.errors import EntityNotFoundError, BusinessRuleViolation
+from app.models.operational_alert import (
+    AlertInstance, AlertRule, AlertRuleType, AlertSeverity, AlertStatus,
+)
+from app.models.technique import Technique
+from app.models.risk_intelligence import TechniqueRiskProfile
+from app.models.ownership_queue import RevalidationQueueItem, QueueStatus
+from app.models.ownership_queue import TechniqueOwnership
+from app.models.executive_dashboard import PostureSnapshot
+from app.models.enums import TechniqueStatus
+
+log = logging.getLogger(__name__)
+
+# ── Pre-configured system rules (seeded at startup) ───────────────────────────
+
+SYSTEM_RULES = [
+    {
+        "name":           "Critical Risk Techniques",
+        "description":    "Fires when 3 or more techniques reach critical risk level (score ≥ 75).",
+        "rule_type":      AlertRuleType.high_risk.value,
+        "severity":       AlertSeverity.critical.value,
+        "is_system":      True,
+        "config":         {"min_risk_score": 75.0, "min_count": 3},
+        "cooldown_hours": 24,
+    },
+    {
+        "name":           "High-Risk Technique Spike",
+        "description":    "Fires when 10 or more techniques reach high risk (score ≥ 50).",
+        "rule_type":      AlertRuleType.high_risk.value,
+        "severity":       AlertSeverity.high.value,
+        "is_system":      True,
+        "config":         {"min_risk_score": 50.0, "min_count": 10},
+        "cooldown_hours": 24,
+    },
+    {
+        "name":           "Stale Technique Detection",
+        "description":    "Fires when 5+ validated techniques have not been reviewed in 30+ days.",
+        "rule_type":      AlertRuleType.stale_technique.value,
+        "severity":       AlertSeverity.medium.value,
+        "is_system":      True,
+        "config":         {"days_stale": 30, "min_count": 5},
+        "cooldown_hours": 48,
+    },
+    {
+        "name":           "Coverage Regression",
+        "description":    "Fires when coverage drops by 5 or more percentage points between daily snapshots.",
+        "rule_type":      AlertRuleType.coverage_regression.value,
+        "severity":       AlertSeverity.high.value,
+        "is_system":      True,
+        "config":         {"min_drop_pct": 5.0},
+        "cooldown_hours": 12,
+    },
+    {
+        "name":           "Low Coverage Warning",
+        "description":    "Fires when overall coverage falls below 30%.",
+        "rule_type":      AlertRuleType.low_coverage.value,
+        "severity":       AlertSeverity.medium.value,
+        "is_system":      True,
+        "config":         {"max_coverage_pct": 30.0},
+        "cooldown_hours": 72,
+    },
+    {
+        "name":           "Revalidation Queue Backlog",
+        "description":    "Fires when 15+ techniques are waiting in the revalidation queue.",
+        "rule_type":      AlertRuleType.expiry_wave.value,
+        "severity":       AlertSeverity.medium.value,
+        "is_system":      True,
+        "config":         {"min_pending_count": 15},
+        "cooldown_hours": 24,
+    },
+    {
+        "name":           "New MITRE Techniques Detected",
+        "description":    "Fires when new ATT&CK techniques are added in the last 7 days.",
+        "rule_type":      AlertRuleType.new_technique.value,
+        "severity":       AlertSeverity.info.value,
+        "is_system":      True,
+        "config":         {"lookback_days": 7, "min_count": 1},
+        "cooldown_hours": 168,  # once a week
+    },
+    {
+        "name":           "Orphan Technique Spike",
+        "description":    "Fires when 20+ techniques have no assigned owner.",
+        "rule_type":      AlertRuleType.orphan_spike.value,
+        "severity":       AlertSeverity.low.value,
+        "is_system":      True,
+        "config":         {"min_orphan_count": 20},
+        "cooldown_hours": 48,
+    },
+]
+
+
+def seed_system_rules(db: Session) -> int:
+    """Ensure all system rules exist (idempotent). Returns count created."""
+    created = 0
+    for rule_def in SYSTEM_RULES:
+        exists = db.query(AlertRule).filter(
+            AlertRule.name      == rule_def["name"],
+            AlertRule.is_system == True,
+        ).first()
+        if not exists:
+            rule = AlertRule(**rule_def)
+            db.add(rule)
+            created += 1
+    if created:
+        db.commit()
+    return created
+
+
+# ── Rule evaluators (one per AlertRuleType) ───────────────────────────────────
+
+def _eval_high_risk(db: Session, rule: AlertRule) -> Optional[dict]:
+    min_score = float(rule.config.get("min_risk_score", 75.0))
+    min_count = int(rule.config.get("min_count", 1))
+
+    profiles = db.query(TechniqueRiskProfile).filter(
+        TechniqueRiskProfile.risk_score >= min_score,
+    ).all()
+    count = len(profiles)
+    if count < min_count:
+        return None
+
+    top = sorted(profiles, key=lambda p: p.risk_score, reverse=True)[:5]
+    return {
+        "title":   f"{count} technique(s) with risk score ≥ {min_score:.0f}",
+        "message": (
+            f"{count} technique(s) have reached risk score ≥ {min_score:.0f}. "
+            f"Top: {', '.join(str(p.technique_id)[:8] + '…' for p in top[:3])}."
+        ),
+        "details": {
+            "count":     count,
+            "threshold": min_score,
+            "top_ids":   [str(p.technique_id) for p in top],
+            "top_scores": [p.risk_score for p in top],
+        },
+    }
+
+
+def _eval_stale_technique(db: Session, rule: AlertRule) -> Optional[dict]:
+    days_stale = int(rule.config.get("days_stale", 30))
+    min_count  = int(rule.config.get("min_count", 1))
+    cutoff     = datetime.utcnow() - timedelta(days=days_stale)
+
+    stale = db.query(Technique).filter(
+        Technique.status_global == TechniqueStatus.validated,
+        Technique.last_review_date < cutoff,
+    ).all()
+    count = len(stale)
+    if count < min_count:
+        return None
+
+    return {
+        "title":   f"{count} validated technique(s) stale for {days_stale}+ days",
+        "message": (
+            f"{count} technique(s) have been validated but not reviewed in over "
+            f"{days_stale} days. Re-validate to maintain confidence."
+        ),
+        "details": {
+            "count":       count,
+            "days_stale":  days_stale,
+            "example_ids": [str(t.id) for t in stale[:10]],
+        },
+    }
+
+
+def _eval_coverage_regression(db: Session, rule: AlertRule) -> Optional[dict]:
+    min_drop = float(rule.config.get("min_drop_pct", 5.0))
+
+    snaps = (
+        db.query(PostureSnapshot)
+        .order_by(PostureSnapshot.snapshot_date.desc())
+        .limit(2)
+        .all()
+    )
+    if len(snaps) < 2:
+        return None
+
+    latest, previous = snaps[0], snaps[1]
+    drop = previous.coverage_pct - latest.coverage_pct
+    if drop < min_drop:
+        return None
+
+    return {
+        "title":   f"Coverage dropped {drop:.1f}% ({previous.coverage_pct:.1f}% → {latest.coverage_pct:.1f}%)",
+        "message": (
+            f"Overall coverage fell by {drop:.1f} percentage points "
+            f"between {previous.snapshot_date} and {latest.snapshot_date}. "
+            f"Investigate recent technique status changes."
+        ),
+        "details": {
+            "previous_pct":  previous.coverage_pct,
+            "current_pct":   latest.coverage_pct,
+            "drop_pct":      round(drop, 2),
+            "previous_date": str(previous.snapshot_date),
+            "current_date":  str(latest.snapshot_date),
+        },
+    }
+
+
+def _eval_low_coverage(db: Session, rule: AlertRule) -> Optional[dict]:
+    max_pct = float(rule.config.get("max_coverage_pct", 30.0))
+    techniques = db.query(Technique).all()
+    total = len(techniques)
+    if total == 0:
+        return None
+
+    validated = sum(1 for t in techniques if t.status_global == TechniqueStatus.validated)
+    partial   = sum(1 for t in techniques if t.status_global == TechniqueStatus.partial)
+    coverage  = (validated + partial * 0.5) / total * 100.0
+
+    if coverage > max_pct:
+        return None
+
+    return {
+        "title":   f"Coverage is critically low: {coverage:.1f}%",
+        "message": (
+            f"Current detection coverage is {coverage:.1f}%, below the minimum "
+            f"threshold of {max_pct:.0f}%. Prioritise coverage improvements."
+        ),
+        "details": {
+            "coverage_pct": round(coverage, 2),
+            "threshold":    max_pct,
+            "validated":    validated,
+            "partial":      partial,
+            "total":        total,
+        },
+    }
+
+
+def _eval_expiry_wave(db: Session, rule: AlertRule) -> Optional[dict]:
+    min_pending = int(rule.config.get("min_pending_count", 15))
+
+    pending_count = db.query(RevalidationQueueItem).filter(
+        RevalidationQueueItem.status.in_([
+            QueueStatus.pending, QueueStatus.in_progress,
+        ]),
+    ).count()
+
+    if pending_count < min_pending:
+        return None
+
+    return {
+        "title":   f"Revalidation queue backlog: {pending_count} items pending",
+        "message": (
+            f"{pending_count} technique(s) are waiting in the revalidation queue "
+            f"(threshold: {min_pending}). Assign analysts to clear the backlog."
+        ),
+        "details": {
+            "pending_count": pending_count,
+            "threshold":     min_pending,
+        },
+    }
+
+
+def _eval_new_technique(db: Session, rule: AlertRule) -> Optional[dict]:
+    lookback_days = int(rule.config.get("lookback_days", 7))
+    min_count     = int(rule.config.get("min_count", 1))
+    cutoff        = datetime.utcnow() - timedelta(days=lookback_days)
+
+    new_techs = db.query(Technique).filter(
+        Technique.mitre_last_modified >= cutoff,
+    ).all()
+    count = len(new_techs)
+    if count < min_count:
+        return None
+
+    return {
+        "title":   f"{count} new/updated MITRE technique(s) in last {lookback_days} days",
+        "message": (
+            f"{count} ATT&CK technique(s) have been added or updated in the last "
+            f"{lookback_days} days. Review and assign coverage."
+        ),
+        "details": {
+            "count":        count,
+            "lookback_days": lookback_days,
+            "technique_ids": [str(t.id) for t in new_techs[:20]],
+            "mitre_ids":    [t.mitre_id for t in new_techs[:20]],
+        },
+    }
+
+
+def _eval_orphan_spike(db: Session, rule: AlertRule) -> Optional[dict]:
+    min_orphans = int(rule.config.get("min_orphan_count", 20))
+
+    total  = db.query(Technique).count()
+    owned  = db.query(TechniqueOwnership).filter(
+        TechniqueOwnership.owner_id.isnot(None),
+    ).count()
+    orphans = max(total - owned, 0)
+
+    if orphans < min_orphans:
+        return None
+
+    return {
+        "title":   f"{orphans} unowned techniques detected",
+        "message": (
+            f"{orphans} out of {total} technique(s) have no assigned owner. "
+            f"Assign ownership to ensure accountability."
+        ),
+        "details": {
+            "orphan_count": orphans,
+            "total":        total,
+            "threshold":    min_orphans,
+        },
+    }
+
+
+_EVALUATORS = {
+    AlertRuleType.high_risk.value:           _eval_high_risk,
+    AlertRuleType.stale_technique.value:     _eval_stale_technique,
+    AlertRuleType.coverage_regression.value: _eval_coverage_regression,
+    AlertRuleType.low_coverage.value:        _eval_low_coverage,
+    AlertRuleType.expiry_wave.value:         _eval_expiry_wave,
+    AlertRuleType.new_technique.value:       _eval_new_technique,
+    AlertRuleType.orphan_spike.value:        _eval_orphan_spike,
+}
+
+
+# ── Core evaluation engine ────────────────────────────────────────────────────
+
+def _in_cooldown(rule: AlertRule) -> bool:
+    if rule.last_fired_at is None:
+        return False
+    if rule.cooldown_hours <= 0:
+        return False
+    return datetime.utcnow() < rule.last_fired_at + timedelta(hours=rule.cooldown_hours)
+
+
+def evaluate_all_rules(db: Session) -> dict:
+    """Evaluate every enabled rule; create AlertInstances for those that fire."""
+    t0 = time.monotonic()
+    rules = db.query(AlertRule).filter(AlertRule.is_enabled == True).all()
+
+    fired: List[AlertInstance] = []
+    for rule in rules:
+        if _in_cooldown(rule):
+            continue
+        evaluator = _EVALUATORS.get(rule.rule_type)
+        if not evaluator:
+            continue
+        try:
+            result = evaluator(db, rule)
+        except Exception:
+            log.exception("Error evaluating rule %s (%s)", rule.id, rule.name)
+            continue
+
+        if result is None:
+            continue  # condition not met
+
+        instance = AlertInstance(
+            rule_id   = rule.id,
+            rule_name = rule.name,
+            rule_type = rule.rule_type,
+            severity  = rule.severity,
+            title     = result["title"],
+            message   = result["message"],
+            details   = result.get("details"),
+            status    = AlertStatus.open.value,
+        )
+        db.add(instance)
+        rule.last_fired_at = datetime.utcnow()
+        fired.append(instance)
+
+    db.commit()
+    for inst in fired:
+        db.refresh(inst)
+
+    return {
+        "rules_evaluated":  len(rules),
+        "alerts_fired":     len(fired),
+        "alerts":           fired,
+        "duration_seconds": round(time.monotonic() - t0, 3),
+    }
+
+
+# ── AlertRule CRUD ────────────────────────────────────────────────────────────
+
+def list_rules(
+    db: Session,
+    rule_type:    Optional[str] = None,
+    include_disabled: bool      = False,
+) -> List[AlertRule]:
+    q = db.query(AlertRule)
+    if rule_type:
+        q = q.filter(AlertRule.rule_type == rule_type)
+    if not include_disabled:
+        q = q.filter(AlertRule.is_enabled == True)
+    return q.order_by(AlertRule.created_at.asc()).all()
+
+
+def get_rule(db: Session, rule_id: UUID) -> AlertRule:
+    rule = db.query(AlertRule).filter(AlertRule.id == rule_id).first()
+    if not rule:
+        raise EntityNotFoundError("AlertRule", str(rule_id))
+    return rule
+
+
+def create_rule(db: Session, created_by: UUID, **kwargs) -> AlertRule:
+    kwargs["is_system"] = False
+    kwargs["created_by"] = created_by
+    rule = AlertRule(**kwargs)
+    db.add(rule)
+    db.commit()
+    db.refresh(rule)
+    return rule
+
+
+def update_rule(db: Session, rule_id: UUID, **kwargs) -> AlertRule:
+    rule = get_rule(db, rule_id)
+    for k, v in kwargs.items():
+        if v is not None:
+            setattr(rule, k, v)
+    db.commit()
+    db.refresh(rule)
+    return rule
+
+
+def delete_rule(db: Session, rule_id: UUID) -> None:
+    rule = get_rule(db, rule_id)
+    if rule.is_system:
+        raise BusinessRuleViolation("System rules cannot be deleted. Disable them instead.")
+    db.delete(rule)
+    db.commit()
+
+
+# ── AlertInstance CRUD ────────────────────────────────────────────────────────
+
+def list_instances(
+    db: Session,
+    status:    Optional[str] = None,
+    severity:  Optional[str] = None,
+    rule_type: Optional[str] = None,
+    limit:     int = 50,
+    offset:    int = 0,
+) -> List[AlertInstance]:
+    q = db.query(AlertInstance)
+    if status:
+        q = q.filter(AlertInstance.status == status)
+    if severity:
+        q = q.filter(AlertInstance.severity == severity)
+    if rule_type:
+        q = q.filter(AlertInstance.rule_type == rule_type)
+    return q.order_by(AlertInstance.created_at.desc()).offset(offset).limit(limit).all()
+
+
+def get_instance(db: Session, instance_id: UUID) -> AlertInstance:
+    inst = db.query(AlertInstance).filter(AlertInstance.id == instance_id).first()
+    if not inst:
+        raise EntityNotFoundError("AlertInstance", str(instance_id))
+    return inst
+
+
+def _transition(
+    db: Session,
+    instance_id: UUID,
+    new_status: str,
+    user_id: Optional[UUID] = None,
+) -> AlertInstance:
+    inst = get_instance(db, instance_id)
+    inst.status = new_status
+    if new_status == AlertStatus.acknowledged.value:
+        inst.acknowledged_by = user_id
+        inst.acknowledged_at = datetime.utcnow()
+    elif new_status == AlertStatus.resolved.value:
+        inst.resolved_at = datetime.utcnow()
+    db.commit()
+    db.refresh(inst)
+    return inst
+
+
+def acknowledge(db: Session, instance_id: UUID, user_id: UUID) -> AlertInstance:
+    inst = get_instance(db, instance_id)
+    if inst.status != AlertStatus.open.value:
+        raise BusinessRuleViolation(f"Cannot acknowledge alert in status '{inst.status}'.")
+    return _transition(db, instance_id, AlertStatus.acknowledged.value, user_id)
+
+
+def resolve(db: Session, instance_id: UUID, user_id: UUID) -> AlertInstance:
+    inst = get_instance(db, instance_id)
+    if inst.status == AlertStatus.resolved.value:
+        raise BusinessRuleViolation("Alert is already resolved.")
+    return _transition(db, instance_id, AlertStatus.resolved.value, user_id)
+
+
+def dismiss(db: Session, instance_id: UUID, user_id: UUID) -> AlertInstance:
+    inst = get_instance(db, instance_id)
+    if inst.status in (AlertStatus.resolved.value, AlertStatus.dismissed.value):
+        raise BusinessRuleViolation(f"Cannot dismiss alert in status '{inst.status}'.")
+    return _transition(db, instance_id, AlertStatus.dismissed.value, user_id)
+
+
+def get_summary(db: Session) -> dict:
+    instances = db.query(AlertInstance).all()
+
+    by_status   = {s.value: 0 for s in AlertStatus}
+    by_severity = {s.value: 0 for s in AlertSeverity}
+    by_type     = {}
+
+    for i in instances:
+        by_status[i.status]     = by_status.get(i.status, 0) + 1
+        by_severity[i.severity] = by_severity.get(i.severity, 0) + 1
+        by_type[i.rule_type]    = by_type.get(i.rule_type, 0) + 1
+
+    recent = (
+        db.query(AlertInstance)
+        .filter(AlertInstance.status == AlertStatus.open.value)
+        .order_by(AlertInstance.created_at.desc())
+        .limit(5)
+        .all()
+    )
+
+    return {
+        "total_open":         by_status.get(AlertStatus.open.value, 0),
+        "total_acknowledged": by_status.get(AlertStatus.acknowledged.value, 0),
+        "total_resolved":     by_status.get(AlertStatus.resolved.value, 0),
+        "by_severity":        by_severity,
+        "by_rule_type":       by_type,
+        "recent_alerts":      recent,
+    }