feat(alerts): Phase 13 — Operational Alert Engine
Some checks failed
Aegis CI / lint-and-test (push) Has been cancelled
Some checks failed
Aegis CI / lint-and-test (push) Has been cancelled
AlertRule + AlertInstance models (b041alerts migration), 8 pre-seeded system rules (high_risk x2, stale_technique, coverage_regression, low_coverage, expiry_wave, new_technique, orphan_spike), evaluation engine with per-rule cooldown, full alert lifecycle (acknowledge/resolve/dismiss), custom rule CRUD, and summary endpoint. Rules seeded at app startup. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -46,6 +46,7 @@ from app.routers import risk_intelligence as risk_router
|
||||
from app.routers import executive_dashboard as dashboard_router
|
||||
from app.routers import api_keys as api_keys_router
|
||||
from app.routers import sso as sso_router
|
||||
from app.routers import operational_alerts as alerts_router
|
||||
from app.domain.errors import DomainError
|
||||
from app.middleware.error_handler import domain_exception_handler
|
||||
from app.middleware.request_context import RequestContextMiddleware
|
||||
@@ -76,6 +77,15 @@ async def lifespan(app: FastAPI):
|
||||
pass
|
||||
finally:
|
||||
db.close()
|
||||
# Seed operational alert system rules
|
||||
db2 = SessionLocal()
|
||||
try:
|
||||
from app.services.operational_alert_service import seed_system_rules
|
||||
seed_system_rules(db2)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
db2.close()
|
||||
yield
|
||||
# Graceful shutdown of the background scheduler
|
||||
scheduler.shutdown(wait=False)
|
||||
@@ -151,6 +161,7 @@ app.include_router(risk_router.router, prefix="/api/v1")
|
||||
app.include_router(dashboard_router.router, prefix="/api/v1")
|
||||
app.include_router(api_keys_router.router, prefix="/api/v1")
|
||||
app.include_router(sso_router.router, prefix="/api/v1")
|
||||
app.include_router(alerts_router.router, prefix="/api/v1")
|
||||
|
||||
|
||||
@app.get("/health", include_in_schema=False)
|
||||
|
||||
@@ -43,6 +43,7 @@ from app.models.risk_intelligence import TechniqueRiskProfile
|
||||
from app.models.executive_dashboard import PostureSnapshot
|
||||
from app.models.api_key import ApiKey
|
||||
from app.models.sso_config import SsoConfig
|
||||
from app.models.operational_alert import AlertRule, AlertInstance
|
||||
|
||||
__all__ = [
|
||||
"User", "Technique", "Test", "TestTemplate", "Evidence",
|
||||
@@ -69,4 +70,6 @@ __all__ = [
|
||||
"PostureSnapshot",
|
||||
"ApiKey",
|
||||
"SsoConfig",
|
||||
"AlertRule",
|
||||
"AlertInstance",
|
||||
]
|
||||
|
||||
144
backend/app/models/operational_alert.py
Normal file
144
backend/app/models/operational_alert.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Phase 13: Operational Alerts — AlertRule and AlertInstance models."""
|
||||
|
||||
import enum
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import (
|
||||
Boolean, Column, DateTime, ForeignKey,
|
||||
Index, Integer, String, Text,
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
# ── Enumerations ──────────────────────────────────────────────────────────────
|
||||
|
||||
class AlertSeverity(str, enum.Enum):
|
||||
critical = "critical"
|
||||
high = "high"
|
||||
medium = "medium"
|
||||
low = "low"
|
||||
info = "info"
|
||||
|
||||
|
||||
class AlertStatus(str, enum.Enum):
|
||||
open = "open"
|
||||
acknowledged = "acknowledged"
|
||||
resolved = "resolved"
|
||||
dismissed = "dismissed"
|
||||
|
||||
|
||||
class AlertRuleType(str, enum.Enum):
|
||||
high_risk = "high_risk" # risk_score >= threshold
|
||||
stale_technique = "stale_technique" # not validated in N days
|
||||
coverage_regression = "coverage_regression" # coverage_pct dropped
|
||||
low_coverage = "low_coverage" # coverage below min
|
||||
expiry_wave = "expiry_wave" # many pending queue items
|
||||
new_technique = "new_technique" # new MITRE techniques added
|
||||
orphan_spike = "orphan_spike" # many unowned techniques
|
||||
custom = "custom" # future extension placeholder
|
||||
|
||||
|
||||
# ── AlertRule ─────────────────────────────────────────────────────────────────
|
||||
|
||||
class AlertRule(Base):
|
||||
"""
|
||||
Defines a condition that, when satisfied, fires an AlertInstance.
|
||||
|
||||
System rules (is_system=True) are seeded at startup and cannot be deleted.
|
||||
Custom rules (is_system=False) can be created by admins.
|
||||
"""
|
||||
|
||||
__tablename__ = "alert_rules"
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
name = Column(String(300), nullable=False)
|
||||
description = Column(Text, nullable=True)
|
||||
rule_type = Column(String(50), nullable=False)
|
||||
severity = Column(String(20), nullable=False, default=AlertSeverity.medium.value)
|
||||
is_enabled = Column(Boolean, nullable=False, default=True)
|
||||
is_system = Column(Boolean, nullable=False, default=False) # seeded, not deletable
|
||||
|
||||
# Rule-specific thresholds/config (varies by rule_type)
|
||||
config = Column(JSONB, nullable=False, default={})
|
||||
|
||||
# Delivery
|
||||
notify_in_app = Column(Boolean, nullable=False, default=True)
|
||||
notify_webhook = Column(Boolean, nullable=False, default=False)
|
||||
webhook_id = Column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("webhook_configs.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
)
|
||||
|
||||
# Cooldown — don't re-fire within N hours of last firing
|
||||
cooldown_hours = Column(Integer, nullable=False, default=24)
|
||||
|
||||
# Meta
|
||||
created_by = Column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("users.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
)
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
last_fired_at = Column(DateTime, nullable=True)
|
||||
|
||||
creator = relationship("User", foreign_keys=[created_by])
|
||||
instances = relationship("AlertInstance", back_populates="rule",
|
||||
cascade="all, delete-orphan")
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_alert_rules_type", "rule_type"),
|
||||
Index("ix_alert_rules_enabled", "is_enabled"),
|
||||
)
|
||||
|
||||
|
||||
# ── AlertInstance ─────────────────────────────────────────────────────────────
|
||||
|
||||
class AlertInstance(Base):
|
||||
"""
|
||||
A single firing of an AlertRule.
|
||||
|
||||
Transitions: open → acknowledged → resolved
|
||||
open → dismissed
|
||||
"""
|
||||
|
||||
__tablename__ = "alert_instances"
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
rule_id = Column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("alert_rules.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
)
|
||||
# Denormalised fields kept for history even after rule deletion
|
||||
rule_name = Column(String(300), nullable=False)
|
||||
rule_type = Column(String(50), nullable=False)
|
||||
severity = Column(String(20), nullable=False)
|
||||
|
||||
title = Column(String(500), nullable=False)
|
||||
message = Column(Text, nullable=False)
|
||||
details = Column(JSONB, nullable=True) # structured context
|
||||
|
||||
status = Column(String(20), nullable=False, default=AlertStatus.open.value)
|
||||
acknowledged_by = Column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("users.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
)
|
||||
acknowledged_at = Column(DateTime, nullable=True)
|
||||
resolved_at = Column(DateTime, nullable=True)
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
rule = relationship("AlertRule", back_populates="instances")
|
||||
acknowledger = relationship("User", foreign_keys=[acknowledged_by])
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_alert_instances_rule_id", "rule_id"),
|
||||
Index("ix_alert_instances_status", "status"),
|
||||
Index("ix_alert_instances_severity", "severity"),
|
||||
Index("ix_alert_instances_created", "created_at"),
|
||||
)
|
||||
191
backend/app/routers/operational_alerts.py
Normal file
191
backend/app/routers/operational_alerts.py
Normal file
@@ -0,0 +1,191 @@
|
||||
"""Phase 13: Operational Alerts router."""
|
||||
|
||||
from typing import List, Optional
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.database import get_db
|
||||
from app.dependencies.auth import get_current_user, require_any_role
|
||||
from app.models.user import User
|
||||
from app.schemas.operational_alert_schema import (
|
||||
AlertRuleCreate, AlertRuleOut, AlertRuleUpdate,
|
||||
AlertInstanceOut, EvaluationResult, AlertSummary,
|
||||
)
|
||||
import app.services.operational_alert_service as svc
|
||||
|
||||
router = APIRouter(prefix="/alerts", tags=["Operational Alerts"])
|
||||
|
||||
|
||||
# ── Evaluation ────────────────────────────────────────────────────────────────
|
||||
|
||||
@router.post("/evaluate", response_model=EvaluationResult, status_code=202)
|
||||
def evaluate_rules(
|
||||
db: Session = Depends(get_db),
|
||||
user=Depends(require_any_role("admin", "red_lead", "blue_lead")),
|
||||
):
|
||||
"""
|
||||
Run the alert evaluation engine against all enabled rules.
|
||||
|
||||
Fires AlertInstances for rules whose conditions are met and are not in cooldown.
|
||||
Admin / leads only.
|
||||
"""
|
||||
result = svc.evaluate_all_rules(db)
|
||||
return EvaluationResult(
|
||||
rules_evaluated = result["rules_evaluated"],
|
||||
alerts_fired = result["alerts_fired"],
|
||||
alerts = [AlertInstanceOut.model_validate(a) for a in result["alerts"]],
|
||||
duration_seconds = result["duration_seconds"],
|
||||
)
|
||||
|
||||
|
||||
# ── Alert instances ───────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("", response_model=List[AlertInstanceOut])
|
||||
def list_alerts(
|
||||
status: Optional[str] = Query(None),
|
||||
severity: Optional[str] = Query(None),
|
||||
rule_type: Optional[str] = Query(None),
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
offset: int = Query(0, ge=0),
|
||||
db: Session = Depends(get_db),
|
||||
user=Depends(get_current_user),
|
||||
):
|
||||
"""List alert instances with optional filters."""
|
||||
return svc.list_instances(db, status=status, severity=severity,
|
||||
rule_type=rule_type, limit=limit, offset=offset)
|
||||
|
||||
|
||||
@router.get("/summary", response_model=AlertSummary)
|
||||
def alert_summary(
|
||||
db: Session = Depends(get_db),
|
||||
user=Depends(get_current_user),
|
||||
):
|
||||
"""Aggregate counts by status, severity, and rule type."""
|
||||
data = svc.get_summary(db)
|
||||
return AlertSummary(
|
||||
total_open = data["total_open"],
|
||||
total_acknowledged = data["total_acknowledged"],
|
||||
total_resolved = data["total_resolved"],
|
||||
by_severity = data["by_severity"],
|
||||
by_rule_type = data["by_rule_type"],
|
||||
recent_alerts = [AlertInstanceOut.model_validate(a) for a in data["recent_alerts"]],
|
||||
)
|
||||
|
||||
|
||||
@router.get("/{alert_id}", response_model=AlertInstanceOut)
|
||||
def get_alert(
|
||||
alert_id: UUID,
|
||||
db: Session = Depends(get_db),
|
||||
user=Depends(get_current_user),
|
||||
):
|
||||
"""Get a single alert instance."""
|
||||
return svc.get_instance(db, alert_id)
|
||||
|
||||
|
||||
@router.post("/{alert_id}/acknowledge", response_model=AlertInstanceOut)
|
||||
def acknowledge_alert(
|
||||
alert_id: UUID,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(get_current_user),
|
||||
):
|
||||
"""Acknowledge an open alert."""
|
||||
return svc.acknowledge(db, alert_id, current_user.id)
|
||||
|
||||
|
||||
@router.post("/{alert_id}/resolve", response_model=AlertInstanceOut)
|
||||
def resolve_alert(
|
||||
alert_id: UUID,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(get_current_user),
|
||||
):
|
||||
"""Mark an alert as resolved."""
|
||||
return svc.resolve(db, alert_id, current_user.id)
|
||||
|
||||
|
||||
@router.post("/{alert_id}/dismiss", response_model=AlertInstanceOut)
|
||||
def dismiss_alert(
|
||||
alert_id: UUID,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(get_current_user),
|
||||
):
|
||||
"""Dismiss an alert (won't re-fire until cooldown resets)."""
|
||||
return svc.dismiss(db, alert_id, current_user.id)
|
||||
|
||||
|
||||
# ── Alert rules ───────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/rules/list", response_model=List[AlertRuleOut])
|
||||
def list_rules(
|
||||
rule_type: Optional[str] = Query(None),
|
||||
include_disabled: bool = Query(False),
|
||||
db: Session = Depends(get_db),
|
||||
user=Depends(get_current_user),
|
||||
):
|
||||
"""List alert rules (all users can read; admins/leads manage them)."""
|
||||
return svc.list_rules(db, rule_type=rule_type, include_disabled=include_disabled)
|
||||
|
||||
|
||||
@router.post("/rules", response_model=AlertRuleOut, status_code=201)
|
||||
def create_rule(
|
||||
body: AlertRuleCreate,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(require_any_role("admin", "red_lead", "blue_lead")),
|
||||
):
|
||||
"""Create a custom alert rule."""
|
||||
return svc.create_rule(
|
||||
db,
|
||||
created_by = current_user.id,
|
||||
name = body.name,
|
||||
description = body.description,
|
||||
rule_type = body.rule_type,
|
||||
severity = body.severity,
|
||||
config = body.config,
|
||||
notify_in_app = body.notify_in_app,
|
||||
notify_webhook = body.notify_webhook,
|
||||
webhook_id = body.webhook_id,
|
||||
cooldown_hours = body.cooldown_hours,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/rules/{rule_id}", response_model=AlertRuleOut)
|
||||
def get_rule(
|
||||
rule_id: UUID,
|
||||
db: Session = Depends(get_db),
|
||||
user=Depends(get_current_user),
|
||||
):
|
||||
"""Get a single alert rule."""
|
||||
return svc.get_rule(db, rule_id)
|
||||
|
||||
|
||||
@router.patch("/rules/{rule_id}", response_model=AlertRuleOut)
|
||||
def update_rule(
|
||||
rule_id: UUID,
|
||||
body: AlertRuleUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
user=Depends(require_any_role("admin", "red_lead", "blue_lead")),
|
||||
):
|
||||
"""Update an alert rule (enable/disable, thresholds, cooldown)."""
|
||||
return svc.update_rule(
|
||||
db, rule_id,
|
||||
name = body.name,
|
||||
description = body.description,
|
||||
severity = body.severity,
|
||||
is_enabled = body.is_enabled,
|
||||
config = body.config,
|
||||
notify_in_app = body.notify_in_app,
|
||||
notify_webhook = body.notify_webhook,
|
||||
webhook_id = body.webhook_id,
|
||||
cooldown_hours = body.cooldown_hours,
|
||||
)
|
||||
|
||||
|
||||
@router.delete("/rules/{rule_id}", status_code=204)
|
||||
def delete_rule(
|
||||
rule_id: UUID,
|
||||
db: Session = Depends(get_db),
|
||||
user=Depends(require_any_role("admin")),
|
||||
):
|
||||
"""Delete a custom alert rule (system rules cannot be deleted)."""
|
||||
svc.delete_rule(db, rule_id)
|
||||
124
backend/app/schemas/operational_alert_schema.py
Normal file
124
backend/app/schemas/operational_alert_schema.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""Phase 13: Operational Alerts — Pydantic schemas."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
from app.models.operational_alert import AlertRuleType, AlertSeverity, AlertStatus
|
||||
|
||||
VALID_SEVERITIES = {s.value for s in AlertSeverity}
|
||||
VALID_STATUSES = {s.value for s in AlertStatus}
|
||||
VALID_RULE_TYPES = {r.value for r in AlertRuleType}
|
||||
|
||||
|
||||
# ── AlertRule schemas ─────────────────────────────────────────────────────────
|
||||
|
||||
class AlertRuleCreate(BaseModel):
|
||||
name: str = Field(..., min_length=1, max_length=300)
|
||||
description: Optional[str] = None
|
||||
rule_type: str
|
||||
severity: str = "medium"
|
||||
config: Dict[str, Any] = Field(default_factory=dict)
|
||||
notify_in_app: bool = True
|
||||
notify_webhook: bool = False
|
||||
webhook_id: Optional[UUID] = None
|
||||
cooldown_hours: int = Field(24, ge=0, le=8760)
|
||||
|
||||
@field_validator("rule_type")
|
||||
@classmethod
|
||||
def validate_rule_type(cls, v: str) -> str:
|
||||
if v not in VALID_RULE_TYPES:
|
||||
raise ValueError(f"Invalid rule_type. Valid: {VALID_RULE_TYPES}")
|
||||
return v
|
||||
|
||||
@field_validator("severity")
|
||||
@classmethod
|
||||
def validate_severity(cls, v: str) -> str:
|
||||
if v not in VALID_SEVERITIES:
|
||||
raise ValueError(f"Invalid severity. Valid: {VALID_SEVERITIES}")
|
||||
return v
|
||||
|
||||
|
||||
class AlertRuleUpdate(BaseModel):
|
||||
name: Optional[str] = Field(None, min_length=1, max_length=300)
|
||||
description: Optional[str] = None
|
||||
severity: Optional[str] = None
|
||||
is_enabled: Optional[bool] = None
|
||||
config: Optional[Dict[str, Any]] = None
|
||||
notify_in_app: Optional[bool] = None
|
||||
notify_webhook: Optional[bool] = None
|
||||
webhook_id: Optional[UUID] = None
|
||||
cooldown_hours: Optional[int] = Field(None, ge=0, le=8760)
|
||||
|
||||
@field_validator("severity")
|
||||
@classmethod
|
||||
def validate_severity(cls, v: Optional[str]) -> Optional[str]:
|
||||
if v is not None and v not in VALID_SEVERITIES:
|
||||
raise ValueError(f"Invalid severity. Valid: {VALID_SEVERITIES}")
|
||||
return v
|
||||
|
||||
|
||||
class AlertRuleOut(BaseModel):
|
||||
id: UUID
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
rule_type: str
|
||||
severity: str
|
||||
is_enabled: bool
|
||||
is_system: bool
|
||||
config: Dict[str, Any]
|
||||
notify_in_app: bool
|
||||
notify_webhook: bool
|
||||
webhook_id: Optional[UUID] = None
|
||||
cooldown_hours: int
|
||||
created_by: Optional[UUID] = None
|
||||
created_at: Optional[datetime] = None
|
||||
last_fired_at: Optional[datetime] = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
# ── AlertInstance schemas ─────────────────────────────────────────────────────
|
||||
|
||||
class AlertInstanceOut(BaseModel):
|
||||
id: UUID
|
||||
rule_id: Optional[UUID] = None
|
||||
rule_name: str
|
||||
rule_type: str
|
||||
severity: str
|
||||
title: str
|
||||
message: str
|
||||
details: Optional[Dict[str, Any]] = None
|
||||
status: str
|
||||
acknowledged_by: Optional[UUID] = None
|
||||
acknowledged_at: Optional[datetime] = None
|
||||
resolved_at: Optional[datetime] = None
|
||||
created_at: Optional[datetime] = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
# ── Evaluation result ─────────────────────────────────────────────────────────
|
||||
|
||||
class EvaluationResult(BaseModel):
|
||||
rules_evaluated: int
|
||||
alerts_fired: int
|
||||
alerts: List[AlertInstanceOut] = Field(default_factory=list)
|
||||
duration_seconds: float
|
||||
|
||||
|
||||
# ── Summary ───────────────────────────────────────────────────────────────────
|
||||
|
||||
class AlertSummary(BaseModel):
|
||||
total_open: int
|
||||
total_acknowledged: int
|
||||
total_resolved: int
|
||||
by_severity: Dict[str, int]
|
||||
by_rule_type: Dict[str, int]
|
||||
recent_alerts: List[AlertInstanceOut] = Field(default_factory=list)
|
||||
530
backend/app/services/operational_alert_service.py
Normal file
530
backend/app/services/operational_alert_service.py
Normal file
@@ -0,0 +1,530 @@
|
||||
"""Phase 13: Operational Alert service — rule evaluation engine + CRUD."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Optional
|
||||
from uuid import UUID
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.domain.errors import EntityNotFoundError, BusinessRuleViolation
|
||||
from app.models.operational_alert import (
|
||||
AlertInstance, AlertRule, AlertRuleType, AlertSeverity, AlertStatus,
|
||||
)
|
||||
from app.models.technique import Technique
|
||||
from app.models.risk_intelligence import TechniqueRiskProfile
|
||||
from app.models.ownership_queue import RevalidationQueueItem, QueueStatus
|
||||
from app.models.ownership_queue import TechniqueOwnership
|
||||
from app.models.executive_dashboard import PostureSnapshot
|
||||
from app.models.enums import TechniqueStatus
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# ── Pre-configured system rules (seeded at startup) ───────────────────────────
|
||||
|
||||
SYSTEM_RULES = [
|
||||
{
|
||||
"name": "Critical Risk Techniques",
|
||||
"description": "Fires when 3 or more techniques reach critical risk level (score ≥ 75).",
|
||||
"rule_type": AlertRuleType.high_risk.value,
|
||||
"severity": AlertSeverity.critical.value,
|
||||
"is_system": True,
|
||||
"config": {"min_risk_score": 75.0, "min_count": 3},
|
||||
"cooldown_hours": 24,
|
||||
},
|
||||
{
|
||||
"name": "High-Risk Technique Spike",
|
||||
"description": "Fires when 10 or more techniques reach high risk (score ≥ 50).",
|
||||
"rule_type": AlertRuleType.high_risk.value,
|
||||
"severity": AlertSeverity.high.value,
|
||||
"is_system": True,
|
||||
"config": {"min_risk_score": 50.0, "min_count": 10},
|
||||
"cooldown_hours": 24,
|
||||
},
|
||||
{
|
||||
"name": "Stale Technique Detection",
|
||||
"description": "Fires when 5+ validated techniques have not been reviewed in 30+ days.",
|
||||
"rule_type": AlertRuleType.stale_technique.value,
|
||||
"severity": AlertSeverity.medium.value,
|
||||
"is_system": True,
|
||||
"config": {"days_stale": 30, "min_count": 5},
|
||||
"cooldown_hours": 48,
|
||||
},
|
||||
{
|
||||
"name": "Coverage Regression",
|
||||
"description": "Fires when coverage drops by 5 or more percentage points between daily snapshots.",
|
||||
"rule_type": AlertRuleType.coverage_regression.value,
|
||||
"severity": AlertSeverity.high.value,
|
||||
"is_system": True,
|
||||
"config": {"min_drop_pct": 5.0},
|
||||
"cooldown_hours": 12,
|
||||
},
|
||||
{
|
||||
"name": "Low Coverage Warning",
|
||||
"description": "Fires when overall coverage falls below 30%.",
|
||||
"rule_type": AlertRuleType.low_coverage.value,
|
||||
"severity": AlertSeverity.medium.value,
|
||||
"is_system": True,
|
||||
"config": {"max_coverage_pct": 30.0},
|
||||
"cooldown_hours": 72,
|
||||
},
|
||||
{
|
||||
"name": "Revalidation Queue Backlog",
|
||||
"description": "Fires when 15+ techniques are waiting in the revalidation queue.",
|
||||
"rule_type": AlertRuleType.expiry_wave.value,
|
||||
"severity": AlertSeverity.medium.value,
|
||||
"is_system": True,
|
||||
"config": {"min_pending_count": 15},
|
||||
"cooldown_hours": 24,
|
||||
},
|
||||
{
|
||||
"name": "New MITRE Techniques Detected",
|
||||
"description": "Fires when new ATT&CK techniques are added in the last 7 days.",
|
||||
"rule_type": AlertRuleType.new_technique.value,
|
||||
"severity": AlertSeverity.info.value,
|
||||
"is_system": True,
|
||||
"config": {"lookback_days": 7, "min_count": 1},
|
||||
"cooldown_hours": 168, # once a week
|
||||
},
|
||||
{
|
||||
"name": "Orphan Technique Spike",
|
||||
"description": "Fires when 20+ techniques have no assigned owner.",
|
||||
"rule_type": AlertRuleType.orphan_spike.value,
|
||||
"severity": AlertSeverity.low.value,
|
||||
"is_system": True,
|
||||
"config": {"min_orphan_count": 20},
|
||||
"cooldown_hours": 48,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def seed_system_rules(db: Session) -> int:
|
||||
"""Ensure all system rules exist (idempotent). Returns count created."""
|
||||
created = 0
|
||||
for rule_def in SYSTEM_RULES:
|
||||
exists = db.query(AlertRule).filter(
|
||||
AlertRule.name == rule_def["name"],
|
||||
AlertRule.is_system == True,
|
||||
).first()
|
||||
if not exists:
|
||||
rule = AlertRule(**rule_def)
|
||||
db.add(rule)
|
||||
created += 1
|
||||
if created:
|
||||
db.commit()
|
||||
return created
|
||||
|
||||
|
||||
# ── Rule evaluators (one per AlertRuleType) ───────────────────────────────────
|
||||
|
||||
def _eval_high_risk(db: Session, rule: AlertRule) -> Optional[dict]:
|
||||
min_score = float(rule.config.get("min_risk_score", 75.0))
|
||||
min_count = int(rule.config.get("min_count", 1))
|
||||
|
||||
profiles = db.query(TechniqueRiskProfile).filter(
|
||||
TechniqueRiskProfile.risk_score >= min_score,
|
||||
).all()
|
||||
count = len(profiles)
|
||||
if count < min_count:
|
||||
return None
|
||||
|
||||
top = sorted(profiles, key=lambda p: p.risk_score, reverse=True)[:5]
|
||||
return {
|
||||
"title": f"{count} technique(s) with risk score ≥ {min_score:.0f}",
|
||||
"message": (
|
||||
f"{count} technique(s) have reached risk score ≥ {min_score:.0f}. "
|
||||
f"Top: {', '.join(str(p.technique_id)[:8] + '…' for p in top[:3])}."
|
||||
),
|
||||
"details": {
|
||||
"count": count,
|
||||
"threshold": min_score,
|
||||
"top_ids": [str(p.technique_id) for p in top],
|
||||
"top_scores": [p.risk_score for p in top],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _eval_stale_technique(db: Session, rule: AlertRule) -> Optional[dict]:
|
||||
days_stale = int(rule.config.get("days_stale", 30))
|
||||
min_count = int(rule.config.get("min_count", 1))
|
||||
cutoff = datetime.utcnow() - timedelta(days=days_stale)
|
||||
|
||||
stale = db.query(Technique).filter(
|
||||
Technique.status_global == TechniqueStatus.validated,
|
||||
Technique.last_review_date < cutoff,
|
||||
).all()
|
||||
count = len(stale)
|
||||
if count < min_count:
|
||||
return None
|
||||
|
||||
return {
|
||||
"title": f"{count} validated technique(s) stale for {days_stale}+ days",
|
||||
"message": (
|
||||
f"{count} technique(s) have been validated but not reviewed in over "
|
||||
f"{days_stale} days. Re-validate to maintain confidence."
|
||||
),
|
||||
"details": {
|
||||
"count": count,
|
||||
"days_stale": days_stale,
|
||||
"example_ids": [str(t.id) for t in stale[:10]],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _eval_coverage_regression(db: Session, rule: AlertRule) -> Optional[dict]:
|
||||
min_drop = float(rule.config.get("min_drop_pct", 5.0))
|
||||
|
||||
snaps = (
|
||||
db.query(PostureSnapshot)
|
||||
.order_by(PostureSnapshot.snapshot_date.desc())
|
||||
.limit(2)
|
||||
.all()
|
||||
)
|
||||
if len(snaps) < 2:
|
||||
return None
|
||||
|
||||
latest, previous = snaps[0], snaps[1]
|
||||
drop = previous.coverage_pct - latest.coverage_pct
|
||||
if drop < min_drop:
|
||||
return None
|
||||
|
||||
return {
|
||||
"title": f"Coverage dropped {drop:.1f}% ({previous.coverage_pct:.1f}% → {latest.coverage_pct:.1f}%)",
|
||||
"message": (
|
||||
f"Overall coverage fell by {drop:.1f} percentage points "
|
||||
f"between {previous.snapshot_date} and {latest.snapshot_date}. "
|
||||
f"Investigate recent technique status changes."
|
||||
),
|
||||
"details": {
|
||||
"previous_pct": previous.coverage_pct,
|
||||
"current_pct": latest.coverage_pct,
|
||||
"drop_pct": round(drop, 2),
|
||||
"previous_date": str(previous.snapshot_date),
|
||||
"current_date": str(latest.snapshot_date),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _eval_low_coverage(db: Session, rule: AlertRule) -> Optional[dict]:
|
||||
max_pct = float(rule.config.get("max_coverage_pct", 30.0))
|
||||
techniques = db.query(Technique).all()
|
||||
total = len(techniques)
|
||||
if total == 0:
|
||||
return None
|
||||
|
||||
validated = sum(1 for t in techniques if t.status_global == TechniqueStatus.validated)
|
||||
partial = sum(1 for t in techniques if t.status_global == TechniqueStatus.partial)
|
||||
coverage = (validated + partial * 0.5) / total * 100.0
|
||||
|
||||
if coverage > max_pct:
|
||||
return None
|
||||
|
||||
return {
|
||||
"title": f"Coverage is critically low: {coverage:.1f}%",
|
||||
"message": (
|
||||
f"Current detection coverage is {coverage:.1f}%, below the minimum "
|
||||
f"threshold of {max_pct:.0f}%. Prioritise coverage improvements."
|
||||
),
|
||||
"details": {
|
||||
"coverage_pct": round(coverage, 2),
|
||||
"threshold": max_pct,
|
||||
"validated": validated,
|
||||
"partial": partial,
|
||||
"total": total,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _eval_expiry_wave(db: Session, rule: AlertRule) -> Optional[dict]:
|
||||
min_pending = int(rule.config.get("min_pending_count", 15))
|
||||
|
||||
pending_count = db.query(RevalidationQueueItem).filter(
|
||||
RevalidationQueueItem.status.in_([
|
||||
QueueStatus.pending, QueueStatus.in_progress,
|
||||
]),
|
||||
).count()
|
||||
|
||||
if pending_count < min_pending:
|
||||
return None
|
||||
|
||||
return {
|
||||
"title": f"Revalidation queue backlog: {pending_count} items pending",
|
||||
"message": (
|
||||
f"{pending_count} technique(s) are waiting in the revalidation queue "
|
||||
f"(threshold: {min_pending}). Assign analysts to clear the backlog."
|
||||
),
|
||||
"details": {
|
||||
"pending_count": pending_count,
|
||||
"threshold": min_pending,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _eval_new_technique(db: Session, rule: AlertRule) -> Optional[dict]:
|
||||
lookback_days = int(rule.config.get("lookback_days", 7))
|
||||
min_count = int(rule.config.get("min_count", 1))
|
||||
cutoff = datetime.utcnow() - timedelta(days=lookback_days)
|
||||
|
||||
new_techs = db.query(Technique).filter(
|
||||
Technique.mitre_last_modified >= cutoff,
|
||||
).all()
|
||||
count = len(new_techs)
|
||||
if count < min_count:
|
||||
return None
|
||||
|
||||
return {
|
||||
"title": f"{count} new/updated MITRE technique(s) in last {lookback_days} days",
|
||||
"message": (
|
||||
f"{count} ATT&CK technique(s) have been added or updated in the last "
|
||||
f"{lookback_days} days. Review and assign coverage."
|
||||
),
|
||||
"details": {
|
||||
"count": count,
|
||||
"lookback_days": lookback_days,
|
||||
"technique_ids": [str(t.id) for t in new_techs[:20]],
|
||||
"mitre_ids": [t.mitre_id for t in new_techs[:20]],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _eval_orphan_spike(db: Session, rule: AlertRule) -> Optional[dict]:
|
||||
min_orphans = int(rule.config.get("min_orphan_count", 20))
|
||||
|
||||
total = db.query(Technique).count()
|
||||
owned = db.query(TechniqueOwnership).filter(
|
||||
TechniqueOwnership.owner_id.isnot(None),
|
||||
).count()
|
||||
orphans = max(total - owned, 0)
|
||||
|
||||
if orphans < min_orphans:
|
||||
return None
|
||||
|
||||
return {
|
||||
"title": f"{orphans} unowned techniques detected",
|
||||
"message": (
|
||||
f"{orphans} out of {total} technique(s) have no assigned owner. "
|
||||
f"Assign ownership to ensure accountability."
|
||||
),
|
||||
"details": {
|
||||
"orphan_count": orphans,
|
||||
"total": total,
|
||||
"threshold": min_orphans,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
_EVALUATORS = {
|
||||
AlertRuleType.high_risk.value: _eval_high_risk,
|
||||
AlertRuleType.stale_technique.value: _eval_stale_technique,
|
||||
AlertRuleType.coverage_regression.value: _eval_coverage_regression,
|
||||
AlertRuleType.low_coverage.value: _eval_low_coverage,
|
||||
AlertRuleType.expiry_wave.value: _eval_expiry_wave,
|
||||
AlertRuleType.new_technique.value: _eval_new_technique,
|
||||
AlertRuleType.orphan_spike.value: _eval_orphan_spike,
|
||||
}
|
||||
|
||||
|
||||
# ── Core evaluation engine ────────────────────────────────────────────────────
|
||||
|
||||
def _in_cooldown(rule: AlertRule) -> bool:
|
||||
if rule.last_fired_at is None:
|
||||
return False
|
||||
if rule.cooldown_hours <= 0:
|
||||
return False
|
||||
return datetime.utcnow() < rule.last_fired_at + timedelta(hours=rule.cooldown_hours)
|
||||
|
||||
|
||||
def evaluate_all_rules(db: Session) -> dict:
|
||||
"""Evaluate every enabled rule; create AlertInstances for those that fire."""
|
||||
t0 = time.monotonic()
|
||||
rules = db.query(AlertRule).filter(AlertRule.is_enabled == True).all()
|
||||
|
||||
fired: List[AlertInstance] = []
|
||||
for rule in rules:
|
||||
if _in_cooldown(rule):
|
||||
continue
|
||||
evaluator = _EVALUATORS.get(rule.rule_type)
|
||||
if not evaluator:
|
||||
continue
|
||||
try:
|
||||
result = evaluator(db, rule)
|
||||
except Exception:
|
||||
log.exception("Error evaluating rule %s (%s)", rule.id, rule.name)
|
||||
continue
|
||||
|
||||
if result is None:
|
||||
continue # condition not met
|
||||
|
||||
instance = AlertInstance(
|
||||
rule_id = rule.id,
|
||||
rule_name = rule.name,
|
||||
rule_type = rule.rule_type,
|
||||
severity = rule.severity,
|
||||
title = result["title"],
|
||||
message = result["message"],
|
||||
details = result.get("details"),
|
||||
status = AlertStatus.open.value,
|
||||
)
|
||||
db.add(instance)
|
||||
rule.last_fired_at = datetime.utcnow()
|
||||
fired.append(instance)
|
||||
|
||||
db.commit()
|
||||
for inst in fired:
|
||||
db.refresh(inst)
|
||||
|
||||
return {
|
||||
"rules_evaluated": len(rules),
|
||||
"alerts_fired": len(fired),
|
||||
"alerts": fired,
|
||||
"duration_seconds": round(time.monotonic() - t0, 3),
|
||||
}
|
||||
|
||||
|
||||
# ── AlertRule CRUD ────────────────────────────────────────────────────────────
|
||||
|
||||
def list_rules(
|
||||
db: Session,
|
||||
rule_type: Optional[str] = None,
|
||||
include_disabled: bool = False,
|
||||
) -> List[AlertRule]:
|
||||
q = db.query(AlertRule)
|
||||
if rule_type:
|
||||
q = q.filter(AlertRule.rule_type == rule_type)
|
||||
if not include_disabled:
|
||||
q = q.filter(AlertRule.is_enabled == True)
|
||||
return q.order_by(AlertRule.created_at.asc()).all()
|
||||
|
||||
|
||||
def get_rule(db: Session, rule_id: UUID) -> AlertRule:
|
||||
rule = db.query(AlertRule).filter(AlertRule.id == rule_id).first()
|
||||
if not rule:
|
||||
raise EntityNotFoundError("AlertRule", str(rule_id))
|
||||
return rule
|
||||
|
||||
|
||||
def create_rule(db: Session, created_by: UUID, **kwargs) -> AlertRule:
|
||||
kwargs["is_system"] = False
|
||||
kwargs["created_by"] = created_by
|
||||
rule = AlertRule(**kwargs)
|
||||
db.add(rule)
|
||||
db.commit()
|
||||
db.refresh(rule)
|
||||
return rule
|
||||
|
||||
|
||||
def update_rule(db: Session, rule_id: UUID, **kwargs) -> AlertRule:
|
||||
rule = get_rule(db, rule_id)
|
||||
for k, v in kwargs.items():
|
||||
if v is not None:
|
||||
setattr(rule, k, v)
|
||||
db.commit()
|
||||
db.refresh(rule)
|
||||
return rule
|
||||
|
||||
|
||||
def delete_rule(db: Session, rule_id: UUID) -> None:
|
||||
rule = get_rule(db, rule_id)
|
||||
if rule.is_system:
|
||||
raise BusinessRuleViolation("System rules cannot be deleted. Disable them instead.")
|
||||
db.delete(rule)
|
||||
db.commit()
|
||||
|
||||
|
||||
# ── AlertInstance CRUD ────────────────────────────────────────────────────────
|
||||
|
||||
def list_instances(
|
||||
db: Session,
|
||||
status: Optional[str] = None,
|
||||
severity: Optional[str] = None,
|
||||
rule_type: Optional[str] = None,
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
) -> List[AlertInstance]:
|
||||
q = db.query(AlertInstance)
|
||||
if status:
|
||||
q = q.filter(AlertInstance.status == status)
|
||||
if severity:
|
||||
q = q.filter(AlertInstance.severity == severity)
|
||||
if rule_type:
|
||||
q = q.filter(AlertInstance.rule_type == rule_type)
|
||||
return q.order_by(AlertInstance.created_at.desc()).offset(offset).limit(limit).all()
|
||||
|
||||
|
||||
def get_instance(db: Session, instance_id: UUID) -> AlertInstance:
|
||||
inst = db.query(AlertInstance).filter(AlertInstance.id == instance_id).first()
|
||||
if not inst:
|
||||
raise EntityNotFoundError("AlertInstance", str(instance_id))
|
||||
return inst
|
||||
|
||||
|
||||
def _transition(
|
||||
db: Session,
|
||||
instance_id: UUID,
|
||||
new_status: str,
|
||||
user_id: Optional[UUID] = None,
|
||||
) -> AlertInstance:
|
||||
inst = get_instance(db, instance_id)
|
||||
inst.status = new_status
|
||||
if new_status == AlertStatus.acknowledged.value:
|
||||
inst.acknowledged_by = user_id
|
||||
inst.acknowledged_at = datetime.utcnow()
|
||||
elif new_status == AlertStatus.resolved.value:
|
||||
inst.resolved_at = datetime.utcnow()
|
||||
db.commit()
|
||||
db.refresh(inst)
|
||||
return inst
|
||||
|
||||
|
||||
def acknowledge(db: Session, instance_id: UUID, user_id: UUID) -> AlertInstance:
|
||||
inst = get_instance(db, instance_id)
|
||||
if inst.status != AlertStatus.open.value:
|
||||
raise BusinessRuleViolation(f"Cannot acknowledge alert in status '{inst.status}'.")
|
||||
return _transition(db, instance_id, AlertStatus.acknowledged.value, user_id)
|
||||
|
||||
|
||||
def resolve(db: Session, instance_id: UUID, user_id: UUID) -> AlertInstance:
|
||||
inst = get_instance(db, instance_id)
|
||||
if inst.status == AlertStatus.resolved.value:
|
||||
raise BusinessRuleViolation("Alert is already resolved.")
|
||||
return _transition(db, instance_id, AlertStatus.resolved.value, user_id)
|
||||
|
||||
|
||||
def dismiss(db: Session, instance_id: UUID, user_id: UUID) -> AlertInstance:
|
||||
inst = get_instance(db, instance_id)
|
||||
if inst.status in (AlertStatus.resolved.value, AlertStatus.dismissed.value):
|
||||
raise BusinessRuleViolation(f"Cannot dismiss alert in status '{inst.status}'.")
|
||||
return _transition(db, instance_id, AlertStatus.dismissed.value, user_id)
|
||||
|
||||
|
||||
def get_summary(db: Session) -> dict:
|
||||
instances = db.query(AlertInstance).all()
|
||||
|
||||
by_status = {s.value: 0 for s in AlertStatus}
|
||||
by_severity = {s.value: 0 for s in AlertSeverity}
|
||||
by_type = {}
|
||||
|
||||
for i in instances:
|
||||
by_status[i.status] = by_status.get(i.status, 0) + 1
|
||||
by_severity[i.severity] = by_severity.get(i.severity, 0) + 1
|
||||
by_type[i.rule_type] = by_type.get(i.rule_type, 0) + 1
|
||||
|
||||
recent = (
|
||||
db.query(AlertInstance)
|
||||
.filter(AlertInstance.status == AlertStatus.open.value)
|
||||
.order_by(AlertInstance.created_at.desc())
|
||||
.limit(5)
|
||||
.all()
|
||||
)
|
||||
|
||||
return {
|
||||
"total_open": by_status.get(AlertStatus.open.value, 0),
|
||||
"total_acknowledged": by_status.get(AlertStatus.acknowledged.value, 0),
|
||||
"total_resolved": by_status.get(AlertStatus.resolved.value, 0),
|
||||
"by_severity": by_severity,
|
||||
"by_rule_type": by_type,
|
||||
"recent_alerts": recent,
|
||||
}
|
||||
Reference in New Issue
Block a user