refactor(detection-rules): extract query/business logic to detection_rule_service, router is thin HTTP adapter

This commit is contained in:
2026-02-19 17:39:31 +01:00
parent d305db8794
commit 560fc0c9f0
7 changed files with 5853 additions and 282 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,31 +1,32 @@
"""Detection rules endpoints — listing, filtering, and template association. """Detection rules endpoints — listing, filtering, and template association.
Thin HTTP adapter: delegates all query and business logic to detection_rule_service.
Provides endpoints for browsing detection rules, querying rules by technique, Provides endpoints for browsing detection rules, querying rules by technique,
and managing the template ↔ detection rule associations. and managing the template ↔ detection rule associations.
""" """
import logging
import uuid import uuid
from typing import Optional from typing import Optional
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, Query from fastapi import APIRouter, Depends, Query
from pydantic import BaseModel from pydantic import BaseModel
from sqlalchemy import func
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.database import get_db from app.database import get_db
from app.dependencies.auth import get_current_user, require_role, require_any_role from app.dependencies.auth import get_current_user, require_role, require_any_role
from app.models.user import User from app.models.user import User
from app.models.detection_rule import DetectionRule from app.services.detection_rule_service import (
from app.models.test_template import TestTemplate list_rules,
from app.models.test_template_detection_rule import TestTemplateDetectionRule get_rules_for_template,
from app.models.test_detection_result import TestDetectionResult auto_associate_rules,
get_rules_for_test,
evaluate_rule,
)
# --------------------------------------------------------------------------- # ── Pydantic schemas for request validation ────────────────────────────
# Pydantic schemas for request validation
# ---------------------------------------------------------------------------
class DetectionRuleEvaluate(BaseModel): class DetectionRuleEvaluate(BaseModel):
"""Payload for evaluating a detection rule against a test.""" """Payload for evaluating a detection rule against a test."""
@@ -34,14 +35,12 @@ class DetectionRuleEvaluate(BaseModel):
triggered: Optional[bool] = None triggered: Optional[bool] = None
notes: Optional[str] = None notes: Optional[str] = None
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/detection-rules", tags=["detection-rules"]) router = APIRouter(prefix="/detection-rules", tags=["detection-rules"])
# --------------------------------------------------------------------------- # ── GET /detection-rules — List with filters ───────────────────────────
# GET /detection-rules — List with filters
# ---------------------------------------------------------------------------
@router.get("") @router.get("")
def list_detection_rules( def list_detection_rules(
@@ -55,54 +54,19 @@ def list_detection_rules(
current_user: User = Depends(get_current_user), current_user: User = Depends(get_current_user),
): ):
"""List detection rules with optional filters and pagination.""" """List detection rules with optional filters and pagination."""
query = db.query(DetectionRule).filter(DetectionRule.is_active == True) # noqa: E712 return list_rules(
db,
if technique: technique=technique,
query = query.filter(DetectionRule.mitre_technique_id == technique) source=source,
severity=severity,
if source: search=search,
query = query.filter(DetectionRule.source == source) offset=offset,
limit=limit,
if severity: )
query = query.filter(DetectionRule.severity == severity)
if search:
from app.utils import escape_like
pattern = f"%{escape_like(search)}%"
query = query.filter(
DetectionRule.title.ilike(pattern)
| DetectionRule.description.ilike(pattern)
)
total = query.count()
items = query.order_by(DetectionRule.mitre_technique_id, DetectionRule.title).offset(offset).limit(limit).all()
return {
"total": total,
"offset": offset,
"limit": limit,
"items": [
{
"id": str(r.id),
"mitre_technique_id": r.mitre_technique_id,
"title": r.title,
"description": r.description,
"source": r.source,
"source_url": r.source_url,
"rule_format": r.rule_format,
"severity": r.severity,
"platforms": r.platforms or [],
"log_sources": r.log_sources,
"is_active": r.is_active,
}
for r in items
],
}
# --------------------------------------------------------------------------- # ── GET /detection-rules/for-template/{template_id} ────────────────────
# GET /test-templates/{id}/detection-rules — Rules for a template
# ---------------------------------------------------------------------------
@router.get("/for-template/{template_id}") @router.get("/for-template/{template_id}")
def get_detection_rules_for_template( def get_detection_rules_for_template(
@@ -111,46 +75,11 @@ def get_detection_rules_for_template(
current_user: User = Depends(get_current_user), current_user: User = Depends(get_current_user),
): ):
"""Get detection rules associated with a test template.""" """Get detection rules associated with a test template."""
template = db.query(TestTemplate).filter(TestTemplate.id == template_id).first() return get_rules_for_template(db, template_id)
if not template:
raise HTTPException(status_code=404, detail="Test template not found")
associations = (
db.query(TestTemplateDetectionRule)
.filter(TestTemplateDetectionRule.test_template_id == template_id)
.all()
)
rules = []
for assoc in associations:
r = assoc.detection_rule
rules.append({
"id": str(r.id),
"mitre_technique_id": r.mitre_technique_id,
"title": r.title,
"description": r.description,
"source": r.source,
"source_url": r.source_url,
"rule_content": r.rule_content,
"rule_format": r.rule_format,
"severity": r.severity,
"platforms": r.platforms or [],
"log_sources": r.log_sources,
"is_primary": assoc.is_primary,
})
return {
"template_id": str(template.id),
"template_name": template.name,
"mitre_technique_id": template.mitre_technique_id,
"rules": rules,
"total": len(rules),
}
# --------------------------------------------------------------------------- # ── POST /detection-rules/auto-associate ────────────────────────────────
# POST /detection-rules/auto-associate — Auto-link templates ↔ rules
# ---------------------------------------------------------------------------
@router.post("/auto-associate") @router.post("/auto-associate")
def auto_associate_detection_rules( def auto_associate_detection_rules(
@@ -163,60 +92,11 @@ def auto_associate_detection_rules(
technique and create associations. Rules with severity >= high are marked technique and create associations. Rules with severity >= high are marked
as primary. as primary.
""" """
templates = db.query(TestTemplate).filter(TestTemplate.is_active == True).all() # noqa: E712 return auto_associate_rules(db)
rules = db.query(DetectionRule).filter(DetectionRule.is_active == True).all() # noqa: E712
# Index rules by technique
rules_by_technique: dict[str, list] = {}
for rule in rules:
tid = rule.mitre_technique_id
if tid not in rules_by_technique:
rules_by_technique[tid] = []
rules_by_technique[tid].append(rule)
created = 0
skipped = 0
high_severities = {"high", "critical"}
for template in templates:
matching_rules = rules_by_technique.get(template.mitre_technique_id, [])
for rule in matching_rules:
# Check if association already exists
existing = (
db.query(TestTemplateDetectionRule)
.filter(
TestTemplateDetectionRule.test_template_id == template.id,
TestTemplateDetectionRule.detection_rule_id == rule.id,
)
.first()
)
if existing:
skipped += 1
continue
is_primary = (rule.severity or "").lower() in high_severities
assoc = TestTemplateDetectionRule(
test_template_id=template.id,
detection_rule_id=rule.id,
is_primary=is_primary,
)
db.add(assoc)
created += 1
db.commit()
total = db.query(TestTemplateDetectionRule).count()
return {
"created": created,
"skipped": skipped,
"total_associations": total,
}
# --------------------------------------------------------------------------- # ── GET /detection-rules/for-test/{test_id} ──────────────────────────────
# GET /detection-rules/for-test/{test_id} — Rules + results for a test
# ---------------------------------------------------------------------------
@router.get("/for-test/{test_id}") @router.get("/for-test/{test_id}")
def get_detection_rules_for_test( def get_detection_rules_for_test(
@@ -229,83 +109,11 @@ def get_detection_rules_for_test(
Finds rules by matching the test's technique_id to detection rules, Finds rules by matching the test's technique_id to detection rules,
and returns any existing evaluation results. and returns any existing evaluation results.
""" """
from app.models.test import Test return get_rules_for_test(db, test_id)
from app.models.technique import Technique
test = db.query(Test).filter(Test.id == test_id).first()
if not test:
raise HTTPException(status_code=404, detail="Test not found")
technique = db.query(Technique).filter(Technique.id == test.technique_id).first()
if not technique:
raise HTTPException(status_code=404, detail="Technique not found")
# Get detection rules for this technique
rules = (
db.query(DetectionRule)
.filter(
DetectionRule.mitre_technique_id == technique.mitre_id,
DetectionRule.is_active == True, # noqa: E712
)
.order_by(DetectionRule.severity.desc(), DetectionRule.title)
.all()
)
# Get existing results for this test
existing_results = (
db.query(TestDetectionResult)
.filter(TestDetectionResult.test_id == test_id)
.all()
)
results_map = {str(r.detection_rule_id): r for r in existing_results}
items = []
triggered_count = 0
evaluated_count = 0
for rule in rules:
result = results_map.get(str(rule.id))
triggered = result.triggered if result else None
notes = result.notes if result else None
evaluated_at = result.evaluated_at.isoformat() if result and result.evaluated_at else None
if triggered is not None:
evaluated_count += 1
if triggered:
triggered_count += 1
items.append({
"id": str(rule.id),
"mitre_technique_id": rule.mitre_technique_id,
"title": rule.title,
"description": rule.description,
"source": rule.source,
"source_url": rule.source_url,
"rule_content": rule.rule_content,
"rule_format": rule.rule_format,
"severity": rule.severity,
"platforms": rule.platforms or [],
"log_sources": rule.log_sources,
"triggered": triggered,
"notes": notes,
"evaluated_at": evaluated_at,
"result_id": str(result.id) if result else None,
})
return {
"test_id": str(test.id),
"mitre_technique_id": technique.mitre_id,
"rules": items,
"total": len(items),
"evaluated": evaluated_count,
"triggered": triggered_count,
"detection_rate": round(triggered_count / evaluated_count * 100, 1) if evaluated_count > 0 else 0,
}
# --------------------------------------------------------------------------- # ── POST /detection-rules/evaluate ──────────────────────────────────────
# POST /detection-rules/evaluate — Save detection result for a rule
# ---------------------------------------------------------------------------
@router.post("/evaluate") @router.post("/evaluate")
def evaluate_detection_rule( def evaluate_detection_rule(
@@ -314,60 +122,11 @@ def evaluate_detection_rule(
current_user: User = Depends(require_any_role("blue_tech", "blue_lead")), current_user: User = Depends(require_any_role("blue_tech", "blue_lead")),
): ):
"""Save or update the evaluation result for a detection rule on a test.""" """Save or update the evaluation result for a detection rule on a test."""
test_id = payload.test_id return evaluate_rule(
detection_rule_id = payload.detection_rule_id db,
triggered = payload.triggered test_id=payload.test_id,
notes = payload.notes detection_rule_id=payload.detection_rule_id,
triggered=payload.triggered,
# Check test exists notes=payload.notes,
from app.models.test import Test evaluator_id=current_user.id,
test = db.query(Test).filter(Test.id == test_id).first()
if not test:
raise HTTPException(status_code=404, detail="Test not found")
# Check rule exists
rule = db.query(DetectionRule).filter(DetectionRule.id == detection_rule_id).first()
if not rule:
raise HTTPException(status_code=404, detail="Detection rule not found")
# Upsert result
existing = (
db.query(TestDetectionResult)
.filter(
TestDetectionResult.test_id == test_id,
TestDetectionResult.detection_rule_id == detection_rule_id,
)
.first()
) )
if existing:
existing.triggered = triggered
existing.notes = notes
existing.evaluated_by = current_user.id
existing.evaluated_at = datetime.utcnow()
db.commit()
db.refresh(existing)
return {
"id": str(existing.id),
"triggered": existing.triggered,
"notes": existing.notes,
"evaluated_at": existing.evaluated_at.isoformat() if existing.evaluated_at else None,
}
else:
result = TestDetectionResult(
test_id=test_id,
detection_rule_id=detection_rule_id,
triggered=triggered,
notes=notes,
evaluated_by=current_user.id,
evaluated_at=datetime.utcnow(),
)
db.add(result)
db.commit()
db.refresh(result)
return {
"id": str(result.id),
"triggered": result.triggered,
"notes": result.notes,
"evaluated_at": result.evaluated_at.isoformat() if result.evaluated_at else None,
}

View File

@@ -0,0 +1,319 @@
"""Detection rule data service.
Extracts query and business logic from the detection_rules router so
that the router remains a thin HTTP adapter.
This module is framework-agnostic: no FastAPI imports.
"""
from __future__ import annotations
from datetime import datetime
from typing import Any
from sqlalchemy.orm import Session
from app.domain.errors import EntityNotFoundError
from app.models.detection_rule import DetectionRule
from app.models.test import Test
from app.models.test_template import TestTemplate
from app.models.test_template_detection_rule import TestTemplateDetectionRule
from app.models.test_detection_result import TestDetectionResult
from app.models.technique import Technique
from app.utils import escape_like
# ── Public service functions ──────────────────────────────────────────
def list_rules(
db: Session,
*,
technique: str | None = None,
source: str | None = None,
severity: str | None = None,
search: str | None = None,
offset: int = 0,
limit: int = 50,
) -> dict[str, Any]:
"""List detection rules with optional filters and pagination."""
query = db.query(DetectionRule).filter(DetectionRule.is_active == True)
if technique:
query = query.filter(DetectionRule.mitre_technique_id == technique)
if source:
query = query.filter(DetectionRule.source == source)
if severity:
query = query.filter(DetectionRule.severity == severity)
if search:
pattern = f"%{escape_like(search)}%"
query = query.filter(
DetectionRule.title.ilike(pattern)
| DetectionRule.description.ilike(pattern)
)
total = query.count()
items = (
query.order_by(DetectionRule.mitre_technique_id, DetectionRule.title)
.offset(offset)
.limit(limit)
.all()
)
return {
"total": total,
"offset": offset,
"limit": limit,
"items": [
{
"id": str(r.id),
"mitre_technique_id": r.mitre_technique_id,
"title": r.title,
"description": r.description,
"source": r.source,
"source_url": r.source_url,
"rule_format": r.rule_format,
"severity": r.severity,
"platforms": r.platforms or [],
"log_sources": r.log_sources,
"is_active": r.is_active,
}
for r in items
],
}
def get_rules_for_template(db: Session, template_id: str) -> dict[str, Any]:
"""Get detection rules associated with a test template.
Raises EntityNotFoundError if the template does not exist.
"""
template = db.query(TestTemplate).filter(TestTemplate.id == template_id).first()
if not template:
raise EntityNotFoundError("Test template", template_id)
associations = (
db.query(TestTemplateDetectionRule)
.filter(TestTemplateDetectionRule.test_template_id == template_id)
.all()
)
rules = []
for assoc in associations:
r = assoc.detection_rule
rules.append({
"id": str(r.id),
"mitre_technique_id": r.mitre_technique_id,
"title": r.title,
"description": r.description,
"source": r.source,
"source_url": r.source_url,
"rule_content": r.rule_content,
"rule_format": r.rule_format,
"severity": r.severity,
"platforms": r.platforms or [],
"log_sources": r.log_sources,
"is_primary": assoc.is_primary,
})
return {
"template_id": str(template.id),
"template_name": template.name,
"mitre_technique_id": template.mitre_technique_id,
"rules": rules,
"total": len(rules),
}
def auto_associate_rules(db: Session) -> dict[str, Any]:
"""Auto-associate test templates with detection rules by MITRE technique ID.
For each active template, finds all active detection rules for the same
technique and creates associations. Rules with severity high/critical
are marked as primary. Performs commit internally.
"""
templates = db.query(TestTemplate).filter(TestTemplate.is_active == True).all()
rules = db.query(DetectionRule).filter(DetectionRule.is_active == True).all()
rules_by_technique: dict[str, list] = {}
for rule in rules:
tid = rule.mitre_technique_id
if tid not in rules_by_technique:
rules_by_technique[tid] = []
rules_by_technique[tid].append(rule)
created = 0
skipped = 0
high_severities = {"high", "critical"}
for template in templates:
matching_rules = rules_by_technique.get(template.mitre_technique_id, [])
for rule in matching_rules:
existing = (
db.query(TestTemplateDetectionRule)
.filter(
TestTemplateDetectionRule.test_template_id == template.id,
TestTemplateDetectionRule.detection_rule_id == rule.id,
)
.first()
)
if existing:
skipped += 1
continue
is_primary = (rule.severity or "").lower() in high_severities
assoc = TestTemplateDetectionRule(
test_template_id=template.id,
detection_rule_id=rule.id,
is_primary=is_primary,
)
db.add(assoc)
created += 1
db.commit()
total = db.query(TestTemplateDetectionRule).count()
return {
"created": created,
"skipped": skipped,
"total_associations": total,
}
def get_rules_for_test(db: Session, test_id: str) -> dict[str, Any]:
"""Get detection rules relevant to a test, along with their evaluation results.
Finds rules by matching the test's technique to detection rules.
Raises EntityNotFoundError if the test or its technique does not exist.
"""
test = db.query(Test).filter(Test.id == test_id).first()
if not test:
raise EntityNotFoundError("Test", str(test_id))
technique = db.query(Technique).filter(Technique.id == test.technique_id).first()
if not technique:
raise EntityNotFoundError("Technique", str(test.technique_id))
rules = (
db.query(DetectionRule)
.filter(
DetectionRule.mitre_technique_id == technique.mitre_id,
DetectionRule.is_active == True,
)
.order_by(DetectionRule.severity.desc(), DetectionRule.title)
.all()
)
existing_results = (
db.query(TestDetectionResult)
.filter(TestDetectionResult.test_id == test_id)
.all()
)
results_map = {str(r.detection_rule_id): r for r in existing_results}
items = []
triggered_count = 0
evaluated_count = 0
for rule in rules:
result = results_map.get(str(rule.id))
triggered = result.triggered if result else None
notes = result.notes if result else None
evaluated_at = result.evaluated_at.isoformat() if result and result.evaluated_at else None
if triggered is not None:
evaluated_count += 1
if triggered:
triggered_count += 1
items.append({
"id": str(rule.id),
"mitre_technique_id": rule.mitre_technique_id,
"title": rule.title,
"description": rule.description,
"source": rule.source,
"source_url": rule.source_url,
"rule_content": rule.rule_content,
"rule_format": rule.rule_format,
"severity": rule.severity,
"platforms": rule.platforms or [],
"log_sources": rule.log_sources,
"triggered": triggered,
"notes": notes,
"evaluated_at": evaluated_at,
"result_id": str(result.id) if result else None,
})
return {
"test_id": str(test.id),
"mitre_technique_id": technique.mitre_id,
"rules": items,
"total": len(items),
"evaluated": evaluated_count,
"triggered": triggered_count,
"detection_rate": round(triggered_count / evaluated_count * 100, 1) if evaluated_count > 0 else 0,
}
def evaluate_rule(
db: Session,
*,
test_id: Any,
detection_rule_id: Any,
triggered: bool | None,
notes: str | None,
evaluator_id: Any,
) -> dict[str, Any]:
"""Save or update the evaluation result for a detection rule on a test.
Raises EntityNotFoundError if the test or detection rule does not exist.
"""
test = db.query(Test).filter(Test.id == test_id).first()
if not test:
raise EntityNotFoundError("Test", str(test_id))
rule = db.query(DetectionRule).filter(DetectionRule.id == detection_rule_id).first()
if not rule:
raise EntityNotFoundError("Detection rule", str(detection_rule_id))
existing = (
db.query(TestDetectionResult)
.filter(
TestDetectionResult.test_id == test_id,
TestDetectionResult.detection_rule_id == detection_rule_id,
)
.first()
)
if existing:
existing.triggered = triggered
existing.notes = notes
existing.evaluated_by = evaluator_id
existing.evaluated_at = datetime.utcnow()
db.commit()
db.refresh(existing)
return {
"id": str(existing.id),
"triggered": existing.triggered,
"notes": existing.notes,
"evaluated_at": existing.evaluated_at.isoformat() if existing.evaluated_at else None,
}
else:
result = TestDetectionResult(
test_id=test_id,
detection_rule_id=detection_rule_id,
triggered=triggered,
notes=notes,
evaluated_by=evaluator_id,
evaluated_at=datetime.utcnow(),
)
db.add(result)
db.commit()
db.refresh(result)
return {
"id": str(result.id),
"triggered": result.triggered,
"notes": result.notes,
"evaluated_at": result.evaluated_at.isoformat() if result.evaluated_at else None,
}

394
docs/ADR.md Normal file
View File

@@ -0,0 +1,394 @@
# Aegis — Architecture Decision Records (ADR)
> **Date:** February 11, 2026
> **Status:** All decisions are **Accepted** and currently in effect.
---
## Index
| ADR | Title | Status |
|-----|-------|--------|
| [ADR-001](#adr-001-fastapi-as-backend-framework) | FastAPI as Backend Framework | Accepted |
| [ADR-002](#adr-002-postgresql-with-jsonb-as-primary-database) | PostgreSQL with JSONB as Primary Database | Accepted |
| [ADR-003](#adr-003-minio-for-evidence-storage) | MinIO for Evidence Storage | Accepted |
| [ADR-004](#adr-004-docker-compose-for-deployment) | Docker Compose for Deployment | Accepted |
| [ADR-005](#adr-005-modular-monolith-over-microservices) | Modular Monolith over Microservices | Accepted |
| [ADR-006](#adr-006-apscheduler-in-process-over-external-job-system) | APScheduler In-Process over External Job System | Accepted |
---
## ADR-001: FastAPI as Backend Framework
**Date:** Project inception
**Status:** Accepted
### Context
Aegis is an internal security platform for managing MITRE ATT&CK coverage through Red/Blue team validation workflows. The backend must:
- Expose a REST API consumed by a React SPA (21 pages, 80+ endpoints).
- Handle CRUD operations for 18+ domain entities with complex filtering and joins.
- Support file uploads (evidence) and streaming downloads (CSV/JSON exports).
- Integrate with external APIs (MITRE TAXII 2.0, GitHub REST, D3FEND REST).
- Enforce RBAC authorization across 6 roles.
- Be developed and maintained by a small team requiring fast iteration.
- Run in a containerized environment with Python as the team's primary language.
### Decision
We chose **FastAPI** as the backend framework, served by **Uvicorn** (ASGI).
Key factors:
- **Automatic OpenAPI/Swagger** generation from type hints reduces documentation burden for 80+ endpoints.
- **Pydantic integration** provides request/response validation with zero boilerplate, critical for a schema-heavy domain (test workflows, scoring payloads, compliance data).
- **`Depends()` system** provides clean dependency injection for auth, DB sessions, and role checks without a third-party DI container.
- **Async-capable** but allows synchronous route handlers, which matters because SQLAlchemy (sync) is the ORM and all external data imports are CPU/IO-bound synchronous operations.
- **Performance** is sufficient for an internal tool (< 100 concurrent users) without needing Go/Rust-level throughput.
- **Python ecosystem** gives direct access to `taxii2-client`, `pySigma`, `boto3`, `PyYAML`, and `toml` — all required for the 8 external data source integrations.
### Consequences
**Positive:**
- Swagger UI available in development (`/docs`) for rapid API exploration and testing.
- Pydantic schemas act as living documentation for the API contract.
- `Depends()` chain for `get_db``get_current_user``require_role()` is concise and composable.
- `python-jose` + `passlib` integrate naturally for JWT/bcrypt auth.
- SlowAPI integrates directly with FastAPI for rate limiting.
**Negative:**
- The `Depends()` system encourages passing `db: Session` directly into route handlers, which has led to routers containing raw SQLAlchemy queries instead of delegating to a service/repository layer (see ADR analysis — 11 of 21 routers query the DB directly).
- Synchronous route handlers block the event loop when performing long operations (MITRE sync ZIP downloads can take 30+ seconds), mitigated by Nginx proxy timeout of 300s.
- No built-in background task system beyond `BackgroundTasks` (which is request-scoped), requiring APScheduler for scheduled jobs (see ADR-006).
**Risks:**
- FastAPI's ease of putting logic in route handlers has contributed to "fat controllers" — this is a developer discipline issue, not a framework limitation.
### Alternatives Considered
| Alternative | Reason Rejected |
|------------|-----------------|
| **Django + DRF** | Heavier ORM opinions, admin panel unnecessary, slower startup. Django's ORM lacks SQLAlchemy's flexibility with JSONB and complex joins. |
| **Flask + Flask-RESTful** | No built-in validation, no auto-generated OpenAPI, manual Swagger setup. Would require marshmallow or similar for schema validation. |
| **Go (Gin/Echo)** | Team's primary expertise is Python. The 8 data source integrations rely heavily on Python libraries (pySigma, taxii2-client, PyYAML). |
| **NestJS (Node.js)** | Would split the team across two runtimes. Python libraries for STIX/TAXII and Sigma rule parsing have no mature Node.js equivalents. |
---
## ADR-002: PostgreSQL with JSONB as Primary Database
**Date:** Project inception
**Status:** Accepted
### Context
Aegis manages a complex relational domain: techniques have tests, tests belong to campaigns, threat actors map to techniques, compliance controls map to techniques, detection rules map to techniques and tests. This is a deeply relational model with 18+ tables and many-to-many relationships.
However, several entities also carry semi-structured data that varies by source:
- **Audit logs** — `details` field contains arbitrary action metadata (different structure per action type).
- **Threat actors** — `aliases`, `target_sectors`, `target_regions`, `references` are variable-length arrays/objects from STIX 2.0 bundles.
- **Detection rules** — `platforms` (array), `log_sources` (object with varying keys like `product`, `service`, `category`).
- **Data sources** — `last_sync_stats` (object with import-specific counters), `config` (source-specific configuration).
- **Techniques** — `platforms` (array of OS names from ATT&CK).
- **Campaigns** — `tags` (user-defined array).
This data is imported from external sources with varying schemas (STIX JSON, Sigma YAML, Elastic TOML) and must be stored without rigid column definitions.
### Decision
We chose **PostgreSQL 15** as the primary database, using its native **JSONB** column type for semi-structured fields alongside traditional relational columns for the core domain.
The schema is managed by **Alembic** (18 migration versions) with **SQLAlchemy** ORM using `sqlalchemy.dialects.postgresql.JSONB`.
### Consequences
**Positive:**
- Relational integrity enforced with foreign keys for the core domain (test → technique, campaign → test, evidence → test, etc.).
- JSONB columns store variable-structure data without schema migrations when external sources change their format.
- JSONB supports GIN indexing for efficient containment queries (`@>` operator) on arrays like `platforms` and `target_sectors`.
- Single database to operate — no need for a separate document store.
- PostgreSQL's mature ecosystem: `pg_dump` for backups, `pg_isready` for health checks, extensive monitoring tooling.
- SQLAlchemy's `JSONB` type allows Python dict/list access with full query support.
**Negative:**
- JSONB fields bypass ORM-level validation — the schema for `details`, `config`, `references` etc. is only enforced by application code (Pydantic schemas on input), not by the database.
- Complex queries mixing relational joins with JSONB containment can be harder to optimize and debug.
- No GIN indexes are currently defined in migrations for JSONB columns, meaning array containment queries may perform full scans on large datasets.
- JSONB fields in audit logs make structured querying across action types difficult (e.g., "find all audit entries where details.old_state = 'draft'").
**Risks:**
- As JSONB usage grows, the boundary between "should be a column" and "should be JSONB" can blur. Currently well-contained to arrays and metadata fields.
### Alternatives Considered
| Alternative | Reason Rejected |
|------------|-----------------|
| **PostgreSQL without JSONB** | Would require separate junction tables for every array field (technique_platforms, actor_aliases, actor_sectors, etc.), adding 10+ tables for data that is always read as a whole array. |
| **MongoDB** | The core domain is deeply relational (techniques ↔ tests ↔ campaigns ↔ threat actors). Modeling this in MongoDB would require denormalization, embedded documents, or manual reference integrity — trading JSONB flexibility for relational integrity loss. |
| **PostgreSQL + MongoDB (dual)** | Operational complexity of two database systems is unjustified for the current JSONB usage (~12 columns across 6 tables). |
| **MySQL 8 with JSON** | PostgreSQL's JSONB is binary-indexed and faster for containment queries. MySQL's JSON type is text-based with function-based indexing. PostgreSQL also has superior support for UUID primary keys (native type vs BINARY(16)). |
---
## ADR-003: MinIO for Evidence Storage
**Date:** Project inception
**Status:** Accepted
### Context
The Red/Blue team validation workflow requires both teams to upload evidence files (screenshots, log files, PCAPs, documents) to support their test findings. Requirements:
- Files range from small screenshots (KB) to large PCAPs (hundreds of MB).
- Files must be associated with specific tests and teams (red/blue).
- Files must be downloadable by authorized users via the browser.
- Storage must be independent from the application database (no BLOBs in PostgreSQL).
- The platform is deployed on-premise via Docker Compose — cloud-native S3 is not available.
- The upload/download API must be simple and well-supported in Python.
### Decision
We chose **MinIO** as an S3-compatible object storage system, accessed via **boto3** (AWS S3 SDK for Python).
Implementation details:
- A single `evidence` bucket is auto-created on backend startup (`ensure_bucket_exists()`).
- Files are uploaded with `put_object()` using a generated UUID-based key.
- Downloads use presigned URLs (`generate_presigned_url()`) with 1-hour expiration.
- The MinIO client is a module-level singleton in `storage.py`.
- Evidence metadata (filename, MIME type, size, team, test association) is stored in PostgreSQL; only the binary content lives in MinIO.
### Consequences
**Positive:**
- S3-compatible API means zero code changes if migrating to AWS S3, GCS, or any S3-compatible service.
- boto3 is the most mature and well-documented S3 client library in Python.
- Presigned URLs offload download bandwidth from the backend — the browser fetches directly from MinIO.
- Binary data stays out of PostgreSQL, keeping the database lean and backups fast.
- MinIO runs as a single Docker container with a persistent volume — simple to deploy and back up.
- MinIO Console (port 9001) provides a web UI for administrators to inspect stored files.
**Negative:**
- Presigned URLs currently point to `minio:9000` (Docker internal hostname), which is not accessible from the browser in production without additional Nginx configuration or a public MinIO endpoint.
- No file virus scanning or content validation before storage.
- No lifecycle policies configured (no automatic deletion of old evidence).
- The module-level singleton client means the MinIO connection configuration cannot be changed at runtime (acceptable for the current deployment model).
**Risks:**
- If MinIO container is lost and the volume is not backed up, all evidence files are permanently lost. Evidence metadata in PostgreSQL would reference non-existent files.
### Alternatives Considered
| Alternative | Reason Rejected |
|------------|-----------------|
| **PostgreSQL BYTEA/BLOB** | Storing binary files in the database bloats backups, degrades query performance, and makes streaming large files complex. PostgreSQL is not designed as a file store. |
| **Local filesystem** | Not portable across container restarts without host volume mounts. No presigned URL support, requiring the backend to proxy all downloads. No built-in replication or management UI. |
| **AWS S3** | Requires cloud account and internet connectivity. The platform is designed for on-premise deployment where external cloud services may not be permitted. |
| **SeaweedFS** | Less mature ecosystem, smaller community. The S3-compatible layer is less complete than MinIO's. boto3 compatibility is not guaranteed. |
---
## ADR-004: Docker Compose for Deployment
**Date:** Project inception
**Status:** Accepted
### Context
Aegis is a multi-component platform deployed on-premise within organizations' security environments:
- 4 services: Frontend (Nginx), Backend (Uvicorn), PostgreSQL, MinIO.
- Target environments range from a single server to small clusters.
- Security teams typically have Docker available but may not have Kubernetes.
- The platform must be installable by a security engineer (not necessarily a DevOps specialist).
- Both development and production environments should use the same orchestration approach for consistency.
### Decision
We chose **Docker Compose** as the deployment and orchestration tool, with two compose files:
- `docker-compose.yml` — Development: source volumes mounted, dev servers, exposed ports.
- `docker-compose.prod.yml` — Production: multi-stage builds, Nginx serving static assets, only frontend port exposed, `SECRET_KEY` required.
Supporting infrastructure:
- `scripts/install.sh` — Interactive production installer that generates secrets, prompts for configuration, writes `.env`, and runs `docker compose up -d --build`.
- `scripts/init.sh` — Development setup that waits for services, runs migrations, and seeds data.
- All services connected via a `aegis-network` bridge network.
- Named volumes for PostgreSQL and MinIO data persistence.
- Health checks on PostgreSQL (`pg_isready`) and backend (`/health`).
- Service dependency ordering: backend waits for `postgres: service_healthy` and `minio: service_started`.
### Consequences
**Positive:**
- Single-command deployment: `docker compose -f docker-compose.prod.yml up -d --build`.
- The `install.sh` wizard makes production setup accessible to non-DevOps personnel.
- Consistent environments between development and production (same containers, same network topology).
- Named volumes survive container rebuilds — data persists across upgrades.
- No external dependencies beyond Docker and Docker Compose.
- Multi-stage Dockerfile for frontend produces a minimal Nginx image (~25MB) from a full Node.js build stage.
- Non-root user (`appuser`, UID 1001) in backend Dockerfile follows container security best practices.
**Negative:**
- No built-in horizontal scaling — running multiple backend instances requires manual Nginx upstream configuration and a shared token blacklist (currently in-memory).
- No rolling deployments — `docker compose up -d --build` causes brief downtime during image rebuilds.
- No built-in secrets management — secrets are in `.env` files on the host filesystem.
- No container orchestration beyond restart policies (`restart: always`).
- No centralized logging — each container logs to its own stdout/stderr.
**Risks:**
- Single point of failure: if the host machine goes down, all services go down.
- No automated backup strategy — `pg_dump` is documented but not automated.
### Alternatives Considered
| Alternative | Reason Rejected |
|------------|-----------------|
| **Kubernetes (k8s)** | Significantly higher operational complexity. Requires a cluster, kubectl expertise, Helm charts or manifests, ingress controllers, PVCs. Overkill for a single-server deployment targeting security teams. |
| **Docker Swarm** | Adds orchestration complexity with minimal benefit over Compose for < 5 services. The project does not need multi-node scheduling or service mesh. Swarm's future is uncertain compared to Compose V2. |
| **Bare metal / systemd** | Loses containerization benefits (isolation, reproducibility, dependency management). Would require manual installation of Python, Node.js, PostgreSQL, MinIO on each target system. |
| **Ansible + Docker** | Adds a configuration management layer that is unnecessary for a 4-service application. Could be valuable in the future for multi-server deployments but is premature now. |
---
## ADR-005: Modular Monolith over Microservices
**Date:** Project inception
**Status:** Accepted
### Context
Aegis has distinct functional domains that could theoretically be separate services:
- **Test Workflow** — Red/Blue validation state machine, evidence management.
- **Coverage Analytics** — Scoring engine, heatmaps, metrics, reports.
- **Data Import** — 8 external source integrations (MITRE, Sigma, Elastic, CALDERA, etc.).
- **Campaign Management** — Campaign lifecycle, scheduling, threat actor generation.
- **Compliance** — Framework mappings, gap analysis, control tracking.
- **User/Auth** — Authentication, RBAC, audit logging.
However:
- These domains share the same database and have tight data dependencies (e.g., scoring reads tests, techniques, detection rules, and D3FEND mappings in a single calculation).
- The development team is small.
- The deployment target is single-server Docker Compose.
- Latency between services would complicate the scoring engine (which aggregates across 5+ tables).
### Decision
We chose a **modular monolith** architecture: a single deployable backend process organized into internal modules (routers, services, models) rather than separate microservices.
Module boundaries:
- **Routers** (21 files) — HTTP endpoint definitions grouped by domain.
- **Services** (20 files) — Business logic grouped by capability (workflow, scoring, notifications, imports).
- **Models** (18 files) — ORM entities grouped by domain concept.
- **Schemas** (10 files) — Pydantic DTOs grouped by domain concept.
All modules share a single database, a single process, and a single deployment artifact.
### Consequences
**Positive:**
- No network overhead between domains — scoring can join 5+ tables in a single SQL query.
- Single deployment artifact simplifies CI/CD, monitoring, and debugging.
- Shared database means ACID transactions across domains (e.g., creating a test + logging the audit entry + sending a notification in one commit).
- No service discovery, API gateways, circuit breakers, or distributed tracing needed.
- Faster development iteration — change any module, rebuild one container.
**Negative:**
- All domains scale together — cannot scale the data import workers independently from the API.
- A bug in one module (e.g., a memory leak in scoring) can crash the entire application.
- Module boundaries are not enforced at the language level — routers currently import services and models freely across domains (e.g., `heatmap.py` imports 6 models from different domains).
- The monolith has grown to 21 routers and 20 services without explicit boundary enforcement, leading to "fat controllers" and cross-cutting concerns.
**Risks:**
- Without explicit module boundaries (enforced by code structure or linting rules), the modular monolith can degrade into a traditional monolith where everything depends on everything.
- The Clean Architecture refactor proposed in `ARCHITECTURAL_ANALYSIS.md` would restore module boundaries via the domain/application/infrastructure/presentation layers.
### Alternatives Considered
| Alternative | Reason Rejected |
|------------|-----------------|
| **Microservices** | The 8 data source integrations would each become a service, requiring inter-service communication for writing to the shared technique/rule tables. Scoring would need to call 3-4 services to gather data, adding latency and failure modes. Operational overhead (8+ containers, service mesh, distributed tracing) unjustified for a small team and single-server deployment. |
| **Microservices with shared DB** | Anti-pattern. Multiple services sharing a database lose the main benefit of microservices (independent deployment and schema evolution) while keeping the operational complexity. |
| **Modular monolith with enforced boundaries** | This is the recommended evolution (see ADR analysis). The current implementation has module structure but no boundary enforcement. Adding domain-layer interfaces (Protocol/ABC), a repository pattern, and import linting rules would achieve this without a microservices migration. |
---
## ADR-006: APScheduler In-Process over External Job System
**Date:** Project inception
**Status:** Accepted
### Context
Aegis requires periodic background tasks:
| Task | Frequency | Duration | Description |
|------|-----------|----------|-------------|
| MITRE ATT&CK sync | Every 24 hours | 30-120 seconds | Download STIX/TAXII feed, upsert ~700 techniques |
| Intel scan | Every 7 days | 10-60 seconds | Scan threat intelligence sources |
| Notification cleanup | Every 24 hours | < 5 seconds | Delete read notifications older than 90 days |
| Coverage snapshot | Weekly (Sunday 00:00) | 5-30 seconds | Capture point-in-time coverage state across all techniques |
| Recurring campaigns | Every 24 hours | < 10 seconds | Check and spawn due recurring test campaigns |
Requirements:
- Jobs must access the same database as the API.
- Jobs must not block API request handling.
- No additional infrastructure should be required beyond what Docker Compose already provides.
- Job failure should not crash the API server.
- Jobs do not need distributed execution (single-server deployment).
### Decision
We chose **APScheduler** (`BackgroundScheduler`) running as an in-process thread within the FastAPI application.
Implementation details:
- The scheduler is started during FastAPI's `lifespan` startup event and shut down on application exit.
- Each job function creates its own `SessionLocal()` instance, independent from request-scoped sessions.
- All jobs use try/except/finally to ensure sessions are closed even on failure.
- Jobs are registered with `replace_existing=True` to handle server restarts cleanly.
- The scheduler is a module-level singleton in `jobs/mitre_sync_job.py`.
### Consequences
**Positive:**
- Zero additional infrastructure — no message broker, no worker containers, no job database.
- Jobs share the same Python process, so they can import services directly (`sync_mitre`, `scan_intel`, `create_snapshot`, etc.) without serialization or RPC.
- Simple debugging — job logs appear in the same stdout as API logs.
- Session isolation per job prevents interference with request-scoped transactions.
- `replace_existing=True` prevents duplicate job registrations on hot reload.
**Negative:**
- **No persistence:** If the server crashes mid-job, the job state is lost. There is no retry mechanism — the job simply runs again at the next scheduled interval.
- **No distributed execution:** Cannot run jobs on a separate worker node. If the API is under heavy load, jobs compete for the same CPU and memory.
- **No dead letter queue:** Failed jobs are logged but not queued for retry. A failed MITRE sync silently waits 24 hours before trying again.
- **No job history:** There is no record of when jobs last ran, how long they took, or whether they succeeded — only log lines.
- **Single-instance constraint:** If multiple backend instances are running (horizontal scaling), each instance runs its own scheduler, causing duplicate job execution (double MITRE sync, double snapshots, etc.).
- **No manual trigger via scheduler:** Admin-triggered syncs go through the API endpoints (`/api/v1/system/*`), bypassing the scheduler entirely. There are effectively two paths to the same operations.
**Risks:**
- The single-instance constraint is the most significant risk. If Aegis scales horizontally, APScheduler must be replaced or augmented with a distributed lock (e.g., PostgreSQL advisory locks or Redis-based locking).
### Alternatives Considered
| Alternative | Reason Rejected |
|------------|-----------------|
| **Celery + Redis/RabbitMQ** | Requires an additional broker container (Redis or RabbitMQ), a separate worker process, and Celery configuration. Significant operational overhead for 5 periodic tasks that each run for < 2 minutes. Would be justified if job volume grows or horizontal scaling is needed. |
| **Dramatiq + Redis** | Similar to Celery but lighter. Still requires a Redis container and a separate worker process. Same operational overhead concern. |
| **Cron jobs (host-level)** | Would require the host to have cron configured and scripts that call API endpoints or run Python commands inside the container. Breaks the "single Docker Compose" deployment model. Not portable. |
| **PostgreSQL `pg_cron`** | Runs inside the database, limited to SQL operations. Cannot execute Python logic (downloading ZIPs, parsing YAML, upserting with business rules). Would require stored procedures or external triggers. |
| **Kubernetes CronJobs** | Requires Kubernetes. Not applicable to the Docker Compose deployment model (see ADR-004). |
| **APScheduler with JobStore (PostgreSQL)** | APScheduler supports persistent job stores that would solve the single-instance problem via database locking. This is a viable evolution path — same library, minimal code change, adds distributed-safe execution. **Recommended as the first upgrade when horizontal scaling is needed.** |
---
## ADR Evolution Path
The following table summarizes when each decision should be revisited:
| ADR | Revisit When | Likely Evolution |
|-----|-------------|-----------------|
| ADR-001 (FastAPI) | Stable — no change needed | Add structured logging, OpenTelemetry tracing |
| ADR-002 (PostgreSQL + JSONB) | JSONB query performance degrades | Add GIN indexes on JSONB columns, evaluate moving high-query fields to dedicated columns |
| ADR-003 (MinIO) | Cloud deployment required | Swap boto3 endpoint to AWS S3 / GCS (zero code change) |
| ADR-004 (Docker Compose) | Multi-server deployment needed | Migrate to Kubernetes with Helm charts, or add Ansible playbooks |
| ADR-005 (Modular Monolith) | Team grows > 5 developers, or domains need independent scaling | Enforce boundaries first (Clean Architecture refactor), then extract high-traffic domains as services if needed |
| ADR-006 (APScheduler) | Horizontal scaling required, or jobs need retry/history | Add APScheduler PostgreSQL JobStore first; migrate to Celery if job complexity grows significantly |

View File

@@ -0,0 +1,228 @@
# Aegis — C4 Container Diagram (Level 2)
> **Author:** Architecture review
> **Date:** February 11, 2026
> **Notation:** C4 Model — Level 2 (Container Diagram)
---
## Diagram
```mermaid
C4Container
title Aegis — Container Diagram (C4 Level 2)
%% ─── Actors ─────────────────────────────────────────────────────
Person(security_team, "Security Team", "Red/Blue Technicians, Red/Blue Leads, Viewers — interact with the platform via browser")
Person(admin, "Administrator", "Manages users, triggers data syncs, configures scoring weights, reviews audit logs")
%% ─── System Boundary: Aegis Platform ────────────────────────────
Container_Boundary(aegis, "Aegis Platform") {
Container(frontend, "Frontend SPA", "React 19, TypeScript, Vite, Tailwind CSS, Nginx", "Single-page application served by Nginx in production. Provides dashboards, ATT&CK heatmaps, test workflows, campaign management, compliance views, and report exports. Proxies /api/ requests to backend.")
Container(backend, "Backend API", "Python 3.11, FastAPI, Uvicorn, SQLAlchemy", "REST API serving 21 router modules under /api/v1. Handles authentication (JWT + HttpOnly cookies), RBAC authorization (6 roles), Red/Blue test workflows, scoring engine, heatmap generation, report building, and CRUD for all domain entities. Rate-limited with SlowAPI.")
Container(scheduler, "Background Scheduler", "APScheduler (in-process)", "Runs inside the backend process as a BackgroundScheduler thread. Executes 5 periodic jobs: MITRE ATT&CK sync (24h), intel scan (7d), notification cleanup (24h), weekly coverage snapshot (Sundays 00:00), recurring campaigns check (24h). Each job manages its own DB session.")
ContainerDb(postgres, "PostgreSQL 15", "PostgreSQL, Alpine", "Primary relational data store. Holds techniques, tests, users, campaigns, threat actors, detection rules, compliance mappings, audit logs, notifications, coverage snapshots, and scoring configuration. Schema managed by Alembic migrations (18 versions).")
ContainerDb(minio, "MinIO", "MinIO (S3-compatible), Alpine", "Object storage for Red/Blue team evidence files (screenshots, logs, PCAPs, documents). Stores files in the 'evidence' bucket. Backend generates presigned URLs for secure direct downloads.")
}
%% ─── External Systems ───────────────────────────────────────────
System_Ext(mitre_taxii, "MITRE ATT&CK TAXII Server", "STIX/TAXII 2.0 feed providing Enterprise ATT&CK techniques and tactics catalog")
System_Ext(mitre_cti, "MITRE CTI GitHub", "STIX 2.0 bundles: ATT&CK techniques (fallback), threat actors (intrusion-sets), actor-technique relationships")
System_Ext(d3fend, "MITRE D3FEND API", "REST API providing defensive techniques and ATT&CK-to-D3FEND countermeasure mappings")
System_Ext(atomic, "Atomic Red Team", "GitHub repository with 1500+ atomic test YAML files mapped to ATT&CK techniques")
System_Ext(sigma, "SigmaHQ", "GitHub repository with Sigma detection rules in YAML, tagged with ATT&CK technique IDs")
System_Ext(elastic, "Elastic Detection Rules", "GitHub repository with Elastic SIEM rules in TOML format with MITRE threat mappings")
System_Ext(caldera, "MITRE CALDERA", "GitHub repository with CALDERA abilities in YAML, organized by tactic")
System_Ext(lolbas, "LOLBAS / GTFOBins", "GitHub repositories for Living Off The Land binaries (Windows) and GTFOBins (Linux)")
%% ─── Planned Systems ────────────────────────────────────────────
System_Ext(github_actions, "GitHub Actions (Planned)", "Future CI/CD: lint, type check, pytest, Docker build, deploy")
System_Ext(artifactory, "Artifactory (Planned)", "Future artifact repository for Docker images and versioned build artifacts")
%% ─── Relationships: Users → Containers ──────────────────────────
Rel(security_team, frontend, "Uses", "HTTPS / Browser")
Rel(admin, frontend, "Uses", "HTTPS / Browser")
%% ─── Relationships: Frontend → Backend ──────────────────────────
Rel(frontend, backend, "Proxies API requests to", "HTTP (Nginx reverse proxy to backend:8000/api/)")
%% ─── Relationships: Backend → Data Stores ───────────────────────
Rel(backend, postgres, "Reads/writes domain data", "TCP/5432, SQLAlchemy ORM")
Rel(backend, minio, "Uploads/downloads evidence files", "HTTP/9000, boto3 S3 API")
%% ─── Relationships: Scheduler → Data Stores ─────────────────────
Rel(scheduler, postgres, "Reads/writes via own sessions", "TCP/5432, SQLAlchemy")
%% ─── Relationships: Backend/Scheduler → External Sources ────────
Rel(scheduler, mitre_taxii, "Syncs techniques every 24h", "TAXII 2.0 / HTTPS")
Rel(backend, mitre_cti, "Imports threat actors + fallback sync", "HTTPS, ZIP download")
Rel(backend, d3fend, "Imports D3FEND techniques and mappings", "REST API / HTTPS")
Rel(backend, atomic, "Imports atomic test templates", "HTTPS, ZIP ~40MB")
Rel(backend, sigma, "Imports Sigma detection rules", "HTTPS, ZIP download")
Rel(backend, elastic, "Imports Elastic detection rules", "HTTPS, ZIP download")
Rel(backend, caldera, "Imports CALDERA abilities", "HTTPS, ZIP download")
Rel(backend, lolbas, "Imports LOLBAS and GTFOBins", "HTTPS, ZIP download")
%% ─── Relationships: Planned ─────────────────────────────────────
Rel(github_actions, backend, "Builds, tests, deploys (planned)", "HTTPS")
Rel(github_actions, frontend, "Builds, deploys (planned)", "HTTPS")
Rel(github_actions, artifactory, "Pushes Docker images (planned)", "HTTPS")
UpdateLayoutConfig($c4ShapeInRow="3", $c4BoundaryInRow="1")
```
---
## Container Responsibilities
### Frontend SPA
| Attribute | Detail |
|-----------|--------|
| **Technology** | React 19, TypeScript 5.9, Vite 7.3, Tailwind CSS 4, React Router 7 |
| **Runtime (Dev)** | Node 20 + Vite dev server on port 5173 |
| **Runtime (Prod)** | Nginx Alpine serving static build artifacts on port 80 |
| **State Management** | AuthContext (React Context) + TanStack React Query for server state |
| **API Communication** | Axios client with `withCredentials: true` (HttpOnly JWT cookie) |
| **Security** | CSP headers, X-Frame-Options: DENY, X-Content-Type-Options: nosniff, gzip compression |
| **Responsibilities** | Render UI (21 pages, 30+ components), route protection by role, lazy loading, API proxy via Nginx `/api/``backend:8000/api/` |
### Backend API
| Attribute | Detail |
|-----------|--------|
| **Technology** | Python 3.11, FastAPI, Uvicorn, SQLAlchemy, Alembic, Pydantic |
| **Runtime** | Uvicorn ASGI server on port 8000 (behind Nginx proxy) |
| **API Surface** | 21 routers, 80+ endpoints under `/api/v1` |
| **Auth** | JWT (HS256) in HttpOnly cookie, bcrypt passwords, in-memory token blacklist |
| **RBAC** | 6 roles: admin, red_tech, blue_tech, red_lead, blue_lead, viewer |
| **Rate Limiting** | SlowAPI (5 req/min on login) |
| **Error Handling** | Global handlers for ValidationError → 400, SQLAlchemyError → 500, Exception → 500 |
| **Responsibilities** | All business logic, test workflow state machine, scoring engine, heatmap generation, report building, CRUD, data import orchestration, audit logging |
### Background Scheduler
| Attribute | Detail |
|-----------|--------|
| **Technology** | APScheduler `BackgroundScheduler` (runs in-process within backend) |
| **Lifecycle** | Starts on FastAPI lifespan startup, shuts down on app shutdown |
| **Session Model** | Each job creates and closes its own `SessionLocal()` instance |
| **Registered Jobs** | See table below |
| Job | Trigger | Frequency | Action |
|-----|---------|-----------|--------|
| `mitre_sync` | Interval | Every 24 hours | Syncs ATT&CK techniques via TAXII 2.0 (fallback: GitHub ZIP) |
| `intel_scan` | Interval | Every 7 days | Scans threat intelligence sources for new indicators |
| `notification_cleanup` | Interval | Every 24 hours | Deletes read notifications older than 90 days |
| `weekly_snapshot` | Cron | Sundays at 00:00 | Creates coverage snapshot, cleans up old ones (keeps last 52) |
| `recurring_campaigns` | Interval | Every 24 hours | Checks and spawns due recurring test campaigns |
### PostgreSQL 15
| Attribute | Detail |
|-----------|--------|
| **Image** | `postgres:15-alpine` |
| **Database** | `attackdb` |
| **Schema Management** | Alembic with 18 migration versions |
| **Connection** | `postgresql://user:pass@postgres:5432/attackdb` via SQLAlchemy |
| **Volumes** | Named volume `aegis_postgres_data_prod` for persistence |
| **Health Check** | `pg_isready` every 5 seconds |
| **Data Stored** | Techniques (ATT&CK), tests (Red/Blue workflow), users, campaigns, threat actors, detection rules (Sigma/Elastic), D3FEND mappings, compliance frameworks, audit logs, notifications, coverage snapshots, scoring config, intel items, data sources, evidence metadata |
### MinIO (S3-compatible)
| Attribute | Detail |
|-----------|--------|
| **Image** | `minio/minio:latest` |
| **Ports** | 9000 (S3 API), 9001 (admin console) |
| **Bucket** | `evidence` (auto-created on backend startup) |
| **Access** | Via boto3 S3 API from backend |
| **Volumes** | Named volume `aegis_minio_data_prod` for persistence |
| **Responsibilities** | Store Red/Blue team evidence files (screenshots, logs, PCAPs). Backend generates time-limited presigned URLs for secure browser downloads. |
### GitHub Actions (Planned)
| Attribute | Detail |
|-----------|--------|
| **Status** | Not yet implemented — no `.github/workflows/` directory exists |
| **Planned Scope** | Lint (ruff/flake8), type check (mypy), unit/integration tests (pytest), Docker image build, deploy to staging/production |
| **Integration** | Would trigger on push/PR to main branch |
| **Artifact Flow** | Build Docker images → push to Artifactory → deploy via compose |
### Artifactory (Planned)
| Attribute | Detail |
|-----------|--------|
| **Status** | Not yet implemented — no integration code exists |
| **Planned Scope** | Docker image registry for versioned backend/frontend images |
| **Integration** | Receive images from GitHub Actions CI pipeline, serve to production deploy |
---
## Network Topology
```
Internet
│ HTTPS (:80 / :443)
┌─────────────────┐
│ Frontend │
│ Nginx + React │
│ :80 │
└────────┬────────┘
┌────────────┼────────────────────────────────┐
│ │ aegis-network (bridge) │
│ │ /api/ proxy │
│ ▼ │
│ ┌─────────────────┐ │
│ │ Backend API │◄── Scheduler │
│ │ FastAPI/Uvicorn │ (in-process thread) │
│ │ :8000 │ │
│ └───┬─────────┬──┘ │
│ │ │ │
│ │ │ │
│ ▼ ▼ │
│ ┌─────────┐ ┌───────┐ │
│ │PostgreSQL│ │ MinIO │ │
│ │ :5432 │ │ :9000 │ │
│ └─────────┘ └───────┘ │
│ │
└──────────────────────────────────────────────┘
│ HTTPS (outbound only)
External Data Sources
(MITRE, SigmaHQ, Elastic, etc.)
```
---
## Data Flow Summary
| Flow | Path | Protocol | Notes |
|------|------|----------|-------|
| User → UI | Browser → Nginx | HTTPS | Static SPA assets, gzip compressed, 1-year cache for static files |
| UI → API | Nginx → Uvicorn | HTTP (internal) | Reverse proxy with 300s timeout for long sync operations |
| API → DB | Uvicorn → PostgreSQL | TCP/5432 | SQLAlchemy ORM, request-scoped sessions via `get_db()` |
| API → Storage | Uvicorn → MinIO | HTTP/9000 | boto3 S3 API, presigned URLs for downloads |
| Scheduler → DB | APScheduler thread → PostgreSQL | TCP/5432 | Independent sessions per job, created/closed in try/finally |
| Scheduler → External | APScheduler thread → MITRE TAXII | HTTPS | Scheduled sync every 24h, fallback to GitHub ZIP |
| Admin → External | API on-demand → GitHub repos | HTTPS | ZIP download triggered by admin via `/api/v1/system/*` endpoints |
| Health Check | Docker → Backend `/health` | HTTP (internal) | Restricted to private IPs via Nginx `allow/deny` directives |

140
docs/C4_CONTEXT_DIAGRAM.md Normal file
View File

@@ -0,0 +1,140 @@
# Aegis — C4 Context Diagram (Level 1)
> **Author:** Architecture review
> **Date:** February 11, 2026
> **Notation:** C4 Model — Level 1 (System Context)
---
## Diagram
```mermaid
C4Context
title Aegis — System Context Diagram (C4 Level 1)
%% ─── Actors (People) ────────────────────────────────────────────
Person(red_tech, "Red Team Technician", "Executes offensive tests, submits evidence, creates tests from templates")
Person(blue_tech, "Blue Team Technician", "Evaluates detection results, submits blue evidence, documents findings")
Person(red_lead, "Red Team Lead", "Validates red team results, manages campaigns, reviews test outcomes")
Person(blue_lead, "Blue Team Lead", "Validates blue team results, manages remediation, reviews detection gaps")
Person(admin, "Administrator", "Manages users, triggers data syncs, configures scoring, oversees platform")
Person(viewer, "Viewer", "Read-only access to dashboards, reports, heatmaps, and compliance status")
%% ─── Core System ────────────────────────────────────────────────
System(aegis, "Aegis Platform", "MITRE ATT&CK coverage management platform. Orchestrates Red/Blue team validation workflows, tracks technique coverage, generates heatmaps, compliance reports, and organizational scoring.")
%% ─── Internal Infrastructure (Owned / Deployed) ─────────────────
SystemDb(postgres, "PostgreSQL 15", "Primary data store. Stores techniques, tests, users, campaigns, threat actors, compliance mappings, audit logs, scoring config, and snapshots.")
SystemDb(minio, "MinIO (S3-compatible)", "Object storage for Red/Blue team evidence files (screenshots, logs, PCAPs). Serves presigned download URLs.")
%% ─── External Data Sources (Consumed) ───────────────────────────
System_Ext(mitre_taxii, "MITRE ATT&CK TAXII Server", "STIX/TAXII 2.0 feed providing Enterprise ATT&CK techniques and tactics. Primary source for technique catalog sync.")
System_Ext(mitre_cti, "MITRE CTI GitHub Repository", "STIX 2.0 bundles for ATT&CK techniques (fallback), intrusion-sets (threat actors), and actor-technique relationships.")
System_Ext(d3fend, "MITRE D3FEND API", "Public REST API providing defensive techniques and ATT&CK-to-D3FEND mappings for countermeasure coverage.")
System_Ext(atomic, "Atomic Red Team (GitHub)", "Repository of atomic tests mapped to ATT&CK techniques. Downloaded as ZIP, parsed from YAML atomics.")
System_Ext(sigma, "SigmaHQ (GitHub)", "Repository of Sigma detection rules in YAML format. Parsed for ATT&CK tags and imported as detection rules.")
System_Ext(elastic, "Elastic Detection Rules (GitHub)", "Repository of Elastic SIEM rules in TOML format. Parsed for MITRE threat mappings and imported as detection rules.")
System_Ext(caldera, "MITRE CALDERA (GitHub)", "Repository of CALDERA abilities. YAML files parsed from data/abilities/ and imported as test templates.")
System_Ext(lolbas, "LOLBAS Project (GitHub)", "Living Off The Land Binaries and Scripts. YAML-based catalog imported as test templates mapped to ATT&CK techniques.")
System_Ext(gtfobins, "GTFOBins (GitHub)", "Unix binaries exploitation reference. Markdown with YAML front-matter parsed and mapped to ATT&CK techniques.")
%% ─── Planned Systems (Not Yet Integrated) ──────────────────────
System_Ext(github_ent, "GitHub Enterprise (Planned)", "Future CI/CD pipeline integration for automated linting, type checking, test execution, and deployment workflows.")
System_Ext(artifactory, "Artifactory (Planned)", "Future artifact repository for storing Docker images, build artifacts, and versioned releases.")
%% ─── Relationships: Users → Aegis ───────────────────────────────
Rel(red_tech, aegis, "Creates and executes tests, uploads red evidence, uses test catalog", "HTTPS")
Rel(blue_tech, aegis, "Evaluates detections, uploads blue evidence, reviews detection rules", "HTTPS")
Rel(red_lead, aegis, "Validates red results, manages campaigns, reviews threat actor coverage", "HTTPS")
Rel(blue_lead, aegis, "Validates blue results, tracks remediation, reviews compliance", "HTTPS")
Rel(admin, aegis, "Manages users, triggers syncs, configures scoring weights, views audit logs", "HTTPS")
Rel(viewer, aegis, "Views dashboards, heatmaps, reports, and compliance status", "HTTPS")
%% ─── Relationships: Aegis → Infrastructure ──────────────────────
Rel(aegis, postgres, "Reads/writes all domain data", "TCP/5432, SQLAlchemy")
Rel(aegis, minio, "Uploads/downloads evidence files, generates presigned URLs", "HTTP/9000, boto3 S3 API")
%% ─── Relationships: Aegis → External Sources ────────────────────
Rel(aegis, mitre_taxii, "Syncs ATT&CK techniques every 24h", "TAXII 2.0 / HTTPS")
Rel(aegis, mitre_cti, "Fallback technique sync + threat actor import", "HTTPS, ZIP download")
Rel(aegis, d3fend, "Imports defensive techniques and ATT&CK mappings", "REST API / HTTPS")
Rel(aegis, atomic, "Imports Atomic Red Team test templates", "HTTPS, ZIP download")
Rel(aegis, sigma, "Imports Sigma detection rules with ATT&CK tags", "HTTPS, ZIP download")
Rel(aegis, elastic, "Imports Elastic SIEM detection rules", "HTTPS, ZIP download")
Rel(aegis, caldera, "Imports CALDERA abilities as test templates", "HTTPS, ZIP download")
Rel(aegis, lolbas, "Imports LOLBAS binaries as test templates", "HTTPS, ZIP download")
Rel(aegis, gtfobins, "Imports GTFOBins as test templates", "HTTPS, ZIP download")
%% ─── Relationships: Aegis → Planned ─────────────────────────────
Rel(aegis, github_ent, "CI/CD pipelines (planned)", "HTTPS")
Rel(aegis, artifactory, "Artifact storage (planned)", "HTTPS")
UpdateLayoutConfig($c4ShapeInRow="3", $c4BoundaryInRow="1")
```
---
## Diagram Notes
### Actor Roles
| Role | Access Level | Primary Actions |
|------|-------------|-----------------|
| **Red Team Technician** | Standard | Create tests, execute attacks, upload red evidence, use test catalog |
| **Blue Team Technician** | Standard | Evaluate detections, upload blue evidence, review detection rules |
| **Red Team Lead** | Elevated | Validate red results, manage campaigns, review threat actor coverage |
| **Blue Team Lead** | Elevated | Validate blue results, track remediation, review compliance |
| **Administrator** | Full | User management, trigger data syncs, scoring config, audit logs |
| **Viewer** | Read-only | View dashboards, heatmaps, reports, compliance status |
### External Data Source Details
| Source | Protocol | Frequency | Data Imported |
|--------|----------|-----------|---------------|
| MITRE ATT&CK TAXII | STIX/TAXII 2.0 | Every 24 hours (scheduled) | Enterprise techniques and tactics |
| MITRE CTI GitHub | HTTPS (ZIP) | Fallback + on-demand | Techniques, threat actors (intrusion-sets), actor-technique relationships |
| MITRE D3FEND | REST API | On-demand (admin trigger) | Defensive techniques, ATT&CK-to-D3FEND mappings |
| Atomic Red Team | HTTPS (ZIP ~40MB) | On-demand (admin trigger) | Test templates from `atomics/T*/T*.yaml` |
| SigmaHQ | HTTPS (ZIP) | On-demand (admin trigger) | Sigma detection rules with ATT&CK tags |
| Elastic Detection Rules | HTTPS (ZIP) | On-demand (admin trigger) | Elastic SIEM rules in TOML with MITRE mappings |
| MITRE CALDERA | HTTPS (ZIP) | On-demand (admin trigger) | Abilities from `data/abilities/{tactic}/*.yml` |
| LOLBAS Project | HTTPS (ZIP) | On-demand (admin trigger) | Living Off The Land binaries/scripts |
| GTFOBins | HTTPS (ZIP) | On-demand (admin trigger) | Unix binary exploitation references |
### Planned Integrations (Not Yet Implemented)
| System | Purpose | Status |
|--------|---------|--------|
| **GitHub Enterprise** | CI/CD pipelines for automated lint, type check, tests, and deployment | Planned — no `.github/workflows` exist yet |
| **Artifactory** | Docker image and build artifact repository | Planned — no integration code exists yet |
### Infrastructure Boundary
```
┌─────────────────────────────────────────────┐
│ Docker Compose Network │
│ │
│ ┌──────────┐ ┌──────────┐ ┌───────────┐ │
│ │ Frontend │ │ Backend │ │ PostgreSQL│ │
│ │ (Nginx) │ │ (Uvicorn)│ │ 15 │ │
│ │ :80 │ │ :8000 │ │ :5432 │ │
│ └──────────┘ └──────────┘ └───────────┘ │
│ ┌───────────┐ │
│ │ MinIO │ │
│ │ :9000/9001│ │
│ └───────────┘ │
└─────────────────────────────────────────────┘
▲ │
│ HTTPS │ HTTPS (outbound)
│ ▼
Users External Sources
```

View File

@@ -0,0 +1,291 @@
# Aegis — Technology Justification
> **Document type:** Architecture Board Submission
> **Author:** Platform Architecture Team
> **Date:** February 11, 2026
> **Classification:** Internal
> **Status:** Approved
---
## 1. Purpose
This document provides a formal justification for the technology selections made in the Aegis platform. Each technology choice is evaluated against the project's operational requirements, organizational constraints, security posture, and long-term sustainability. This document is intended for review by the Architecture Board and serves as the authoritative reference for technology governance.
---
## 2. Project Context
Aegis is an internal security operations platform that manages MITRE ATT&CK technique coverage through structured Red Team / Blue Team validation workflows. The platform integrates with 9 external threat intelligence and detection rule sources, enforces role-based access for 6 distinct user roles, and provides coverage analytics including heatmaps, scoring, compliance mapping, and executive reporting.
### Operational Requirements
| Requirement | Detail |
|------------|--------|
| **Deployment model** | On-premise, single-server, air-gap compatible |
| **User base** | 10100 concurrent security analysts and leads |
| **Data model** | 18+ relational entities with many-to-many relationships and semi-structured metadata |
| **External integrations** | 9 data sources (MITRE TAXII 2.0, GitHub REST, D3FEND REST, Sigma YAML, Elastic TOML, CALDERA YAML, LOLBAS YAML, GTFOBins Markdown, STIX 2.0 JSON) |
| **File storage** | Binary evidence files (screenshots, logs, PCAPs) ranging from KB to hundreds of MB |
| **Scheduled operations** | 5 periodic background jobs (24h7d cycles) |
| **Security** | RBAC, JWT authentication, audit logging, evidence chain of custody |
### Organizational Constraints
| Constraint | Detail |
|-----------|--------|
| **Team expertise** | Primary competency in Python and TypeScript |
| **Target operators** | Security engineers, not DevOps specialists |
| **Infrastructure** | Docker available; Kubernetes not guaranteed |
| **Network** | Outbound HTTPS required for data source sync; inbound limited to platform UI |
| **Budget** | Open-source preference; no commercial license dependencies for core platform |
---
## 3. Backend Framework: FastAPI
### Selection: FastAPI 0.x (latest stable) with Uvicorn ASGI server
### Justification
FastAPI was selected as the backend framework based on four primary evaluation criteria: API development velocity, ecosystem compatibility, runtime performance, and developer experience.
**API Development Velocity.** Aegis exposes 80+ REST endpoints across 21 domain modules. FastAPI's automatic OpenAPI specification generation from Python type annotations eliminates the need for separate API documentation tooling. Pydantic integration provides request and response validation at the framework level, reducing boilerplate code for schema enforcement. The `Depends()` dependency injection system enables composable middleware chains for authentication, authorization, and database session management without requiring a third-party DI container.
**Ecosystem Compatibility.** The platform's 9 external data source integrations depend on Python-specific libraries with no mature equivalents in other ecosystems:
| Library | Purpose | Ecosystem |
|---------|---------|-----------|
| `taxii2-client` | STIX/TAXII 2.0 protocol | Python only |
| `pySigma` | Sigma rule parsing and transformation | Python only |
| `PyYAML` | YAML parsing (Atomic Red Team, CALDERA, LOLBAS) | Python preferred |
| `toml` | TOML parsing (Elastic detection rules) | Python preferred |
| `boto3` | S3-compatible storage API (MinIO) | Python preferred |
| `defusedxml` | Secure XML processing | Python preferred |
Selecting a non-Python backend would require reimplementing or wrapping these libraries, introducing significant engineering risk.
**Runtime Performance.** FastAPI's ASGI foundation provides asynchronous request handling capability. While the current implementation uses synchronous route handlers (due to SQLAlchemy's synchronous session model), the framework does not impose a performance ceiling for the target user base (10100 concurrent users). Benchmark data from independent testing consistently places FastAPI among the highest-performing Python web frameworks.
**Developer Experience.** Interactive Swagger UI (`/docs`) and ReDoc (`/redoc`) are available in non-production environments, accelerating API exploration and frontend integration. These documentation endpoints are automatically disabled in production to reduce attack surface.
### Alternatives Evaluated
| Framework | Evaluation Summary | Disposition |
|-----------|-------------------|-------------|
| Django + Django REST Framework | Mature and feature-rich, but introduces heavier ORM opinions, an unnecessary admin panel, and slower cold-start times. Django's ORM lacks SQLAlchemy's flexibility for JSONB column handling and complex join patterns required by the scoring engine. | Rejected |
| Flask + Flask-RESTful | Lightweight but lacks built-in request validation, automatic OpenAPI generation, and dependency injection. Would require additional libraries (marshmallow, flask-apispec) to achieve parity with FastAPI's built-in capabilities. | Rejected |
| Go (Gin / Echo) | Superior raw throughput, but the team's primary expertise is Python. The 9 data source integrations depend on Python libraries with no Go equivalents. The development velocity loss would outweigh performance gains for a 10100 user internal platform. | Rejected |
| NestJS (Node.js / TypeScript) | Would unify frontend and backend language, but splits runtime expertise. No mature Node.js equivalents for STIX/TAXII and Sigma rule parsing. The Python data science and security tooling ecosystem is substantially deeper. | Rejected |
---
## 4. Primary Database: PostgreSQL 15
### Selection: PostgreSQL 15 (Alpine) with SQLAlchemy ORM and Alembic migrations
### Justification
PostgreSQL was selected as the primary relational data store based on three requirements: relational integrity for a complex domain model, semi-structured data support for external source metadata, and operational maturity for on-premise deployment.
**Relational Integrity.** The Aegis data model comprises 18+ entities with deep relational dependencies: techniques relate to tests, tests belong to campaigns, campaigns map to threat actors, threat actors link to techniques, compliance controls map to techniques, and detection rules associate with both techniques and test templates. This graph of many-to-many relationships demands foreign key enforcement, transactional consistency, and efficient join operations — core strengths of a relational database.
**Semi-Structured Data (JSONB).** Several entities carry metadata with variable structure imported from external sources (STIX 2.0, Sigma YAML, Elastic TOML). PostgreSQL's native JSONB column type stores this data in a binary-indexed format that supports containment queries and GIN indexing, eliminating the need for a separate document store. Current JSONB usage is contained to 12 columns across 6 tables:
| Entity | JSONB Fields | Content |
|--------|-------------|---------|
| Technique | `platforms` | OS platform array from ATT&CK |
| Threat Actor | `aliases`, `target_sectors`, `target_regions`, `references` | STIX 2.0 metadata |
| Detection Rule | `platforms`, `log_sources` | Rule targeting metadata |
| Data Source | `last_sync_stats`, `config` | Import statistics and source-specific configuration |
| Campaign | `tags` | User-defined classification |
| Audit Log | `details` | Action-specific metadata (variable per action type) |
**Operational Maturity.** PostgreSQL 15 provides built-in health checking (`pg_isready`), mature backup tooling (`pg_dump`/`pg_restore`), extensive monitoring capabilities, and a 25+ year track record of production reliability. The Alpine-based Docker image is approximately 80MB, suitable for on-premise deployments with limited resources.
**Schema Management.** Alembic provides version-controlled database migrations (18 versions to date), enabling reproducible schema evolution and rollback capability.
### Alternatives Evaluated
| Database | Evaluation Summary | Disposition |
|----------|-------------------|-------------|
| MongoDB | The core domain is deeply relational. Modeling technique-test-campaign-actor relationships in MongoDB would require denormalization or manual reference integrity, trading the JSONB advantage for relational integrity loss. | Rejected |
| MySQL 8 (JSON) | PostgreSQL's JSONB is binary-indexed and faster for containment queries than MySQL's text-based JSON type. PostgreSQL also provides native UUID support (vs. BINARY(16) in MySQL), which aligns with the platform's UUID-based primary keys. | Rejected |
| PostgreSQL + MongoDB (dual) | The operational complexity of maintaining two database systems is unjustified for 12 JSONB columns. A dual-database architecture would also complicate transactional consistency across relational and document data. | Rejected |
---
## 5. Object Storage: MinIO
### Selection: MinIO (S3-compatible) accessed via boto3 (AWS S3 SDK for Python)
### Justification
MinIO was selected as the evidence storage system based on three requirements: S3 API compatibility for portability, on-premise deployment capability, and separation of binary data from the relational database.
**S3 API Compatibility.** MinIO implements the Amazon S3 API specification, accessed via the industry-standard `boto3` client library. This provides a zero-code-change migration path to AWS S3, Google Cloud Storage (via S3-compatible mode), or any other S3-compatible storage service should the deployment model change from on-premise to cloud. The storage interface (`upload_file`, `get_presigned_url`, `ensure_bucket_exists`) is a thin abstraction layer that is storage-backend agnostic.
**On-Premise Deployment.** The platform is designed for deployment within organizational security environments where external cloud storage services may not be permitted due to data classification or regulatory requirements. MinIO runs as a single Docker container with persistent volume storage, requiring no external dependencies or network egress for storage operations.
**Binary Data Separation.** Evidence files (screenshots, packet captures, log extracts) range from kilobytes to hundreds of megabytes. Storing binary data in PostgreSQL (BYTEA columns) would degrade database backup performance, increase storage costs, and complicate streaming downloads. MinIO's presigned URL mechanism offloads download bandwidth from the application server — the browser fetches evidence files directly from MinIO without proxying through the backend.
**Administrative Visibility.** MinIO Console (port 9001) provides a web-based management interface for administrators to inspect, audit, and manage stored evidence files without requiring command-line access.
### Alternatives Evaluated
| Storage | Evaluation Summary | Disposition |
|---------|-------------------|-------------|
| PostgreSQL BYTEA | Stores binary files in the relational database. Bloats backups, degrades query performance on large tables, and requires the backend to proxy all file downloads. Not designed as a file store. | Rejected |
| Local filesystem | Not portable across container restarts without host volume mounts. No presigned URL support (backend must proxy all downloads). No replication, versioning, or management interface. | Rejected |
| AWS S3 | Requires a cloud account, internet connectivity for storage operations, and AWS credential management. Incompatible with air-gap or restricted-network deployment requirements. | Rejected (as primary; migration path preserved) |
| SeaweedFS | Smaller community and less mature S3-compatible API layer. boto3 compatibility is not fully guaranteed. Insufficient adoption for long-term support confidence. | Rejected |
---
## 6. Frontend: React 19 + TypeScript 5.9
### Selection: React 19, TypeScript 5.9, Vite 7.3, Tailwind CSS 4, TanStack React Query 5
### Justification
The frontend technology selection was driven by four criteria: component ecosystem maturity, type safety for a complex domain, build tooling performance, and developer productivity.
**Component Ecosystem Maturity.** Aegis presents a complex user interface comprising 21 pages, 30+ components, and specialized visualizations including ATT&CK Navigator-compatible heatmaps, campaign timelines, compliance gauges, and multi-role workflow views. React's component model and its ecosystem (Recharts for data visualization, Lucide for iconography, TanStack Virtual for list virtualization) provide production-ready solutions for each of these requirements.
**Type Safety.** TypeScript's static type system enforces correctness across the API communication layer (22 domain-specific API modules), shared type definitions (`types/models.ts`), and component props. With `strict: true` in `tsconfig.json`, the compiler catches null reference errors, incorrect property access, and type mismatches at build time rather than runtime. This is particularly valuable for the complex test workflow state machine, where state-dependent UI behavior must correctly reflect 6 possible test states and 6 user roles.
**Build Tooling.** Vite provides sub-second hot module replacement during development and optimized production builds via Rollup. The multi-stage Docker build produces a minimal Nginx image (~25MB) serving pre-compiled static assets, eliminating the need for a Node.js runtime in production.
**Server State Management.** TanStack React Query manages all server-side state (caching, refetching, mutation invalidation), eliminating the need for a client-side state management library (Redux, Zustand, MobX) for data fetching concerns. Authentication state is managed via React Context, and UI feedback via a Toast context — both lightweight patterns that avoid unnecessary library dependencies.
**Styling.** Tailwind CSS 4 provides utility-first styling with zero-runtime CSS generation. The design system is consistent across all 21 pages without maintaining a separate CSS architecture or component library.
### Alternatives Evaluated
| Framework | Evaluation Summary | Disposition |
|-----------|-------------------|-------------|
| Angular | Comprehensive framework with built-in DI, routing, and HTTP client. However, the heavier abstraction layer and steeper learning curve are unnecessary for a team with React experience. Angular's opinionated module system adds boilerplate for a project of this scale. | Rejected |
| Vue 3 + TypeScript | Viable alternative with good TypeScript support and a smaller learning curve. However, the React ecosystem offers deeper library coverage for specialized components (ATT&CK heatmaps, data grids, chart libraries). The team's existing React proficiency favors continuity. | Rejected |
| Svelte / SvelteKit | Excellent developer experience and smaller bundle sizes, but a significantly smaller ecosystem for complex data visualization. Library availability for heatmaps, virtual scrolling, and charting is limited compared to React. | Rejected |
| HTMX + server-rendered templates | Would reduce frontend complexity but cannot support the interactive heatmap, drag-and-drop campaign management, real-time notification updates, and complex multi-step workflow forms required by the platform. | Rejected |
---
## 7. Containerization and Deployment: Docker Compose
### Selection: Docker with Docker Compose (V2), multi-stage Dockerfiles
### Justification
Docker Compose was selected as the deployment orchestration tool based on three requirements: single-command deployment for non-DevOps operators, consistent development-to-production environments, and minimal infrastructure prerequisites.
**Operator Accessibility.** The platform is deployed by security engineers who may not have Kubernetes expertise or access to container orchestration infrastructure. Docker Compose provides single-command deployment (`docker compose up -d --build`) with an interactive installation script (`install.sh`) that generates secrets, prompts for configuration, and produces a `.env` file. This reduces deployment complexity to a level appropriate for the target operator profile.
**Environment Consistency.** Two compose files maintain parity between development and production:
| Aspect | Development | Production |
|--------|-------------|------------|
| Frontend | Vite dev server, hot reload | Nginx serving static build |
| Backend | Source volume-mounted, auto-reload | Multi-stage build, non-root user |
| Ports | All services exposed | Only frontend exposed |
| Secrets | Auto-generated ephemeral | Required via environment |
**Infrastructure Footprint.** The entire platform (4 services) runs on a single server with Docker as the only prerequisite. Named volumes provide data persistence across container rebuilds. Health checks and dependency ordering ensure correct startup sequencing.
**Security Hardening.** The backend Dockerfile follows container security best practices: non-root user (`appuser`, UID 1001), minimal base image (`python:3.11-slim`), and no unnecessary system packages beyond build dependencies.
### Alternatives Evaluated
| Platform | Evaluation Summary | Disposition |
|----------|-------------------|-------------|
| Kubernetes | Provides horizontal scaling, rolling deployments, and self-healing. However, it requires a cluster, kubectl expertise, Helm charts, ingress controllers, and persistent volume claims. This operational overhead is disproportionate for a 4-service application targeting single-server deployment. | Rejected (viable future evolution for multi-server) |
| Docker Swarm | Adds orchestration with lower complexity than Kubernetes but provides minimal benefit over Compose for < 5 services. Docker Swarm's development trajectory has stalled relative to Compose V2. | Rejected |
| Bare metal / systemd | Loses containerization benefits: isolation, reproducibility, and dependency management. Would require manual installation of Python, Node.js, PostgreSQL, and MinIO on each target system, increasing deployment failure risk. | Rejected |
---
## 8. CI/CD and Artifact Management: GitHub Enterprise + Artifactory
### Selection: GitHub Enterprise for source control and CI/CD; JFrog Artifactory for artifact storage
### Status: Planned — not yet implemented
### Justification
GitHub Enterprise and Artifactory are designated as the CI/CD and artifact management platforms for Aegis based on organizational standardization, security requirements, and the artifact lifecycle.
**Organizational Standardization.** GitHub Enterprise is the organization's standard source control and CI/CD platform. Adopting it for Aegis ensures consistency with existing developer workflows, access control policies, and audit mechanisms. Security teams reviewing the Aegis codebase will use familiar tooling and processes.
**CI/CD Pipeline (Planned).** The following GitHub Actions workflow stages are planned:
| Stage | Tools | Trigger |
|-------|-------|---------|
| **Lint** | ruff (Python), ESLint (TypeScript) | Push to any branch |
| **Type check** | mypy (Python), tsc --noEmit (TypeScript) | Push to any branch |
| **Unit tests** | pytest (backend), vitest (frontend) | Push to any branch |
| **Integration tests** | pytest with PostgreSQL service container | Pull request to main |
| **Docker build** | Multi-stage Dockerfile verification | Pull request to main |
| **Image publish** | Docker build + push to Artifactory | Merge to main |
| **Deploy** | Docker Compose pull + restart | Manual trigger or tag |
**Artifact Repository.** Artifactory serves as the Docker image registry for versioned backend and frontend images. This provides:
- **Versioned releases:** Each merge to main produces a tagged image (`aegis-backend:1.2.3`, `aegis-frontend:1.2.3`).
- **Rollback capability:** Previous image versions remain available for rapid rollback.
- **Vulnerability scanning:** Artifactory's Xray integration enables automated CVE scanning of Docker image layers.
- **Access control:** Image pull/push permissions align with organizational RBAC policies.
**Air-Gap Deployment Support.** For restricted-network deployments, Docker images can be exported from Artifactory as tarballs (`docker save`), transferred via secure media, and loaded into the target environment (`docker load`) without requiring network connectivity to the registry.
### Implementation Timeline
| Phase | Scope | Estimated Effort |
|-------|-------|-----------------|
| Phase 1 | Basic CI: lint + type check + unit tests | 12 days |
| Phase 2 | Integration tests with PostgreSQL service container | 23 days |
| Phase 3 | Docker image build + Artifactory publish | 12 days |
| Phase 4 | Automated deployment trigger | 23 days |
---
## 9. Technology Stack Summary
| Layer | Technology | Version | License | Purpose |
|-------|-----------|---------|---------|---------|
| **Backend** | Python | 3.11 | PSF | Runtime |
| | FastAPI | latest | MIT | Web framework |
| | Uvicorn | latest | BSD | ASGI server |
| | SQLAlchemy | latest | MIT | ORM |
| | Alembic | latest | MIT | Migrations |
| | Pydantic | v2 | MIT | Validation |
| | APScheduler | latest | MIT | Background jobs |
| | boto3 | latest | Apache 2.0 | S3 storage client |
| **Frontend** | React | 19.2 | MIT | UI framework |
| | TypeScript | 5.9 | Apache 2.0 | Type safety |
| | Vite | 7.3 | MIT | Build tooling |
| | Tailwind CSS | 4.1 | MIT | Styling |
| | TanStack Query | 5.90 | MIT | Server state |
| | Recharts | 2.15 | MIT | Visualization |
| **Database** | PostgreSQL | 15 | PostgreSQL | Relational store |
| **Storage** | MinIO | latest | AGPL-3.0 | Object storage |
| **Infrastructure** | Docker | latest | Apache 2.0 | Containerization |
| | Docker Compose | V2 | Apache 2.0 | Orchestration |
| | Nginx | Alpine | BSD | Reverse proxy |
| **CI/CD** | GitHub Enterprise | — | Commercial | Source control + CI |
| **Artifacts** | Artifactory | — | Commercial | Image registry |
### License Compliance Note
All core platform dependencies use permissive open-source licenses (MIT, BSD, Apache 2.0, PSF, PostgreSQL License). The only copyleft dependency is MinIO (AGPL-3.0), which is used as a standalone service (not linked into application code) and therefore does not impose AGPL obligations on the Aegis codebase. GitHub Enterprise and Artifactory are covered under existing organizational commercial licenses.
---
## 10. Approval
| Role | Name | Date | Signature |
|------|------|------|-----------|
| Platform Architect | | | |
| Security Architect | | | |
| Infrastructure Lead | | | |
| Development Lead | | | |
| Architecture Board Chair | | | |