Aegis/backend/app/services/threat_actor_import_service.py

"""Threat Actor import service (MITRE CTI / STIX 2.0).

Downloads the MITRE CTI repository, parses the STIX 2.0 bundle for
``intrusion-set`` objects (APT groups) and ``relationship`` objects
linking them to ``attack-pattern`` (techniques), then creates
:class:`ThreatActor` and :class:`ThreatActorTechnique` records.

STIX 2.0 structure
------------------
The enterprise-attack bundle contains:
- ``intrusion-set`` objects → our ThreatActor rows
- ``attack-pattern`` objects → already in our Technique table
- ``relationship`` objects (type=uses) → connects intrusion-set → attack-pattern

Strategy
--------
1. Download ZIP of ``github.com/mitre/cti``.
2. Load ``enterprise-attack/enterprise-attack.json`` (single STIX bundle).
3. Build lookup maps for intrusion-sets and attack-patterns.
4. Parse relationships to connect actors → techniques.
5. Upsert into database.

Idempotency
-----------
Deduplication by ``mitre_id`` for ThreatActor and by the unique
constraint ``(threat_actor_id, technique_id)`` for ThreatActorTechnique.
"""

import io
import json
import logging
import shutil
import tempfile
import zipfile
from datetime import datetime
from pathlib import Path

import requests as _requests
from sqlalchemy.orm import Session

from app.models.threat_actor import ThreatActor, ThreatActorTechnique
from app.models.technique import Technique
from app.models.data_source import DataSource
from app.services.audit_service import log_action

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

MITRE_CTI_ZIP_URL = (
    "https://github.com/mitre/cti"
    "/archive/refs/heads/master.zip"
)

_DOWNLOAD_TIMEOUT = 300
_ZIP_ROOT_PREFIX = "cti-master"


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _download_zip(url: str = MITRE_CTI_ZIP_URL) -> bytes:
    """Download the MITRE CTI ZIP and return raw bytes."""
    logger.info("Downloading MITRE CTI ZIP from %s …", url)
    resp = _requests.get(url, timeout=_DOWNLOAD_TIMEOUT, stream=True)
    resp.raise_for_status()
    content = resp.content
    logger.info("Downloaded %.1f MB", len(content) / (1024 * 1024))
    return content


def _extract_zip_and_load_bundle(zip_bytes: bytes, dest: str) -> dict:
    """Extract ZIP and load the enterprise-attack STIX bundle."""
    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
        zf.extractall(dest)

    bundle_path = (
        Path(dest) / _ZIP_ROOT_PREFIX
        / "enterprise-attack" / "enterprise-attack.json"
    )
    if not bundle_path.is_file():
        raise FileNotFoundError(
            f"STIX bundle not found at {bundle_path}"
        )

    logger.info("Loading STIX bundle from %s …", bundle_path)
    with open(bundle_path, "r", encoding="utf-8") as fh:
        bundle = json.load(fh)

    objects = bundle.get("objects", [])
    logger.info("Loaded %d STIX objects", len(objects))
    return bundle


def _extract_mitre_id(external_references: list) -> str | None:
    """Extract the MITRE ATT&CK ID from external_references."""
    if not isinstance(external_references, list):
        return None
    for ref in external_references:
        if isinstance(ref, dict) and ref.get("source_name") == "mitre-attack":
            return ref.get("external_id")
    return None


def _extract_mitre_url(external_references: list) -> str | None:
    """Extract the MITRE ATT&CK URL from external_references."""
    if not isinstance(external_references, list):
        return None
    for ref in external_references:
        if isinstance(ref, dict) and ref.get("source_name") == "mitre-attack":
            return ref.get("url")
    return None


# Map STIX primary_motivation vocabulary → simplified frontend values
_MOTIVATION_MAP: dict[str, str] = {
    # Espionage / nation-state
    "espionage":                "espionage",
    "national-security":        "espionage",
    "political":                "espionage",
    # Financial
    "financial":                "financial",
    "financial-gain":           "financial",
    "personal-gain":            "financial",
    "organizational-gain":      "financial",
    # Destruction / disruption
    "destruction":              "destruction",
    "disruption":               "destruction",
    "coercion":                 "destruction",
    "dominance":                "destruction",
    # Hacktivism / ideology
    "ideology":                 "hacktivism",
    "hacktivism":               "hacktivism",
    "notoriety":                "hacktivism",
    "personal-satisfaction":    "hacktivism",
    "revenge":                  "hacktivism",
}


def _normalize_motivation(raw: str | None) -> str | None:
    """Normalize a STIX primary_motivation value to the Aegis vocabulary."""
    if not raw:
        return None
    return _MOTIVATION_MAP.get(raw.lower().strip())


# Known MITRE group IDs → motivation (overrides description inference)
_MITRE_ID_MOTIVATION: dict[str, str] = {
    # ── Financial ──────────────────────────────────────────────────
    "G0046": "financial",  # FIN7
    "G0037": "financial",  # FIN6
    "G0061": "financial",  # FIN8
    "G0080": "financial",  # Cobalt Group
    "G0008": "financial",  # Carbanak
    "G0114": "financial",  # Chimera (financial)
    "G0032": "financial",  # Lazarus (financial ops)
    "G0082": "financial",  # APT38
    "G0098": "financial",  # BlackTech (financial)
    "G0096": "financial",  # APT41 (partly financial)
    "G0102": "financial",  # Wizard Spider (Ryuk/Conti)
    "G0119": "financial",  # Indrik Spider
    "G0108": "financial",  # Blue Mockingbird
    "G0059": "financial",  # Magic Hound (some financial)
    # ── Espionage ──────────────────────────────────────────────────
    "G0007": "espionage",  # APT28 / Fancy Bear
    "G0016": "espionage",  # APT29 / Cozy Bear
    "G0025": "espionage",  # APT17
    "G0050": "espionage",  # APT32 / OceanLotus
    "G0064": "espionage",  # APT33 / Elfin
    "G0049": "espionage",  # APT34 / OilRig
    "G0010": "espionage",  # Turla
    "G0022": "espionage",  # APT3
    "G0006": "espionage",  # APT1 / Comment Crew
    "G0009": "espionage",  # Deep Panda
    "G0045": "espionage",  # menuPass / APT10
    "G0041": "espionage",  # Leviathan / APT40
    "G0060": "espionage",  # BRONZE BUTLER
    "G0065": "espionage",  # Leviathan / APT40
    "G0001": "espionage",  # Axiom
    "G0004": "espionage",  # Ke3chang
    "G0011": "espionage",  # PittyTiger
    "G0015": "espionage",  # Tonto Team
    "G0020": "espionage",  # Equation Group
    "G0030": "espionage",  # Lotus Blossom
    "G0035": "espionage",  # Dragonfly / Energetic Bear
    "G0036": "espionage",  # PLATINUM
    "G0038": "espionage",  # Stealth Falcon
    "G0040": "espionage",  # Patchwork
    "G0043": "espionage",  # Group5
    "G0047": "espionage",  # Gamaredon Group
    "G0048": "espionage",  # RTM (partly)
    "G0052": "espionage",  # CopyKittens
    "G0053": "espionage",  # FIN5 (partly espionage)
    "G0055": "espionage",  # NEODYMIUM
    "G0056": "espionage",  # PROMETHIUM
    "G0058": "espionage",  # Charming Kitten / APT35
    "G0062": "espionage",  # CozyDuke
    "G0063": "espionage",  # Sowbug
    "G0066": "espionage",  # Elderwood
    "G0067": "espionage",  # APT37 / Reaper (espionage+destruction)
    "G0068": "espionage",  # PLATINUM
    "G0069": "espionage",  # MuddyWater
    "G0074": "espionage",  # Transparent Tribe
    "G0075": "espionage",  # Rancor
    "G0076": "espionage",  # Thrip
    "G0077": "espionage",  # Leafminer / OilRig subgroup
    "G0087": "espionage",  # APT39
    "G0090": "espionage",  # Leafminer
    "G0091": "espionage",  # Silence (financial but listed here)
    "G0093": "espionage",  # GALLIUM
    "G0094": "espionage",  # Kimsuky
    "G0099": "espionage",  # APT-C-36
    "G0100": "espionage",  # Inception
    "G0103": "espionage",  # Mofang
    "G0104": "espionage",  # Volatile Cedar
    "G0105": "espionage",  # DarkHydrus
    "G0106": "espionage",  # Rocke
    "G0107": "espionage",  # Whitefly
    "G0109": "espionage",  # Machete
    "G0110": "espionage",  # Dark Caracal
    "G0111": "espionage",  # Dark Basin
    "G0112": "espionage",  # Windshift
    "G0113": "espionage",  # Frankenstein
    "G0115": "espionage",  # HAFNIUM
    "G0116": "espionage",  # Operation Wocao
    "G0117": "espionage",  # Fox Kitten
    "G0118": "espionage",  # TA505
    "G0120": "espionage",  # Evilnum
    "G0121": "espionage",  # Sidewinder
    "G0122": "espionage",  # Silent Librarian
    "G0123": "espionage",  # Waterbear
    "G0124": "espionage",  # Windigo
    "G0125": "espionage",  # HAFNIUM (dup)
    "G0126": "espionage",  # Higaisa
    "G0127": "espionage",  # TA551
    "G0128": "espionage",  # ZIRCONIUM / APT31
    "G0129": "espionage",  # Mustang Panda
    "G0130": "espionage",  # Ajax Security Team
    "G0131": "espionage",  # Tonto Team
    "G0133": "espionage",  # Nomadic Octopus
    "G0134": "espionage",  # Sandworm (espionage+destruction)
    "G0135": "espionage",  # BackdoorDiplomacy
    "G0136": "espionage",  # IndigoZebra
    "G0138": "espionage",  # Threat Group-3390
    "G0139": "espionage",  # TeamTNT
    "G0140": "espionage",  # LazyScripter
    "G0141": "espionage",  # Aoqin Dragon
    "G0142": "espionage",  # Confucius
    "G0143": "espionage",  # Aquatic Panda
    "G0144": "espionage",  # TG-3390
    "G0145": "espionage",  # POLONIUM
    # ── Destruction ────────────────────────────────────────────────
    "G0034": "destruction",  # Sandworm Team
    "G0067": "destruction",  # APT37 (also espionage)
    "G0070": "destruction",  # Dark Caracal
    "G0072": "destruction",  # Honeybee
    "G0079": "destruction",  # DarkHotel (partly)
    "G0095": "destruction",  # Machete (partly)
    "G0031": "destruction",  # Cleaver
    # ── Hacktivism ─────────────────────────────────────────────────
    "G0026": "hacktivism",   # APT18 (some ops)
}


# Keyword patterns for description-based inference
_DESCRIPTION_KEYWORDS: list[tuple[str, str]] = [
    # Financial first (strongest signal)
    ("financially motivated",   "financial"),
    ("financial gain",          "financial"),
    ("financial crime",         "financial"),
    ("for financial",           "financial"),
    ("ransomware",              "financial"),
    ("extortion",               "financial"),
    ("fraud",                   "financial"),
    ("profit",                  "financial"),
    ("monetar",                 "financial"),
    ("criminal group",          "financial"),
    ("cybercriminal",           "financial"),
    ("e-crime",                 "financial"),
    # Destruction
    ("destructive",             "destruction"),
    ("disruptive",              "destruction"),
    ("wiper",                   "destruction"),
    ("sabotage",                "destruction"),
    ("disrupt",                 "destruction"),
    # Hacktivism
    ("hacktivist",              "hacktivism"),
    ("political statement",     "hacktivism"),
    ("ideolog",                 "hacktivism"),
    # Espionage (broad, lowest priority)
    ("espionage",               "espionage"),
    ("intelligence collection", "espionage"),
    ("intelligence gathering",  "espionage"),
    ("cyber espionage",         "espionage"),
    ("nation-state",            "espionage"),
    ("state-sponsored",         "espionage"),
    ("government-sponsored",    "espionage"),
    ("military intelligence",   "espionage"),
]


def _infer_motivation_from_description(description: str) -> str | None:
    """Infer motivation by scanning the group description for keywords."""
    if not description:
        return None
    lower = description.lower()
    for keyword, motivation in _DESCRIPTION_KEYWORDS:
        if keyword in lower:
            return motivation
    return None


def _parse_intrusion_sets(objects: list) -> list[dict]:
    """Parse STIX intrusion-set objects into ThreatActor dicts."""
    actors = []
    for obj in objects:
        if obj.get("type") != "intrusion-set":
            continue
        if obj.get("revoked"):
            continue

        ext_refs = obj.get("external_references", [])
        mitre_id = _extract_mitre_id(ext_refs)
        mitre_url = _extract_mitre_url(ext_refs)

        name = obj.get("name", "").strip()
        if not name:
            continue

        aliases = obj.get("aliases", [])
        if isinstance(aliases, list) and name in aliases:
            aliases = [a for a in aliases if a != name]

        description = obj.get("description", "")

        # Derive motivation: curated override > STIX field > description inference
        raw_motivation = obj.get("primary_motivation")
        motivation = (
            _MITRE_ID_MOTIVATION.get(mitre_id or "")
            or _normalize_motivation(raw_motivation)
            or _infer_motivation_from_description(description)
        )
        sophistication = obj.get("sophistication")  # e.g. "advanced", "expert"

        # Extract references (non-MITRE)
        references = []
        for ref in ext_refs:
            if isinstance(ref, dict) and ref.get("source_name") != "mitre-attack":
                references.append({
                    "source": ref.get("source_name", ""),
                    "url": ref.get("url", ""),
                    "description": ref.get("description", ""),
                })

        actors.append({
            "stix_id": obj.get("id"),  # e.g. "intrusion-set--abc123"
            "mitre_id": mitre_id,
            "name": name,
            "aliases": aliases if aliases else [],
            "description": description,
            "mitre_url": mitre_url,
            "references": references[:20],  # cap to avoid bloat
            "first_seen": obj.get("first_seen"),
            "last_seen": obj.get("last_seen"),
            "motivation": motivation,
            "sophistication": sophistication,
        })

    logger.info("Parsed %d intrusion-sets (threat actors)", len(actors))
    return actors


def _parse_relationships(objects: list) -> list[dict]:
    """Parse STIX relationship objects (type=uses) linking
    intrusion-sets to attack-patterns.
    """
    relationships = []
    for obj in objects:
        if obj.get("type") != "relationship":
            continue
        if obj.get("relationship_type") != "uses":
            continue
        if obj.get("revoked"):
            continue

        source_ref = obj.get("source_ref", "")
        target_ref = obj.get("target_ref", "")

        # We want intrusion-set → attack-pattern
        if not source_ref.startswith("intrusion-set--"):
            continue
        if not target_ref.startswith("attack-pattern--"):
            continue

        relationships.append({
            "source_ref": source_ref,
            "target_ref": target_ref,
            "description": obj.get("description", ""),
        })

    logger.info("Parsed %d uses-relationships (actor→technique)", len(relationships))
    return relationships


def _build_attack_pattern_map(objects: list) -> dict[str, str]:
    """Build a map from STIX attack-pattern ID → MITRE technique ID.

    e.g. {"attack-pattern--abc123": "T1059.001"}
    """
    mapping = {}
    for obj in objects:
        if obj.get("type") != "attack-pattern":
            continue
        if obj.get("revoked"):
            continue
        stix_id = obj.get("id", "")
        mitre_id = _extract_mitre_id(obj.get("external_references", []))
        if stix_id and mitre_id:
            mapping[stix_id] = mitre_id
    logger.info("Built attack-pattern map with %d entries", len(mapping))
    return mapping


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def sync(db: Session) -> dict:
    """Download and import threat actors from MITRE CTI.

    Returns a summary dict.
    """
    tmp_dir = tempfile.mkdtemp(prefix="aegis_mitre_cti_")
    try:
        zip_bytes = _download_zip()
        bundle = _extract_zip_and_load_bundle(zip_bytes, tmp_dir)
    finally:
        shutil.rmtree(tmp_dir, ignore_errors=True)
        logger.info("Cleaned up temp directory %s", tmp_dir)

    objects = bundle.get("objects", [])

    # Step 1: Parse data
    actor_dicts = _parse_intrusion_sets(objects)
    relationships = _parse_relationships(objects)
    attack_pattern_map = _build_attack_pattern_map(objects)

    # Step 2: Build STIX-ID → actor dict map
    stix_to_actor = {a["stix_id"]: a for a in actor_dicts}

    # Step 3: Load existing actors and techniques from DB
    existing_actors = {
        row.mitre_id: row
        for row in db.query(ThreatActor).all()
        if row.mitre_id
    }

    technique_by_mitre_id = {
        row.mitre_id: row
        for row in db.query(Technique).all()
    }

    # Step 4: Upsert threat actors
    actors_created = 0
    actors_skipped = 0
    stix_to_db_actor: dict[str, ThreatActor] = {}

    for actor_dict in actor_dicts:
        mitre_id = actor_dict["mitre_id"]
        stix_id = actor_dict["stix_id"]

        if mitre_id and mitre_id in existing_actors:
            # Update existing actor
            db_actor = existing_actors[mitre_id]
            db_actor.name = actor_dict["name"]
            db_actor.aliases = actor_dict["aliases"]
            db_actor.description = actor_dict["description"]
            db_actor.mitre_url = actor_dict["mitre_url"]
            db_actor.references = actor_dict["references"]
            db_actor.first_seen = actor_dict.get("first_seen")
            db_actor.last_seen = actor_dict.get("last_seen")
            # Update enrichment fields if available
            if actor_dict.get("motivation"):
                db_actor.motivation = actor_dict["motivation"]
            if actor_dict.get("sophistication"):
                db_actor.sophistication = actor_dict["sophistication"]
            stix_to_db_actor[stix_id] = db_actor
            actors_skipped += 1
        else:
            # Create new actor
            db_actor = ThreatActor(
                mitre_id=mitre_id,
                name=actor_dict["name"],
                aliases=actor_dict["aliases"],
                description=actor_dict["description"],
                mitre_url=actor_dict["mitre_url"],
                references=actor_dict["references"],
                first_seen=actor_dict.get("first_seen"),
                last_seen=actor_dict.get("last_seen"),
                motivation=actor_dict.get("motivation"),
                sophistication=actor_dict.get("sophistication"),
                is_active=True,
            )
            db.add(db_actor)
            db.flush()  # get the ID
            if mitre_id:
                existing_actors[mitre_id] = db_actor
            stix_to_db_actor[stix_id] = db_actor
            actors_created += 1

    db.flush()

    # Step 5: Upsert actor-technique relationships
    # Load existing relationships
    existing_rels: set[tuple] = set()
    for row in db.query(ThreatActorTechnique).all():
        existing_rels.add((str(row.threat_actor_id), str(row.technique_id)))

    rels_created = 0
    rels_skipped = 0

    for rel in relationships:
        source_ref = rel["source_ref"]
        target_ref = rel["target_ref"]

        # Resolve actor
        db_actor = stix_to_db_actor.get(source_ref)
        if not db_actor:
            continue

        # Resolve technique
        mitre_technique_id = attack_pattern_map.get(target_ref)
        if not mitre_technique_id:
            continue

        db_technique = technique_by_mitre_id.get(mitre_technique_id)
        if not db_technique:
            continue

        rel_key = (str(db_actor.id), str(db_technique.id))
        if rel_key in existing_rels:
            rels_skipped += 1
            continue

        actor_technique = ThreatActorTechnique(
            threat_actor_id=db_actor.id,
            technique_id=db_technique.id,
            usage_description=rel["description"][:5000] if rel["description"] else None,
        )
        db.add(actor_technique)
        existing_rels.add(rel_key)
        rels_created += 1

    db.commit()

    summary = {
        "actors_created": actors_created,
        "actors_updated": actors_skipped,
        "relationships_created": rels_created,
        "relationships_skipped": rels_skipped,
        "total_actors_parsed": len(actor_dicts),
        "total_relationships_parsed": len(relationships),
    }

    # Update DataSource record
    ds = db.query(DataSource).filter(DataSource.name == "mitre_cti").first()
    if ds:
        ds.last_sync_at = datetime.utcnow()
        ds.last_sync_status = "success"
        ds.last_sync_stats = summary
        db.commit()

    logger.info("MITRE CTI threat actor import complete — %s", summary)

    log_action(
        db,
        user_id=None,
        action="import_threat_actors",
        entity_type="threat_actor",
        entity_id=None,
        details=summary,
    )
    db.commit()

    return summary