"""MITRE CALDERA abilities import service. Downloads the CALDERA repository ZIP from GitHub, parses the ability YAML files under ``data/abilities/{tactic}/``, and creates :class:`TestTemplate` records in the database. Strategy -------- 1. Download the CALDERA repo as a ZIP. 2. Extract into a temporary directory. 3. Walk ``data/abilities/{tactic}/*.yml`` files. 4. For each ability: extract name, description, technique ID, platforms, and executor commands. 5. Create TestTemplate rows keyed by the ability's ``id`` field. 6. Clean up. Idempotency ----------- Running the import twice does **not** create duplicates. Existing templates are identified by ``source = "caldera"`` + ``atomic_test_id`` (the CALDERA ability ``id``). """ # Import io import io # Import logging import logging # Import shutil import shutil # Import tempfile import tempfile # Import zipfile import zipfile # Import datetime from datetime from datetime import datetime # Import Path from pathlib from pathlib import Path # Import requests import requests as _requests # Import yaml import yaml # Import Session from sqlalchemy.orm from sqlalchemy.orm import Session # Import DataSource from app.models.data_source from app.models.data_source import DataSource from app.models.technique import Technique # Import TestTemplate from app.models.test_template from app.models.test_template import TestTemplate from app.services.audit_service import log_action # Assign logger = logging.getLogger(__name__) logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- CALDERA_ZIP_URL = ( # Literal argument value "https://github.com/mitre/stockpile" # Literal argument value "/archive/refs/heads/master.zip" ) # Assign _DOWNLOAD_TIMEOUT = 300 _DOWNLOAD_TIMEOUT = 300 # Assign _ZIP_ROOT_PREFIX = "stockpile-master" _ZIP_ROOT_PREFIX = "stockpile-master" # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _download_zip(url: str = CALDERA_ZIP_URL) -> bytes: """Download the CALDERA ZIP and return raw bytes.""" # Log info: "Downloading CALDERA ZIP from %s …", url logger.info("Downloading CALDERA ZIP from %s …", url) # Assign resp = _requests.get(url, timeout=_DOWNLOAD_TIMEOUT, stream=True) resp = _requests.get(url, timeout=_DOWNLOAD_TIMEOUT, stream=True) # Call resp.raise_for_status() resp.raise_for_status() # Assign content = resp.content content = resp.content # Log info: "Downloaded %.1f MB", len(content) / (1024 * 1024 logger.info("Downloaded %.1f MB", len(content) / (1024 * 1024)) # Return content return content # Define function _extract_zip def _extract_zip(zip_bytes: bytes, dest: str) -> Path: """Extract *zip_bytes* into *dest* and return abilities dir.""" dest_path = Path(dest).resolve() with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf: for member in zf.infolist(): target = (dest_path / member.filename).resolve() if not target.is_relative_to(dest_path): raise ValueError( f"Zip Slip detected — '{member.filename}' resolves outside target directory" ) zf.extractall(dest) # Assign abilities_dir = Path(dest) / _ZIP_ROOT_PREFIX / "data" / "abilities" abilities_dir = Path(dest) / _ZIP_ROOT_PREFIX / "data" / "abilities" # Check: not abilities_dir.is_dir() if not abilities_dir.is_dir(): # Raise FileNotFoundError raise FileNotFoundError( f"Expected abilities directory not found at {abilities_dir}" ) # Return abilities_dir return abilities_dir # Define function _extract_commands def _extract_commands(platforms_dict: dict) -> str: """Extract executor commands from CALDERA platforms dict. The structure is typically:: platforms: windows: psh: command: "whoami" linux: sh: command: "id" Returns a formatted string with all commands. """ # Assign lines = [] lines = [] # Check: not isinstance(platforms_dict, dict) if not isinstance(platforms_dict, dict): # Return "" return "" # Iterate over platforms_dict.items() for os_name, executors in platforms_dict.items(): # Check: not isinstance(executors, dict) if not isinstance(executors, dict): # Skip to the next loop iteration continue # Iterate over executors.items() for executor_name, executor_data in executors.items(): # Check: isinstance(executor_data, dict) if isinstance(executor_data, dict): # Assign cmd = executor_data.get("command", "") cmd = executor_data.get("command", "") # Check: cmd if cmd: # Call lines.append() lines.append(f"[{os_name}/{executor_name}]\n{cmd}") # Alternative: isinstance(executor_data, str) elif isinstance(executor_data, str): # Call lines.append() lines.append(f"[{os_name}/{executor_name}]\n{executor_data}") # Return "\n\n".join(lines) return "\n\n".join(lines) # Define function _extract_platforms def _extract_platforms(platforms_dict: dict) -> str: """Extract platform names from CALDERA platforms dict.""" # Check: not isinstance(platforms_dict, dict) if not isinstance(platforms_dict, dict): # Return "" return "" # Assign platform_names = [] platform_names = [] # Iterate over platforms_dict for os_name in platforms_dict: # Assign normalized = str(os_name).lower().strip() normalized = str(os_name).lower().strip() # Check: normalized in ("windows", "linux", "darwin", "macos") if normalized in ("windows", "linux", "darwin", "macos"): # Check: normalized == "darwin" if normalized == "darwin": # Assign normalized = "macos" normalized = "macos" # Check: normalized not in platform_names if normalized not in platform_names: # Call platform_names.append() platform_names.append(normalized) # Return ", ".join(platform_names) return ", ".join(platform_names) # Define function _parse_abilities def _parse_abilities(abilities_dir: Path) -> list[dict]: """Walk abilities directories and parse all YAML files. Returns a flat list of dicts, each representing one ability. """ # Assign results = [] results: list[dict] = [] # Assign yaml_files = sorted(abilities_dir.rglob("*.yml")) yaml_files = sorted(abilities_dir.rglob("*.yml")) # Log info: "Found %d ability YAML files", len(yaml_files logger.info("Found %d ability YAML files", len(yaml_files)) # Iterate over yaml_files for yaml_path in yaml_files: # Attempt the following; catch errors below try: # Open context manager with open(yaml_path, "r", encoding="utf-8") as fh: # Assign data_list = list(yaml.safe_load_all(fh)) data_list = list(yaml.safe_load_all(fh)) # Handle Exception except Exception as exc: # Log debug: "Failed to parse %s: %s", yaml_path, exc logger.debug("Failed to parse %s: %s", yaml_path, exc) # Skip to the next loop iteration continue # Stockpile YAML files may contain YAML lists of abilities # (e.g. [- id: ..., - id: ...]) or single-document dicts. # Flatten everything into individual ability dicts. abilities: list[dict] = [] # Iterate over data_list for data in data_list: # Check: isinstance(data, dict) if isinstance(data, dict): # Call abilities.append() abilities.append(data) # Alternative: isinstance(data, list) elif isinstance(data, list): # Call abilities.extend() abilities.extend(d for d in data if isinstance(d, dict)) # Iterate over abilities for data in abilities: # Assign ability_id = data.get("id", "") ability_id = data.get("id", "") # Check: not ability_id if not ability_id: # Skip to the next loop iteration continue # Assign name = data.get("name", "").strip() name = data.get("name", "").strip() # Assign description = data.get("description", "").strip() description = data.get("description", "").strip() # Assign tactic = data.get("tactic", "").strip() tactic = data.get("tactic", "").strip() # Extract technique info technique = data.get("technique", {}) # Check: isinstance(technique, dict) if isinstance(technique, dict): # Assign attack_id = technique.get("attack_id", "") attack_id = technique.get("attack_id", "") # Fallback: handle remaining cases else: # Assign attack_id = "" attack_id = "" # Check: not attack_id if not attack_id: # Skip to the next loop iteration continue # Normalise technique ID attack_id = str(attack_id).strip().upper() # Check: not attack_id.startswith("T") if not attack_id.startswith("T"): # Skip to the next loop iteration continue # Extract platforms and commands platforms_dict = data.get("platforms", {}) # Assign commands = _extract_commands(platforms_dict) commands = _extract_commands(platforms_dict) # Assign platform_str = _extract_platforms(platforms_dict) platform_str = _extract_platforms(platforms_dict) # Determine executor type executors = set() # Check: isinstance(platforms_dict, dict) if isinstance(platforms_dict, dict): # Iterate over platforms_dict.values() for os_executors in platforms_dict.values(): # Check: isinstance(os_executors, dict) if isinstance(os_executors, dict): # Call executors.update() executors.update(os_executors.keys()) # Assign executor_str = ", ".join(sorted(executors)) if executors else None executor_str = ", ".join(sorted(executors)) if executors else None # Call results.append() results.append({ # Literal argument value "mitre_technique_id": attack_id, # Literal argument value "name": f"CALDERA: {name}"[:500] if name else f"CALDERA ability {ability_id}"[:500], # Literal argument value "description": f"{description}\n\nTactic: {tactic}".strip()[:2000] if description else None, # Literal argument value "source": "caldera", # Literal argument value "platform": platform_str, # Literal argument value "tool_suggested": executor_str, # Literal argument value "attack_procedure": commands[:4000] if commands else None, # Literal argument value "atomic_test_id": f"caldera:{ability_id}", # Literal argument value "source_url": f"https://github.com/mitre/stockpile/tree/master/data/abilities/{tactic}", }) # Log info: "Parsed %d CALDERA abilities total", len(results logger.info("Parsed %d CALDERA abilities total", len(results)) # Return results return results # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def sync(db: Session) -> dict: """Download and import CALDERA abilities as TestTemplates. Returns a summary dict with ``created``, ``skipped_existing``, ``total_parsed``. """ # Assign tmp_dir = tempfile.mkdtemp(prefix="aegis_caldera_") tmp_dir = tempfile.mkdtemp(prefix="aegis_caldera_") # Attempt the following; catch errors below try: # Assign zip_bytes = _download_zip() zip_bytes = _download_zip() # Assign abilities_dir = _extract_zip(zip_bytes, tmp_dir) abilities_dir = _extract_zip(zip_bytes, tmp_dir) # Assign parsed = _parse_abilities(abilities_dir) parsed = _parse_abilities(abilities_dir) # Always execute this cleanup block finally: # Call shutil.rmtree() shutil.rmtree(tmp_dir, ignore_errors=True) # Log info: "Cleaned up temp directory %s", tmp_dir logger.info("Cleaned up temp directory %s", tmp_dir) # Pre-load existing for dedup existing_ids: set[str] = { row[0] for row in db.query(TestTemplate.atomic_test_id) # Chain .filter() call .filter(TestTemplate.source == "caldera") # Chain .filter() call .filter(TestTemplate.atomic_test_id.isnot(None)) # Chain .all() call .all() } # Assign created = 0 created = 0 # Assign skipped = 0 skipped = 0 new_technique_ids: set[str] = set() # Iterate over parsed for item in parsed: # Check: item["atomic_test_id"] in existing_ids if item["atomic_test_id"] in existing_ids: # Assign skipped = 1 skipped += 1 # Skip to the next loop iteration continue # Assign template = TestTemplate( template = TestTemplate( # Keyword argument: mitre_technique_id mitre_technique_id=item["mitre_technique_id"], # Keyword argument: name name=item["name"], # Keyword argument: description description=item["description"], # Keyword argument: source source=item["source"], # Keyword argument: source_url source_url=item["source_url"], # Keyword argument: attack_procedure attack_procedure=item["attack_procedure"], # Keyword argument: platform platform=item["platform"], # Keyword argument: tool_suggested tool_suggested=item["tool_suggested"], # Keyword argument: atomic_test_id atomic_test_id=item["atomic_test_id"], # Keyword argument: is_active is_active=True, ) # Stage new record(s) for database insertion db.add(template) # Call existing_ids.add() existing_ids.add(item["atomic_test_id"]) new_technique_ids.add(item["mitre_technique_id"]) created += 1 if new_technique_ids: db.query(Technique).filter( Technique.mitre_id.in_(new_technique_ids) ).update({"review_required": True}, synchronize_session=False) db.commit() # Assign summary = { summary = { # Literal argument value "created": created, # Literal argument value "skipped_existing": skipped, # Literal argument value "total_parsed": len(parsed), } # Update DataSource record ds = db.query(DataSource).filter(DataSource.name == "caldera").first() # Check: ds if ds: # Assign ds.last_sync_at = datetime.utcnow() ds.last_sync_at = datetime.utcnow() # Assign ds.last_sync_status = "success" ds.last_sync_status = "success" # Assign ds.last_sync_stats = summary ds.last_sync_stats = summary # Commit all pending changes to the database db.commit() # Log info: "CALDERA import complete — %s", summary logger.info("CALDERA import complete — %s", summary) # Call log_action() log_action(db, user_id=None, action="import_caldera", # Keyword argument: entity_type entity_type="test_template", entity_id=None, details=summary) # Commit all pending changes to the database db.commit() # Return summary return summary