"""Sigma Rules import service. Downloads the SigmaHQ repository ZIP from GitHub, parses every YAML rule file under ``rules/``, extracts MITRE ATT&CK tags, and creates :class:`DetectionRule` records in the database. Strategy -------- 1. Download the full SigmaHQ repo as a ZIP archive. 2. Extract in a temporary directory. 3. Walk all ``.yml`` files under ``rules/``. 4. Parse each YAML file — extract title, description, logsource, detection tags, severity (``level``), and the raw YAML content. 5. Filter: only import rules that have at least one ``attack.tXXXX`` tag. 6. Create / skip ``DetectionRule`` rows keyed by ``(source, source_id)``. 7. Clean up the temporary directory. Idempotency ----------- Running the import twice does **not** create duplicates. Existing rules are identified by ``source = "sigma"`` + ``source_id`` (relative file path) and simply skipped. """ # Import io import io # Import logging import logging # Import re import re # Import shutil import shutil # Import tempfile import tempfile # Import zipfile import zipfile # Import datetime from datetime from datetime import datetime # Import Path from pathlib from pathlib import Path # Import requests import requests as _requests # Import yaml import yaml # Import Session from sqlalchemy.orm from sqlalchemy.orm import Session # Import DataSource from app.models.data_source from app.models.data_source import DataSource from app.models.technique import Technique from app.services.audit_service import log_action # Assign logger = logging.getLogger(__name__) logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- SIGMA_ZIP_URL = ( # Literal argument value "https://github.com/SigmaHQ/sigma/archive/refs/heads/master.zip" ) # Assign _DOWNLOAD_TIMEOUT = 300 _DOWNLOAD_TIMEOUT = 300 # Assign _ZIP_ROOT_PREFIX = "sigma-master" _ZIP_ROOT_PREFIX = "sigma-master" # Safety limits for ZIP extraction — prevent zip-bomb DoS _MAX_UNCOMPRESSED_SIZE = 500 * 1024 * 1024 # 500 MB # Assign _MAX_ENTRIES = 50_000 _MAX_ENTRIES = 50_000 # Regex to extract MITRE ATT&CK technique IDs from Sigma tags # e.g. "attack.t1059.001" → "T1059.001" _ATTACK_TAG_RE = re.compile(r"attack\.(t\d{4}(?:\.\d{3})?)", re.IGNORECASE) # Sigma severity levels _SEVERITY_MAP = { # Literal argument value "informational": "informational", # Literal argument value "low": "low", # Literal argument value "medium": "medium", # Literal argument value "high": "high", # Literal argument value "critical": "critical", } # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _download_zip(url: str = SIGMA_ZIP_URL) -> bytes: """Download the SigmaHQ ZIP and return raw bytes.""" # Log info: "Downloading SigmaHQ ZIP from %s …", url logger.info("Downloading SigmaHQ ZIP from %s …", url) # Assign resp = _requests.get(url, timeout=_DOWNLOAD_TIMEOUT, stream=True) resp = _requests.get(url, timeout=_DOWNLOAD_TIMEOUT, stream=True) # Call resp.raise_for_status() resp.raise_for_status() # Assign content = resp.content content = resp.content # Log info: "Downloaded %.1f MB", len(content) / (1024 * 1024 logger.info("Downloaded %.1f MB", len(content) / (1024 * 1024)) # Return content return content # Define function _safe_extract_zip def _safe_extract_zip(zip_bytes: bytes, dest: str) -> None: """Extract *zip_bytes* into *dest* with Zip Slip and Zip Bomb protection. Raises :class:`ValueError` if any member tries to escape the target directory (path traversal / Zip Slip) or if the archive exceeds the safety limits. """ # Assign dest_path = Path(dest).resolve() dest_path = Path(dest).resolve() # Open context manager with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf: # Assign entries = zf.infolist() entries = zf.infolist() # Check: len(entries) > _MAX_ENTRIES if len(entries) > _MAX_ENTRIES: # Raise ValueError raise ValueError( f"ZIP archive contains {len(entries)} entries " f"(limit: {_MAX_ENTRIES}) — possible zip bomb" ) # Assign total_size = sum(info.file_size for info in entries) total_size = sum(info.file_size for info in entries) # Check: total_size > _MAX_UNCOMPRESSED_SIZE if total_size > _MAX_UNCOMPRESSED_SIZE: # Raise ValueError raise ValueError( f"ZIP uncompressed size {total_size / (1024 * 1024):.0f} MB " f"exceeds limit of {_MAX_UNCOMPRESSED_SIZE / (1024 * 1024):.0f} MB" ) # Iterate over entries for member in entries: # Assign target = (dest_path / member.filename).resolve() target = (dest_path / member.filename).resolve() # Check: not target.is_relative_to(dest_path) if not target.is_relative_to(dest_path): # Raise ValueError raise ValueError( f"Zip Slip detected — member '{member.filename}' " f"resolves outside target directory" ) # Call zf.extractall() zf.extractall(dest) # Define function _extract_zip def _extract_zip(zip_bytes: bytes, dest: str) -> Path: """Extract *zip_bytes* into *dest* and return the path to rules/ dir.""" # Call _safe_extract_zip() _safe_extract_zip(zip_bytes, dest) # Assign rules_dir = Path(dest) / _ZIP_ROOT_PREFIX / "rules" rules_dir = Path(dest) / _ZIP_ROOT_PREFIX / "rules" # Check: not rules_dir.is_dir() if not rules_dir.is_dir(): # Raise FileNotFoundError raise FileNotFoundError( f"Expected rules directory not found at {rules_dir}" ) # Return rules_dir return rules_dir # Define function _extract_attack_tags def _extract_attack_tags(tags: list) -> list[str]: """Extract MITRE technique IDs from Sigma tag list. Example input: ["attack.defense_evasion", "attack.t1059.001", "cve.2021.44228"] Example output: ["T1059.001"] """ # Assign technique_ids = [] technique_ids = [] # Iterate over tags for tag in tags: # Assign m = _ATTACK_TAG_RE.match(str(tag).strip()) m = _ATTACK_TAG_RE.match(str(tag).strip()) # Check: m if m: # Call technique_ids.append() technique_ids.append(m.group(1).upper()) # Return list(set(technique_ids)) return list(set(technique_ids)) # Define function _parse_sigma_rules def _parse_sigma_rules(rules_dir: Path) -> list[dict]: """Walk the rules directory and parse all Sigma YAML files. Returns a flat list of dicts, one per (rule, technique) combination. A single Sigma rule tagged with N techniques produces N entries. """ # Assign results = [] results: list[dict] = [] # Assign yaml_files = sorted(rules_dir.rglob("*.yml")) yaml_files = sorted(rules_dir.rglob("*.yml")) # Log info: "Found %d YAML files to parse", len(yaml_files logger.info("Found %d YAML files to parse", len(yaml_files)) # Iterate over yaml_files for yaml_path in yaml_files: # Assign relative_path = str(yaml_path.relative_to(rules_dir.parent)) relative_path = str(yaml_path.relative_to(rules_dir.parent)) # Attempt the following; catch errors below try: # Open context manager with open(yaml_path, "r", encoding="utf-8") as fh: # Assign data = yaml.safe_load(fh) data = yaml.safe_load(fh) # Handle Exception except Exception as exc: # Log debug: "Failed to parse %s: %s", yaml_path, exc logger.debug("Failed to parse %s: %s", yaml_path, exc) # Skip to the next loop iteration continue # Check: not isinstance(data, dict) if not isinstance(data, dict): # Skip to the next loop iteration continue # Assign title = data.get("title", "").strip() title = data.get("title", "").strip() # Check: not title if not title: # Skip to the next loop iteration continue # Extract ATT&CK technique IDs from tags tags = data.get("tags", []) # Check: not isinstance(tags, list) if not isinstance(tags, list): # Skip to the next loop iteration continue # Assign technique_ids = _extract_attack_tags(tags) technique_ids = _extract_attack_tags(tags) # Check: not technique_ids if not technique_ids: # continue # Skip rules without ATT&CK mapping continue # Skip rules without ATT&CK mapping # Assign description = data.get("description", "") description = data.get("description", "") # Assign level = str(data.get("level", "")).lower() level = str(data.get("level", "")).lower() # Assign severity = _SEVERITY_MAP.get(level) severity = _SEVERITY_MAP.get(level) # Extract logsource logsource = data.get("logsource", {}) # Check: not isinstance(logsource, dict) if not isinstance(logsource, dict): # Assign logsource = {} logsource = {} # Read full YAML content for storage try: # Open context manager with open(yaml_path, "r", encoding="utf-8") as fh: # Assign raw_content = fh.read() raw_content = fh.read() # Handle Exception except Exception: # Assign raw_content = yaml.dump(data, default_flow_style=False) raw_content = yaml.dump(data, default_flow_style=False) # False positive assessment falsepositives = data.get("falsepositives", []) # Check: isinstance(falsepositives, list) and len(falsepositives) > 3 if isinstance(falsepositives, list) and len(falsepositives) > 3: # Assign fp_rate = "high" fp_rate = "high" # Alternative: isinstance(falsepositives, list) and len(falsepositives) > 1 elif isinstance(falsepositives, list) and len(falsepositives) > 1: # Assign fp_rate = "medium" fp_rate = "medium" # Fallback: handle remaining cases else: # Assign fp_rate = "low" fp_rate = "low" # Create one entry per technique for tech_id in technique_ids: # Assign source_url = ( source_url = ( f"https://github.com/SigmaHQ/sigma/blob/master/" f"{relative_path.replace(chr(92), '/')}" ) # Call results.append() results.append({ # Literal argument value "mitre_technique_id": tech_id, # Literal argument value "title": title[:500], # Literal argument value "description": str(description)[:2000] if description else None, # Literal argument value "source_id": relative_path, # Literal argument value "source_url": source_url, # Literal argument value "rule_content": raw_content, # Literal argument value "severity": severity, # Literal argument value "log_sources": logsource if logsource else None, # Literal argument value "false_positive_rate": fp_rate, # Literal argument value "platforms": _platforms_from_logsource(logsource), }) # Log info: "Parsed %d (rule, technique) pairs total", len(res logger.info("Parsed %d (rule, technique) pairs total", len(results)) # Return results return results # Define function _platforms_from_logsource def _platforms_from_logsource(logsource: dict) -> list[str]: """Infer platform list from Sigma logsource.""" # Assign platforms = [] platforms = [] # Assign product = str(logsource.get("product", "")).lower() product = str(logsource.get("product", "")).lower() # Assign service = str(logsource.get("service", "")).lower() service = str(logsource.get("service", "")).lower() # Check: "windows" in product or "windows" in service if "windows" in product or "windows" in service: # Call platforms.append() platforms.append("windows") # Check: "linux" in product or "linux" in service if "linux" in product or "linux" in service: # Call platforms.append() platforms.append("linux") # Check: "macos" in product or "macos" in service if "macos" in product or "macos" in service: # Call platforms.append() platforms.append("macos") # Sysmon → Windows if "sysmon" in service and "windows" not in platforms: # Call platforms.append() platforms.append("windows") # Return platforms if platforms else None return platforms if platforms else None # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def sync(db: Session) -> dict: """Download and import Sigma detection rules. Parameters ---------- db : Session Active SQLAlchemy database session. Returns: ------- dict Summary with ``created``, ``skipped_existing``, ``total_parsed``. """ # Assign tmp_dir = tempfile.mkdtemp(prefix="aegis_sigma_") tmp_dir = tempfile.mkdtemp(prefix="aegis_sigma_") # Attempt the following; catch errors below try: # Assign zip_bytes = _download_zip() zip_bytes = _download_zip() # Assign rules_dir = _extract_zip(zip_bytes, tmp_dir) rules_dir = _extract_zip(zip_bytes, tmp_dir) # Assign parsed_rules = _parse_sigma_rules(rules_dir) parsed_rules = _parse_sigma_rules(rules_dir) # Always execute this cleanup block finally: # Call shutil.rmtree() shutil.rmtree(tmp_dir, ignore_errors=True) # Log info: "Cleaned up temp directory %s", tmp_dir logger.info("Cleaned up temp directory %s", tmp_dir) # Pre-load existing source_ids for dedup existing_ids: set[str] = { row[0] for row in db.query(DetectionRule.source_id) # Chain .filter() call .filter(DetectionRule.source == "sigma") # Chain .filter() call .filter(DetectionRule.source_id.isnot(None)) # Chain .all() call .all() } # Assign created = 0 created = 0 # Assign skipped = 0 skipped = 0 new_technique_ids: set[str] = set() # Iterate over parsed_rules for item in parsed_rules: # Deduplicate by source_id: one rule file may map to multiple techniques, # but we skip insertion if this source_id was already imported. if item["source_id"] in existing_ids: # Assign skipped = 1 skipped += 1 # Skip to the next loop iteration continue # Assign rule = DetectionRule( rule = DetectionRule( # Keyword argument: mitre_technique_id mitre_technique_id=item["mitre_technique_id"], # Keyword argument: title title=item["title"], # Keyword argument: description description=item["description"], # Keyword argument: source source="sigma", # Keyword argument: source_id source_id=item["source_id"], # Keyword argument: source_url source_url=item["source_url"], # Keyword argument: rule_content rule_content=item["rule_content"], # Keyword argument: rule_format rule_format="sigma_yaml", # Keyword argument: severity severity=item["severity"], # Keyword argument: platforms platforms=item["platforms"], # Keyword argument: log_sources log_sources=item["log_sources"], # Keyword argument: false_positive_rate false_positive_rate=item["false_positive_rate"], # Keyword argument: is_active is_active=True, ) # Stage new record(s) for database insertion db.add(rule) # Call existing_ids.add() existing_ids.add(item["source_id"]) new_technique_ids.add(item["mitre_technique_id"]) created += 1 # Flag techniques that received new rules for review if new_technique_ids: db.query(Technique).filter( Technique.mitre_id.in_(new_technique_ids) ).update({"review_required": True}, synchronize_session=False) db.commit() # Assign summary = { summary = { # Literal argument value "created": created, # Literal argument value "skipped_existing": skipped, # Literal argument value "total_parsed": len(parsed_rules), } # Update DataSource record ds = db.query(DataSource).filter(DataSource.name == "sigma").first() # Check: ds if ds: # Assign ds.last_sync_at = datetime.utcnow() ds.last_sync_at = datetime.utcnow() # Assign ds.last_sync_status = "success" ds.last_sync_status = "success" # Assign ds.last_sync_stats = summary ds.last_sync_stats = summary # Commit all pending changes to the database db.commit() # Log info: "Sigma import complete — %s", summary logger.info("Sigma import complete — %s", summary) # Call log_action() log_action( db, # Keyword argument: user_id user_id=None, # Keyword argument: action action="import_sigma_rules", # Keyword argument: entity_type entity_type="detection_rule", # Keyword argument: entity_id entity_id=None, # Keyword argument: details details=summary, ) # Commit all pending changes to the database db.commit() # Return summary return summary