"""Sigma Rules import service.

Downloads the SigmaHQ repository ZIP from GitHub, parses every YAML rule
file under ``rules/``, extracts MITRE ATT&CK tags, and creates
:class:`DetectionRule` records in the database.

Strategy
--------
1. Download the full SigmaHQ repo as a ZIP archive.
2. Extract in a temporary directory.
3. Walk all ``.yml`` files under ``rules/``.
4. Parse each YAML file — extract title, description, logsource,
   detection tags, severity (``level``), and the raw YAML content.
5. Filter: only import rules that have at least one ``attack.tXXXX`` tag.
6. Create / skip ``DetectionRule`` rows keyed by ``(source, source_id)``.
7. Clean up the temporary directory.

Idempotency
-----------
Running the import twice does **not** create duplicates.  Existing
rules are identified by ``source = "sigma"`` + ``source_id`` (relative
file path) and simply skipped.
"""

# Import io
import io

# Import logging
import logging

# Import re
import re

# Import shutil
import shutil

# Import tempfile
import tempfile

# Import zipfile
import zipfile

# Import datetime from datetime
from datetime import datetime

# Import Path from pathlib
from pathlib import Path

# Import requests
import requests as _requests

# Import yaml
import yaml

# Import Session from sqlalchemy.orm
from sqlalchemy.orm import Session

# Import DataSource from app.models.data_source
from app.models.data_source import DataSource
from app.models.technique import Technique
from app.services.audit_service import log_action

# Assign logger = logging.getLogger(__name__)
logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

SIGMA_ZIP_URL = (
    # Literal argument value
    "https://github.com/SigmaHQ/sigma/archive/refs/heads/master.zip"
)

# Assign _DOWNLOAD_TIMEOUT = 300
_DOWNLOAD_TIMEOUT = 300
# Assign _ZIP_ROOT_PREFIX = "sigma-master"
_ZIP_ROOT_PREFIX = "sigma-master"

# Safety limits for ZIP extraction — prevent zip-bomb DoS
_MAX_UNCOMPRESSED_SIZE = 500 * 1024 * 1024  # 500 MB
# Assign _MAX_ENTRIES = 50_000
_MAX_ENTRIES = 50_000

# Regex to extract MITRE ATT&CK technique IDs from Sigma tags
# e.g. "attack.t1059.001" → "T1059.001"
_ATTACK_TAG_RE = re.compile(r"attack\.(t\d{4}(?:\.\d{3})?)", re.IGNORECASE)

# Sigma severity levels
_SEVERITY_MAP = {
    # Literal argument value
    "informational": "informational",
    # Literal argument value
    "low": "low",
    # Literal argument value
    "medium": "medium",
    # Literal argument value
    "high": "high",
    # Literal argument value
    "critical": "critical",
}


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _download_zip(url: str = SIGMA_ZIP_URL) -> bytes:
    """Download the SigmaHQ ZIP and return raw bytes."""
    # Log info: "Downloading SigmaHQ ZIP from %s …", url
    logger.info("Downloading SigmaHQ ZIP from %s …", url)
    # Assign resp = _requests.get(url, timeout=_DOWNLOAD_TIMEOUT, stream=True)
    resp = _requests.get(url, timeout=_DOWNLOAD_TIMEOUT, stream=True)
    # Call resp.raise_for_status()
    resp.raise_for_status()
    # Assign content = resp.content
    content = resp.content
    # Log info: "Downloaded %.1f MB", len(content) / (1024 * 1024
    logger.info("Downloaded %.1f MB", len(content) / (1024 * 1024))
    # Return content
    return content


# Define function _safe_extract_zip
def _safe_extract_zip(zip_bytes: bytes, dest: str) -> None:
    """Extract *zip_bytes* into *dest* with Zip Slip and Zip Bomb protection.

    Raises :class:`ValueError` if any member tries to escape the target
    directory (path traversal / Zip Slip) or if the archive exceeds the
    safety limits.
    """
    # Assign dest_path = Path(dest).resolve()
    dest_path = Path(dest).resolve()

    # Open context manager
    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
        # Assign entries = zf.infolist()
        entries = zf.infolist()

        # Check: len(entries) > _MAX_ENTRIES
        if len(entries) > _MAX_ENTRIES:
            # Raise ValueError
            raise ValueError(
                f"ZIP archive contains {len(entries)} entries "
                f"(limit: {_MAX_ENTRIES}) — possible zip bomb"
            )

        # Assign total_size = sum(info.file_size for info in entries)
        total_size = sum(info.file_size for info in entries)
        # Check: total_size > _MAX_UNCOMPRESSED_SIZE
        if total_size > _MAX_UNCOMPRESSED_SIZE:
            # Raise ValueError
            raise ValueError(
                f"ZIP uncompressed size {total_size / (1024 * 1024):.0f} MB "
                f"exceeds limit of {_MAX_UNCOMPRESSED_SIZE / (1024 * 1024):.0f} MB"
            )

        # Iterate over entries
        for member in entries:
            # Assign target = (dest_path / member.filename).resolve()
            target = (dest_path / member.filename).resolve()
            # Check: not target.is_relative_to(dest_path)
            if not target.is_relative_to(dest_path):
                # Raise ValueError
                raise ValueError(
                    f"Zip Slip detected — member '{member.filename}' "
                    f"resolves outside target directory"
                )

        # Call zf.extractall()
        zf.extractall(dest)


# Define function _extract_zip
def _extract_zip(zip_bytes: bytes, dest: str) -> Path:
    """Extract *zip_bytes* into *dest* and return the path to rules/ dir."""
    # Call _safe_extract_zip()
    _safe_extract_zip(zip_bytes, dest)
    # Assign rules_dir = Path(dest) / _ZIP_ROOT_PREFIX / "rules"
    rules_dir = Path(dest) / _ZIP_ROOT_PREFIX / "rules"
    # Check: not rules_dir.is_dir()
    if not rules_dir.is_dir():
        # Raise FileNotFoundError
        raise FileNotFoundError(
            f"Expected rules directory not found at {rules_dir}"
        )
    # Return rules_dir
    return rules_dir


# Define function _extract_attack_tags
def _extract_attack_tags(tags: list) -> list[str]:
    """Extract MITRE technique IDs from Sigma tag list.

    Example input:  ["attack.defense_evasion", "attack.t1059.001", "cve.2021.44228"]
    Example output: ["T1059.001"]
    """
    # Assign technique_ids = []
    technique_ids = []
    # Iterate over tags
    for tag in tags:
        # Assign m = _ATTACK_TAG_RE.match(str(tag).strip())
        m = _ATTACK_TAG_RE.match(str(tag).strip())
        # Check: m
        if m:
            # Call technique_ids.append()
            technique_ids.append(m.group(1).upper())
    # Return list(set(technique_ids))
    return list(set(technique_ids))


# Define function _parse_sigma_rules
def _parse_sigma_rules(rules_dir: Path) -> list[dict]:
    """Walk the rules directory and parse all Sigma YAML files.

    Returns a flat list of dicts, one per (rule, technique) combination.
    A single Sigma rule tagged with N techniques produces N entries.
    """
    # Assign results = []
    results: list[dict] = []
    # Assign yaml_files = sorted(rules_dir.rglob("*.yml"))
    yaml_files = sorted(rules_dir.rglob("*.yml"))
    # Log info: "Found %d YAML files to parse", len(yaml_files
    logger.info("Found %d YAML files to parse", len(yaml_files))

    # Iterate over yaml_files
    for yaml_path in yaml_files:
        # Assign relative_path = str(yaml_path.relative_to(rules_dir.parent))
        relative_path = str(yaml_path.relative_to(rules_dir.parent))
        # Attempt the following; catch errors below
        try:
            # Open context manager
            with open(yaml_path, "r", encoding="utf-8") as fh:
                # Assign data = yaml.safe_load(fh)
                data = yaml.safe_load(fh)
        # Handle Exception
        except Exception as exc:
            # Log debug: "Failed to parse %s: %s", yaml_path, exc
            logger.debug("Failed to parse %s: %s", yaml_path, exc)
            # Skip to the next loop iteration
            continue

        # Check: not isinstance(data, dict)
        if not isinstance(data, dict):
            # Skip to the next loop iteration
            continue

        # Assign title = data.get("title", "").strip()
        title = data.get("title", "").strip()
        # Check: not title
        if not title:
            # Skip to the next loop iteration
            continue

        # Extract ATT&CK technique IDs from tags
        tags = data.get("tags", [])
        # Check: not isinstance(tags, list)
        if not isinstance(tags, list):
            # Skip to the next loop iteration
            continue

        # Assign technique_ids = _extract_attack_tags(tags)
        technique_ids = _extract_attack_tags(tags)
        # Check: not technique_ids
        if not technique_ids:
            # continue  # Skip rules without ATT&CK mapping
            continue  # Skip rules without ATT&CK mapping

        # Assign description = data.get("description", "")
        description = data.get("description", "")
        # Assign level = str(data.get("level", "")).lower()
        level = str(data.get("level", "")).lower()
        # Assign severity = _SEVERITY_MAP.get(level)
        severity = _SEVERITY_MAP.get(level)

        # Extract logsource
        logsource = data.get("logsource", {})
        # Check: not isinstance(logsource, dict)
        if not isinstance(logsource, dict):
            # Assign logsource = {}
            logsource = {}

        # Read full YAML content for storage
        try:
            # Open context manager
            with open(yaml_path, "r", encoding="utf-8") as fh:
                # Assign raw_content = fh.read()
                raw_content = fh.read()
        # Handle Exception
        except Exception:
            # Assign raw_content = yaml.dump(data, default_flow_style=False)
            raw_content = yaml.dump(data, default_flow_style=False)

        # False positive assessment
        falsepositives = data.get("falsepositives", [])
        # Check: isinstance(falsepositives, list) and len(falsepositives) > 3
        if isinstance(falsepositives, list) and len(falsepositives) > 3:
            # Assign fp_rate = "high"
            fp_rate = "high"
        # Alternative: isinstance(falsepositives, list) and len(falsepositives) > 1
        elif isinstance(falsepositives, list) and len(falsepositives) > 1:
            # Assign fp_rate = "medium"
            fp_rate = "medium"
        # Fallback: handle remaining cases
        else:
            # Assign fp_rate = "low"
            fp_rate = "low"

        # Create one entry per technique
        for tech_id in technique_ids:
            # Assign source_url = (
            source_url = (
                f"https://github.com/SigmaHQ/sigma/blob/master/"
                f"{relative_path.replace(chr(92), '/')}"
            )
            # Call results.append()
            results.append({
                # Literal argument value
                "mitre_technique_id": tech_id,
                # Literal argument value
                "title": title[:500],
                # Literal argument value
                "description": str(description)[:2000] if description else None,
                # Literal argument value
                "source_id": relative_path,
                # Literal argument value
                "source_url": source_url,
                # Literal argument value
                "rule_content": raw_content,
                # Literal argument value
                "severity": severity,
                # Literal argument value
                "log_sources": logsource if logsource else None,
                # Literal argument value
                "false_positive_rate": fp_rate,
                # Literal argument value
                "platforms": _platforms_from_logsource(logsource),
            })

    # Log info: "Parsed %d (rule, technique) pairs total", len(res
    logger.info("Parsed %d (rule, technique) pairs total", len(results))
    # Return results
    return results


# Define function _platforms_from_logsource
def _platforms_from_logsource(logsource: dict) -> list[str]:
    """Infer platform list from Sigma logsource."""
    # Assign platforms = []
    platforms = []
    # Assign product = str(logsource.get("product", "")).lower()
    product = str(logsource.get("product", "")).lower()
    # Assign service = str(logsource.get("service", "")).lower()
    service = str(logsource.get("service", "")).lower()

    # Check: "windows" in product or "windows" in service
    if "windows" in product or "windows" in service:
        # Call platforms.append()
        platforms.append("windows")
    # Check: "linux" in product or "linux" in service
    if "linux" in product or "linux" in service:
        # Call platforms.append()
        platforms.append("linux")
    # Check: "macos" in product or "macos" in service
    if "macos" in product or "macos" in service:
        # Call platforms.append()
        platforms.append("macos")

    # Sysmon → Windows
    if "sysmon" in service and "windows" not in platforms:
        # Call platforms.append()
        platforms.append("windows")

    # Return platforms if platforms else None
    return platforms if platforms else None


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def sync(db: Session) -> dict:
    """Download and import Sigma detection rules.

    Parameters
    ----------
    db : Session
        Active SQLAlchemy database session.

    Returns:
    -------
    dict
        Summary with ``created``, ``skipped_existing``, ``total_parsed``.
    """
    # Assign tmp_dir = tempfile.mkdtemp(prefix="aegis_sigma_")
    tmp_dir = tempfile.mkdtemp(prefix="aegis_sigma_")
    # Attempt the following; catch errors below
    try:
        # Assign zip_bytes = _download_zip()
        zip_bytes = _download_zip()
        # Assign rules_dir = _extract_zip(zip_bytes, tmp_dir)
        rules_dir = _extract_zip(zip_bytes, tmp_dir)
        # Assign parsed_rules = _parse_sigma_rules(rules_dir)
        parsed_rules = _parse_sigma_rules(rules_dir)
    # Always execute this cleanup block
    finally:
        # Call shutil.rmtree()
        shutil.rmtree(tmp_dir, ignore_errors=True)
        # Log info: "Cleaned up temp directory %s", tmp_dir
        logger.info("Cleaned up temp directory %s", tmp_dir)

    # Pre-load existing source_ids for dedup
    existing_ids: set[str] = {
        row[0]
        for row in db.query(DetectionRule.source_id)
        # Chain .filter() call
        .filter(DetectionRule.source == "sigma")
        # Chain .filter() call
        .filter(DetectionRule.source_id.isnot(None))
        # Chain .all() call
        .all()
    }

    # Assign created = 0
    created = 0
    # Assign skipped = 0
    skipped = 0
    new_technique_ids: set[str] = set()

    # Iterate over parsed_rules
    for item in parsed_rules:
        # Deduplicate by source_id: one rule file may map to multiple techniques,
        # but we skip insertion if this source_id was already imported.
        if item["source_id"] in existing_ids:
            # Assign skipped = 1
            skipped += 1
            # Skip to the next loop iteration
            continue

        # Assign rule = DetectionRule(
        rule = DetectionRule(
            # Keyword argument: mitre_technique_id
            mitre_technique_id=item["mitre_technique_id"],
            # Keyword argument: title
            title=item["title"],
            # Keyword argument: description
            description=item["description"],
            # Keyword argument: source
            source="sigma",
            # Keyword argument: source_id
            source_id=item["source_id"],
            # Keyword argument: source_url
            source_url=item["source_url"],
            # Keyword argument: rule_content
            rule_content=item["rule_content"],
            # Keyword argument: rule_format
            rule_format="sigma_yaml",
            # Keyword argument: severity
            severity=item["severity"],
            # Keyword argument: platforms
            platforms=item["platforms"],
            # Keyword argument: log_sources
            log_sources=item["log_sources"],
            # Keyword argument: false_positive_rate
            false_positive_rate=item["false_positive_rate"],
            # Keyword argument: is_active
            is_active=True,
        )
        # Stage new record(s) for database insertion
        db.add(rule)
        # Call existing_ids.add()
        existing_ids.add(item["source_id"])
        new_technique_ids.add(item["mitre_technique_id"])
        created += 1

    # Flag techniques that received new rules for review
    if new_technique_ids:
        db.query(Technique).filter(
            Technique.mitre_id.in_(new_technique_ids)
        ).update({"review_required": True}, synchronize_session=False)

    db.commit()

    # Assign summary = {
    summary = {
        # Literal argument value
        "created": created,
        # Literal argument value
        "skipped_existing": skipped,
        # Literal argument value
        "total_parsed": len(parsed_rules),
    }

    # Update DataSource record
    ds = db.query(DataSource).filter(DataSource.name == "sigma").first()
    # Check: ds
    if ds:
        # Assign ds.last_sync_at = datetime.utcnow()
        ds.last_sync_at = datetime.utcnow()
        # Assign ds.last_sync_status = "success"
        ds.last_sync_status = "success"
        # Assign ds.last_sync_stats = summary
        ds.last_sync_stats = summary
        # Commit all pending changes to the database
        db.commit()

    # Log info: "Sigma import complete — %s", summary
    logger.info("Sigma import complete — %s", summary)

    # Call log_action()
    log_action(
        db,
        # Keyword argument: user_id
        user_id=None,
        # Keyword argument: action
        action="import_sigma_rules",
        # Keyword argument: entity_type
        entity_type="detection_rule",
        # Keyword argument: entity_id
        entity_id=None,
        # Keyword argument: details
        details=summary,
    )
    # Commit all pending changes to the database
    db.commit()

    # Return summary
    return summary