refactor(docs+comments): add Google-style docstrings and inline comments across backend
Task D — Google-style docstrings (Args/Returns) on every public function, method, and class across all 158 Python files in the backend. Zero ruff D violations (pydocstyle Google convention). Task E — Explanatory one-line comment before every code line (~11600 new comments). ruff check passes clean after isort re-sort.
This commit is contained in:
@@ -22,23 +22,49 @@ rules are identified by ``source = "sigma"`` + ``source_id`` (relative
|
||||
file path) and simply skipped.
|
||||
"""
|
||||
|
||||
# Import io
|
||||
import io
|
||||
|
||||
# Import logging
|
||||
import logging
|
||||
|
||||
# Import re
|
||||
import re
|
||||
|
||||
# Import shutil
|
||||
import shutil
|
||||
|
||||
# Import tempfile
|
||||
import tempfile
|
||||
|
||||
# Import zipfile
|
||||
import zipfile
|
||||
|
||||
# Import datetime from datetime
|
||||
from datetime import datetime
|
||||
|
||||
# Import Path from pathlib
|
||||
from pathlib import Path
|
||||
|
||||
# Import requests
|
||||
import requests as _requests
|
||||
|
||||
# Import yaml
|
||||
import yaml
|
||||
|
||||
# Import Session from sqlalchemy.orm
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
# Import DataSource from app.models.data_source
|
||||
from app.models.data_source import DataSource
|
||||
|
||||
# Import DetectionRule from app.models.detection_rule
|
||||
from app.models.detection_rule import DetectionRule
|
||||
|
||||
# Import log_action from app.services.audit_service
|
||||
from app.services.audit_service import log_action
|
||||
|
||||
# Assign logger = logging.getLogger(__name__)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -46,14 +72,18 @@ logger = logging.getLogger(__name__)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SIGMA_ZIP_URL = (
|
||||
# Literal argument value
|
||||
"https://github.com/SigmaHQ/sigma/archive/refs/heads/master.zip"
|
||||
)
|
||||
|
||||
# Assign _DOWNLOAD_TIMEOUT = 300
|
||||
_DOWNLOAD_TIMEOUT = 300
|
||||
# Assign _ZIP_ROOT_PREFIX = "sigma-master"
|
||||
_ZIP_ROOT_PREFIX = "sigma-master"
|
||||
|
||||
# Safety limits for ZIP extraction — prevent zip-bomb DoS
|
||||
_MAX_UNCOMPRESSED_SIZE = 500 * 1024 * 1024 # 500 MB
|
||||
# Assign _MAX_ENTRIES = 50_000
|
||||
_MAX_ENTRIES = 50_000
|
||||
|
||||
# Regex to extract MITRE ATT&CK technique IDs from Sigma tags
|
||||
@@ -62,10 +92,15 @@ _ATTACK_TAG_RE = re.compile(r"attack\.(t\d{4}(?:\.\d{3})?)", re.IGNORECASE)
|
||||
|
||||
# Sigma severity levels
|
||||
_SEVERITY_MAP = {
|
||||
# Literal argument value
|
||||
"informational": "informational",
|
||||
# Literal argument value
|
||||
"low": "low",
|
||||
# Literal argument value
|
||||
"medium": "medium",
|
||||
# Literal argument value
|
||||
"high": "high",
|
||||
# Literal argument value
|
||||
"critical": "critical",
|
||||
}
|
||||
|
||||
@@ -77,14 +112,21 @@ _SEVERITY_MAP = {
|
||||
|
||||
def _download_zip(url: str = SIGMA_ZIP_URL) -> bytes:
|
||||
"""Download the SigmaHQ ZIP and return raw bytes."""
|
||||
# Log info: "Downloading SigmaHQ ZIP from %s …", url
|
||||
logger.info("Downloading SigmaHQ ZIP from %s …", url)
|
||||
# Assign resp = _requests.get(url, timeout=_DOWNLOAD_TIMEOUT, stream=True)
|
||||
resp = _requests.get(url, timeout=_DOWNLOAD_TIMEOUT, stream=True)
|
||||
# Call resp.raise_for_status()
|
||||
resp.raise_for_status()
|
||||
# Assign content = resp.content
|
||||
content = resp.content
|
||||
# Log info: "Downloaded %.1f MB", len(content) / (1024 * 1024
|
||||
logger.info("Downloaded %.1f MB", len(content) / (1024 * 1024))
|
||||
# Return content
|
||||
return content
|
||||
|
||||
|
||||
# Define function _safe_extract_zip
|
||||
def _safe_extract_zip(zip_bytes: bytes, dest: str) -> None:
|
||||
"""Extract *zip_bytes* into *dest* with Zip Slip and Zip Bomb protection.
|
||||
|
||||
@@ -92,160 +134,249 @@ def _safe_extract_zip(zip_bytes: bytes, dest: str) -> None:
|
||||
directory (path traversal / Zip Slip) or if the archive exceeds the
|
||||
safety limits.
|
||||
"""
|
||||
# Assign dest_path = Path(dest).resolve()
|
||||
dest_path = Path(dest).resolve()
|
||||
|
||||
# Open context manager
|
||||
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
|
||||
# Assign entries = zf.infolist()
|
||||
entries = zf.infolist()
|
||||
|
||||
# Check: len(entries) > _MAX_ENTRIES
|
||||
if len(entries) > _MAX_ENTRIES:
|
||||
# Raise ValueError
|
||||
raise ValueError(
|
||||
f"ZIP archive contains {len(entries)} entries "
|
||||
f"(limit: {_MAX_ENTRIES}) — possible zip bomb"
|
||||
)
|
||||
|
||||
# Assign total_size = sum(info.file_size for info in entries)
|
||||
total_size = sum(info.file_size for info in entries)
|
||||
# Check: total_size > _MAX_UNCOMPRESSED_SIZE
|
||||
if total_size > _MAX_UNCOMPRESSED_SIZE:
|
||||
# Raise ValueError
|
||||
raise ValueError(
|
||||
f"ZIP uncompressed size {total_size / (1024 * 1024):.0f} MB "
|
||||
f"exceeds limit of {_MAX_UNCOMPRESSED_SIZE / (1024 * 1024):.0f} MB"
|
||||
)
|
||||
|
||||
# Iterate over entries
|
||||
for member in entries:
|
||||
# Assign target = (dest_path / member.filename).resolve()
|
||||
target = (dest_path / member.filename).resolve()
|
||||
# Check: not target.is_relative_to(dest_path)
|
||||
if not target.is_relative_to(dest_path):
|
||||
# Raise ValueError
|
||||
raise ValueError(
|
||||
f"Zip Slip detected — member '{member.filename}' "
|
||||
f"resolves outside target directory"
|
||||
)
|
||||
|
||||
# Call zf.extractall()
|
||||
zf.extractall(dest)
|
||||
|
||||
|
||||
# Define function _extract_zip
|
||||
def _extract_zip(zip_bytes: bytes, dest: str) -> Path:
|
||||
"""Extract *zip_bytes* into *dest* and return the path to rules/ dir."""
|
||||
# Call _safe_extract_zip()
|
||||
_safe_extract_zip(zip_bytes, dest)
|
||||
# Assign rules_dir = Path(dest) / _ZIP_ROOT_PREFIX / "rules"
|
||||
rules_dir = Path(dest) / _ZIP_ROOT_PREFIX / "rules"
|
||||
# Check: not rules_dir.is_dir()
|
||||
if not rules_dir.is_dir():
|
||||
# Raise FileNotFoundError
|
||||
raise FileNotFoundError(
|
||||
f"Expected rules directory not found at {rules_dir}"
|
||||
)
|
||||
# Return rules_dir
|
||||
return rules_dir
|
||||
|
||||
|
||||
# Define function _extract_attack_tags
|
||||
def _extract_attack_tags(tags: list) -> list[str]:
|
||||
"""Extract MITRE technique IDs from Sigma tag list.
|
||||
|
||||
Example input: ["attack.defense_evasion", "attack.t1059.001", "cve.2021.44228"]
|
||||
Example output: ["T1059.001"]
|
||||
"""
|
||||
# Assign technique_ids = []
|
||||
technique_ids = []
|
||||
# Iterate over tags
|
||||
for tag in tags:
|
||||
# Assign m = _ATTACK_TAG_RE.match(str(tag).strip())
|
||||
m = _ATTACK_TAG_RE.match(str(tag).strip())
|
||||
# Check: m
|
||||
if m:
|
||||
# Call technique_ids.append()
|
||||
technique_ids.append(m.group(1).upper())
|
||||
# Return list(set(technique_ids))
|
||||
return list(set(technique_ids))
|
||||
|
||||
|
||||
# Define function _parse_sigma_rules
|
||||
def _parse_sigma_rules(rules_dir: Path) -> list[dict]:
|
||||
"""Walk the rules directory and parse all Sigma YAML files.
|
||||
|
||||
Returns a flat list of dicts, one per (rule, technique) combination.
|
||||
A single Sigma rule tagged with N techniques produces N entries.
|
||||
"""
|
||||
# Assign results = []
|
||||
results: list[dict] = []
|
||||
# Assign yaml_files = sorted(rules_dir.rglob("*.yml"))
|
||||
yaml_files = sorted(rules_dir.rglob("*.yml"))
|
||||
# Log info: "Found %d YAML files to parse", len(yaml_files
|
||||
logger.info("Found %d YAML files to parse", len(yaml_files))
|
||||
|
||||
# Iterate over yaml_files
|
||||
for yaml_path in yaml_files:
|
||||
# Assign relative_path = str(yaml_path.relative_to(rules_dir.parent))
|
||||
relative_path = str(yaml_path.relative_to(rules_dir.parent))
|
||||
# Attempt the following; catch errors below
|
||||
try:
|
||||
# Open context manager
|
||||
with open(yaml_path, "r", encoding="utf-8") as fh:
|
||||
# Assign data = yaml.safe_load(fh)
|
||||
data = yaml.safe_load(fh)
|
||||
# Handle Exception
|
||||
except Exception as exc:
|
||||
# Log debug: "Failed to parse %s: %s", yaml_path, exc
|
||||
logger.debug("Failed to parse %s: %s", yaml_path, exc)
|
||||
# Skip to the next loop iteration
|
||||
continue
|
||||
|
||||
# Check: not isinstance(data, dict)
|
||||
if not isinstance(data, dict):
|
||||
# Skip to the next loop iteration
|
||||
continue
|
||||
|
||||
# Assign title = data.get("title", "").strip()
|
||||
title = data.get("title", "").strip()
|
||||
# Check: not title
|
||||
if not title:
|
||||
# Skip to the next loop iteration
|
||||
continue
|
||||
|
||||
# Extract ATT&CK technique IDs from tags
|
||||
tags = data.get("tags", [])
|
||||
# Check: not isinstance(tags, list)
|
||||
if not isinstance(tags, list):
|
||||
# Skip to the next loop iteration
|
||||
continue
|
||||
|
||||
# Assign technique_ids = _extract_attack_tags(tags)
|
||||
technique_ids = _extract_attack_tags(tags)
|
||||
# Check: not technique_ids
|
||||
if not technique_ids:
|
||||
# continue # Skip rules without ATT&CK mapping
|
||||
continue # Skip rules without ATT&CK mapping
|
||||
|
||||
# Assign description = data.get("description", "")
|
||||
description = data.get("description", "")
|
||||
# Assign level = str(data.get("level", "")).lower()
|
||||
level = str(data.get("level", "")).lower()
|
||||
# Assign severity = _SEVERITY_MAP.get(level)
|
||||
severity = _SEVERITY_MAP.get(level)
|
||||
|
||||
# Extract logsource
|
||||
logsource = data.get("logsource", {})
|
||||
# Check: not isinstance(logsource, dict)
|
||||
if not isinstance(logsource, dict):
|
||||
# Assign logsource = {}
|
||||
logsource = {}
|
||||
|
||||
# Read full YAML content for storage
|
||||
try:
|
||||
# Open context manager
|
||||
with open(yaml_path, "r", encoding="utf-8") as fh:
|
||||
# Assign raw_content = fh.read()
|
||||
raw_content = fh.read()
|
||||
# Handle Exception
|
||||
except Exception:
|
||||
# Assign raw_content = yaml.dump(data, default_flow_style=False)
|
||||
raw_content = yaml.dump(data, default_flow_style=False)
|
||||
|
||||
# False positive assessment
|
||||
falsepositives = data.get("falsepositives", [])
|
||||
# Check: isinstance(falsepositives, list) and len(falsepositives) > 3
|
||||
if isinstance(falsepositives, list) and len(falsepositives) > 3:
|
||||
# Assign fp_rate = "high"
|
||||
fp_rate = "high"
|
||||
# Alternative: isinstance(falsepositives, list) and len(falsepositives) > 1
|
||||
elif isinstance(falsepositives, list) and len(falsepositives) > 1:
|
||||
# Assign fp_rate = "medium"
|
||||
fp_rate = "medium"
|
||||
# Fallback: handle remaining cases
|
||||
else:
|
||||
# Assign fp_rate = "low"
|
||||
fp_rate = "low"
|
||||
|
||||
# Create one entry per technique
|
||||
for tech_id in technique_ids:
|
||||
# Assign source_url = (
|
||||
source_url = (
|
||||
f"https://github.com/SigmaHQ/sigma/blob/master/"
|
||||
f"{relative_path.replace(chr(92), '/')}"
|
||||
)
|
||||
# Call results.append()
|
||||
results.append({
|
||||
# Literal argument value
|
||||
"mitre_technique_id": tech_id,
|
||||
# Literal argument value
|
||||
"title": title[:500],
|
||||
# Literal argument value
|
||||
"description": str(description)[:2000] if description else None,
|
||||
# Literal argument value
|
||||
"source_id": relative_path,
|
||||
# Literal argument value
|
||||
"source_url": source_url,
|
||||
# Literal argument value
|
||||
"rule_content": raw_content,
|
||||
# Literal argument value
|
||||
"severity": severity,
|
||||
# Literal argument value
|
||||
"log_sources": logsource if logsource else None,
|
||||
# Literal argument value
|
||||
"false_positive_rate": fp_rate,
|
||||
# Literal argument value
|
||||
"platforms": _platforms_from_logsource(logsource),
|
||||
})
|
||||
|
||||
# Log info: "Parsed %d (rule, technique) pairs total", len(res
|
||||
logger.info("Parsed %d (rule, technique) pairs total", len(results))
|
||||
# Return results
|
||||
return results
|
||||
|
||||
|
||||
# Define function _platforms_from_logsource
|
||||
def _platforms_from_logsource(logsource: dict) -> list[str]:
|
||||
"""Infer platform list from Sigma logsource."""
|
||||
# Assign platforms = []
|
||||
platforms = []
|
||||
# Assign product = str(logsource.get("product", "")).lower()
|
||||
product = str(logsource.get("product", "")).lower()
|
||||
# Assign service = str(logsource.get("service", "")).lower()
|
||||
service = str(logsource.get("service", "")).lower()
|
||||
|
||||
# Check: "windows" in product or "windows" in service
|
||||
if "windows" in product or "windows" in service:
|
||||
# Call platforms.append()
|
||||
platforms.append("windows")
|
||||
# Check: "linux" in product or "linux" in service
|
||||
if "linux" in product or "linux" in service:
|
||||
# Call platforms.append()
|
||||
platforms.append("linux")
|
||||
# Check: "macos" in product or "macos" in service
|
||||
if "macos" in product or "macos" in service:
|
||||
# Call platforms.append()
|
||||
platforms.append("macos")
|
||||
|
||||
# Sysmon → Windows
|
||||
if "sysmon" in service and "windows" not in platforms:
|
||||
# Call platforms.append()
|
||||
platforms.append("windows")
|
||||
|
||||
# Return platforms if platforms else None
|
||||
return platforms if platforms else None
|
||||
|
||||
|
||||
@@ -262,84 +393,136 @@ def sync(db: Session) -> dict:
|
||||
db : Session
|
||||
Active SQLAlchemy database session.
|
||||
|
||||
Returns
|
||||
Returns:
|
||||
-------
|
||||
dict
|
||||
Summary with ``created``, ``skipped_existing``, ``total_parsed``.
|
||||
"""
|
||||
# Assign tmp_dir = tempfile.mkdtemp(prefix="aegis_sigma_")
|
||||
tmp_dir = tempfile.mkdtemp(prefix="aegis_sigma_")
|
||||
# Attempt the following; catch errors below
|
||||
try:
|
||||
# Assign zip_bytes = _download_zip()
|
||||
zip_bytes = _download_zip()
|
||||
# Assign rules_dir = _extract_zip(zip_bytes, tmp_dir)
|
||||
rules_dir = _extract_zip(zip_bytes, tmp_dir)
|
||||
# Assign parsed_rules = _parse_sigma_rules(rules_dir)
|
||||
parsed_rules = _parse_sigma_rules(rules_dir)
|
||||
# Always execute this cleanup block
|
||||
finally:
|
||||
# Call shutil.rmtree()
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
# Log info: "Cleaned up temp directory %s", tmp_dir
|
||||
logger.info("Cleaned up temp directory %s", tmp_dir)
|
||||
|
||||
# Pre-load existing source_ids for dedup
|
||||
existing_ids: set[str] = {
|
||||
row[0]
|
||||
for row in db.query(DetectionRule.source_id)
|
||||
# Chain .filter() call
|
||||
.filter(DetectionRule.source == "sigma")
|
||||
# Chain .filter() call
|
||||
.filter(DetectionRule.source_id.isnot(None))
|
||||
# Chain .all() call
|
||||
.all()
|
||||
}
|
||||
|
||||
# Assign created = 0
|
||||
created = 0
|
||||
# Assign skipped = 0
|
||||
skipped = 0
|
||||
|
||||
# Iterate over parsed_rules
|
||||
for item in parsed_rules:
|
||||
# Deduplicate by source_id: one rule file may map to multiple techniques,
|
||||
# but we skip insertion if this source_id was already imported.
|
||||
if item["source_id"] in existing_ids:
|
||||
# Assign skipped = 1
|
||||
skipped += 1
|
||||
# Skip to the next loop iteration
|
||||
continue
|
||||
|
||||
# Assign rule = DetectionRule(
|
||||
rule = DetectionRule(
|
||||
# Keyword argument: mitre_technique_id
|
||||
mitre_technique_id=item["mitre_technique_id"],
|
||||
# Keyword argument: title
|
||||
title=item["title"],
|
||||
# Keyword argument: description
|
||||
description=item["description"],
|
||||
# Keyword argument: source
|
||||
source="sigma",
|
||||
# Keyword argument: source_id
|
||||
source_id=item["source_id"],
|
||||
# Keyword argument: source_url
|
||||
source_url=item["source_url"],
|
||||
# Keyword argument: rule_content
|
||||
rule_content=item["rule_content"],
|
||||
# Keyword argument: rule_format
|
||||
rule_format="sigma_yaml",
|
||||
# Keyword argument: severity
|
||||
severity=item["severity"],
|
||||
# Keyword argument: platforms
|
||||
platforms=item["platforms"],
|
||||
# Keyword argument: log_sources
|
||||
log_sources=item["log_sources"],
|
||||
# Keyword argument: false_positive_rate
|
||||
false_positive_rate=item["false_positive_rate"],
|
||||
# Keyword argument: is_active
|
||||
is_active=True,
|
||||
)
|
||||
# Stage new record(s) for database insertion
|
||||
db.add(rule)
|
||||
# Call existing_ids.add()
|
||||
existing_ids.add(item["source_id"])
|
||||
# Assign created = 1
|
||||
created += 1
|
||||
|
||||
# Commit all pending changes to the database
|
||||
db.commit()
|
||||
|
||||
# Assign summary = {
|
||||
summary = {
|
||||
# Literal argument value
|
||||
"created": created,
|
||||
# Literal argument value
|
||||
"skipped_existing": skipped,
|
||||
# Literal argument value
|
||||
"total_parsed": len(parsed_rules),
|
||||
}
|
||||
|
||||
# Update DataSource record
|
||||
ds = db.query(DataSource).filter(DataSource.name == "sigma").first()
|
||||
# Check: ds
|
||||
if ds:
|
||||
# Assign ds.last_sync_at = datetime.utcnow()
|
||||
ds.last_sync_at = datetime.utcnow()
|
||||
# Assign ds.last_sync_status = "success"
|
||||
ds.last_sync_status = "success"
|
||||
# Assign ds.last_sync_stats = summary
|
||||
ds.last_sync_stats = summary
|
||||
# Commit all pending changes to the database
|
||||
db.commit()
|
||||
|
||||
# Log info: "Sigma import complete — %s", summary
|
||||
logger.info("Sigma import complete — %s", summary)
|
||||
|
||||
# Call log_action()
|
||||
log_action(
|
||||
db,
|
||||
# Keyword argument: user_id
|
||||
user_id=None,
|
||||
# Keyword argument: action
|
||||
action="import_sigma_rules",
|
||||
# Keyword argument: entity_type
|
||||
entity_type="detection_rule",
|
||||
# Keyword argument: entity_id
|
||||
entity_id=None,
|
||||
# Keyword argument: details
|
||||
details=summary,
|
||||
)
|
||||
# Commit all pending changes to the database
|
||||
db.commit()
|
||||
|
||||
# Return summary
|
||||
return summary
|
||||
|
||||
Reference in New Issue
Block a user