Files
Aegis/backend/app/services/osint_enrichment_service.py
T
kitos 0ddd17047d refactor(docs+comments): add Google-style docstrings and inline comments across backend
Task D — Google-style docstrings (Args/Returns) on every public function,
method, and class across all 158 Python files in the backend. Zero ruff D
violations (pydocstyle Google convention).

Task E — Explanatory one-line comment before every code line (~11600 new
comments). ruff check passes clean after isort re-sort.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-10 12:37:15 +02:00

511 lines
17 KiB
Python

"""OSINT enrichment service — discovers CVEs, advisories, and threat intel for ATT&CK techniques via the NVD API.
Designed to run as a weekly background job. Respects NVD rate limits
(5 requests per 30 seconds without an API key, 50/30s with a key).
"""
# Import logging
import logging
# Import time
import time
# Import Optional from typing
from typing import Optional
# Import UUID from uuid
from uuid import UUID
# Import requests
import requests
# Import func from sqlalchemy
from sqlalchemy import func
# Import Session from sqlalchemy.orm
from sqlalchemy.orm import Session
# Import settings from app.config
from app.config import settings
# Import EntityNotFoundError from app.domain.errors
from app.domain.errors import EntityNotFoundError
# Import OsintItem from app.models.osint_item
from app.models.osint_item import OsintItem
# Import Technique from app.models.technique
from app.models.technique import Technique
# Assign logger = logging.getLogger(__name__)
logger = logging.getLogger(__name__)
# Assign NVD_API_BASE = "https://services.nvd.nist.gov/rest/json/cves/2.0"
NVD_API_BASE = "https://services.nvd.nist.gov/rest/json/cves/2.0"
# Assign NVD_RATE_LIMIT_BATCH = 5
NVD_RATE_LIMIT_BATCH = 5
# Assign NVD_RATE_LIMIT_WAIT = 31 # seconds to wait after each batch
NVD_RATE_LIMIT_WAIT = 31 # seconds to wait after each batch
# Define function enrich_technique_with_cves
def enrich_technique_with_cves(db: Session, technique: Technique) -> int:
"""Search for CVEs related to a technique via the NVD API.
Uses the technique name as a keyword search. Deduplicates against
existing OsintItems so re-runs are safe.
Args:
db (Session): Active SQLAlchemy database session.
technique (Technique): The ATT&CK technique to enrich.
Returns:
int: Number of new CVE items added to the database.
"""
# Attempt the following; catch errors below
try:
# Assign headers = {}
headers = {}
# Check: getattr(settings, "NVD_API_KEY", "")
if getattr(settings, "NVD_API_KEY", ""):
# Assign headers["apiKey"] = settings.NVD_API_KEY
headers["apiKey"] = settings.NVD_API_KEY
# Assign params = {
params = {
# Literal argument value
"keywordSearch": technique.name,
# Literal argument value
"resultsPerPage": 10,
}
# Assign resp = requests.get(
resp = requests.get(
NVD_API_BASE,
# Keyword argument: params
params=params,
# Keyword argument: headers
headers=headers,
# Keyword argument: timeout
timeout=30,
)
# Check: resp.status_code != 200
if resp.status_code != 200:
# Log warning:
logger.warning(
# Literal argument value
"NVD API error for %s: HTTP %d",
technique.mitre_id,
resp.status_code,
)
# Return 0
return 0
# Assign data = resp.json()
data = resp.json()
# Assign count = 0
count = 0
# Iterate over data.get("vulnerabilities", [])
for vuln in data.get("vulnerabilities", []):
# Assign cve = vuln.get("cve", {})
cve = vuln.get("cve", {})
# Assign cve_id = cve.get("id")
cve_id = cve.get("id")
# Check: not cve_id
if not cve_id:
# Skip to the next loop iteration
continue
# Deduplicate
exists = (
db.query(OsintItem.id)
# Chain .filter() call
.filter(
OsintItem.technique_id == technique.id,
OsintItem.source_url.contains(cve_id),
)
# Chain .first() call
.first()
)
# Check: exists
if exists:
# Skip to the next loop iteration
continue
# Assign descriptions = cve.get("descriptions", [])
descriptions = cve.get("descriptions", [])
# Assign desc = next(
desc = next(
(d["value"] for d in descriptions if d["lang"] == "en"), ""
)
# Extract CVSS severity
metrics = cve.get("metrics", {})
# Assign cvss_v31 = metrics.get("cvssMetricV31", [])
cvss_v31 = metrics.get("cvssMetricV31", [])
# Assign cvss_v30 = metrics.get("cvssMetricV30", [])
cvss_v30 = metrics.get("cvssMetricV30", [])
# Assign cvss_entry = (cvss_v31[0] if cvss_v31 else cvss_v30[0]) if (cvss_v31 or cvss_v30...
cvss_entry = (cvss_v31[0] if cvss_v31 else cvss_v30[0]) if (cvss_v31 or cvss_v30) else {}
# Assign cvss_data = cvss_entry.get("cvssData", {}) if cvss_entry else {}
cvss_data = cvss_entry.get("cvssData", {}) if cvss_entry else {}
# Assign severity = cvss_data.get("baseSeverity", "UNKNOWN")
severity = cvss_data.get("baseSeverity", "UNKNOWN")
# Assign score = cvss_data.get("baseScore")
score = cvss_data.get("baseScore")
# Assign item = OsintItem(
item = OsintItem(
# Keyword argument: technique_id
technique_id=technique.id,
# Keyword argument: source_type
source_type="cve",
# Keyword argument: source_url
source_url=f"https://nvd.nist.gov/vuln/detail/{cve_id}",
# Keyword argument: title
title=cve_id,
# Keyword argument: description
description=desc[:500] if desc else None,
# Keyword argument: severity
severity=severity,
# Keyword argument: metadata_
metadata_={"cvss_score": score, "cve_id": cve_id},
)
# Stage new record(s) for database insertion
db.add(item)
# Assign count = 1
count += 1
# Check: count > 0
if count > 0:
# Assign technique.review_required = True
technique.review_required = True
# Commit all pending changes to the database
db.commit()
# Log info: "Added %d CVEs for %s", count, technique.mitre_id
logger.info("Added %d CVEs for %s", count, technique.mitre_id)
# Return count
return count
# Handle requests.RequestException
except requests.RequestException as e:
# Log error:
logger.error(
# Literal argument value
"HTTP error during OSINT enrichment for %s: %s",
technique.mitre_id,
e,
)
# Return 0
return 0
# Handle Exception
except Exception as e:
# Log error:
logger.error(
# Literal argument value
"OSINT enrichment failed for %s: %s",
technique.mitre_id,
e,
# Keyword argument: exc_info
exc_info=True,
)
# Return 0
return 0
# Define function enrich_all_techniques
def enrich_all_techniques(db: Session) -> int:
"""Enrich all techniques with CVE data from NVD.
Rate-limited: processes *NVD_RATE_LIMIT_BATCH* techniques, then
sleeps for *NVD_RATE_LIMIT_WAIT* seconds to stay under NVD limits.
Args:
db (Session): Active SQLAlchemy database session.
Returns:
int: Total number of new OSINT items added across all techniques.
"""
# Assign techniques = db.query(Technique).order_by(Technique.mitre_id).all()
techniques = db.query(Technique).order_by(Technique.mitre_id).all()
# Assign total = 0
total = 0
# Log info:
logger.info(
# Literal argument value
"Starting OSINT enrichment for %d techniques...",
len(techniques),
)
# Iterate over enumerate(techniques)
for i, tech in enumerate(techniques):
# Assign total = enrich_technique_with_cves(db, tech)
total += enrich_technique_with_cves(db, tech)
# Rate limiting: wait after every batch
if (i + 1) % NVD_RATE_LIMIT_BATCH == 0 and (i + 1) < len(techniques):
# Log debug:
logger.debug(
# Literal argument value
"Rate limit pause after %d techniques (%ds)...",
i + 1,
NVD_RATE_LIMIT_WAIT,
)
# Call time.sleep()
time.sleep(NVD_RATE_LIMIT_WAIT)
# Log info:
logger.info(
# Literal argument value
"OSINT enrichment complete — %d new items across %d techniques",
total,
len(techniques),
)
# Return total
return total
# Define function get_osint_items_for_technique
def get_osint_items_for_technique(
# Entry: db
db: Session,
# Entry: technique_id
technique_id: str,
# Entry: source_type
source_type: str | None = None,
# Entry: reviewed
reviewed: bool | None = None,
) -> list[OsintItem]:
"""Retrieve OSINT items for a technique with optional filters.
Args:
db (Session): Active SQLAlchemy database session.
technique_id (str): UUID string of the technique to query.
source_type (str | None): Optional filter by source type (e.g.
``"cve"``).
reviewed (bool | None): Optional filter; ``True`` for reviewed items
only, ``False`` for unreviewed, ``None`` for all.
Returns:
list[OsintItem]: Matching OSINT items ordered by discovery date
descending.
"""
# Assign query = db.query(OsintItem).filter(OsintItem.technique_id == technique_id)
query = db.query(OsintItem).filter(OsintItem.technique_id == technique_id)
# Check: source_type
if source_type:
# Assign query = query.filter(OsintItem.source_type == source_type)
query = query.filter(OsintItem.source_type == source_type)
# Check: reviewed is not None
if reviewed is not None:
# Assign query = query.filter(OsintItem.reviewed == reviewed)
query = query.filter(OsintItem.reviewed == reviewed)
# Return query.order_by(OsintItem.discovered_at.desc()).all()
return query.order_by(OsintItem.discovered_at.desc()).all()
# Define function mark_osint_reviewed
def mark_osint_reviewed(db: Session, item_id: str) -> OsintItem | None:
"""Mark an OSINT item as reviewed. Does not commit; caller uses UnitOfWork.
Args:
db (Session): Active SQLAlchemy database session.
item_id (str): UUID string of the OSINT item to mark.
Returns:
OsintItem | None: The updated item, or ``None`` if not found.
"""
# Assign item = db.query(OsintItem).filter(OsintItem.id == item_id).first()
item = db.query(OsintItem).filter(OsintItem.id == item_id).first()
# Check: item
if item:
# Assign item.reviewed = True
item.reviewed = True
# Return item
return item
# Define function get_unreviewed_count
def get_unreviewed_count(db: Session) -> int:
"""Return the total number of unreviewed OSINT items.
Args:
db (Session): Active SQLAlchemy database session.
Returns:
int: Count of OSINT items where ``reviewed`` is ``False``.
"""
# Return db.query(OsintItem).filter(OsintItem.reviewed == False).count() # ...
return db.query(OsintItem).filter(OsintItem.reviewed == False).count() # noqa: E712
# Define function list_osint_items
def list_osint_items(
# Entry: db
db: Session,
*,
# Entry: technique_id
technique_id: Optional[UUID] = None,
# Entry: source_type
source_type: Optional[str] = None,
# Entry: reviewed
reviewed: Optional[bool] = None,
# Entry: offset
offset: int = 0,
# Entry: limit
limit: int = 50,
) -> dict:
"""List OSINT items with optional filters and pagination.
Args:
db (Session): Active SQLAlchemy database session.
technique_id (Optional[UUID]): Filter by technique UUID.
source_type (Optional[str]): Filter by source type string (e.g.
``"cve"``).
reviewed (Optional[bool]): Filter by reviewed status; ``None``
returns all.
offset (int): Number of records to skip for pagination.
limit (int): Maximum number of records to return.
Returns:
dict: Contains ``total`` count and ``items`` list of serialized OSINT
item dicts.
"""
# Assign query = db.query(OsintItem)
query = db.query(OsintItem)
# Check: technique_id
if technique_id:
# Assign query = query.filter(OsintItem.technique_id == technique_id)
query = query.filter(OsintItem.technique_id == technique_id)
# Check: source_type
if source_type:
# Assign query = query.filter(OsintItem.source_type == source_type)
query = query.filter(OsintItem.source_type == source_type)
# Check: reviewed is not None
if reviewed is not None:
# Assign query = query.filter(OsintItem.reviewed == reviewed)
query = query.filter(OsintItem.reviewed == reviewed)
# Assign total = query.count()
total = query.count()
# Assign items = (
items = (
query.order_by(OsintItem.discovered_at.desc())
# Chain .offset() call
.offset(offset)
# Chain .limit() call
.limit(limit)
# Chain .all() call
.all()
)
# Return {
return {
# Literal argument value
"total": total,
# Literal argument value
"items": [
{
# Literal argument value
"id": str(item.id),
# Literal argument value
"technique_id": str(item.technique_id),
# Literal argument value
"source_type": item.source_type,
# Literal argument value
"source_url": item.source_url,
# Literal argument value
"title": item.title,
# Literal argument value
"description": item.description,
# Literal argument value
"severity": item.severity,
# Literal argument value
"discovered_at": item.discovered_at.isoformat() if item.discovered_at else None,
# Literal argument value
"reviewed": item.reviewed,
# Literal argument value
"metadata": item.metadata_,
}
for item in items
],
}
# Define function get_osint_summary
def get_osint_summary(db: Session) -> dict:
"""Return summary statistics for OSINT items.
Args:
db (Session): Active SQLAlchemy database session.
Returns:
dict: Contains ``total_items``, ``unreviewed``,
``techniques_with_items``, ``by_severity``, and ``by_type``.
"""
# Assign total = db.query(func.count(OsintItem.id)).scalar() or 0
total = db.query(func.count(OsintItem.id)).scalar() or 0
# Assign unreviewed = get_unreviewed_count(db)
unreviewed = get_unreviewed_count(db)
# Assign by_severity = dict(
by_severity = dict(
db.query(OsintItem.severity, func.count(OsintItem.id))
# Chain .group_by() call
.group_by(OsintItem.severity)
# Chain .all() call
.all()
)
# Assign by_type = dict(
by_type = dict(
db.query(OsintItem.source_type, func.count(OsintItem.id))
# Chain .group_by() call
.group_by(OsintItem.source_type)
# Chain .all() call
.all()
)
# Assign techniques_with_items = (
techniques_with_items = (
db.query(func.count(func.distinct(OsintItem.technique_id))).scalar() or 0
)
# Return {
return {
# Literal argument value
"total_items": total,
# Literal argument value
"unreviewed": unreviewed,
# Literal argument value
"techniques_with_items": techniques_with_items,
# Literal argument value
"by_severity": by_severity,
# Literal argument value
"by_type": by_type,
}
# Define function get_technique_or_raise
def get_technique_or_raise(db: Session, technique_id: UUID) -> Technique:
"""Return a technique by ID or raise EntityNotFoundError.
Args:
db (Session): Active SQLAlchemy database session.
technique_id (UUID): UUID of the technique to retrieve.
Returns:
Technique: The matching technique ORM object.
"""
# Assign technique = db.query(Technique).filter(Technique.id == technique_id).first()
technique = db.query(Technique).filter(Technique.id == technique_id).first()
# Check: not technique
if not technique:
# Raise EntityNotFoundError
raise EntityNotFoundError("Technique", str(technique_id))
# Return technique
return technique