feat(phase-38): automatic intelligence — OSINT enrichment + stale coverage detection

Tarea 4.1 — OSINT Enrichment:
- Add OsintItem model with source_type, severity, CVSS metadata, review flag
- Add Alembic migration b022 with osint_items table and optimized indexes
- Add osint_enrichment_service with NVD API integration, deduplication, rate limiting
- Add OSINT router: GET /osint/items, /osint/summary, /osint/technique/{id}
- Add POST /osint/items/{id}/review to mark items as reviewed
- Add POST /osint/enrich/{technique_id} for manual single-technique enrichment
- Techniques with new CVEs are automatically flagged review_required=True
- Register weekly enrichment job in APScheduler
- Add NVD_API_KEY config setting for optional increased rate limits

Tarea 4.2 — Stale Coverage Detection:
- Add stale_detection_service that flags techniques with no validated test
  in the last N days, or never-validated but with a coverage status
- Configurable threshold via STALE_THRESHOLD_DAYS setting (default 365)
- Register daily stale detection job in APScheduler
- Only flags techniques not already marked review_required
This commit is contained in:
2026-02-17 17:47:47 +01:00
parent 31e116b4ba
commit 222979574a
9 changed files with 607 additions and 2 deletions

View File

@@ -0,0 +1,191 @@
"""OSINT enrichment service — automatically discovers CVEs, advisories, and
related intelligence for MITRE ATT&CK techniques using the NVD API.
Designed to run as a weekly background job. Respects NVD rate limits
(5 requests per 30 seconds without an API key, 50/30s with a key).
"""
import logging
import time
import requests
from sqlalchemy.orm import Session
from app.config import settings
from app.models.osint_item import OsintItem
from app.models.technique import Technique
logger = logging.getLogger(__name__)
NVD_API_BASE = "https://services.nvd.nist.gov/rest/json/cves/2.0"
NVD_RATE_LIMIT_BATCH = 5
NVD_RATE_LIMIT_WAIT = 31 # seconds to wait after each batch
def enrich_technique_with_cves(db: Session, technique: Technique) -> int:
"""Search for CVEs related to a technique via the NVD API.
Uses the technique name as a keyword search. Deduplicates against
existing OsintItems so re-runs are safe.
Returns the number of new CVEs added.
"""
try:
headers = {}
if getattr(settings, "NVD_API_KEY", ""):
headers["apiKey"] = settings.NVD_API_KEY
params = {
"keywordSearch": technique.name,
"resultsPerPage": 10,
}
resp = requests.get(
NVD_API_BASE,
params=params,
headers=headers,
timeout=30,
)
if resp.status_code != 200:
logger.warning(
"NVD API error for %s: HTTP %d",
technique.mitre_id,
resp.status_code,
)
return 0
data = resp.json()
count = 0
for vuln in data.get("vulnerabilities", []):
cve = vuln.get("cve", {})
cve_id = cve.get("id")
if not cve_id:
continue
# Deduplicate
exists = (
db.query(OsintItem.id)
.filter(
OsintItem.technique_id == technique.id,
OsintItem.source_url.contains(cve_id),
)
.first()
)
if exists:
continue
descriptions = cve.get("descriptions", [])
desc = next(
(d["value"] for d in descriptions if d["lang"] == "en"), ""
)
# Extract CVSS severity
metrics = cve.get("metrics", {})
cvss_v31 = metrics.get("cvssMetricV31", [])
cvss_v30 = metrics.get("cvssMetricV30", [])
cvss_entry = (cvss_v31[0] if cvss_v31 else cvss_v30[0]) if (cvss_v31 or cvss_v30) else {}
cvss_data = cvss_entry.get("cvssData", {}) if cvss_entry else {}
severity = cvss_data.get("baseSeverity", "UNKNOWN")
score = cvss_data.get("baseScore")
item = OsintItem(
technique_id=technique.id,
source_type="cve",
source_url=f"https://nvd.nist.gov/vuln/detail/{cve_id}",
title=cve_id,
description=desc[:500] if desc else None,
severity=severity,
metadata_={"cvss_score": score, "cve_id": cve_id},
)
db.add(item)
count += 1
if count > 0:
technique.review_required = True
db.commit()
logger.info("Added %d CVEs for %s", count, technique.mitre_id)
return count
except requests.RequestException as e:
logger.error(
"HTTP error during OSINT enrichment for %s: %s",
technique.mitre_id,
e,
)
return 0
except Exception as e:
logger.error(
"OSINT enrichment failed for %s: %s",
technique.mitre_id,
e,
exc_info=True,
)
return 0
def enrich_all_techniques(db: Session) -> int:
"""Enrich all techniques with CVE data from NVD.
Rate-limited: processes *NVD_RATE_LIMIT_BATCH* techniques, then
sleeps for *NVD_RATE_LIMIT_WAIT* seconds to stay under NVD limits.
Returns total number of new OSINT items added.
"""
techniques = db.query(Technique).order_by(Technique.mitre_id).all()
total = 0
logger.info(
"Starting OSINT enrichment for %d techniques...",
len(techniques),
)
for i, tech in enumerate(techniques):
total += enrich_technique_with_cves(db, tech)
# Rate limiting: wait after every batch
if (i + 1) % NVD_RATE_LIMIT_BATCH == 0 and (i + 1) < len(techniques):
logger.debug(
"Rate limit pause after %d techniques (%ds)...",
i + 1,
NVD_RATE_LIMIT_WAIT,
)
time.sleep(NVD_RATE_LIMIT_WAIT)
logger.info(
"OSINT enrichment complete — %d new items across %d techniques",
total,
len(techniques),
)
return total
def get_osint_items_for_technique(
db: Session,
technique_id: str,
source_type: str | None = None,
reviewed: bool | None = None,
) -> list[OsintItem]:
"""Retrieve OSINT items for a technique with optional filters."""
query = db.query(OsintItem).filter(OsintItem.technique_id == technique_id)
if source_type:
query = query.filter(OsintItem.source_type == source_type)
if reviewed is not None:
query = query.filter(OsintItem.reviewed == reviewed)
return query.order_by(OsintItem.discovered_at.desc()).all()
def mark_osint_reviewed(db: Session, item_id: str) -> OsintItem | None:
"""Mark an OSINT item as reviewed."""
item = db.query(OsintItem).filter(OsintItem.id == item_id).first()
if item:
item.reviewed = True
db.commit()
db.refresh(item)
return item
def get_unreviewed_count(db: Session) -> int:
"""Return the total number of unreviewed OSINT items."""
return db.query(OsintItem).filter(OsintItem.reviewed == False).count() # noqa: E712

View File

@@ -0,0 +1,78 @@
"""Stale coverage detection — marks techniques whose last validated test
is older than a configurable threshold.
This is the simple version. The full Decay Engine (Fase 8) will replace
this with a multi-factor, configurable decay model with confidence scores.
"""
import logging
from datetime import datetime, timedelta
from sqlalchemy import func
from sqlalchemy.orm import Session
from app.config import settings
from app.models.technique import Technique
from app.models.test import Test
logger = logging.getLogger(__name__)
STALE_THRESHOLD_DAYS = getattr(settings, "STALE_THRESHOLD_DAYS", 365)
def detect_stale_coverage(db: Session) -> int:
"""Scan all techniques and flag those with stale coverage.
A technique is considered stale when:
- It has a status other than ``not_evaluated``, AND
- Its most recent *validated* test is older than *STALE_THRESHOLD_DAYS*, OR
- It has never had a validated test (but has been manually marked as
covered/partial).
Returns the number of newly-flagged techniques.
"""
cutoff = datetime.utcnow() - timedelta(days=STALE_THRESHOLD_DAYS)
# Subquery: latest validated test date per technique
latest_test = (
db.query(
Test.technique_id,
func.max(Test.created_at).label("last_tested"),
)
.filter(Test.state == "validated")
.group_by(Test.technique_id)
.subquery()
)
# Find techniques that are stale
stale_techniques = (
db.query(Technique)
.outerjoin(latest_test, Technique.id == latest_test.c.technique_id)
.filter(
# Either tested before cutoff, or never tested at all
(latest_test.c.last_tested < cutoff)
| (latest_test.c.last_tested.is_(None))
)
.filter(
# Only flag techniques that have a real status (not never-evaluated ones)
Technique.status_global != "not_evaluated"
)
.all()
)
count = 0
for tech in stale_techniques:
if not tech.review_required:
tech.review_required = True
count += 1
logger.info("Marked %s as stale coverage", tech.mitre_id)
if count > 0:
db.commit()
logger.info(
"Stale coverage detection complete — %d techniques flagged", count
)
else:
logger.info("Stale coverage detection complete — no new stale techniques")
return count