"""Automated threat-intelligence scan service. Searches public security RSS feeds for mentions of MITRE ATT&CK technique IDs and names. New findings are stored as :class:`IntelItem` records and the related technique is flagged for review. This is an **MVP** implementation — it queries a small set of well-known RSS feeds and parses them with the standard-library :mod:`xml.etree` parser. No LLMs or paid APIs are used. """ # Import logging import logging # Import re import re # Import datetime from datetime from datetime import datetime # Import defusedxml.ElementTree import defusedxml.ElementTree as ET # noqa: N817 — ET is the universal stdlib alias for ElementTree # Import requests import requests as _requests # Import Session from sqlalchemy.orm from sqlalchemy.orm import Session # Import IntelItem from app.models.intel from app.models.intel import IntelItem # Import Technique from app.models.technique from app.models.technique import Technique # Import log_action from app.services.audit_service from app.services.audit_service import log_action # Assign logger = logging.getLogger(__name__) logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Public security RSS feeds # --------------------------------------------------------------------------- RSS_FEEDS: list[dict[str, str]] = [ { # Literal argument value "name": "CISA Alerts", # Literal argument value "url": "https://www.cisa.gov/cybersecurity-advisories/all.xml", }, { "name": "SecurityWeek", "url": "https://feeds.feedburner.com/Securityweek", }, { # Literal argument value "name": "SANS ISC", # Literal argument value "url": "https://isc.sans.edu/rssfeed.xml", }, { # Literal argument value "name": "BleepingComputer", # Literal argument value "url": "https://www.bleepingcomputer.com/feed/", }, { # Literal argument value "name": "The Hacker News", # Literal argument value "url": "https://feeds.feedburner.com/TheHackersNews", }, { # Literal argument value "name": "Krebs on Security", # Literal argument value "url": "https://krebsonsecurity.com/feed/", }, ] # Timeout for each feed request (seconds) _FEED_TIMEOUT = 15 # Minimum technique name length for name-based matching # Short names ("Kill", "BITS") produce too many false positives _MIN_NAME_LEN = 8 # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _fetch_feed(url: str) -> list[dict[str, str]]: """Download and parse an RSS/Atom feed, returning a list of entries. Each entry is a dict with keys ``title``, ``link``, and ``description``. Returns an empty list on any error so the scan can continue. """ # Attempt the following; catch errors below try: # Assign resp = _requests.get(url, timeout=_FEED_TIMEOUT, headers={ resp = _requests.get(url, timeout=_FEED_TIMEOUT, headers={ # Literal argument value "User-Agent": "AegisPlatform/1.0 IntelScan", }) # Call resp.raise_for_status() resp.raise_for_status() # Handle Exception except Exception as exc: # Log warning: "Failed to fetch feed %s: %s", url, exc logger.warning("Failed to fetch feed %s: %s", url, exc) # Return [] return [] # Attempt the following; catch errors below try: # Assign root = ET.fromstring(resp.content) root = ET.fromstring(resp.content) # Handle ET.ParseError except ET.ParseError as exc: # Log warning: "Failed to parse feed %s: %s", url, exc logger.warning("Failed to parse feed %s: %s", url, exc) # Return [] return [] # Assign entries = [] entries: list[dict[str, str]] = [] # RSS 2.0 format: ... for item in root.iter("item"): # Assign title_el = item.find("title") title_el = item.find("title") # Assign link_el = item.find("link") link_el = item.find("link") # Assign desc_el = item.find("description") desc_el = item.find("description") # Call entries.append() entries.append({ # Literal argument value "title": title_el.text.strip() if title_el is not None and title_el.text else "", # Literal argument value "link": link_el.text.strip() if link_el is not None and link_el.text else "", # Literal argument value "description": desc_el.text.strip() if desc_el is not None and desc_el.text else "", }) # Atom format: ... ns = {"atom": "http://www.w3.org/2005/Atom"} # Iterate over root.iter("{http for entry in root.iter("{http://www.w3.org/2005/Atom}entry"): # Assign title_el = entry.find("atom:title", ns) title_el = entry.find("atom:title", ns) # Assign link_el = entry.find("atom:link", ns) link_el = entry.find("atom:link", ns) # Assign summary_el = entry.find("atom:summary", ns) summary_el = entry.find("atom:summary", ns) # Assign link_href = "" link_href = "" # Check: link_el is not None if link_el is not None: # Assign link_href = link_el.get("href", "") link_href = link_el.get("href", "") # Call entries.append() entries.append({ # Literal argument value "title": title_el.text.strip() if title_el is not None and title_el.text else "", # Literal argument value "link": link_href.strip(), # Literal argument value "description": summary_el.text.strip() if summary_el is not None and summary_el.text else "", }) # Return entries return entries def _build_patterns(technique: Technique) -> tuple[list[re.Pattern], list[re.Pattern]]: """Build regex patterns for a technique. Returns two lists: - ``id_patterns``: MITRE ID patterns (high confidence, word-boundary matched) - ``name_patterns``: technique name patterns (lower confidence, long names only) """ id_patterns: list[re.Pattern] = [] name_patterns: list[re.Pattern] = [] # MITRE ID with word boundaries so T1059 doesn't partially match T1059.001 mitre_id_escaped = re.escape(technique.mitre_id) id_patterns.append(re.compile(rf"\b{mitre_id_escaped}\b", re.IGNORECASE)) # Technique name — only for distinctly long names to reduce false positives if technique.name and len(technique.name) >= _MIN_NAME_LEN: name_escaped = re.escape(technique.name) name_patterns.append(re.compile(rf"\b{name_escaped}\b", re.IGNORECASE)) return id_patterns, name_patterns def _entry_matches( entry: dict[str, str], id_patterns: list[re.Pattern], name_patterns: list[re.Pattern], ) -> bool: """Return True if any pattern matches the entry's title or description.""" # Assign text = f"{entry.get('title', '')} {entry.get('description', '')}" text = f"{entry.get('title', '')} {entry.get('description', '')}" return any(p.search(text) for p in id_patterns + name_patterns) # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def scan_intel(db: Session) -> dict: """Run the intel scan across RSS feeds for known techniques. Parameters ---------- db : Session Active SQLAlchemy database session. Returns: ------- dict Summary with keys ``new_items``, ``duplicates_skipped``, ``techniques_flagged``, ``feeds_checked``. """ # Log info: "Intel scan starting..." logger.info("Intel scan starting...") # 1. Load all active techniques techniques = ( db.query(Technique) # Chain .order_by() call .order_by(Technique.mitre_id) .all() ) # Log info: "Scanning %d techniques against %d feeds", len(tec logger.info("Scanning %d techniques against %d feeds", len(techniques), len(RSS_FEEDS)) # 2. Pre-load all existing intel URLs for dedup existing_urls: set[str] = { row[0] for row in db.query(IntelItem.url).all() } # 3. Fetch all feeds once all_entries: list[tuple[str, dict[str, str]]] = [] # (feed_name, entry) # Assign feeds_ok = 0 feeds_ok = 0 # Iterate over RSS_FEEDS for feed in RSS_FEEDS: # Assign entries = _fetch_feed(feed["url"]) entries = _fetch_feed(feed["url"]) # Check: entries if entries: # Assign feeds_ok = 1 feeds_ok += 1 # Iterate over entries for entry in entries: # Call all_entries.append() all_entries.append((feed["name"], entry)) # Log info: "Fetched %d entries from %d/%d feeds", len(all_ent logger.info("Fetched %d entries from %d/%d feeds", len(all_entries), feeds_ok, len(RSS_FEEDS)) # 4. Match entries to techniques new_items = 0 # Assign duplicates_skipped = 0 duplicates_skipped = 0 # Assign techniques_flagged = set() techniques_flagged: set[str] = set() # Iterate over techniques for technique in techniques: id_patterns, name_patterns = _build_patterns(technique) # Iterate over all_entries for feed_name, entry in all_entries: if not _entry_matches(entry, id_patterns, name_patterns): continue # Skip entries with no title (low-quality) if not entry.get("title", "").strip(): continue # Assign url = entry.get("link", "").strip() url = entry.get("link", "").strip() # Check: not url if not url: # Skip to the next loop iteration continue # Dedup if url in existing_urls: # Assign duplicates_skipped = 1 duplicates_skipped += 1 # Skip to the next loop iteration continue # Create IntelItem intel_item = IntelItem( # Keyword argument: technique_id technique_id=technique.id, # Keyword argument: url url=url, # Keyword argument: title title=entry.get("title", "")[:500], # Keyword argument: source source=feed_name, # Keyword argument: detected_at detected_at=datetime.utcnow(), # Keyword argument: reviewed reviewed=False, ) # Stage new record(s) for database insertion db.add(intel_item) # Call existing_urls.add() existing_urls.add(url) # Assign new_items = 1 new_items += 1 # Flag technique for review if not technique.review_required: # Assign technique.review_required = True technique.review_required = True # Call techniques_flagged.add() techniques_flagged.add(technique.mitre_id) # 5. Single commit db.commit() # Assign summary = { summary = { # Literal argument value "new_items": new_items, # Literal argument value "duplicates_skipped": duplicates_skipped, # Literal argument value "techniques_flagged": len(techniques_flagged), # Literal argument value "feeds_checked": feeds_ok, } # Log info: logger.info( # Literal argument value "Intel scan complete — new=%d, duplicates_skipped=%d, " # Literal argument value "techniques_flagged=%d, feeds_checked=%d", new_items, duplicates_skipped, len(techniques_flagged), feeds_ok, ) # 6. Audit log log_action( db, # Keyword argument: user_id user_id=None, # Keyword argument: action action="intel_scan", # Keyword argument: entity_type entity_type="intel_item", # Keyword argument: entity_id entity_id=None, # Keyword argument: details details=summary, ) # Commit all pending changes to the database db.commit() # Return summary return summary