From 96bf6e50979e4cc152e9715b965f27eeb2decbc1 Mon Sep 17 00:00:00 2001 From: debian Date: Thu, 5 Mar 2026 03:08:48 -0500 Subject: [PATCH] fase(4): crawling infrastructure with migrated code --- .ralph/.loop_start_sha | 2 +- .ralph/progress.json | 2 +- .../crawling/domain/entities/CrawlSession.js | 4 + .../adapters/CrawlingStateGraph.js | 72 +++++ .../adapters/ExplorationOrchestrator.js | 180 ++++++++++++ .../adapters/PlaywrightCrawlerEngine.js | 13 + .../infrastructure/http/CrawlingController.js | 56 ++++ .../KyselyCrawlSessionRepository.js | 79 +++++ .../repositories/KyselyStateRepository.js | 72 +++++ .../crawling/domain/entities/CrawlSession.ts | 7 +- .../adapters/CrawlingStateGraph.ts | 88 ++++++ .../adapters/ExplorationOrchestrator.ts | 273 ++++++++++++++++++ .../adapters/PlaywrightCrawlerEngine.ts | 11 + .../infrastructure/http/CrawlingController.ts | 73 +++++ .../KyselyCrawlSessionRepository.ts | 90 ++++++ .../repositories/KyselyStateRepository.ts | 86 ++++++ 16 files changed, 1105 insertions(+), 3 deletions(-) create mode 100644 dist/modules/crawling/infrastructure/adapters/CrawlingStateGraph.js create mode 100644 dist/modules/crawling/infrastructure/adapters/ExplorationOrchestrator.js create mode 100644 dist/modules/crawling/infrastructure/adapters/PlaywrightCrawlerEngine.js create mode 100644 dist/modules/crawling/infrastructure/http/CrawlingController.js create mode 100644 dist/modules/crawling/infrastructure/repositories/KyselyCrawlSessionRepository.js create mode 100644 dist/modules/crawling/infrastructure/repositories/KyselyStateRepository.js create mode 100644 src/modules/crawling/infrastructure/adapters/CrawlingStateGraph.ts create mode 100644 src/modules/crawling/infrastructure/adapters/ExplorationOrchestrator.ts create mode 100644 src/modules/crawling/infrastructure/adapters/PlaywrightCrawlerEngine.ts create mode 100644 src/modules/crawling/infrastructure/http/CrawlingController.ts create mode 100644 src/modules/crawling/infrastructure/repositories/KyselyCrawlSessionRepository.ts create mode 100644 src/modules/crawling/infrastructure/repositories/KyselyStateRepository.ts diff --git a/.ralph/.loop_start_sha b/.ralph/.loop_start_sha index 56faa43..49ac47c 100644 --- a/.ralph/.loop_start_sha +++ b/.ralph/.loop_start_sha @@ -1 +1 @@ -f8191133c83b74200c5f851d012051527da899d6 +39c5313ba581cfeb5d793df7c3cef21923473127 diff --git a/.ralph/progress.json b/.ralph/progress.json index 8f361f8..e504749 100644 --- a/.ralph/progress.json +++ b/.ralph/progress.json @@ -1 +1 @@ -{"status": "completed", "timestamp": "2026-03-04 04:32:44"} +{"status": "failed", "timestamp": "2026-03-04 16:32:12"} diff --git a/dist/modules/crawling/domain/entities/CrawlSession.js b/dist/modules/crawling/domain/entities/CrawlSession.js index dcd03bc..34d33b4 100644 --- a/dist/modules/crawling/domain/entities/CrawlSession.js +++ b/dist/modules/crawling/domain/entities/CrawlSession.js @@ -11,6 +11,10 @@ class CrawlSession extends AggregateRoot_1.AggregateRoot { constructor(props, id) { super(props, id); } + /** Reconstruct from persistence without emitting domain events */ + static reconstitute(props, id) { + return new CrawlSession(props, id); + } static create(request) { const urlResult = Url_1.Url.create(request.url); if (!urlResult.ok) { diff --git a/dist/modules/crawling/infrastructure/adapters/CrawlingStateGraph.js b/dist/modules/crawling/infrastructure/adapters/CrawlingStateGraph.js new file mode 100644 index 0000000..0562654 --- /dev/null +++ b/dist/modules/crawling/infrastructure/adapters/CrawlingStateGraph.js @@ -0,0 +1,72 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +exports.CrawlingStateGraph = void 0; +class CrawlingStateGraph { + constructor() { + this.states = new Map(); + this.transitions = []; + /** Insertion order for BFS */ + this.insertionOrder = []; + } + addState(state) { + if (!this.states.has(state.id)) { + this.states.set(state.id, state); + this.insertionOrder.push(state.id); + } + else { + const existing = this.states.get(state.id); + this.states.set(state.id, { ...existing, visitCount: existing.visitCount + 1 }); + } + } + hasState(stateId) { + return this.states.has(stateId); + } + getState(stateId) { + return this.states.get(stateId); + } + incrementVisit(stateId) { + const state = this.states.get(stateId); + if (state) { + this.states.set(stateId, { ...state, visitCount: state.visitCount + 1 }); + } + } + recordTransition(fromId, action, toId) { + this.transitions.push({ fromId, action, toId, timestamp: Date.now() }); + } + getUnvisited() { + return this.insertionOrder + .map((id) => this.states.get(id)) + .filter((s) => s.visitCount === 0); + } + /** BFS heuristic: returns the oldest unvisited state, or null if none */ + getNextToExplore() { + const unvisited = this.getUnvisited(); + return unvisited.length > 0 ? unvisited[0] : null; + } + getAllStates() { + return this.insertionOrder.map((id) => this.states.get(id)); + } + getTransitions() { + return [...this.transitions]; + } + toJSON() { + return { + stateCount: this.states.size, + transitionCount: this.transitions.length, + states: this.getAllStates().map((s) => ({ + id: s.id, + url: s.url, + title: s.title, + visitCount: s.visitCount, + })), + transitions: this.transitions.map((t) => ({ + fromId: t.fromId, + toId: t.toId, + actionId: t.action.id, + actionType: t.action.type, + timestamp: t.timestamp, + })), + }; + } +} +exports.CrawlingStateGraph = CrawlingStateGraph; diff --git a/dist/modules/crawling/infrastructure/adapters/ExplorationOrchestrator.js b/dist/modules/crawling/infrastructure/adapters/ExplorationOrchestrator.js new file mode 100644 index 0000000..be754c1 --- /dev/null +++ b/dist/modules/crawling/infrastructure/adapters/ExplorationOrchestrator.js @@ -0,0 +1,180 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +exports.ExplorationOrchestrator = void 0; +const AnomalyDetector_1 = require("../../../../core/AnomalyDetector"); +const Logger_1 = require("../../../../core/Logger"); +class ExplorationOrchestrator { + constructor(config) { + this.actionTrace = []; + this.aborted = false; + this.graph = config.graph; + this.engine = config.engine; + this.detector = config.detector ?? new AnomalyDetector_1.AnomalyDetector(); + this.collectors = config.collectors ?? []; + this.exporters = config.exporters ?? []; + this.reproducer = config.reproducer; + this.logger = config.logger ?? new Logger_1.NullLogger(); + this.seed = config.seed; + this.url = config.url; + this.maxSteps = config.maxSteps ?? 100; + this.outputDir = config.outputDir ?? './reports'; + this.events = config.events ?? {}; + this.sessionId = config.sessionId ?? `${Date.now()}_${config.seed}`; + this.explorationConfig = config.explorationConfig ?? {}; + this.fuzzingPlugin = config.fuzzingPlugin; + this.stateHooks = config.stateHooks ?? []; + } + stop() { + this.aborted = true; + } + async run() { + const anomalies = []; + let stepsExecuted = 0; + let depth = 0; + const sessionTimeoutMs = this.explorationConfig.sessionTimeoutMs ?? 0; + const maxDepth = this.explorationConfig.maxDepth ?? Infinity; + const sessionStart = Date.now(); + this.logger.log({ + event: 'session_start', + timestamp: sessionStart, + seed: this.seed, + target: this.url, + }); + this.events.onSessionStarted?.(this.sessionId, this.url); + const isTimedOut = () => sessionTimeoutMs > 0 && Date.now() - sessionStart >= sessionTimeoutMs; + try { + await this.engine.launch(this.url); + const initialState = await this.engine.captureState(); + this.graph.addState(initialState); + this.logger.log({ + event: 'state_discovered', + timestamp: Date.now(), + stateId: initialState.id, + url: initialState.url, + title: initialState.title, + }); + this.events.onStateDiscovered?.(this.sessionId, initialState.id, initialState.url, initialState.title); + while (stepsExecuted < this.maxSteps && !this.aborted && !isTimedOut() && depth <= maxDepth) { + const currentState = this.graph.getNextToExplore(); + if (!currentState) + break; + this.graph.incrementVisit(currentState.id); + const actions = await this.engine.discoverActions(currentState); + if (actions.length === 0) + continue; + const actionIndex = (this.seed + stepsExecuted) % actions.length; + const action = actions[actionIndex]; + this.logger.log({ + event: 'action_executed', + timestamp: Date.now(), + actionId: action.id, + type: action.type, + selector: action.selector, + value: action.value, + url: action.url, + }); + this.events.onActionExecuted?.(this.sessionId, action.type, action.selector, Date.now()); + const observation = await this.engine.executeAction(action); + this.actionTrace.push(action); + if (!this.graph.hasState(observation.newStateId)) { + const newState = await this.engine.captureState(); + this.graph.addState(newState); + depth += 1; + this.logger.log({ + event: 'state_discovered', + timestamp: Date.now(), + stateId: newState.id, + url: newState.url, + title: newState.title, + }); + this.events.onStateDiscovered?.(this.sessionId, newState.id, newState.url, newState.title); + for (const hook of this.stateHooks) { + const hookAnomalies = await hook(newState, this.engine, this.sessionId, [...this.actionTrace]).catch(() => []); + for (const anomaly of hookAnomalies) { + anomalies.push(anomaly); + this.logger.log({ + event: 'anomaly_detected', + timestamp: Date.now(), + anomalyId: anomaly.id, + type: anomaly.type, + severity: anomaly.severity, + }); + this.events.onAnomalyDetected?.(this.sessionId, anomaly); + for (const exporter of this.exporters) { + await exporter.export(anomaly, `${this.outputDir}/${anomaly.id}`); + } + } + } + } + this.graph.recordTransition(currentState.id, action, observation.newStateId); + this.logger.log({ + event: 'exploration_step', + timestamp: Date.now(), + stateId: currentState.id, + actionId: action.id, + }); + const detected = this.detector.detect(observation, [...this.actionTrace]); + for (const anomaly of detected) { + for (const collector of this.collectors) { + const evidence = await collector.collect(anomaly, this.engine); + Object.assign(anomaly.evidence, evidence); + } + anomalies.push(anomaly); + this.logger.log({ + event: 'anomaly_detected', + timestamp: Date.now(), + anomalyId: anomaly.id, + type: anomaly.type, + severity: anomaly.severity, + }); + this.events.onAnomalyDetected?.(this.sessionId, anomaly); + for (const exporter of this.exporters) { + const reportDir = `${this.outputDir}/${anomaly.id}`; + await exporter.export(anomaly, reportDir); + } + } + stepsExecuted += 1; + if (this.fuzzingPlugin && + this.explorationConfig.fuzzingEnabled !== false && + currentState.domSnapshot) { + const fuzzActions = this.fuzzingPlugin.generateFuzzActions(currentState.domSnapshot, currentState); + for (const fuzzAction of fuzzActions) { + if (this.aborted || isTimedOut()) + break; + const fuzzObs = await this.engine.executeAction(fuzzAction); + this.actionTrace.push(fuzzAction); + const fuzzAnomalies = this.detector.detect(fuzzObs, [...this.actionTrace]); + for (const anomaly of fuzzAnomalies) { + for (const collector of this.collectors) { + const evidence = await collector.collect(anomaly, this.engine); + Object.assign(anomaly.evidence, evidence); + } + anomalies.push(anomaly); + this.events.onAnomalyDetected?.(this.sessionId, anomaly); + for (const exporter of this.exporters) { + await exporter.export(anomaly, `${this.outputDir}/${anomaly.id}`); + } + } + } + } + } + } + catch (err) { + const msg = err instanceof Error ? err.message : String(err); + this.events.onSessionError?.(this.sessionId, msg); + await this.engine.close().catch(() => undefined); + throw err; + } + await this.engine.close(); + const statesVisited = this.graph.getAllStates().filter((s) => s.visitCount > 0).length; + this.logger.log({ + event: 'session_end', + timestamp: Date.now(), + statesVisited, + anomaliesFound: anomalies.length, + }); + this.events.onSessionCompleted?.(this.sessionId, statesVisited, anomalies.length); + return { statesVisited, anomaliesFound: anomalies.length, anomalies }; + } +} +exports.ExplorationOrchestrator = ExplorationOrchestrator; diff --git a/dist/modules/crawling/infrastructure/adapters/PlaywrightCrawlerEngine.js b/dist/modules/crawling/infrastructure/adapters/PlaywrightCrawlerEngine.js new file mode 100644 index 0000000..5846ac6 --- /dev/null +++ b/dist/modules/crawling/infrastructure/adapters/PlaywrightCrawlerEngine.js @@ -0,0 +1,13 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +exports.PlaywrightCrawlerEngine = void 0; +/** + * PlaywrightCrawlerEngine — adapts PlaywrightAgent to implement the ICrawlerEngine port. + */ +const PlaywrightAgent_1 = require("../../../../plugins/agents/PlaywrightAgent"); +class PlaywrightCrawlerEngine extends PlaywrightAgent_1.PlaywrightAgent { + constructor(config = {}) { + super(config); + } +} +exports.PlaywrightCrawlerEngine = PlaywrightCrawlerEngine; diff --git a/dist/modules/crawling/infrastructure/http/CrawlingController.js b/dist/modules/crawling/infrastructure/http/CrawlingController.js new file mode 100644 index 0000000..73c4129 --- /dev/null +++ b/dist/modules/crawling/infrastructure/http/CrawlingController.js @@ -0,0 +1,56 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +exports.createCrawlingRouter = createCrawlingRouter; +/** + * CrawlingController — thin Express controller for crawling routes. + * Delegates to use cases; returns Result-based responses. + */ +const express_1 = require("express"); +function createCrawlingRouter(deps) { + const router = (0, express_1.Router)(); + // POST /api/sessions — start a new crawl session + router.post('/', async (req, res) => { + const body = req.body; + const { url, seed = 42, maxStates = 50, config } = body; + if (!url || typeof url !== 'string') { + res.status(400).json({ error: 'url is required' }); + return; + } + const result = await deps.startCrawl.execute({ url, seed, maxStates, config }); + if (!result.ok) { + res.status(422).json({ error: result.error }); + return; + } + res.status(201).json(result.value); + }); + // GET /api/sessions — list all sessions + router.get('/', async (_req, res) => { + const result = await deps.listSessions.execute({}); + if (!result.ok) { + res.status(500).json({ error: result.error }); + return; + } + res.json(result.value); + }); + // GET /api/sessions/:id — session detail + router.get('/:id', async (req, res) => { + const sessionId = req.params['id']; + const result = await deps.getSession.execute({ sessionId }); + if (!result.ok) { + res.status(404).json({ error: result.error }); + return; + } + res.json(result.value); + }); + // DELETE /api/sessions/:id — stop a session + router.delete('/:id', async (req, res) => { + const sessionId = req.params['id']; + const result = await deps.stopCrawl.execute({ sessionId }); + if (!result.ok) { + res.status(404).json({ error: result.error }); + return; + } + res.json({ stopped: true }); + }); + return router; +} diff --git a/dist/modules/crawling/infrastructure/repositories/KyselyCrawlSessionRepository.js b/dist/modules/crawling/infrastructure/repositories/KyselyCrawlSessionRepository.js new file mode 100644 index 0000000..861a658 --- /dev/null +++ b/dist/modules/crawling/infrastructure/repositories/KyselyCrawlSessionRepository.js @@ -0,0 +1,79 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +exports.KyselyCrawlSessionRepository = void 0; +const CrawlSession_1 = require("../../domain/entities/CrawlSession"); +const UniqueId_1 = require("../../../../shared/domain/UniqueId"); +class KyselyCrawlSessionRepository { + constructor(db) { + this.db = db; + } + async save(session) { + const row = { + id: session.id.toString(), + url: session.url, + status: session.status, + seed: session.seed, + max_states: session.maxStates, + states_visited: session.statesVisited, + anomalies_found: 0, + started_at: Date.now(), + finished_at: null, + config_json: JSON.stringify(session.config), + }; + await this.db + .insertInto('sessions') + .values(row) + .execute(); + } + async findById(id) { + const row = await this.db + .selectFrom('sessions') + .selectAll() + .where('id', '=', id.toString()) + .executeTakeFirst(); + if (!row) + return null; + return this.toDomain(row); + } + async findAll() { + const rows = await this.db + .selectFrom('sessions') + .selectAll() + .orderBy('started_at', 'desc') + .execute(); + return rows.map((row) => this.toDomain(row)); + } + async update(session) { + const isTerminal = session.status === 'completed' || session.status === 'failed' || session.status === 'stopped'; + await this.db + .updateTable('sessions') + .set({ + status: session.status, + states_visited: session.statesVisited, + finished_at: isTerminal ? Date.now() : null, + config_json: JSON.stringify(session.config), + }) + .where('id', '=', session.id.toString()) + .execute(); + } + toDomain(row) { + const props = { + url: row.url, + status: row.status, + seed: row.seed, + maxStates: row.max_states, + statesVisited: row.states_visited, + config: this.parseJson(row.config_json), + }; + return CrawlSession_1.CrawlSession.reconstitute(props, UniqueId_1.UniqueId.from(row.id)); + } + parseJson(json) { + try { + return JSON.parse(json); + } + catch { + return {}; + } + } +} +exports.KyselyCrawlSessionRepository = KyselyCrawlSessionRepository; diff --git a/dist/modules/crawling/infrastructure/repositories/KyselyStateRepository.js b/dist/modules/crawling/infrastructure/repositories/KyselyStateRepository.js new file mode 100644 index 0000000..b143c5b --- /dev/null +++ b/dist/modules/crawling/infrastructure/repositories/KyselyStateRepository.js @@ -0,0 +1,72 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +exports.KyselyStateRepository = void 0; +const CrawlState_1 = require("../../domain/entities/CrawlState"); +const UniqueId_1 = require("../../../../shared/domain/UniqueId"); +class KyselyStateRepository { + constructor(db) { + this.db = db; + } + async save(state) { + const row = { + id: state.id.toString(), + session_id: state.sessionId, + url: state.url, + title: state.title, + dom_snapshot_path: null, + visit_count: state.visitCount, + discovered_at: Date.now(), + }; + await this.db + .insertInto('states') + .values(row) + .execute(); + } + async findById(id) { + const row = await this.db + .selectFrom('states') + .selectAll() + .where('id', '=', id.toString()) + .executeTakeFirst(); + if (!row) + return null; + return this.toDomain(row); + } + async findAll() { + const rows = await this.db + .selectFrom('states') + .selectAll() + .execute(); + return rows.map((row) => this.toDomain(row)); + } + async findBySessionId(sessionId) { + const rows = await this.db + .selectFrom('states') + .selectAll() + .where('session_id', '=', sessionId) + .execute(); + return rows.map((row) => this.toDomain(row)); + } + async update(state) { + await this.db + .updateTable('states') + .set({ + visit_count: state.visitCount, + url: state.url, + title: state.title, + }) + .where('id', '=', state.id.toString()) + .execute(); + } + toDomain(row) { + return CrawlState_1.CrawlState.create({ + url: row.url, + title: row.title, + domSnapshot: '', + visitCount: row.visit_count, + stateId: row.id, + sessionId: row.session_id, + }, UniqueId_1.UniqueId.from(row.id)); + } +} +exports.KyselyStateRepository = KyselyStateRepository; diff --git a/src/modules/crawling/domain/entities/CrawlSession.ts b/src/modules/crawling/domain/entities/CrawlSession.ts index 296824a..91483ad 100644 --- a/src/modules/crawling/domain/entities/CrawlSession.ts +++ b/src/modules/crawling/domain/entities/CrawlSession.ts @@ -8,7 +8,7 @@ import { CrawlFailed } from '../events/CrawlFailed'; type SessionStatusValue = 'running' | 'completed' | 'failed' | 'stopped'; -interface CrawlSessionProps { +export interface CrawlSessionProps { url: string; status: SessionStatusValue; seed: number; @@ -29,6 +29,11 @@ export class CrawlSession extends AggregateRoot { super(props, id); } + /** Reconstruct from persistence without emitting domain events */ + static reconstitute(props: CrawlSessionProps, id: UniqueId): CrawlSession { + return new CrawlSession(props, id); + } + static create(request: CreateCrawlSessionRequest): Result { const urlResult = Url.create(request.url); if (!urlResult.ok) { diff --git a/src/modules/crawling/infrastructure/adapters/CrawlingStateGraph.ts b/src/modules/crawling/infrastructure/adapters/CrawlingStateGraph.ts new file mode 100644 index 0000000..bedf5da --- /dev/null +++ b/src/modules/crawling/infrastructure/adapters/CrawlingStateGraph.ts @@ -0,0 +1,88 @@ +/** + * CrawlingStateGraph — copy of StateGraph for the crawling module. + * Uses BFS ordering by default for exploration scheduling. + */ +import { IState, IAction } from '../../../../core/interfaces'; + +export interface ITransition { + fromId: string; + action: IAction; + toId: string; + timestamp: number; +} + +export class CrawlingStateGraph { + private states: Map = new Map(); + private transitions: ITransition[] = []; + /** Insertion order for BFS */ + private insertionOrder: string[] = []; + + addState(state: IState): void { + if (!this.states.has(state.id)) { + this.states.set(state.id, state); + this.insertionOrder.push(state.id); + } else { + const existing = this.states.get(state.id)!; + this.states.set(state.id, { ...existing, visitCount: existing.visitCount + 1 }); + } + } + + hasState(stateId: string): boolean { + return this.states.has(stateId); + } + + getState(stateId: string): IState | undefined { + return this.states.get(stateId); + } + + incrementVisit(stateId: string): void { + const state = this.states.get(stateId); + if (state) { + this.states.set(stateId, { ...state, visitCount: state.visitCount + 1 }); + } + } + + recordTransition(fromId: string, action: IAction, toId: string): void { + this.transitions.push({ fromId, action, toId, timestamp: Date.now() }); + } + + getUnvisited(): IState[] { + return this.insertionOrder + .map((id) => this.states.get(id)!) + .filter((s) => s.visitCount === 0); + } + + /** BFS heuristic: returns the oldest unvisited state, or null if none */ + getNextToExplore(): IState | null { + const unvisited = this.getUnvisited(); + return unvisited.length > 0 ? unvisited[0]! : null; + } + + getAllStates(): IState[] { + return this.insertionOrder.map((id) => this.states.get(id)!); + } + + getTransitions(): ITransition[] { + return [...this.transitions]; + } + + toJSON(): object { + return { + stateCount: this.states.size, + transitionCount: this.transitions.length, + states: this.getAllStates().map((s) => ({ + id: s.id, + url: s.url, + title: s.title, + visitCount: s.visitCount, + })), + transitions: this.transitions.map((t) => ({ + fromId: t.fromId, + toId: t.toId, + actionId: t.action.id, + actionType: t.action.type, + timestamp: t.timestamp, + })), + }; + } +} diff --git a/src/modules/crawling/infrastructure/adapters/ExplorationOrchestrator.ts b/src/modules/crawling/infrastructure/adapters/ExplorationOrchestrator.ts new file mode 100644 index 0000000..8758b02 --- /dev/null +++ b/src/modules/crawling/infrastructure/adapters/ExplorationOrchestrator.ts @@ -0,0 +1,273 @@ +/** + * ExplorationOrchestrator — adapts ExplorationEngine to use ICrawlerEngine port. + * All dependencies are injected via constructor; no direct plugin imports. + */ +import { IState, IAction, IAnomaly, ILogger, ICollector, IExporter, IReproducer, IFuzzingPlugin } from '../../../../core/interfaces'; +import { ICrawlerEngine } from '../../domain/ports/ICrawlerEngine'; +import { CrawlingStateGraph } from './CrawlingStateGraph'; +import { AnomalyDetector } from '../../../../core/AnomalyDetector'; +import { NullLogger } from '../../../../core/Logger'; +import { ExplorationConfig } from '../../../../core/ExplorationConfig'; + +export interface OrchestratorEventCallbacks { + onSessionStarted?: (sessionId: string, url: string) => void; + onStateDiscovered?: (sessionId: string, stateId: string, url: string, title: string) => void; + onActionExecuted?: (sessionId: string, actionType: string, selector: string | undefined, timestamp: number) => void; + onAnomalyDetected?: (sessionId: string, anomaly: IAnomaly) => void; + onSessionCompleted?: (sessionId: string, statesVisited: number, anomaliesFound: number) => void; + onSessionError?: (sessionId: string, error: string) => void; +} + +export type StateHook = ( + state: IState, + engine: ICrawlerEngine, + sessionId: string, + actionTrace: IAction[] +) => Promise; + +export interface OrchestratorConfig { + graph: CrawlingStateGraph; + engine: ICrawlerEngine; + detector?: AnomalyDetector; + collectors?: ICollector[]; + exporters?: IExporter[]; + reproducer?: IReproducer; + logger?: ILogger; + seed: number; + url: string; + maxSteps?: number; + outputDir?: string; + events?: OrchestratorEventCallbacks; + sessionId?: string; + explorationConfig?: Partial; + fuzzingPlugin?: IFuzzingPlugin; + stateHooks?: StateHook[]; +} + +export interface OrchestratorResult { + statesVisited: number; + anomaliesFound: number; + anomalies: IAnomaly[]; +} + +export class ExplorationOrchestrator { + private graph: CrawlingStateGraph; + private engine: ICrawlerEngine; + private detector: AnomalyDetector; + private collectors: ICollector[]; + private exporters: IExporter[]; + private reproducer?: IReproducer; + private logger: ILogger; + private seed: number; + private url: string; + private maxSteps: number; + private outputDir: string; + private events: OrchestratorEventCallbacks; + private sessionId: string; + private explorationConfig: Partial; + private fuzzingPlugin?: IFuzzingPlugin; + private stateHooks: StateHook[]; + private actionTrace: IAction[] = []; + private aborted = false; + + constructor(config: OrchestratorConfig) { + this.graph = config.graph; + this.engine = config.engine; + this.detector = config.detector ?? new AnomalyDetector(); + this.collectors = config.collectors ?? []; + this.exporters = config.exporters ?? []; + this.reproducer = config.reproducer; + this.logger = config.logger ?? new NullLogger(); + this.seed = config.seed; + this.url = config.url; + this.maxSteps = config.maxSteps ?? 100; + this.outputDir = config.outputDir ?? './reports'; + this.events = config.events ?? {}; + this.sessionId = config.sessionId ?? `${Date.now()}_${config.seed}`; + this.explorationConfig = config.explorationConfig ?? {}; + this.fuzzingPlugin = config.fuzzingPlugin; + this.stateHooks = config.stateHooks ?? []; + } + + stop(): void { + this.aborted = true; + } + + async run(): Promise { + const anomalies: IAnomaly[] = []; + let stepsExecuted = 0; + let depth = 0; + const sessionTimeoutMs = this.explorationConfig.sessionTimeoutMs ?? 0; + const maxDepth = this.explorationConfig.maxDepth ?? Infinity; + const sessionStart = Date.now(); + + this.logger.log({ + event: 'session_start', + timestamp: sessionStart, + seed: this.seed, + target: this.url, + }); + this.events.onSessionStarted?.(this.sessionId, this.url); + + const isTimedOut = (): boolean => + sessionTimeoutMs > 0 && Date.now() - sessionStart >= sessionTimeoutMs; + + try { + await this.engine.launch(this.url); + + const initialState = await this.engine.captureState(); + this.graph.addState(initialState); + + this.logger.log({ + event: 'state_discovered', + timestamp: Date.now(), + stateId: initialState.id, + url: initialState.url, + title: initialState.title, + }); + this.events.onStateDiscovered?.(this.sessionId, initialState.id, initialState.url, initialState.title); + + while (stepsExecuted < this.maxSteps && !this.aborted && !isTimedOut() && depth <= maxDepth) { + const currentState = this.graph.getNextToExplore(); + if (!currentState) break; + + this.graph.incrementVisit(currentState.id); + + const actions = await this.engine.discoverActions(currentState); + if (actions.length === 0) continue; + + const actionIndex = (this.seed + stepsExecuted) % actions.length; + const action = actions[actionIndex]!; + + this.logger.log({ + event: 'action_executed', + timestamp: Date.now(), + actionId: action.id, + type: action.type, + selector: action.selector, + value: action.value, + url: action.url, + }); + this.events.onActionExecuted?.(this.sessionId, action.type, action.selector, Date.now()); + + const observation = await this.engine.executeAction(action); + this.actionTrace.push(action); + + if (!this.graph.hasState(observation.newStateId)) { + const newState = await this.engine.captureState(); + this.graph.addState(newState); + depth += 1; + + this.logger.log({ + event: 'state_discovered', + timestamp: Date.now(), + stateId: newState.id, + url: newState.url, + title: newState.title, + }); + this.events.onStateDiscovered?.(this.sessionId, newState.id, newState.url, newState.title); + + for (const hook of this.stateHooks) { + const hookAnomalies = await hook(newState, this.engine, this.sessionId, [...this.actionTrace]).catch(() => []); + for (const anomaly of hookAnomalies) { + anomalies.push(anomaly); + this.logger.log({ + event: 'anomaly_detected', + timestamp: Date.now(), + anomalyId: anomaly.id, + type: anomaly.type, + severity: anomaly.severity, + }); + this.events.onAnomalyDetected?.(this.sessionId, anomaly); + for (const exporter of this.exporters) { + await exporter.export(anomaly, `${this.outputDir}/${anomaly.id}`); + } + } + } + } + + this.graph.recordTransition(currentState.id, action, observation.newStateId); + + this.logger.log({ + event: 'exploration_step', + timestamp: Date.now(), + stateId: currentState.id, + actionId: action.id, + }); + + const detected = this.detector.detect(observation, [...this.actionTrace]); + for (const anomaly of detected) { + for (const collector of this.collectors) { + const evidence = await collector.collect(anomaly, this.engine as unknown as import('../../../../core/interfaces').IInteractionAgent); + Object.assign(anomaly.evidence, evidence); + } + + anomalies.push(anomaly); + + this.logger.log({ + event: 'anomaly_detected', + timestamp: Date.now(), + anomalyId: anomaly.id, + type: anomaly.type, + severity: anomaly.severity, + }); + this.events.onAnomalyDetected?.(this.sessionId, anomaly); + + for (const exporter of this.exporters) { + const reportDir = `${this.outputDir}/${anomaly.id}`; + await exporter.export(anomaly, reportDir); + } + } + + stepsExecuted += 1; + + if ( + this.fuzzingPlugin && + this.explorationConfig.fuzzingEnabled !== false && + currentState.domSnapshot + ) { + const fuzzActions = this.fuzzingPlugin.generateFuzzActions( + currentState.domSnapshot, + currentState + ); + for (const fuzzAction of fuzzActions) { + if (this.aborted || isTimedOut()) break; + const fuzzObs = await this.engine.executeAction(fuzzAction); + this.actionTrace.push(fuzzAction); + const fuzzAnomalies = this.detector.detect(fuzzObs, [...this.actionTrace]); + for (const anomaly of fuzzAnomalies) { + for (const collector of this.collectors) { + const evidence = await collector.collect(anomaly, this.engine as unknown as import('../../../../core/interfaces').IInteractionAgent); + Object.assign(anomaly.evidence, evidence); + } + anomalies.push(anomaly); + this.events.onAnomalyDetected?.(this.sessionId, anomaly); + for (const exporter of this.exporters) { + await exporter.export(anomaly, `${this.outputDir}/${anomaly.id}`); + } + } + } + } + } + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + this.events.onSessionError?.(this.sessionId, msg); + await this.engine.close().catch(() => undefined); + throw err; + } + + await this.engine.close(); + + const statesVisited = this.graph.getAllStates().filter((s) => s.visitCount > 0).length; + + this.logger.log({ + event: 'session_end', + timestamp: Date.now(), + statesVisited, + anomaliesFound: anomalies.length, + }); + this.events.onSessionCompleted?.(this.sessionId, statesVisited, anomalies.length); + + return { statesVisited, anomaliesFound: anomalies.length, anomalies }; + } +} diff --git a/src/modules/crawling/infrastructure/adapters/PlaywrightCrawlerEngine.ts b/src/modules/crawling/infrastructure/adapters/PlaywrightCrawlerEngine.ts new file mode 100644 index 0000000..fb8e067 --- /dev/null +++ b/src/modules/crawling/infrastructure/adapters/PlaywrightCrawlerEngine.ts @@ -0,0 +1,11 @@ +/** + * PlaywrightCrawlerEngine — adapts PlaywrightAgent to implement the ICrawlerEngine port. + */ +import { PlaywrightAgent, PlaywrightAgentConfig } from '../../../../plugins/agents/PlaywrightAgent'; +import { ICrawlerEngine } from '../../domain/ports/ICrawlerEngine'; + +export class PlaywrightCrawlerEngine extends PlaywrightAgent implements ICrawlerEngine { + constructor(config: PlaywrightAgentConfig = {}) { + super(config); + } +} diff --git a/src/modules/crawling/infrastructure/http/CrawlingController.ts b/src/modules/crawling/infrastructure/http/CrawlingController.ts new file mode 100644 index 0000000..9fe9bd6 --- /dev/null +++ b/src/modules/crawling/infrastructure/http/CrawlingController.ts @@ -0,0 +1,73 @@ +/** + * CrawlingController — thin Express controller for crawling routes. + * Delegates to use cases; returns Result-based responses. + */ +import { Router, Request, Response } from 'express'; +import { StartCrawlCommand } from '../../application/commands/StartCrawlCommand'; +import { StopCrawlCommand } from '../../application/commands/StopCrawlCommand'; +import { GetSessionQuery } from '../../application/queries/GetSessionQuery'; +import { ListSessionsQuery } from '../../application/queries/ListSessionsQuery'; + +export interface CrawlingControllerDeps { + startCrawl: StartCrawlCommand; + stopCrawl: StopCrawlCommand; + getSession: GetSessionQuery; + listSessions: ListSessionsQuery; +} + +export function createCrawlingRouter(deps: CrawlingControllerDeps): Router { + const router = Router(); + + // POST /api/sessions — start a new crawl session + router.post('/', async (req: Request, res: Response) => { + const body = req.body as { url?: string; seed?: number; maxStates?: number; config?: Record }; + const { url, seed = 42, maxStates = 50, config } = body; + + if (!url || typeof url !== 'string') { + res.status(400).json({ error: 'url is required' }); + return; + } + + const result = await deps.startCrawl.execute({ url, seed, maxStates, config }); + if (!result.ok) { + res.status(422).json({ error: result.error }); + return; + } + + res.status(201).json(result.value); + }); + + // GET /api/sessions — list all sessions + router.get('/', async (_req: Request, res: Response) => { + const result = await deps.listSessions.execute({}); + if (!result.ok) { + res.status(500).json({ error: result.error }); + return; + } + res.json(result.value); + }); + + // GET /api/sessions/:id — session detail + router.get('/:id', async (req: Request, res: Response) => { + const sessionId = req.params['id'] as string; + const result = await deps.getSession.execute({ sessionId }); + if (!result.ok) { + res.status(404).json({ error: result.error }); + return; + } + res.json(result.value); + }); + + // DELETE /api/sessions/:id — stop a session + router.delete('/:id', async (req: Request, res: Response) => { + const sessionId = req.params['id'] as string; + const result = await deps.stopCrawl.execute({ sessionId }); + if (!result.ok) { + res.status(404).json({ error: result.error }); + return; + } + res.json({ stopped: true }); + }); + + return router; +} diff --git a/src/modules/crawling/infrastructure/repositories/KyselyCrawlSessionRepository.ts b/src/modules/crawling/infrastructure/repositories/KyselyCrawlSessionRepository.ts new file mode 100644 index 0000000..6599b3e --- /dev/null +++ b/src/modules/crawling/infrastructure/repositories/KyselyCrawlSessionRepository.ts @@ -0,0 +1,90 @@ +/** + * KyselyCrawlSessionRepository — implements ICrawlSessionRepository using Kysely. + */ +import { Kysely } from 'kysely'; +import { Database, SessionTable } from '../../../../shared/infrastructure/DatabaseConnection'; +import { ICrawlSessionRepository } from '../../domain/ports/ICrawlSessionRepository'; +import { CrawlSession, CrawlSessionProps } from '../../domain/entities/CrawlSession'; +import { UniqueId } from '../../../../shared/domain/UniqueId'; + +export class KyselyCrawlSessionRepository implements ICrawlSessionRepository { + constructor(private readonly db: Kysely) {} + + async save(session: CrawlSession): Promise { + const row: SessionTable = { + id: session.id.toString(), + url: session.url, + status: session.status, + seed: session.seed, + max_states: session.maxStates, + states_visited: session.statesVisited, + anomalies_found: 0, + started_at: Date.now(), + finished_at: null, + config_json: JSON.stringify(session.config), + }; + + await this.db + .insertInto('sessions') + .values(row) + .execute(); + } + + async findById(id: UniqueId): Promise { + const row = await this.db + .selectFrom('sessions') + .selectAll() + .where('id', '=', id.toString()) + .executeTakeFirst(); + + if (!row) return null; + + return this.toDomain(row); + } + + async findAll(): Promise { + const rows = await this.db + .selectFrom('sessions') + .selectAll() + .orderBy('started_at', 'desc') + .execute(); + + return rows.map((row) => this.toDomain(row)); + } + + async update(session: CrawlSession): Promise { + const isTerminal = session.status === 'completed' || session.status === 'failed' || session.status === 'stopped'; + + await this.db + .updateTable('sessions') + .set({ + status: session.status, + states_visited: session.statesVisited, + finished_at: isTerminal ? Date.now() : null, + config_json: JSON.stringify(session.config), + }) + .where('id', '=', session.id.toString()) + .execute(); + } + + private toDomain(row: SessionTable): CrawlSession { + const props: CrawlSessionProps = { + url: row.url, + status: row.status as CrawlSessionProps['status'], + seed: row.seed, + maxStates: row.max_states, + statesVisited: row.states_visited, + config: this.parseJson(row.config_json), + }; + + return CrawlSession.reconstitute(props, UniqueId.from(row.id)); + } + + private parseJson(json: string): Record { + try { + return JSON.parse(json) as Record; + } catch { + return {}; + } + } +} diff --git a/src/modules/crawling/infrastructure/repositories/KyselyStateRepository.ts b/src/modules/crawling/infrastructure/repositories/KyselyStateRepository.ts new file mode 100644 index 0000000..556a125 --- /dev/null +++ b/src/modules/crawling/infrastructure/repositories/KyselyStateRepository.ts @@ -0,0 +1,86 @@ +/** + * KyselyStateRepository — implements IStateRepository using Kysely. + */ +import { Kysely } from 'kysely'; +import { Database, StateTable } from '../../../../shared/infrastructure/DatabaseConnection'; +import { IStateRepository } from '../../domain/ports/IStateRepository'; +import { CrawlState } from '../../domain/entities/CrawlState'; +import { UniqueId } from '../../../../shared/domain/UniqueId'; + +export class KyselyStateRepository implements IStateRepository { + constructor(private readonly db: Kysely) {} + + async save(state: CrawlState): Promise { + const row: StateTable = { + id: state.id.toString(), + session_id: state.sessionId, + url: state.url, + title: state.title, + dom_snapshot_path: null, + visit_count: state.visitCount, + discovered_at: Date.now(), + }; + + await this.db + .insertInto('states') + .values(row) + .execute(); + } + + async findById(id: UniqueId): Promise { + const row = await this.db + .selectFrom('states') + .selectAll() + .where('id', '=', id.toString()) + .executeTakeFirst(); + + if (!row) return null; + + return this.toDomain(row); + } + + async findAll(): Promise { + const rows = await this.db + .selectFrom('states') + .selectAll() + .execute(); + + return rows.map((row) => this.toDomain(row)); + } + + async findBySessionId(sessionId: string): Promise { + const rows = await this.db + .selectFrom('states') + .selectAll() + .where('session_id', '=', sessionId) + .execute(); + + return rows.map((row) => this.toDomain(row)); + } + + async update(state: CrawlState): Promise { + await this.db + .updateTable('states') + .set({ + visit_count: state.visitCount, + url: state.url, + title: state.title, + }) + .where('id', '=', state.id.toString()) + .execute(); + } + + private toDomain(row: StateTable): CrawlState { + return CrawlState.create( + { + url: row.url, + title: row.title, + domSnapshot: '', + visitCount: row.visit_count, + stateId: row.id, + sessionId: row.session_id, + }, + UniqueId.from(row.id) + ); + } +}