fase(4): crawling infrastructure with migrated code

This commit is contained in:
debian
2026-03-05 03:08:48 -05:00
parent 39c5313ba5
commit 96bf6e5097
16 changed files with 1105 additions and 3 deletions

View File

@@ -11,6 +11,10 @@ class CrawlSession extends AggregateRoot_1.AggregateRoot {
constructor(props, id) {
super(props, id);
}
/** Reconstruct from persistence without emitting domain events */
static reconstitute(props, id) {
return new CrawlSession(props, id);
}
static create(request) {
const urlResult = Url_1.Url.create(request.url);
if (!urlResult.ok) {

View File

@@ -0,0 +1,72 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.CrawlingStateGraph = void 0;
class CrawlingStateGraph {
constructor() {
this.states = new Map();
this.transitions = [];
/** Insertion order for BFS */
this.insertionOrder = [];
}
addState(state) {
if (!this.states.has(state.id)) {
this.states.set(state.id, state);
this.insertionOrder.push(state.id);
}
else {
const existing = this.states.get(state.id);
this.states.set(state.id, { ...existing, visitCount: existing.visitCount + 1 });
}
}
hasState(stateId) {
return this.states.has(stateId);
}
getState(stateId) {
return this.states.get(stateId);
}
incrementVisit(stateId) {
const state = this.states.get(stateId);
if (state) {
this.states.set(stateId, { ...state, visitCount: state.visitCount + 1 });
}
}
recordTransition(fromId, action, toId) {
this.transitions.push({ fromId, action, toId, timestamp: Date.now() });
}
getUnvisited() {
return this.insertionOrder
.map((id) => this.states.get(id))
.filter((s) => s.visitCount === 0);
}
/** BFS heuristic: returns the oldest unvisited state, or null if none */
getNextToExplore() {
const unvisited = this.getUnvisited();
return unvisited.length > 0 ? unvisited[0] : null;
}
getAllStates() {
return this.insertionOrder.map((id) => this.states.get(id));
}
getTransitions() {
return [...this.transitions];
}
toJSON() {
return {
stateCount: this.states.size,
transitionCount: this.transitions.length,
states: this.getAllStates().map((s) => ({
id: s.id,
url: s.url,
title: s.title,
visitCount: s.visitCount,
})),
transitions: this.transitions.map((t) => ({
fromId: t.fromId,
toId: t.toId,
actionId: t.action.id,
actionType: t.action.type,
timestamp: t.timestamp,
})),
};
}
}
exports.CrawlingStateGraph = CrawlingStateGraph;

View File

@@ -0,0 +1,180 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.ExplorationOrchestrator = void 0;
const AnomalyDetector_1 = require("../../../../core/AnomalyDetector");
const Logger_1 = require("../../../../core/Logger");
class ExplorationOrchestrator {
constructor(config) {
this.actionTrace = [];
this.aborted = false;
this.graph = config.graph;
this.engine = config.engine;
this.detector = config.detector ?? new AnomalyDetector_1.AnomalyDetector();
this.collectors = config.collectors ?? [];
this.exporters = config.exporters ?? [];
this.reproducer = config.reproducer;
this.logger = config.logger ?? new Logger_1.NullLogger();
this.seed = config.seed;
this.url = config.url;
this.maxSteps = config.maxSteps ?? 100;
this.outputDir = config.outputDir ?? './reports';
this.events = config.events ?? {};
this.sessionId = config.sessionId ?? `${Date.now()}_${config.seed}`;
this.explorationConfig = config.explorationConfig ?? {};
this.fuzzingPlugin = config.fuzzingPlugin;
this.stateHooks = config.stateHooks ?? [];
}
stop() {
this.aborted = true;
}
async run() {
const anomalies = [];
let stepsExecuted = 0;
let depth = 0;
const sessionTimeoutMs = this.explorationConfig.sessionTimeoutMs ?? 0;
const maxDepth = this.explorationConfig.maxDepth ?? Infinity;
const sessionStart = Date.now();
this.logger.log({
event: 'session_start',
timestamp: sessionStart,
seed: this.seed,
target: this.url,
});
this.events.onSessionStarted?.(this.sessionId, this.url);
const isTimedOut = () => sessionTimeoutMs > 0 && Date.now() - sessionStart >= sessionTimeoutMs;
try {
await this.engine.launch(this.url);
const initialState = await this.engine.captureState();
this.graph.addState(initialState);
this.logger.log({
event: 'state_discovered',
timestamp: Date.now(),
stateId: initialState.id,
url: initialState.url,
title: initialState.title,
});
this.events.onStateDiscovered?.(this.sessionId, initialState.id, initialState.url, initialState.title);
while (stepsExecuted < this.maxSteps && !this.aborted && !isTimedOut() && depth <= maxDepth) {
const currentState = this.graph.getNextToExplore();
if (!currentState)
break;
this.graph.incrementVisit(currentState.id);
const actions = await this.engine.discoverActions(currentState);
if (actions.length === 0)
continue;
const actionIndex = (this.seed + stepsExecuted) % actions.length;
const action = actions[actionIndex];
this.logger.log({
event: 'action_executed',
timestamp: Date.now(),
actionId: action.id,
type: action.type,
selector: action.selector,
value: action.value,
url: action.url,
});
this.events.onActionExecuted?.(this.sessionId, action.type, action.selector, Date.now());
const observation = await this.engine.executeAction(action);
this.actionTrace.push(action);
if (!this.graph.hasState(observation.newStateId)) {
const newState = await this.engine.captureState();
this.graph.addState(newState);
depth += 1;
this.logger.log({
event: 'state_discovered',
timestamp: Date.now(),
stateId: newState.id,
url: newState.url,
title: newState.title,
});
this.events.onStateDiscovered?.(this.sessionId, newState.id, newState.url, newState.title);
for (const hook of this.stateHooks) {
const hookAnomalies = await hook(newState, this.engine, this.sessionId, [...this.actionTrace]).catch(() => []);
for (const anomaly of hookAnomalies) {
anomalies.push(anomaly);
this.logger.log({
event: 'anomaly_detected',
timestamp: Date.now(),
anomalyId: anomaly.id,
type: anomaly.type,
severity: anomaly.severity,
});
this.events.onAnomalyDetected?.(this.sessionId, anomaly);
for (const exporter of this.exporters) {
await exporter.export(anomaly, `${this.outputDir}/${anomaly.id}`);
}
}
}
}
this.graph.recordTransition(currentState.id, action, observation.newStateId);
this.logger.log({
event: 'exploration_step',
timestamp: Date.now(),
stateId: currentState.id,
actionId: action.id,
});
const detected = this.detector.detect(observation, [...this.actionTrace]);
for (const anomaly of detected) {
for (const collector of this.collectors) {
const evidence = await collector.collect(anomaly, this.engine);
Object.assign(anomaly.evidence, evidence);
}
anomalies.push(anomaly);
this.logger.log({
event: 'anomaly_detected',
timestamp: Date.now(),
anomalyId: anomaly.id,
type: anomaly.type,
severity: anomaly.severity,
});
this.events.onAnomalyDetected?.(this.sessionId, anomaly);
for (const exporter of this.exporters) {
const reportDir = `${this.outputDir}/${anomaly.id}`;
await exporter.export(anomaly, reportDir);
}
}
stepsExecuted += 1;
if (this.fuzzingPlugin &&
this.explorationConfig.fuzzingEnabled !== false &&
currentState.domSnapshot) {
const fuzzActions = this.fuzzingPlugin.generateFuzzActions(currentState.domSnapshot, currentState);
for (const fuzzAction of fuzzActions) {
if (this.aborted || isTimedOut())
break;
const fuzzObs = await this.engine.executeAction(fuzzAction);
this.actionTrace.push(fuzzAction);
const fuzzAnomalies = this.detector.detect(fuzzObs, [...this.actionTrace]);
for (const anomaly of fuzzAnomalies) {
for (const collector of this.collectors) {
const evidence = await collector.collect(anomaly, this.engine);
Object.assign(anomaly.evidence, evidence);
}
anomalies.push(anomaly);
this.events.onAnomalyDetected?.(this.sessionId, anomaly);
for (const exporter of this.exporters) {
await exporter.export(anomaly, `${this.outputDir}/${anomaly.id}`);
}
}
}
}
}
}
catch (err) {
const msg = err instanceof Error ? err.message : String(err);
this.events.onSessionError?.(this.sessionId, msg);
await this.engine.close().catch(() => undefined);
throw err;
}
await this.engine.close();
const statesVisited = this.graph.getAllStates().filter((s) => s.visitCount > 0).length;
this.logger.log({
event: 'session_end',
timestamp: Date.now(),
statesVisited,
anomaliesFound: anomalies.length,
});
this.events.onSessionCompleted?.(this.sessionId, statesVisited, anomalies.length);
return { statesVisited, anomaliesFound: anomalies.length, anomalies };
}
}
exports.ExplorationOrchestrator = ExplorationOrchestrator;

View File

@@ -0,0 +1,13 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.PlaywrightCrawlerEngine = void 0;
/**
* PlaywrightCrawlerEngine — adapts PlaywrightAgent to implement the ICrawlerEngine port.
*/
const PlaywrightAgent_1 = require("../../../../plugins/agents/PlaywrightAgent");
class PlaywrightCrawlerEngine extends PlaywrightAgent_1.PlaywrightAgent {
constructor(config = {}) {
super(config);
}
}
exports.PlaywrightCrawlerEngine = PlaywrightCrawlerEngine;

View File

@@ -0,0 +1,56 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.createCrawlingRouter = createCrawlingRouter;
/**
* CrawlingController — thin Express controller for crawling routes.
* Delegates to use cases; returns Result-based responses.
*/
const express_1 = require("express");
function createCrawlingRouter(deps) {
const router = (0, express_1.Router)();
// POST /api/sessions — start a new crawl session
router.post('/', async (req, res) => {
const body = req.body;
const { url, seed = 42, maxStates = 50, config } = body;
if (!url || typeof url !== 'string') {
res.status(400).json({ error: 'url is required' });
return;
}
const result = await deps.startCrawl.execute({ url, seed, maxStates, config });
if (!result.ok) {
res.status(422).json({ error: result.error });
return;
}
res.status(201).json(result.value);
});
// GET /api/sessions — list all sessions
router.get('/', async (_req, res) => {
const result = await deps.listSessions.execute({});
if (!result.ok) {
res.status(500).json({ error: result.error });
return;
}
res.json(result.value);
});
// GET /api/sessions/:id — session detail
router.get('/:id', async (req, res) => {
const sessionId = req.params['id'];
const result = await deps.getSession.execute({ sessionId });
if (!result.ok) {
res.status(404).json({ error: result.error });
return;
}
res.json(result.value);
});
// DELETE /api/sessions/:id — stop a session
router.delete('/:id', async (req, res) => {
const sessionId = req.params['id'];
const result = await deps.stopCrawl.execute({ sessionId });
if (!result.ok) {
res.status(404).json({ error: result.error });
return;
}
res.json({ stopped: true });
});
return router;
}

View File

@@ -0,0 +1,79 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.KyselyCrawlSessionRepository = void 0;
const CrawlSession_1 = require("../../domain/entities/CrawlSession");
const UniqueId_1 = require("../../../../shared/domain/UniqueId");
class KyselyCrawlSessionRepository {
constructor(db) {
this.db = db;
}
async save(session) {
const row = {
id: session.id.toString(),
url: session.url,
status: session.status,
seed: session.seed,
max_states: session.maxStates,
states_visited: session.statesVisited,
anomalies_found: 0,
started_at: Date.now(),
finished_at: null,
config_json: JSON.stringify(session.config),
};
await this.db
.insertInto('sessions')
.values(row)
.execute();
}
async findById(id) {
const row = await this.db
.selectFrom('sessions')
.selectAll()
.where('id', '=', id.toString())
.executeTakeFirst();
if (!row)
return null;
return this.toDomain(row);
}
async findAll() {
const rows = await this.db
.selectFrom('sessions')
.selectAll()
.orderBy('started_at', 'desc')
.execute();
return rows.map((row) => this.toDomain(row));
}
async update(session) {
const isTerminal = session.status === 'completed' || session.status === 'failed' || session.status === 'stopped';
await this.db
.updateTable('sessions')
.set({
status: session.status,
states_visited: session.statesVisited,
finished_at: isTerminal ? Date.now() : null,
config_json: JSON.stringify(session.config),
})
.where('id', '=', session.id.toString())
.execute();
}
toDomain(row) {
const props = {
url: row.url,
status: row.status,
seed: row.seed,
maxStates: row.max_states,
statesVisited: row.states_visited,
config: this.parseJson(row.config_json),
};
return CrawlSession_1.CrawlSession.reconstitute(props, UniqueId_1.UniqueId.from(row.id));
}
parseJson(json) {
try {
return JSON.parse(json);
}
catch {
return {};
}
}
}
exports.KyselyCrawlSessionRepository = KyselyCrawlSessionRepository;

View File

@@ -0,0 +1,72 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.KyselyStateRepository = void 0;
const CrawlState_1 = require("../../domain/entities/CrawlState");
const UniqueId_1 = require("../../../../shared/domain/UniqueId");
class KyselyStateRepository {
constructor(db) {
this.db = db;
}
async save(state) {
const row = {
id: state.id.toString(),
session_id: state.sessionId,
url: state.url,
title: state.title,
dom_snapshot_path: null,
visit_count: state.visitCount,
discovered_at: Date.now(),
};
await this.db
.insertInto('states')
.values(row)
.execute();
}
async findById(id) {
const row = await this.db
.selectFrom('states')
.selectAll()
.where('id', '=', id.toString())
.executeTakeFirst();
if (!row)
return null;
return this.toDomain(row);
}
async findAll() {
const rows = await this.db
.selectFrom('states')
.selectAll()
.execute();
return rows.map((row) => this.toDomain(row));
}
async findBySessionId(sessionId) {
const rows = await this.db
.selectFrom('states')
.selectAll()
.where('session_id', '=', sessionId)
.execute();
return rows.map((row) => this.toDomain(row));
}
async update(state) {
await this.db
.updateTable('states')
.set({
visit_count: state.visitCount,
url: state.url,
title: state.title,
})
.where('id', '=', state.id.toString())
.execute();
}
toDomain(row) {
return CrawlState_1.CrawlState.create({
url: row.url,
title: row.title,
domSnapshot: '',
visitCount: row.visit_count,
stateId: row.id,
sessionId: row.session_id,
}, UniqueId_1.UniqueId.from(row.id));
}
}
exports.KyselyStateRepository = KyselyStateRepository;