fase(4): crawling infrastructure with migrated code

This commit is contained in:
debian
2026-03-05 03:08:48 -05:00
parent 39c5313ba5
commit 96bf6e5097
16 changed files with 1105 additions and 3 deletions

View File

@@ -0,0 +1,72 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.CrawlingStateGraph = void 0;
class CrawlingStateGraph {
constructor() {
this.states = new Map();
this.transitions = [];
/** Insertion order for BFS */
this.insertionOrder = [];
}
addState(state) {
if (!this.states.has(state.id)) {
this.states.set(state.id, state);
this.insertionOrder.push(state.id);
}
else {
const existing = this.states.get(state.id);
this.states.set(state.id, { ...existing, visitCount: existing.visitCount + 1 });
}
}
hasState(stateId) {
return this.states.has(stateId);
}
getState(stateId) {
return this.states.get(stateId);
}
incrementVisit(stateId) {
const state = this.states.get(stateId);
if (state) {
this.states.set(stateId, { ...state, visitCount: state.visitCount + 1 });
}
}
recordTransition(fromId, action, toId) {
this.transitions.push({ fromId, action, toId, timestamp: Date.now() });
}
getUnvisited() {
return this.insertionOrder
.map((id) => this.states.get(id))
.filter((s) => s.visitCount === 0);
}
/** BFS heuristic: returns the oldest unvisited state, or null if none */
getNextToExplore() {
const unvisited = this.getUnvisited();
return unvisited.length > 0 ? unvisited[0] : null;
}
getAllStates() {
return this.insertionOrder.map((id) => this.states.get(id));
}
getTransitions() {
return [...this.transitions];
}
toJSON() {
return {
stateCount: this.states.size,
transitionCount: this.transitions.length,
states: this.getAllStates().map((s) => ({
id: s.id,
url: s.url,
title: s.title,
visitCount: s.visitCount,
})),
transitions: this.transitions.map((t) => ({
fromId: t.fromId,
toId: t.toId,
actionId: t.action.id,
actionType: t.action.type,
timestamp: t.timestamp,
})),
};
}
}
exports.CrawlingStateGraph = CrawlingStateGraph;