feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from interruptions instead of restarting from scratch. - introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling - expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction - skip infobox tables during recipe parsing to avoid false recipe matches - add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction Add resumable checkpoint support so long scrapes can recover from interruptions instead of restarting from scratch. - introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling - expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction - skip infobox tables during recipe parsing to avoid false recipe matches - add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
This commit is contained in:
@@ -8,9 +8,13 @@ import (
|
||||
type Config struct {
|
||||
BaseURL string
|
||||
AllowedDomain string
|
||||
CacheDir string
|
||||
CheckpointPath string
|
||||
MaxDepth int
|
||||
MaxRetries int
|
||||
MaxQueuedPages int
|
||||
AutosaveEvery time.Duration
|
||||
AutosavePages int
|
||||
RequestDelay time.Duration
|
||||
RequestJitter time.Duration
|
||||
RequestTimeout time.Duration
|
||||
@@ -35,9 +39,13 @@ func DefaultConfig() Config {
|
||||
return Config{
|
||||
BaseURL: baseURL,
|
||||
AllowedDomain: "outward.fandom.com",
|
||||
CacheDir: ".cache",
|
||||
CheckpointPath: ".cache/scrape-state.json",
|
||||
MaxDepth: 3,
|
||||
MaxRetries: 2,
|
||||
MaxQueuedPages: 1500,
|
||||
AutosaveEvery: 12 * time.Second,
|
||||
AutosavePages: 20,
|
||||
RequestDelay: 650 * time.Millisecond,
|
||||
RequestJitter: 350 * time.Millisecond,
|
||||
RequestTimeout: 8 * time.Second,
|
||||
|
||||
Reference in New Issue
Block a user