feat(scraper): add checkpointing and richer page extraction

Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.

- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction

Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.

- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
This commit is contained in:
2026-03-15 17:08:24 +02:00
parent 42e2083ece
commit 6bf221de3f
9 changed files with 607 additions and 29 deletions

View File

@@ -4,6 +4,8 @@ import (
"encoding/json"
"math/rand"
"os"
"os/signal"
"syscall"
"time"
"scrappr/internal/logx"
@@ -20,6 +22,7 @@ func Run() error {
cfg := scraper.DefaultConfig()
s := scraper.New(cfg)
installSignalCheckpoint(s)
dataset, err := s.Run()
if err != nil {
@@ -39,9 +42,27 @@ func Run() error {
outputPath,
)
if err := s.ClearCheckpoint(); err != nil {
logx.Eventf("warn", "failed to clear checkpoint: %v", err)
}
return nil
}
func installSignalCheckpoint(s *scraper.Scraper) {
signals := make(chan os.Signal, 1)
signal.Notify(signals, os.Interrupt, syscall.SIGTERM)
go func() {
sig := <-signals
logx.Eventf("warn", "received %s, saving checkpoint before exit", sig.String())
if err := s.SaveCheckpoint("signal"); err != nil {
logx.Eventf("error", "failed to save checkpoint on signal: %v", err)
}
os.Exit(130)
}()
}
func writeDataset(outputPath string, dataset model.Dataset) error {
file, err := os.Create(outputPath)
if err != nil {