Files
outward-scrapper/internal/app/run.go
Daniel Legt 6bf221de3f feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.

- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction

Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.

- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00

78 lines
1.5 KiB
Go

package app
import (
"encoding/json"
"math/rand"
"os"
"os/signal"
"syscall"
"time"
"scrappr/internal/logx"
"scrappr/internal/model"
"scrappr/internal/scraper"
)
const outputPath = "outward_data.json"
func Run() error {
rand.Seed(time.Now().UnixNano())
logx.Eventf("start", "Outward scraper booting")
cfg := scraper.DefaultConfig()
s := scraper.New(cfg)
installSignalCheckpoint(s)
dataset, err := s.Run()
if err != nil {
return err
}
logx.Eventf("write", "writing dataset to %s", outputPath)
if err := writeDataset(outputPath, dataset); err != nil {
return err
}
logx.Eventf(
"success",
"wrote %d items and %d effects to %s",
len(dataset.Items),
len(dataset.Effects),
outputPath,
)
if err := s.ClearCheckpoint(); err != nil {
logx.Eventf("warn", "failed to clear checkpoint: %v", err)
}
return nil
}
func installSignalCheckpoint(s *scraper.Scraper) {
signals := make(chan os.Signal, 1)
signal.Notify(signals, os.Interrupt, syscall.SIGTERM)
go func() {
sig := <-signals
logx.Eventf("warn", "received %s, saving checkpoint before exit", sig.String())
if err := s.SaveCheckpoint("signal"); err != nil {
logx.Eventf("error", "failed to save checkpoint on signal: %v", err)
}
os.Exit(130)
}()
}
func writeDataset(outputPath string, dataset model.Dataset) error {
file, err := os.Create(outputPath)
if err != nil {
return err
}
defer file.Close()
encoder := json.NewEncoder(file)
encoder.SetIndent("", " ")
return encoder.Encode(dataset)
}