feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from interruptions instead of restarting from scratch. - introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling - expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction - skip infobox tables during recipe parsing to avoid false recipe matches - add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction Add resumable checkpoint support so long scrapes can recover from interruptions instead of restarting from scratch. - introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling - expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction - skip infobox tables during recipe parsing to avoid false recipe matches - add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
This commit is contained in:
@@ -4,6 +4,8 @@ import (
|
||||
"encoding/json"
|
||||
"math/rand"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"scrappr/internal/logx"
|
||||
@@ -20,6 +22,7 @@ func Run() error {
|
||||
|
||||
cfg := scraper.DefaultConfig()
|
||||
s := scraper.New(cfg)
|
||||
installSignalCheckpoint(s)
|
||||
|
||||
dataset, err := s.Run()
|
||||
if err != nil {
|
||||
@@ -39,9 +42,27 @@ func Run() error {
|
||||
outputPath,
|
||||
)
|
||||
|
||||
if err := s.ClearCheckpoint(); err != nil {
|
||||
logx.Eventf("warn", "failed to clear checkpoint: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func installSignalCheckpoint(s *scraper.Scraper) {
|
||||
signals := make(chan os.Signal, 1)
|
||||
signal.Notify(signals, os.Interrupt, syscall.SIGTERM)
|
||||
|
||||
go func() {
|
||||
sig := <-signals
|
||||
logx.Eventf("warn", "received %s, saving checkpoint before exit", sig.String())
|
||||
if err := s.SaveCheckpoint("signal"); err != nil {
|
||||
logx.Eventf("error", "failed to save checkpoint on signal: %v", err)
|
||||
}
|
||||
os.Exit(130)
|
||||
}()
|
||||
}
|
||||
|
||||
func writeDataset(outputPath string, dataset model.Dataset) error {
|
||||
file, err := os.Create(outputPath)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user