Add resumable checkpoint support so long scrapes can recover from interruptions instead of restarting from scratch. - introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling - expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction - skip infobox tables during recipe parsing to avoid false recipe matches - add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction Add resumable checkpoint support so long scrapes can recover from interruptions instead of restarting from scratch. - introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling - expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction - skip infobox tables during recipe parsing to avoid false recipe matches - add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
78 lines
1.5 KiB
Go
78 lines
1.5 KiB
Go
package app
|
|
|
|
import (
|
|
"encoding/json"
|
|
"math/rand"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
"time"
|
|
|
|
"scrappr/internal/logx"
|
|
"scrappr/internal/model"
|
|
"scrappr/internal/scraper"
|
|
)
|
|
|
|
const outputPath = "outward_data.json"
|
|
|
|
func Run() error {
|
|
rand.Seed(time.Now().UnixNano())
|
|
|
|
logx.Eventf("start", "Outward scraper booting")
|
|
|
|
cfg := scraper.DefaultConfig()
|
|
s := scraper.New(cfg)
|
|
installSignalCheckpoint(s)
|
|
|
|
dataset, err := s.Run()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
logx.Eventf("write", "writing dataset to %s", outputPath)
|
|
if err := writeDataset(outputPath, dataset); err != nil {
|
|
return err
|
|
}
|
|
|
|
logx.Eventf(
|
|
"success",
|
|
"wrote %d items and %d effects to %s",
|
|
len(dataset.Items),
|
|
len(dataset.Effects),
|
|
outputPath,
|
|
)
|
|
|
|
if err := s.ClearCheckpoint(); err != nil {
|
|
logx.Eventf("warn", "failed to clear checkpoint: %v", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func installSignalCheckpoint(s *scraper.Scraper) {
|
|
signals := make(chan os.Signal, 1)
|
|
signal.Notify(signals, os.Interrupt, syscall.SIGTERM)
|
|
|
|
go func() {
|
|
sig := <-signals
|
|
logx.Eventf("warn", "received %s, saving checkpoint before exit", sig.String())
|
|
if err := s.SaveCheckpoint("signal"); err != nil {
|
|
logx.Eventf("error", "failed to save checkpoint on signal: %v", err)
|
|
}
|
|
os.Exit(130)
|
|
}()
|
|
}
|
|
|
|
func writeDataset(outputPath string, dataset model.Dataset) error {
|
|
file, err := os.Create(outputPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer file.Close()
|
|
|
|
encoder := json.NewEncoder(file)
|
|
encoder.SetIndent("", " ")
|
|
|
|
return encoder.Encode(dataset)
|
|
}
|