Files
Daniel Legt 6bf221de3f feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.

- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction

Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.

- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00

111 lines
2.9 KiB
Go

package scraper
import (
"regexp"
"time"
)
type Config struct {
BaseURL string
AllowedDomain string
CacheDir string
CheckpointPath string
MaxDepth int
MaxRetries int
MaxQueuedPages int
AutosaveEvery time.Duration
AutosavePages int
RequestDelay time.Duration
RequestJitter time.Duration
RequestTimeout time.Duration
RetryBaseDelay time.Duration
ProgressEvery time.Duration
BrowserReferrer string
BrowserAgents []string
ItemSeeds []string
CraftingSeeds []string
IgnoredPrefixes []string
IgnoredExact map[string]bool
ItemListPathRe *regexp.Regexp
CraftingPathRe *regexp.Regexp
AmountPrefixRe *regexp.Regexp
WhitespaceRe *regexp.Regexp
SpinnerFrames []string
}
func DefaultConfig() Config {
baseURL := "https://outward.fandom.com"
return Config{
BaseURL: baseURL,
AllowedDomain: "outward.fandom.com",
CacheDir: ".cache",
CheckpointPath: ".cache/scrape-state.json",
MaxDepth: 3,
MaxRetries: 2,
MaxQueuedPages: 1500,
AutosaveEvery: 12 * time.Second,
AutosavePages: 20,
RequestDelay: 650 * time.Millisecond,
RequestJitter: 350 * time.Millisecond,
RequestTimeout: 8 * time.Second,
RetryBaseDelay: 1200 * time.Millisecond,
ProgressEvery: 3 * time.Second,
BrowserReferrer: baseURL + "/",
BrowserAgents: []string{
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
},
ItemSeeds: []string{
baseURL + "/wiki/Items/Weapons",
baseURL + "/wiki/Items/Equipment",
baseURL + "/wiki/Items/Consumables",
baseURL + "/wiki/Items/Ingredients",
baseURL + "/wiki/Items/Deployables",
baseURL + "/wiki/Items/Other",
baseURL + "/wiki/Items/Item_Values",
},
CraftingSeeds: []string{
baseURL + "/wiki/Crafting",
baseURL + "/wiki/Crafting/Survival",
baseURL + "/wiki/Crafting/Cooking",
baseURL + "/wiki/Crafting/Alchemy",
},
IgnoredPrefixes: []string{
"/wiki/File:",
"/wiki/Category:",
"/wiki/Special:",
"/wiki/Help:",
"/wiki/Template:",
"/wiki/User:",
"/wiki/User_blog:",
"/wiki/Forum:",
"/wiki/Message_Wall:",
"/wiki/Thread:",
"/wiki/Map:",
},
IgnoredExact: map[string]bool{
"/wiki/Outward_Wiki": true,
"/wiki/Items": true,
"/wiki/Crafting": false,
},
ItemListPathRe: regexp.MustCompile(`^/wiki/Items(?:/|$)`),
CraftingPathRe: regexp.MustCompile(`^/wiki/Crafting(?:/|$)`),
AmountPrefixRe: regexp.MustCompile(`^\s*(\d+x)\s+`),
WhitespaceRe: regexp.MustCompile(`\s+`),
SpinnerFrames: []string{
"⠋",
"⠙",
"⠹",
"⠸",
"⠼",
"⠴",
"⠦",
"⠧",
"⠇",
"⠏",
},
}
}