feat(scraper): add checkpointing and richer page extraction

Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.

- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction

Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.

- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
This commit is contained in:
2026-03-15 17:08:24 +02:00
parent 42e2083ece
commit 6bf221de3f
9 changed files with 607 additions and 29 deletions

4
.gitignore vendored
View File

@@ -32,3 +32,7 @@ go.work
.nfs* .nfs*
# End of https://www.toptal.com/developers/gitignore/api/go,linux # End of https://www.toptal.com/developers/gitignore/api/go,linux
.cache/
outward_data.json

View File

@@ -29,6 +29,8 @@ go run ./cmd/scrappr
- Limits crawl depth and queue size to avoid drifting into junk pages - Limits crawl depth and queue size to avoid drifting into junk pages
- Retries temporary failures with short backoff - Retries temporary failures with short backoff
- Prints colored emoji logs for queueing, requests, responses, parsing, retries, and periodic status - Prints colored emoji logs for queueing, requests, responses, parsing, retries, and periodic status
- Stores legacy and portable infobox fields, primary item image URLs, recipes, effects, and raw content tables for later processing
- Saves resumable checkpoints into `.cache/scrape-state.json` on a timer, during progress milestones, and on `Ctrl+C`
- Writes a stable, sorted JSON dataset to `outward_data.json` - Writes a stable, sorted JSON dataset to `outward_data.json`
## Tuning ## Tuning
@@ -37,4 +39,4 @@ Scraper defaults live in `internal/scraper/config.go`.
- Lower or raise `RequestDelay` / `RequestJitter` - Lower or raise `RequestDelay` / `RequestJitter`
- Tighten or relax `MaxQueuedPages` - Tighten or relax `MaxQueuedPages`
- Adjust `RequestTimeout`, `MaxRetries`, and `ProgressEvery` - Adjust `RequestTimeout`, `MaxRetries`, `ProgressEvery`, `AutosaveEvery`, and `AutosavePages`

View File

@@ -4,6 +4,8 @@ import (
"encoding/json" "encoding/json"
"math/rand" "math/rand"
"os" "os"
"os/signal"
"syscall"
"time" "time"
"scrappr/internal/logx" "scrappr/internal/logx"
@@ -20,6 +22,7 @@ func Run() error {
cfg := scraper.DefaultConfig() cfg := scraper.DefaultConfig()
s := scraper.New(cfg) s := scraper.New(cfg)
installSignalCheckpoint(s)
dataset, err := s.Run() dataset, err := s.Run()
if err != nil { if err != nil {
@@ -39,9 +42,27 @@ func Run() error {
outputPath, outputPath,
) )
if err := s.ClearCheckpoint(); err != nil {
logx.Eventf("warn", "failed to clear checkpoint: %v", err)
}
return nil return nil
} }
func installSignalCheckpoint(s *scraper.Scraper) {
signals := make(chan os.Signal, 1)
signal.Notify(signals, os.Interrupt, syscall.SIGTERM)
go func() {
sig := <-signals
logx.Eventf("warn", "received %s, saving checkpoint before exit", sig.String())
if err := s.SaveCheckpoint("signal"); err != nil {
logx.Eventf("error", "failed to save checkpoint on signal: %v", err)
}
os.Exit(130)
}()
}
func writeDataset(outputPath string, dataset model.Dataset) error { func writeDataset(outputPath string, dataset model.Dataset) error {
file, err := os.Create(outputPath) file, err := os.Create(outputPath)
if err != nil { if err != nil {

View File

@@ -36,6 +36,7 @@ var (
"status": {emoji: "🌀", label: "STATUS", color: colorYellow}, "status": {emoji: "🌀", label: "STATUS", color: colorYellow},
"done": {emoji: "✅", label: "DONE", color: colorGreen}, "done": {emoji: "✅", label: "DONE", color: colorGreen},
"write": {emoji: "💾", label: "WRITE", color: colorBlue}, "write": {emoji: "💾", label: "WRITE", color: colorBlue},
"cache": {emoji: "🗂️", label: "CACHE", color: colorCyan},
"skip": {emoji: "⏭️", label: "SKIP", color: colorGray}, "skip": {emoji: "⏭️", label: "SKIP", color: colorGray},
"warn": {emoji: "⚠️", label: "WARN", color: colorYellow}, "warn": {emoji: "⚠️", label: "WARN", color: colorYellow},
"error": {emoji: "💥", label: "ERROR", color: colorRed}, "error": {emoji: "💥", label: "ERROR", color: colorRed},

View File

@@ -8,14 +8,23 @@ type Recipe struct {
SourcePage string `json:"source_page,omitempty"` SourcePage string `json:"source_page,omitempty"`
} }
type Table struct {
Title string `json:"title,omitempty"`
Headers []string `json:"headers,omitempty"`
Rows []map[string]string `json:"rows,omitempty"`
RawRows [][]string `json:"raw_rows,omitempty"`
}
type Item struct { type Item struct {
Name string `json:"name"` Name string `json:"name"`
URL string `json:"url"` URL string `json:"url"`
Categories []string `json:"categories,omitempty"` Categories []string `json:"categories,omitempty"`
Infobox map[string]string `json:"infobox,omitempty"` Infobox map[string]string `json:"infobox,omitempty"`
ImageURL string `json:"image_url,omitempty"`
Effects []string `json:"effects,omitempty"` Effects []string `json:"effects,omitempty"`
EffectLinks []string `json:"effect_links,omitempty"` EffectLinks []string `json:"effect_links,omitempty"`
Recipes []Recipe `json:"recipes,omitempty"` Recipes []Recipe `json:"recipes,omitempty"`
Tables []Table `json:"tables,omitempty"`
Description string `json:"description,omitempty"` Description string `json:"description,omitempty"`
} }

View File

@@ -0,0 +1,200 @@
package scraper
import (
"encoding/json"
"errors"
"os"
"path/filepath"
"sort"
"strings"
"time"
"scrappr/internal/logx"
"scrappr/internal/model"
)
const checkpointVersion = 2
type checkpoint struct {
Version int `json:"version"`
SavedAt time.Time `json:"saved_at"`
Reason string `json:"reason"`
Dataset model.Dataset `json:"dataset"`
CompletedURLs []string `json:"completed_urls"`
Stats checkpointStats `json:"stats"`
}
type checkpointStats struct {
Completed int `json:"completed"`
Failed int `json:"failed"`
Retried int `json:"retried"`
}
func (s *Scraper) SaveCheckpoint(reason string) error {
state := s.snapshotCheckpoint(reason)
if err := os.MkdirAll(s.cfg.CacheDir, 0o755); err != nil {
return err
}
tempPath := s.cfg.CheckpointPath + ".tmp"
file, err := os.Create(tempPath)
if err != nil {
return err
}
encoder := json.NewEncoder(file)
encoder.SetIndent("", " ")
if err := encoder.Encode(state); err != nil {
file.Close()
_ = os.Remove(tempPath)
return err
}
if err := file.Close(); err != nil {
_ = os.Remove(tempPath)
return err
}
if err := os.Rename(tempPath, s.cfg.CheckpointPath); err != nil {
_ = os.Remove(tempPath)
return err
}
logx.Eventf(
"cache",
"saved checkpoint (%s) items=%d effects=%d completed=%d -> %s",
reason,
len(state.Dataset.Items),
len(state.Dataset.Effects),
state.Stats.Completed,
s.cfg.CheckpointPath,
)
return nil
}
func (s *Scraper) ClearCheckpoint() error {
err := os.Remove(s.cfg.CheckpointPath)
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
logx.Eventf("cache", "cleared checkpoint %s", s.cfg.CheckpointPath)
return nil
}
func (s *Scraper) loadCheckpoint() error {
file, err := os.Open(s.cfg.CheckpointPath)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
return nil
}
return err
}
defer file.Close()
var state checkpoint
if err := json.NewDecoder(file).Decode(&state); err != nil {
return err
}
if state.Version != checkpointVersion {
logx.Eventf(
"cache",
"ignoring checkpoint %s with version=%d expected=%d",
s.cfg.CheckpointPath,
state.Version,
checkpointVersion,
)
return nil
}
s.mu.Lock()
defer s.mu.Unlock()
s.items = make(map[string]*model.Item, len(state.Dataset.Items))
for i := range state.Dataset.Items {
item := state.Dataset.Items[i]
s.items[item.URL] = &item
}
s.effects = make(map[string]*model.Effect, len(state.Dataset.Effects))
for i := range state.Dataset.Effects {
effect := state.Dataset.Effects[i]
s.effects[effect.URL] = &effect
}
s.completedURLs = make(map[string]bool, len(state.CompletedURLs))
for _, rawURL := range state.CompletedURLs {
if rawURL != "" {
s.completedURLs[rawURL] = true
}
}
s.completed = state.Stats.Completed
s.failed = state.Stats.Failed
s.retried = state.Stats.Retried
s.queued = map[string]bool{}
s.lastEvent = time.Now()
logx.Eventf(
"cache",
"loaded checkpoint from %s saved=%s items=%d effects=%d completed=%d",
s.cfg.CheckpointPath,
state.SavedAt.Format(time.RFC3339),
len(state.Dataset.Items),
len(state.Dataset.Effects),
state.Stats.Completed,
)
return nil
}
func (s *Scraper) snapshotCheckpoint(reason string) checkpoint {
s.mu.Lock()
defer s.mu.Unlock()
items := make([]model.Item, 0, len(s.items))
for _, item := range s.items {
items = append(items, *item)
}
effects := make([]model.Effect, 0, len(s.effects))
for _, effect := range s.effects {
effects = append(effects, *effect)
}
completedURLs := make([]string, 0, len(s.completedURLs))
for rawURL := range s.completedURLs {
completedURLs = append(completedURLs, rawURL)
}
sort.Slice(items, func(i, j int) bool {
return strings.ToLower(items[i].Name) < strings.ToLower(items[j].Name)
})
sort.Slice(effects, func(i, j int) bool {
return strings.ToLower(effects[i].Name) < strings.ToLower(effects[j].Name)
})
sort.Strings(completedURLs)
return checkpoint{
Version: checkpointVersion,
SavedAt: time.Now(),
Reason: reason,
Dataset: model.Dataset{
Items: items,
Effects: effects,
},
CompletedURLs: completedURLs,
Stats: checkpointStats{
Completed: s.completed,
Failed: s.failed,
Retried: s.retried,
},
}
}
func (s *Scraper) CheckpointPath() string {
return filepath.Clean(s.cfg.CheckpointPath)
}

View File

@@ -8,9 +8,13 @@ import (
type Config struct { type Config struct {
BaseURL string BaseURL string
AllowedDomain string AllowedDomain string
CacheDir string
CheckpointPath string
MaxDepth int MaxDepth int
MaxRetries int MaxRetries int
MaxQueuedPages int MaxQueuedPages int
AutosaveEvery time.Duration
AutosavePages int
RequestDelay time.Duration RequestDelay time.Duration
RequestJitter time.Duration RequestJitter time.Duration
RequestTimeout time.Duration RequestTimeout time.Duration
@@ -35,9 +39,13 @@ func DefaultConfig() Config {
return Config{ return Config{
BaseURL: baseURL, BaseURL: baseURL,
AllowedDomain: "outward.fandom.com", AllowedDomain: "outward.fandom.com",
CacheDir: ".cache",
CheckpointPath: ".cache/scrape-state.json",
MaxDepth: 3, MaxDepth: 3,
MaxRetries: 2, MaxRetries: 2,
MaxQueuedPages: 1500, MaxQueuedPages: 1500,
AutosaveEvery: 12 * time.Second,
AutosavePages: 20,
RequestDelay: 650 * time.Millisecond, RequestDelay: 650 * time.Millisecond,
RequestJitter: 350 * time.Millisecond, RequestJitter: 350 * time.Millisecond,
RequestTimeout: 8 * time.Second, RequestTimeout: 8 * time.Second,

View File

@@ -1,10 +1,12 @@
package scraper package scraper
import ( import (
"encoding/json"
"math/rand" "math/rand"
"net/url" "net/url"
"path" "path"
"sort" "sort"
"strconv"
"strings" "strings"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
@@ -127,9 +129,6 @@ func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc
if strings.Contains(strings.ToLower(title), "/") { if strings.Contains(strings.ToLower(title), "/") {
return false return false
} }
if doc.Find("aside.portable-infobox").Length() == 0 {
return false
}
for _, category := range lcCats { for _, category := range lcCats {
if strings.Contains(category, "items") || strings.Contains(category, "food") || if strings.Contains(category, "items") || strings.Contains(category, "food") ||
@@ -140,11 +139,21 @@ func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc
} }
} }
infoboxText := strings.ToLower(s.clean(doc.Find("aside.portable-infobox").Text())) if !s.hasItemInfobox(doc) {
return false
}
infoboxText := strings.ToLower(s.clean(s.infoboxText(doc)))
descriptionText := strings.ToLower(s.parseDescription(doc))
return strings.Contains(infoboxText, "item details") || return strings.Contains(infoboxText, "item details") ||
strings.Contains(infoboxText, "consumable details") || strings.Contains(infoboxText, "consumable details") ||
strings.Contains(infoboxText, "equipment details") || strings.Contains(infoboxText, "equipment details") ||
strings.Contains(infoboxText, "weapon") strings.Contains(infoboxText, "weapon") ||
strings.Contains(descriptionText, " is an item in outward") ||
strings.Contains(descriptionText, " is a weapon in outward") ||
strings.Contains(descriptionText, " is a consumable in outward") ||
strings.Contains(descriptionText, " is an ingredient in outward") ||
strings.Contains(descriptionText, " is a shield in outward")
} }
func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool { func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool {
@@ -166,12 +175,14 @@ func (s *Scraper) parseItemPage(doc *goquery.Document, title, pageURL string, ca
Name: title, Name: title,
URL: pageURL, URL: pageURL,
Categories: categories, Categories: categories,
Infobox: s.parsePortableInfobox(doc), Infobox: s.parseInfobox(doc),
ImageURL: s.parseImageURL(doc),
} }
item.Description = s.parseDescription(doc) item.Description = s.parseDescription(doc)
item.Effects, item.EffectLinks = s.parseEffectsSection(doc) item.Effects, item.EffectLinks = s.parseEffectsSection(doc)
item.Recipes = s.parseRecipesFromPage(doc, title) item.Recipes = s.parseRecipesFromPage(doc, title)
item.Tables = s.parseContentTables(doc)
return item return item
} }
@@ -181,12 +192,20 @@ func (s *Scraper) parseEffectPage(doc *goquery.Document, title, pageURL string,
Name: title, Name: title,
URL: pageURL, URL: pageURL,
Categories: categories, Categories: categories,
Infobox: s.parsePortableInfobox(doc), Infobox: s.parseInfobox(doc),
Description: s.parseDescription(doc), Description: s.parseDescription(doc),
} }
} }
func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string { func (s *Scraper) hasItemInfobox(doc *goquery.Document) bool {
return doc.Find("aside.portable-infobox").Length() > 0 || doc.Find("table.infoboxtable").Length() > 0
}
func (s *Scraper) infoboxText(doc *goquery.Document) string {
return s.clean(doc.Find("aside.portable-infobox, table.infoboxtable").First().Text())
}
func (s *Scraper) parseInfobox(doc *goquery.Document) map[string]string {
out := map[string]string{} out := map[string]string{}
doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) { doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) {
@@ -205,6 +224,32 @@ func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string
} }
}) })
doc.Find("table.infoboxtable tr").Each(func(_ int, row *goquery.Selection) {
cells := row.Find("th, td")
if cells.Length() < 2 {
return
}
label := s.clean(cells.First().Text())
if label == "" {
return
}
var values []string
cells.Slice(1, cells.Length()).Each(func(_ int, cell *goquery.Selection) {
text := s.clean(cell.Text())
if text != "" {
values = append(values, text)
}
})
if len(values) == 0 {
return
}
out[label] = strings.Join(values, " | ")
})
return out return out
} }
@@ -220,6 +265,38 @@ func (s *Scraper) parseCategories(doc *goquery.Document) []string {
} }
}) })
if len(categories) == 0 {
doc.Find("script").EachWithBreak(func(_ int, node *goquery.Selection) bool {
text := node.Text()
index := strings.Index(text, "wgCategories")
if index < 0 {
return true
}
arrayStart := strings.Index(text[index:], "[")
arrayEnd := strings.Index(text[index:], "]")
if arrayStart < 0 || arrayEnd < 0 || arrayEnd <= arrayStart {
return true
}
raw := text[index+arrayStart : index+arrayEnd+1]
var parsed []string
if err := json.Unmarshal([]byte(raw), &parsed); err != nil {
return true
}
for _, category := range parsed {
category = s.clean(category)
if category != "" && !seen[category] {
seen[category] = true
categories = append(categories, category)
}
}
return len(categories) == 0
})
}
if len(categories) == 0 { if len(categories) == 0 {
headerText := s.clean(doc.Find("body").Text()) headerText := s.clean(doc.Find("body").Text())
if idx := strings.Index(headerText, "in:"); idx >= 0 { if idx := strings.Index(headerText, "in:"); idx >= 0 {
@@ -295,6 +372,10 @@ func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string)
var recipes []model.Recipe var recipes []model.Recipe
doc.Find("table").Each(func(_ int, table *goquery.Selection) { doc.Find("table").Each(func(_ int, table *goquery.Selection) {
if table.HasClass("infoboxtable") {
return
}
headerText := strings.ToLower(s.clean(table.Find("tr").First().Text())) headerText := strings.ToLower(s.clean(table.Find("tr").First().Text()))
if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) { if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) {
return return
@@ -354,6 +435,169 @@ func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string)
return recipes return recipes
} }
func (s *Scraper) parseImageURL(doc *goquery.Document) string {
image := doc.Find("aside.portable-infobox img, table.infoboxtable img").First()
if image.Length() == 0 {
return ""
}
for _, attr := range []string{"src", "data-src"} {
if raw, ok := image.Attr(attr); ok {
return s.normalizeImageURL(raw)
}
}
return ""
}
func (s *Scraper) normalizeImageURL(raw string) string {
raw = strings.TrimSpace(raw)
switch {
case raw == "":
return ""
case strings.HasPrefix(raw, "//"):
return "https:" + raw
default:
return raw
}
}
func (s *Scraper) parseContentTables(doc *goquery.Document) []model.Table {
var tables []model.Table
doc.Find(".mw-parser-output table").Each(func(_ int, table *goquery.Selection) {
if table.HasClass("infoboxtable") || table.Find("table").Length() > 0 {
return
}
rows := table.Find("tr")
if rows.Length() < 2 {
return
}
headerCells := rows.First().Find("th, td")
if headerCells.Length() == 0 {
return
}
headers := make([]string, 0, headerCells.Length())
headerKeys := make([]string, 0, headerCells.Length())
headerCells.Each(func(i int, cell *goquery.Selection) {
header := s.clean(cell.Text())
if header == "" {
header = "Column " + strconv.Itoa(i+1)
}
headers = append(headers, header)
headerKeys = append(headerKeys, s.tableHeaderKey(header, i))
})
parsedTable := model.Table{
Title: s.tableTitle(table),
Headers: headers,
}
rows.Slice(1, rows.Length()).Each(func(_ int, row *goquery.Selection) {
cells := row.Find("th, td")
if cells.Length() == 0 {
return
}
rawRow := make([]string, 0, cells.Length())
rowData := map[string]string{}
cells.Each(func(i int, cell *goquery.Selection) {
text := s.clean(cell.Text())
rawRow = append(rawRow, text)
key := s.tableColumnKey(headerKeys, headers, i)
if text != "" {
rowData[key] = text
}
})
if s.rowIsEmpty(rawRow) {
return
}
parsedTable.RawRows = append(parsedTable.RawRows, rawRow)
if len(rowData) > 0 {
parsedTable.Rows = append(parsedTable.Rows, rowData)
}
})
if len(parsedTable.RawRows) == 0 {
return
}
tables = append(tables, parsedTable)
})
return tables
}
func (s *Scraper) tableTitle(table *goquery.Selection) string {
var parts []string
for prev := table.Prev(); prev.Length() > 0; prev = prev.Prev() {
switch goquery.NodeName(prev) {
case "h4", "h3":
text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]"))
if text != "" {
parts = append([]string{text}, parts...)
}
case "h2":
text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]"))
if text != "" {
parts = append([]string{text}, parts...)
}
return strings.Join(parts, " / ")
}
}
return strings.Join(parts, " / ")
}
func (s *Scraper) tableHeaderKey(header string, index int) string {
header = strings.TrimSpace(header)
if header == "" {
return s.fallbackColumnKey(index)
}
header = strings.ToLower(header)
header = strings.ReplaceAll(header, " ", "_")
header = strings.ReplaceAll(header, "/", "_")
header = strings.ReplaceAll(header, "-", "_")
header = s.cfg.WhitespaceRe.ReplaceAllString(header, "_")
header = strings.Trim(header, "_")
if header == "" {
return s.fallbackColumnKey(index)
}
return header
}
func (s *Scraper) tableColumnKey(headerKeys, headers []string, index int) string {
if index < len(headerKeys) && headerKeys[index] != "" {
return headerKeys[index]
}
if index < len(headers) && headers[index] != "" {
return headers[index]
}
return s.fallbackColumnKey(index)
}
func (s *Scraper) fallbackColumnKey(index int) string {
return "column_" + strconv.Itoa(index+1)
}
func (s *Scraper) rowIsEmpty(row []string) bool {
for _, value := range row {
if value != "" {
return false
}
}
return true
}
func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection { func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection {
var found *goquery.Selection var found *goquery.Selection

View File

@@ -18,31 +18,37 @@ type Scraper struct {
cfg Config cfg Config
collector *colly.Collector collector *colly.Collector
mu sync.Mutex mu sync.Mutex
items map[string]*model.Item items map[string]*model.Item
effects map[string]*model.Effect effects map[string]*model.Effect
queued map[string]bool queued map[string]bool
completed int completedURLs map[string]bool
failed int completed int
retried int failed int
requestSeq int retried int
spinnerIndex int requestSeq int
activeURL string spinnerIndex int
activeSince time.Time activeURL string
lastEvent time.Time activeSince time.Time
lastEvent time.Time
} }
func New(cfg Config) *Scraper { func New(cfg Config) *Scraper {
return &Scraper{ return &Scraper{
cfg: cfg, cfg: cfg,
items: map[string]*model.Item{}, items: map[string]*model.Item{},
effects: map[string]*model.Effect{}, effects: map[string]*model.Effect{},
queued: map[string]bool{}, queued: map[string]bool{},
lastEvent: time.Now(), completedURLs: map[string]bool{},
lastEvent: time.Now(),
} }
} }
func (s *Scraper) Run() (model.Dataset, error) { func (s *Scraper) Run() (model.Dataset, error) {
if err := s.loadCheckpoint(); err != nil {
return model.Dataset{}, err
}
s.collector = colly.NewCollector( s.collector = colly.NewCollector(
colly.AllowedDomains(s.cfg.AllowedDomain), colly.AllowedDomains(s.cfg.AllowedDomain),
colly.MaxDepth(s.cfg.MaxDepth), colly.MaxDepth(s.cfg.MaxDepth),
@@ -66,13 +72,16 @@ func (s *Scraper) Run() (model.Dataset, error) {
done := make(chan struct{}) done := make(chan struct{})
defer close(done) defer close(done)
s.startStatusLoop(done) s.startStatusLoop(done)
s.startAutosaveLoop(done)
for _, seed := range append(append([]string{}, s.cfg.ItemSeeds...), s.cfg.CraftingSeeds...) { s.resumeQueue()
s.queueVisit("seed", seed)
}
s.collector.Wait() s.collector.Wait()
if err := s.SaveCheckpoint("final"); err != nil {
return model.Dataset{}, err
}
return model.Dataset{ return model.Dataset{
Items: s.flattenItems(), Items: s.flattenItems(),
Effects: s.flattenEffects(), Effects: s.flattenEffects(),
@@ -225,6 +234,9 @@ func (s *Scraper) registerHandlers() {
s.collector.OnScraped(func(r *colly.Response) { s.collector.OnScraped(func(r *colly.Response) {
s.mu.Lock() s.mu.Lock()
s.completed++ s.completed++
if r != nil && r.Request != nil && r.StatusCode < 400 {
s.completedURLs[r.Request.URL.String()] = true
}
s.activeURL = "" s.activeURL = ""
s.activeSince = time.Time{} s.activeSince = time.Time{}
s.lastEvent = time.Now() s.lastEvent = time.Now()
@@ -233,6 +245,12 @@ func (s *Scraper) registerHandlers() {
s.mu.Unlock() s.mu.Unlock()
logx.Eventf("done", "#%s total=%d queued=%d url=%s", r.Request.Ctx.Get("request_id"), doneCount, queueLen, r.Request.URL.String()) logx.Eventf("done", "#%s total=%d queued=%d url=%s", r.Request.Ctx.Get("request_id"), doneCount, queueLen, r.Request.URL.String())
if s.cfg.AutosavePages > 0 && doneCount%s.cfg.AutosavePages == 0 {
if err := s.SaveCheckpoint("progress"); err != nil {
logx.Eventf("warn", "autosave failed after %d pages: %v", doneCount, err)
}
}
}) })
s.collector.OnHTML(".mw-parser-output table a[href]", func(e *colly.HTMLElement) { s.collector.OnHTML(".mw-parser-output table a[href]", func(e *colly.HTMLElement) {
@@ -301,6 +319,28 @@ func (s *Scraper) startStatusLoop(done <-chan struct{}) {
}() }()
} }
func (s *Scraper) startAutosaveLoop(done <-chan struct{}) {
if s.cfg.AutosaveEvery <= 0 {
return
}
go func() {
ticker := time.NewTicker(s.cfg.AutosaveEvery)
defer ticker.Stop()
for {
select {
case <-done:
return
case <-ticker.C:
if err := s.SaveCheckpoint("timer"); err != nil {
logx.Eventf("warn", "autosave failed: %v", err)
}
}
}
}()
}
func (s *Scraper) queueVisit(fromURL, toURL string) { func (s *Scraper) queueVisit(fromURL, toURL string) {
if toURL == "" { if toURL == "" {
return return
@@ -311,6 +351,9 @@ func (s *Scraper) queueVisit(fromURL, toURL string) {
case s.queued[toURL]: case s.queued[toURL]:
s.mu.Unlock() s.mu.Unlock()
return return
case s.completedURLs[toURL]:
s.mu.Unlock()
return
case len(s.queued) >= s.cfg.MaxQueuedPages: case len(s.queued) >= s.cfg.MaxQueuedPages:
s.mu.Unlock() s.mu.Unlock()
logx.Eventf("skip", "queue budget reached from=%s to=%s", s.debugURLName(fromURL), toURL) logx.Eventf("skip", "queue budget reached from=%s to=%s", s.debugURLName(fromURL), toURL)
@@ -340,6 +383,52 @@ func (s *Scraper) spinnerFrame() string {
return frame return frame
} }
func (s *Scraper) resumeQueue() {
for _, seed := range append(append([]string{}, s.cfg.ItemSeeds...), s.cfg.CraftingSeeds...) {
s.queueSeed(seed)
}
s.mu.Lock()
items := make([]model.Item, 0, len(s.items))
for _, item := range s.items {
items = append(items, *item)
}
s.mu.Unlock()
for _, item := range items {
for _, effectLink := range item.EffectLinks {
link := s.absoluteWikiURL(effectLink)
if link == "" {
continue
}
s.queueVisit(item.URL, link)
}
}
}
func (s *Scraper) queueSeed(toURL string) {
if toURL == "" {
return
}
s.mu.Lock()
if s.queued[toURL] {
s.mu.Unlock()
return
}
s.queued[toURL] = true
queueLen := len(s.queued)
s.mu.Unlock()
ctx := colly.NewContext()
ctx.Put("from_url", "seed")
logx.Eventf("queue", "%d from=%s to=%s", queueLen, "seed", toURL)
if err := s.collector.Request("GET", toURL, nil, ctx, nil); err != nil {
logx.Eventf("warn", "queue failed from=%s to=%s: %v", "seed", toURL, err)
}
}
func (s *Scraper) shouldRetry(statusCode int) bool { func (s *Scraper) shouldRetry(statusCode int) bool {
return statusCode == 0 || statusCode == 408 || statusCode == 425 || statusCode == 429 || statusCode >= 500 return statusCode == 0 || statusCode == 408 || statusCode == 425 || statusCode == 429 || statusCode >= 500
} }