feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from interruptions instead of restarting from scratch. - introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling - expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction - skip infobox tables during recipe parsing to avoid false recipe matches - add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction Add resumable checkpoint support so long scrapes can recover from interruptions instead of restarting from scratch. - introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling - expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction - skip infobox tables during recipe parsing to avoid false recipe matches - add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -32,3 +32,7 @@ go.work
|
|||||||
.nfs*
|
.nfs*
|
||||||
|
|
||||||
# End of https://www.toptal.com/developers/gitignore/api/go,linux
|
# End of https://www.toptal.com/developers/gitignore/api/go,linux
|
||||||
|
|
||||||
|
.cache/
|
||||||
|
|
||||||
|
outward_data.json
|
||||||
@@ -29,6 +29,8 @@ go run ./cmd/scrappr
|
|||||||
- Limits crawl depth and queue size to avoid drifting into junk pages
|
- Limits crawl depth and queue size to avoid drifting into junk pages
|
||||||
- Retries temporary failures with short backoff
|
- Retries temporary failures with short backoff
|
||||||
- Prints colored emoji logs for queueing, requests, responses, parsing, retries, and periodic status
|
- Prints colored emoji logs for queueing, requests, responses, parsing, retries, and periodic status
|
||||||
|
- Stores legacy and portable infobox fields, primary item image URLs, recipes, effects, and raw content tables for later processing
|
||||||
|
- Saves resumable checkpoints into `.cache/scrape-state.json` on a timer, during progress milestones, and on `Ctrl+C`
|
||||||
- Writes a stable, sorted JSON dataset to `outward_data.json`
|
- Writes a stable, sorted JSON dataset to `outward_data.json`
|
||||||
|
|
||||||
## Tuning
|
## Tuning
|
||||||
@@ -37,4 +39,4 @@ Scraper defaults live in `internal/scraper/config.go`.
|
|||||||
|
|
||||||
- Lower or raise `RequestDelay` / `RequestJitter`
|
- Lower or raise `RequestDelay` / `RequestJitter`
|
||||||
- Tighten or relax `MaxQueuedPages`
|
- Tighten or relax `MaxQueuedPages`
|
||||||
- Adjust `RequestTimeout`, `MaxRetries`, and `ProgressEvery`
|
- Adjust `RequestTimeout`, `MaxRetries`, `ProgressEvery`, `AutosaveEvery`, and `AutosavePages`
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"os"
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"scrappr/internal/logx"
|
"scrappr/internal/logx"
|
||||||
@@ -20,6 +22,7 @@ func Run() error {
|
|||||||
|
|
||||||
cfg := scraper.DefaultConfig()
|
cfg := scraper.DefaultConfig()
|
||||||
s := scraper.New(cfg)
|
s := scraper.New(cfg)
|
||||||
|
installSignalCheckpoint(s)
|
||||||
|
|
||||||
dataset, err := s.Run()
|
dataset, err := s.Run()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -39,9 +42,27 @@ func Run() error {
|
|||||||
outputPath,
|
outputPath,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if err := s.ClearCheckpoint(); err != nil {
|
||||||
|
logx.Eventf("warn", "failed to clear checkpoint: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func installSignalCheckpoint(s *scraper.Scraper) {
|
||||||
|
signals := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(signals, os.Interrupt, syscall.SIGTERM)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
sig := <-signals
|
||||||
|
logx.Eventf("warn", "received %s, saving checkpoint before exit", sig.String())
|
||||||
|
if err := s.SaveCheckpoint("signal"); err != nil {
|
||||||
|
logx.Eventf("error", "failed to save checkpoint on signal: %v", err)
|
||||||
|
}
|
||||||
|
os.Exit(130)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
func writeDataset(outputPath string, dataset model.Dataset) error {
|
func writeDataset(outputPath string, dataset model.Dataset) error {
|
||||||
file, err := os.Create(outputPath)
|
file, err := os.Create(outputPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ var (
|
|||||||
"status": {emoji: "🌀", label: "STATUS", color: colorYellow},
|
"status": {emoji: "🌀", label: "STATUS", color: colorYellow},
|
||||||
"done": {emoji: "✅", label: "DONE", color: colorGreen},
|
"done": {emoji: "✅", label: "DONE", color: colorGreen},
|
||||||
"write": {emoji: "💾", label: "WRITE", color: colorBlue},
|
"write": {emoji: "💾", label: "WRITE", color: colorBlue},
|
||||||
|
"cache": {emoji: "🗂️", label: "CACHE", color: colorCyan},
|
||||||
"skip": {emoji: "⏭️", label: "SKIP", color: colorGray},
|
"skip": {emoji: "⏭️", label: "SKIP", color: colorGray},
|
||||||
"warn": {emoji: "⚠️", label: "WARN", color: colorYellow},
|
"warn": {emoji: "⚠️", label: "WARN", color: colorYellow},
|
||||||
"error": {emoji: "💥", label: "ERROR", color: colorRed},
|
"error": {emoji: "💥", label: "ERROR", color: colorRed},
|
||||||
|
|||||||
@@ -8,14 +8,23 @@ type Recipe struct {
|
|||||||
SourcePage string `json:"source_page,omitempty"`
|
SourcePage string `json:"source_page,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type Table struct {
|
||||||
|
Title string `json:"title,omitempty"`
|
||||||
|
Headers []string `json:"headers,omitempty"`
|
||||||
|
Rows []map[string]string `json:"rows,omitempty"`
|
||||||
|
RawRows [][]string `json:"raw_rows,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type Item struct {
|
type Item struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
URL string `json:"url"`
|
URL string `json:"url"`
|
||||||
Categories []string `json:"categories,omitempty"`
|
Categories []string `json:"categories,omitempty"`
|
||||||
Infobox map[string]string `json:"infobox,omitempty"`
|
Infobox map[string]string `json:"infobox,omitempty"`
|
||||||
|
ImageURL string `json:"image_url,omitempty"`
|
||||||
Effects []string `json:"effects,omitempty"`
|
Effects []string `json:"effects,omitempty"`
|
||||||
EffectLinks []string `json:"effect_links,omitempty"`
|
EffectLinks []string `json:"effect_links,omitempty"`
|
||||||
Recipes []Recipe `json:"recipes,omitempty"`
|
Recipes []Recipe `json:"recipes,omitempty"`
|
||||||
|
Tables []Table `json:"tables,omitempty"`
|
||||||
Description string `json:"description,omitempty"`
|
Description string `json:"description,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
200
internal/scraper/checkpoint.go
Normal file
200
internal/scraper/checkpoint.go
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"scrappr/internal/logx"
|
||||||
|
"scrappr/internal/model"
|
||||||
|
)
|
||||||
|
|
||||||
|
const checkpointVersion = 2
|
||||||
|
|
||||||
|
type checkpoint struct {
|
||||||
|
Version int `json:"version"`
|
||||||
|
SavedAt time.Time `json:"saved_at"`
|
||||||
|
Reason string `json:"reason"`
|
||||||
|
Dataset model.Dataset `json:"dataset"`
|
||||||
|
CompletedURLs []string `json:"completed_urls"`
|
||||||
|
Stats checkpointStats `json:"stats"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type checkpointStats struct {
|
||||||
|
Completed int `json:"completed"`
|
||||||
|
Failed int `json:"failed"`
|
||||||
|
Retried int `json:"retried"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) SaveCheckpoint(reason string) error {
|
||||||
|
state := s.snapshotCheckpoint(reason)
|
||||||
|
|
||||||
|
if err := os.MkdirAll(s.cfg.CacheDir, 0o755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
tempPath := s.cfg.CheckpointPath + ".tmp"
|
||||||
|
file, err := os.Create(tempPath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
encoder := json.NewEncoder(file)
|
||||||
|
encoder.SetIndent("", " ")
|
||||||
|
if err := encoder.Encode(state); err != nil {
|
||||||
|
file.Close()
|
||||||
|
_ = os.Remove(tempPath)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
_ = os.Remove(tempPath)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.Rename(tempPath, s.cfg.CheckpointPath); err != nil {
|
||||||
|
_ = os.Remove(tempPath)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
logx.Eventf(
|
||||||
|
"cache",
|
||||||
|
"saved checkpoint (%s) items=%d effects=%d completed=%d -> %s",
|
||||||
|
reason,
|
||||||
|
len(state.Dataset.Items),
|
||||||
|
len(state.Dataset.Effects),
|
||||||
|
state.Stats.Completed,
|
||||||
|
s.cfg.CheckpointPath,
|
||||||
|
)
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) ClearCheckpoint() error {
|
||||||
|
err := os.Remove(s.cfg.CheckpointPath)
|
||||||
|
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
logx.Eventf("cache", "cleared checkpoint %s", s.cfg.CheckpointPath)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) loadCheckpoint() error {
|
||||||
|
file, err := os.Open(s.cfg.CheckpointPath)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
var state checkpoint
|
||||||
|
if err := json.NewDecoder(file).Decode(&state); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if state.Version != checkpointVersion {
|
||||||
|
logx.Eventf(
|
||||||
|
"cache",
|
||||||
|
"ignoring checkpoint %s with version=%d expected=%d",
|
||||||
|
s.cfg.CheckpointPath,
|
||||||
|
state.Version,
|
||||||
|
checkpointVersion,
|
||||||
|
)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
|
||||||
|
s.items = make(map[string]*model.Item, len(state.Dataset.Items))
|
||||||
|
for i := range state.Dataset.Items {
|
||||||
|
item := state.Dataset.Items[i]
|
||||||
|
s.items[item.URL] = &item
|
||||||
|
}
|
||||||
|
|
||||||
|
s.effects = make(map[string]*model.Effect, len(state.Dataset.Effects))
|
||||||
|
for i := range state.Dataset.Effects {
|
||||||
|
effect := state.Dataset.Effects[i]
|
||||||
|
s.effects[effect.URL] = &effect
|
||||||
|
}
|
||||||
|
|
||||||
|
s.completedURLs = make(map[string]bool, len(state.CompletedURLs))
|
||||||
|
for _, rawURL := range state.CompletedURLs {
|
||||||
|
if rawURL != "" {
|
||||||
|
s.completedURLs[rawURL] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
s.completed = state.Stats.Completed
|
||||||
|
s.failed = state.Stats.Failed
|
||||||
|
s.retried = state.Stats.Retried
|
||||||
|
s.queued = map[string]bool{}
|
||||||
|
s.lastEvent = time.Now()
|
||||||
|
|
||||||
|
logx.Eventf(
|
||||||
|
"cache",
|
||||||
|
"loaded checkpoint from %s saved=%s items=%d effects=%d completed=%d",
|
||||||
|
s.cfg.CheckpointPath,
|
||||||
|
state.SavedAt.Format(time.RFC3339),
|
||||||
|
len(state.Dataset.Items),
|
||||||
|
len(state.Dataset.Effects),
|
||||||
|
state.Stats.Completed,
|
||||||
|
)
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) snapshotCheckpoint(reason string) checkpoint {
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
|
||||||
|
items := make([]model.Item, 0, len(s.items))
|
||||||
|
for _, item := range s.items {
|
||||||
|
items = append(items, *item)
|
||||||
|
}
|
||||||
|
|
||||||
|
effects := make([]model.Effect, 0, len(s.effects))
|
||||||
|
for _, effect := range s.effects {
|
||||||
|
effects = append(effects, *effect)
|
||||||
|
}
|
||||||
|
|
||||||
|
completedURLs := make([]string, 0, len(s.completedURLs))
|
||||||
|
for rawURL := range s.completedURLs {
|
||||||
|
completedURLs = append(completedURLs, rawURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Slice(items, func(i, j int) bool {
|
||||||
|
return strings.ToLower(items[i].Name) < strings.ToLower(items[j].Name)
|
||||||
|
})
|
||||||
|
sort.Slice(effects, func(i, j int) bool {
|
||||||
|
return strings.ToLower(effects[i].Name) < strings.ToLower(effects[j].Name)
|
||||||
|
})
|
||||||
|
sort.Strings(completedURLs)
|
||||||
|
|
||||||
|
return checkpoint{
|
||||||
|
Version: checkpointVersion,
|
||||||
|
SavedAt: time.Now(),
|
||||||
|
Reason: reason,
|
||||||
|
Dataset: model.Dataset{
|
||||||
|
Items: items,
|
||||||
|
Effects: effects,
|
||||||
|
},
|
||||||
|
CompletedURLs: completedURLs,
|
||||||
|
Stats: checkpointStats{
|
||||||
|
Completed: s.completed,
|
||||||
|
Failed: s.failed,
|
||||||
|
Retried: s.retried,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) CheckpointPath() string {
|
||||||
|
return filepath.Clean(s.cfg.CheckpointPath)
|
||||||
|
}
|
||||||
@@ -8,9 +8,13 @@ import (
|
|||||||
type Config struct {
|
type Config struct {
|
||||||
BaseURL string
|
BaseURL string
|
||||||
AllowedDomain string
|
AllowedDomain string
|
||||||
|
CacheDir string
|
||||||
|
CheckpointPath string
|
||||||
MaxDepth int
|
MaxDepth int
|
||||||
MaxRetries int
|
MaxRetries int
|
||||||
MaxQueuedPages int
|
MaxQueuedPages int
|
||||||
|
AutosaveEvery time.Duration
|
||||||
|
AutosavePages int
|
||||||
RequestDelay time.Duration
|
RequestDelay time.Duration
|
||||||
RequestJitter time.Duration
|
RequestJitter time.Duration
|
||||||
RequestTimeout time.Duration
|
RequestTimeout time.Duration
|
||||||
@@ -35,9 +39,13 @@ func DefaultConfig() Config {
|
|||||||
return Config{
|
return Config{
|
||||||
BaseURL: baseURL,
|
BaseURL: baseURL,
|
||||||
AllowedDomain: "outward.fandom.com",
|
AllowedDomain: "outward.fandom.com",
|
||||||
|
CacheDir: ".cache",
|
||||||
|
CheckpointPath: ".cache/scrape-state.json",
|
||||||
MaxDepth: 3,
|
MaxDepth: 3,
|
||||||
MaxRetries: 2,
|
MaxRetries: 2,
|
||||||
MaxQueuedPages: 1500,
|
MaxQueuedPages: 1500,
|
||||||
|
AutosaveEvery: 12 * time.Second,
|
||||||
|
AutosavePages: 20,
|
||||||
RequestDelay: 650 * time.Millisecond,
|
RequestDelay: 650 * time.Millisecond,
|
||||||
RequestJitter: 350 * time.Millisecond,
|
RequestJitter: 350 * time.Millisecond,
|
||||||
RequestTimeout: 8 * time.Second,
|
RequestTimeout: 8 * time.Second,
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
package scraper
|
package scraper
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"net/url"
|
"net/url"
|
||||||
"path"
|
"path"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
@@ -127,9 +129,6 @@ func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc
|
|||||||
if strings.Contains(strings.ToLower(title), "/") {
|
if strings.Contains(strings.ToLower(title), "/") {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
if doc.Find("aside.portable-infobox").Length() == 0 {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, category := range lcCats {
|
for _, category := range lcCats {
|
||||||
if strings.Contains(category, "items") || strings.Contains(category, "food") ||
|
if strings.Contains(category, "items") || strings.Contains(category, "food") ||
|
||||||
@@ -140,11 +139,21 @@ func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
infoboxText := strings.ToLower(s.clean(doc.Find("aside.portable-infobox").Text()))
|
if !s.hasItemInfobox(doc) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
infoboxText := strings.ToLower(s.clean(s.infoboxText(doc)))
|
||||||
|
descriptionText := strings.ToLower(s.parseDescription(doc))
|
||||||
return strings.Contains(infoboxText, "item details") ||
|
return strings.Contains(infoboxText, "item details") ||
|
||||||
strings.Contains(infoboxText, "consumable details") ||
|
strings.Contains(infoboxText, "consumable details") ||
|
||||||
strings.Contains(infoboxText, "equipment details") ||
|
strings.Contains(infoboxText, "equipment details") ||
|
||||||
strings.Contains(infoboxText, "weapon")
|
strings.Contains(infoboxText, "weapon") ||
|
||||||
|
strings.Contains(descriptionText, " is an item in outward") ||
|
||||||
|
strings.Contains(descriptionText, " is a weapon in outward") ||
|
||||||
|
strings.Contains(descriptionText, " is a consumable in outward") ||
|
||||||
|
strings.Contains(descriptionText, " is an ingredient in outward") ||
|
||||||
|
strings.Contains(descriptionText, " is a shield in outward")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool {
|
func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool {
|
||||||
@@ -166,12 +175,14 @@ func (s *Scraper) parseItemPage(doc *goquery.Document, title, pageURL string, ca
|
|||||||
Name: title,
|
Name: title,
|
||||||
URL: pageURL,
|
URL: pageURL,
|
||||||
Categories: categories,
|
Categories: categories,
|
||||||
Infobox: s.parsePortableInfobox(doc),
|
Infobox: s.parseInfobox(doc),
|
||||||
|
ImageURL: s.parseImageURL(doc),
|
||||||
}
|
}
|
||||||
|
|
||||||
item.Description = s.parseDescription(doc)
|
item.Description = s.parseDescription(doc)
|
||||||
item.Effects, item.EffectLinks = s.parseEffectsSection(doc)
|
item.Effects, item.EffectLinks = s.parseEffectsSection(doc)
|
||||||
item.Recipes = s.parseRecipesFromPage(doc, title)
|
item.Recipes = s.parseRecipesFromPage(doc, title)
|
||||||
|
item.Tables = s.parseContentTables(doc)
|
||||||
|
|
||||||
return item
|
return item
|
||||||
}
|
}
|
||||||
@@ -181,12 +192,20 @@ func (s *Scraper) parseEffectPage(doc *goquery.Document, title, pageURL string,
|
|||||||
Name: title,
|
Name: title,
|
||||||
URL: pageURL,
|
URL: pageURL,
|
||||||
Categories: categories,
|
Categories: categories,
|
||||||
Infobox: s.parsePortableInfobox(doc),
|
Infobox: s.parseInfobox(doc),
|
||||||
Description: s.parseDescription(doc),
|
Description: s.parseDescription(doc),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string {
|
func (s *Scraper) hasItemInfobox(doc *goquery.Document) bool {
|
||||||
|
return doc.Find("aside.portable-infobox").Length() > 0 || doc.Find("table.infoboxtable").Length() > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) infoboxText(doc *goquery.Document) string {
|
||||||
|
return s.clean(doc.Find("aside.portable-infobox, table.infoboxtable").First().Text())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) parseInfobox(doc *goquery.Document) map[string]string {
|
||||||
out := map[string]string{}
|
out := map[string]string{}
|
||||||
|
|
||||||
doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) {
|
doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) {
|
||||||
@@ -205,6 +224,32 @@ func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
doc.Find("table.infoboxtable tr").Each(func(_ int, row *goquery.Selection) {
|
||||||
|
cells := row.Find("th, td")
|
||||||
|
if cells.Length() < 2 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
label := s.clean(cells.First().Text())
|
||||||
|
if label == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var values []string
|
||||||
|
cells.Slice(1, cells.Length()).Each(func(_ int, cell *goquery.Selection) {
|
||||||
|
text := s.clean(cell.Text())
|
||||||
|
if text != "" {
|
||||||
|
values = append(values, text)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(values) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
out[label] = strings.Join(values, " | ")
|
||||||
|
})
|
||||||
|
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -220,6 +265,38 @@ func (s *Scraper) parseCategories(doc *goquery.Document) []string {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
if len(categories) == 0 {
|
||||||
|
doc.Find("script").EachWithBreak(func(_ int, node *goquery.Selection) bool {
|
||||||
|
text := node.Text()
|
||||||
|
index := strings.Index(text, "wgCategories")
|
||||||
|
if index < 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
arrayStart := strings.Index(text[index:], "[")
|
||||||
|
arrayEnd := strings.Index(text[index:], "]")
|
||||||
|
if arrayStart < 0 || arrayEnd < 0 || arrayEnd <= arrayStart {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
raw := text[index+arrayStart : index+arrayEnd+1]
|
||||||
|
var parsed []string
|
||||||
|
if err := json.Unmarshal([]byte(raw), &parsed); err != nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, category := range parsed {
|
||||||
|
category = s.clean(category)
|
||||||
|
if category != "" && !seen[category] {
|
||||||
|
seen[category] = true
|
||||||
|
categories = append(categories, category)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return len(categories) == 0
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
if len(categories) == 0 {
|
if len(categories) == 0 {
|
||||||
headerText := s.clean(doc.Find("body").Text())
|
headerText := s.clean(doc.Find("body").Text())
|
||||||
if idx := strings.Index(headerText, "in:"); idx >= 0 {
|
if idx := strings.Index(headerText, "in:"); idx >= 0 {
|
||||||
@@ -295,6 +372,10 @@ func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string)
|
|||||||
var recipes []model.Recipe
|
var recipes []model.Recipe
|
||||||
|
|
||||||
doc.Find("table").Each(func(_ int, table *goquery.Selection) {
|
doc.Find("table").Each(func(_ int, table *goquery.Selection) {
|
||||||
|
if table.HasClass("infoboxtable") {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
headerText := strings.ToLower(s.clean(table.Find("tr").First().Text()))
|
headerText := strings.ToLower(s.clean(table.Find("tr").First().Text()))
|
||||||
if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) {
|
if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) {
|
||||||
return
|
return
|
||||||
@@ -354,6 +435,169 @@ func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string)
|
|||||||
return recipes
|
return recipes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) parseImageURL(doc *goquery.Document) string {
|
||||||
|
image := doc.Find("aside.portable-infobox img, table.infoboxtable img").First()
|
||||||
|
if image.Length() == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, attr := range []string{"src", "data-src"} {
|
||||||
|
if raw, ok := image.Attr(attr); ok {
|
||||||
|
return s.normalizeImageURL(raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) normalizeImageURL(raw string) string {
|
||||||
|
raw = strings.TrimSpace(raw)
|
||||||
|
switch {
|
||||||
|
case raw == "":
|
||||||
|
return ""
|
||||||
|
case strings.HasPrefix(raw, "//"):
|
||||||
|
return "https:" + raw
|
||||||
|
default:
|
||||||
|
return raw
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) parseContentTables(doc *goquery.Document) []model.Table {
|
||||||
|
var tables []model.Table
|
||||||
|
|
||||||
|
doc.Find(".mw-parser-output table").Each(func(_ int, table *goquery.Selection) {
|
||||||
|
if table.HasClass("infoboxtable") || table.Find("table").Length() > 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
rows := table.Find("tr")
|
||||||
|
if rows.Length() < 2 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
headerCells := rows.First().Find("th, td")
|
||||||
|
if headerCells.Length() == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
headers := make([]string, 0, headerCells.Length())
|
||||||
|
headerKeys := make([]string, 0, headerCells.Length())
|
||||||
|
headerCells.Each(func(i int, cell *goquery.Selection) {
|
||||||
|
header := s.clean(cell.Text())
|
||||||
|
if header == "" {
|
||||||
|
header = "Column " + strconv.Itoa(i+1)
|
||||||
|
}
|
||||||
|
headers = append(headers, header)
|
||||||
|
headerKeys = append(headerKeys, s.tableHeaderKey(header, i))
|
||||||
|
})
|
||||||
|
|
||||||
|
parsedTable := model.Table{
|
||||||
|
Title: s.tableTitle(table),
|
||||||
|
Headers: headers,
|
||||||
|
}
|
||||||
|
|
||||||
|
rows.Slice(1, rows.Length()).Each(func(_ int, row *goquery.Selection) {
|
||||||
|
cells := row.Find("th, td")
|
||||||
|
if cells.Length() == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
rawRow := make([]string, 0, cells.Length())
|
||||||
|
rowData := map[string]string{}
|
||||||
|
|
||||||
|
cells.Each(func(i int, cell *goquery.Selection) {
|
||||||
|
text := s.clean(cell.Text())
|
||||||
|
rawRow = append(rawRow, text)
|
||||||
|
key := s.tableColumnKey(headerKeys, headers, i)
|
||||||
|
if text != "" {
|
||||||
|
rowData[key] = text
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if s.rowIsEmpty(rawRow) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
parsedTable.RawRows = append(parsedTable.RawRows, rawRow)
|
||||||
|
if len(rowData) > 0 {
|
||||||
|
parsedTable.Rows = append(parsedTable.Rows, rowData)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(parsedTable.RawRows) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
tables = append(tables, parsedTable)
|
||||||
|
})
|
||||||
|
|
||||||
|
return tables
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) tableTitle(table *goquery.Selection) string {
|
||||||
|
var parts []string
|
||||||
|
|
||||||
|
for prev := table.Prev(); prev.Length() > 0; prev = prev.Prev() {
|
||||||
|
switch goquery.NodeName(prev) {
|
||||||
|
case "h4", "h3":
|
||||||
|
text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]"))
|
||||||
|
if text != "" {
|
||||||
|
parts = append([]string{text}, parts...)
|
||||||
|
}
|
||||||
|
case "h2":
|
||||||
|
text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]"))
|
||||||
|
if text != "" {
|
||||||
|
parts = append([]string{text}, parts...)
|
||||||
|
}
|
||||||
|
return strings.Join(parts, " / ")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.Join(parts, " / ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) tableHeaderKey(header string, index int) string {
|
||||||
|
header = strings.TrimSpace(header)
|
||||||
|
if header == "" {
|
||||||
|
return s.fallbackColumnKey(index)
|
||||||
|
}
|
||||||
|
|
||||||
|
header = strings.ToLower(header)
|
||||||
|
header = strings.ReplaceAll(header, " ", "_")
|
||||||
|
header = strings.ReplaceAll(header, "/", "_")
|
||||||
|
header = strings.ReplaceAll(header, "-", "_")
|
||||||
|
header = s.cfg.WhitespaceRe.ReplaceAllString(header, "_")
|
||||||
|
header = strings.Trim(header, "_")
|
||||||
|
if header == "" {
|
||||||
|
return s.fallbackColumnKey(index)
|
||||||
|
}
|
||||||
|
|
||||||
|
return header
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) tableColumnKey(headerKeys, headers []string, index int) string {
|
||||||
|
if index < len(headerKeys) && headerKeys[index] != "" {
|
||||||
|
return headerKeys[index]
|
||||||
|
}
|
||||||
|
if index < len(headers) && headers[index] != "" {
|
||||||
|
return headers[index]
|
||||||
|
}
|
||||||
|
return s.fallbackColumnKey(index)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) fallbackColumnKey(index int) string {
|
||||||
|
return "column_" + strconv.Itoa(index+1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) rowIsEmpty(row []string) bool {
|
||||||
|
for _, value := range row {
|
||||||
|
if value != "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection {
|
func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection {
|
||||||
var found *goquery.Selection
|
var found *goquery.Selection
|
||||||
|
|
||||||
|
|||||||
@@ -18,31 +18,37 @@ type Scraper struct {
|
|||||||
cfg Config
|
cfg Config
|
||||||
collector *colly.Collector
|
collector *colly.Collector
|
||||||
|
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
items map[string]*model.Item
|
items map[string]*model.Item
|
||||||
effects map[string]*model.Effect
|
effects map[string]*model.Effect
|
||||||
queued map[string]bool
|
queued map[string]bool
|
||||||
completed int
|
completedURLs map[string]bool
|
||||||
failed int
|
completed int
|
||||||
retried int
|
failed int
|
||||||
requestSeq int
|
retried int
|
||||||
spinnerIndex int
|
requestSeq int
|
||||||
activeURL string
|
spinnerIndex int
|
||||||
activeSince time.Time
|
activeURL string
|
||||||
lastEvent time.Time
|
activeSince time.Time
|
||||||
|
lastEvent time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
func New(cfg Config) *Scraper {
|
func New(cfg Config) *Scraper {
|
||||||
return &Scraper{
|
return &Scraper{
|
||||||
cfg: cfg,
|
cfg: cfg,
|
||||||
items: map[string]*model.Item{},
|
items: map[string]*model.Item{},
|
||||||
effects: map[string]*model.Effect{},
|
effects: map[string]*model.Effect{},
|
||||||
queued: map[string]bool{},
|
queued: map[string]bool{},
|
||||||
lastEvent: time.Now(),
|
completedURLs: map[string]bool{},
|
||||||
|
lastEvent: time.Now(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scraper) Run() (model.Dataset, error) {
|
func (s *Scraper) Run() (model.Dataset, error) {
|
||||||
|
if err := s.loadCheckpoint(); err != nil {
|
||||||
|
return model.Dataset{}, err
|
||||||
|
}
|
||||||
|
|
||||||
s.collector = colly.NewCollector(
|
s.collector = colly.NewCollector(
|
||||||
colly.AllowedDomains(s.cfg.AllowedDomain),
|
colly.AllowedDomains(s.cfg.AllowedDomain),
|
||||||
colly.MaxDepth(s.cfg.MaxDepth),
|
colly.MaxDepth(s.cfg.MaxDepth),
|
||||||
@@ -66,13 +72,16 @@ func (s *Scraper) Run() (model.Dataset, error) {
|
|||||||
done := make(chan struct{})
|
done := make(chan struct{})
|
||||||
defer close(done)
|
defer close(done)
|
||||||
s.startStatusLoop(done)
|
s.startStatusLoop(done)
|
||||||
|
s.startAutosaveLoop(done)
|
||||||
|
|
||||||
for _, seed := range append(append([]string{}, s.cfg.ItemSeeds...), s.cfg.CraftingSeeds...) {
|
s.resumeQueue()
|
||||||
s.queueVisit("seed", seed)
|
|
||||||
}
|
|
||||||
|
|
||||||
s.collector.Wait()
|
s.collector.Wait()
|
||||||
|
|
||||||
|
if err := s.SaveCheckpoint("final"); err != nil {
|
||||||
|
return model.Dataset{}, err
|
||||||
|
}
|
||||||
|
|
||||||
return model.Dataset{
|
return model.Dataset{
|
||||||
Items: s.flattenItems(),
|
Items: s.flattenItems(),
|
||||||
Effects: s.flattenEffects(),
|
Effects: s.flattenEffects(),
|
||||||
@@ -225,6 +234,9 @@ func (s *Scraper) registerHandlers() {
|
|||||||
s.collector.OnScraped(func(r *colly.Response) {
|
s.collector.OnScraped(func(r *colly.Response) {
|
||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
s.completed++
|
s.completed++
|
||||||
|
if r != nil && r.Request != nil && r.StatusCode < 400 {
|
||||||
|
s.completedURLs[r.Request.URL.String()] = true
|
||||||
|
}
|
||||||
s.activeURL = ""
|
s.activeURL = ""
|
||||||
s.activeSince = time.Time{}
|
s.activeSince = time.Time{}
|
||||||
s.lastEvent = time.Now()
|
s.lastEvent = time.Now()
|
||||||
@@ -233,6 +245,12 @@ func (s *Scraper) registerHandlers() {
|
|||||||
s.mu.Unlock()
|
s.mu.Unlock()
|
||||||
|
|
||||||
logx.Eventf("done", "#%s total=%d queued=%d url=%s", r.Request.Ctx.Get("request_id"), doneCount, queueLen, r.Request.URL.String())
|
logx.Eventf("done", "#%s total=%d queued=%d url=%s", r.Request.Ctx.Get("request_id"), doneCount, queueLen, r.Request.URL.String())
|
||||||
|
|
||||||
|
if s.cfg.AutosavePages > 0 && doneCount%s.cfg.AutosavePages == 0 {
|
||||||
|
if err := s.SaveCheckpoint("progress"); err != nil {
|
||||||
|
logx.Eventf("warn", "autosave failed after %d pages: %v", doneCount, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
s.collector.OnHTML(".mw-parser-output table a[href]", func(e *colly.HTMLElement) {
|
s.collector.OnHTML(".mw-parser-output table a[href]", func(e *colly.HTMLElement) {
|
||||||
@@ -301,6 +319,28 @@ func (s *Scraper) startStatusLoop(done <-chan struct{}) {
|
|||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) startAutosaveLoop(done <-chan struct{}) {
|
||||||
|
if s.cfg.AutosaveEvery <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
ticker := time.NewTicker(s.cfg.AutosaveEvery)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
if err := s.SaveCheckpoint("timer"); err != nil {
|
||||||
|
logx.Eventf("warn", "autosave failed: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
func (s *Scraper) queueVisit(fromURL, toURL string) {
|
func (s *Scraper) queueVisit(fromURL, toURL string) {
|
||||||
if toURL == "" {
|
if toURL == "" {
|
||||||
return
|
return
|
||||||
@@ -311,6 +351,9 @@ func (s *Scraper) queueVisit(fromURL, toURL string) {
|
|||||||
case s.queued[toURL]:
|
case s.queued[toURL]:
|
||||||
s.mu.Unlock()
|
s.mu.Unlock()
|
||||||
return
|
return
|
||||||
|
case s.completedURLs[toURL]:
|
||||||
|
s.mu.Unlock()
|
||||||
|
return
|
||||||
case len(s.queued) >= s.cfg.MaxQueuedPages:
|
case len(s.queued) >= s.cfg.MaxQueuedPages:
|
||||||
s.mu.Unlock()
|
s.mu.Unlock()
|
||||||
logx.Eventf("skip", "queue budget reached from=%s to=%s", s.debugURLName(fromURL), toURL)
|
logx.Eventf("skip", "queue budget reached from=%s to=%s", s.debugURLName(fromURL), toURL)
|
||||||
@@ -340,6 +383,52 @@ func (s *Scraper) spinnerFrame() string {
|
|||||||
return frame
|
return frame
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) resumeQueue() {
|
||||||
|
for _, seed := range append(append([]string{}, s.cfg.ItemSeeds...), s.cfg.CraftingSeeds...) {
|
||||||
|
s.queueSeed(seed)
|
||||||
|
}
|
||||||
|
|
||||||
|
s.mu.Lock()
|
||||||
|
items := make([]model.Item, 0, len(s.items))
|
||||||
|
for _, item := range s.items {
|
||||||
|
items = append(items, *item)
|
||||||
|
}
|
||||||
|
s.mu.Unlock()
|
||||||
|
|
||||||
|
for _, item := range items {
|
||||||
|
for _, effectLink := range item.EffectLinks {
|
||||||
|
link := s.absoluteWikiURL(effectLink)
|
||||||
|
if link == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s.queueVisit(item.URL, link)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scraper) queueSeed(toURL string) {
|
||||||
|
if toURL == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
s.mu.Lock()
|
||||||
|
if s.queued[toURL] {
|
||||||
|
s.mu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.queued[toURL] = true
|
||||||
|
queueLen := len(s.queued)
|
||||||
|
s.mu.Unlock()
|
||||||
|
|
||||||
|
ctx := colly.NewContext()
|
||||||
|
ctx.Put("from_url", "seed")
|
||||||
|
|
||||||
|
logx.Eventf("queue", "%d from=%s to=%s", queueLen, "seed", toURL)
|
||||||
|
if err := s.collector.Request("GET", toURL, nil, ctx, nil); err != nil {
|
||||||
|
logx.Eventf("warn", "queue failed from=%s to=%s: %v", "seed", toURL, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *Scraper) shouldRetry(statusCode int) bool {
|
func (s *Scraper) shouldRetry(statusCode int) bool {
|
||||||
return statusCode == 0 || statusCode == 408 || statusCode == 425 || statusCode == 429 || statusCode >= 500
|
return statusCode == 0 || statusCode == 408 || statusCode == 425 || statusCode == 429 || statusCode >= 500
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user