diff --git a/.gitignore b/.gitignore index 212e961..805cfcf 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,7 @@ go.work .nfs* # End of https://www.toptal.com/developers/gitignore/api/go,linux + +.cache/ + +outward_data.json \ No newline at end of file diff --git a/README.md b/README.md index ef988d0..821cf2b 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,8 @@ go run ./cmd/scrappr - Limits crawl depth and queue size to avoid drifting into junk pages - Retries temporary failures with short backoff - Prints colored emoji logs for queueing, requests, responses, parsing, retries, and periodic status +- Stores legacy and portable infobox fields, primary item image URLs, recipes, effects, and raw content tables for later processing +- Saves resumable checkpoints into `.cache/scrape-state.json` on a timer, during progress milestones, and on `Ctrl+C` - Writes a stable, sorted JSON dataset to `outward_data.json` ## Tuning @@ -37,4 +39,4 @@ Scraper defaults live in `internal/scraper/config.go`. - Lower or raise `RequestDelay` / `RequestJitter` - Tighten or relax `MaxQueuedPages` -- Adjust `RequestTimeout`, `MaxRetries`, and `ProgressEvery` +- Adjust `RequestTimeout`, `MaxRetries`, `ProgressEvery`, `AutosaveEvery`, and `AutosavePages` diff --git a/internal/app/run.go b/internal/app/run.go index 20b413a..7b9b981 100644 --- a/internal/app/run.go +++ b/internal/app/run.go @@ -4,6 +4,8 @@ import ( "encoding/json" "math/rand" "os" + "os/signal" + "syscall" "time" "scrappr/internal/logx" @@ -20,6 +22,7 @@ func Run() error { cfg := scraper.DefaultConfig() s := scraper.New(cfg) + installSignalCheckpoint(s) dataset, err := s.Run() if err != nil { @@ -39,9 +42,27 @@ func Run() error { outputPath, ) + if err := s.ClearCheckpoint(); err != nil { + logx.Eventf("warn", "failed to clear checkpoint: %v", err) + } + return nil } +func installSignalCheckpoint(s *scraper.Scraper) { + signals := make(chan os.Signal, 1) + signal.Notify(signals, os.Interrupt, syscall.SIGTERM) + + go func() { + sig := <-signals + logx.Eventf("warn", "received %s, saving checkpoint before exit", sig.String()) + if err := s.SaveCheckpoint("signal"); err != nil { + logx.Eventf("error", "failed to save checkpoint on signal: %v", err) + } + os.Exit(130) + }() +} + func writeDataset(outputPath string, dataset model.Dataset) error { file, err := os.Create(outputPath) if err != nil { diff --git a/internal/logx/logx.go b/internal/logx/logx.go index 8f459b7..714a7c8 100644 --- a/internal/logx/logx.go +++ b/internal/logx/logx.go @@ -36,6 +36,7 @@ var ( "status": {emoji: "🌀", label: "STATUS", color: colorYellow}, "done": {emoji: "✅", label: "DONE", color: colorGreen}, "write": {emoji: "💾", label: "WRITE", color: colorBlue}, + "cache": {emoji: "🗂️", label: "CACHE", color: colorCyan}, "skip": {emoji: "⏭️", label: "SKIP", color: colorGray}, "warn": {emoji: "⚠️", label: "WARN", color: colorYellow}, "error": {emoji: "💥", label: "ERROR", color: colorRed}, diff --git a/internal/model/types.go b/internal/model/types.go index f21e61c..d12443e 100644 --- a/internal/model/types.go +++ b/internal/model/types.go @@ -8,14 +8,23 @@ type Recipe struct { SourcePage string `json:"source_page,omitempty"` } +type Table struct { + Title string `json:"title,omitempty"` + Headers []string `json:"headers,omitempty"` + Rows []map[string]string `json:"rows,omitempty"` + RawRows [][]string `json:"raw_rows,omitempty"` +} + type Item struct { Name string `json:"name"` URL string `json:"url"` Categories []string `json:"categories,omitempty"` Infobox map[string]string `json:"infobox,omitempty"` + ImageURL string `json:"image_url,omitempty"` Effects []string `json:"effects,omitempty"` EffectLinks []string `json:"effect_links,omitempty"` Recipes []Recipe `json:"recipes,omitempty"` + Tables []Table `json:"tables,omitempty"` Description string `json:"description,omitempty"` } diff --git a/internal/scraper/checkpoint.go b/internal/scraper/checkpoint.go new file mode 100644 index 0000000..39e06b8 --- /dev/null +++ b/internal/scraper/checkpoint.go @@ -0,0 +1,200 @@ +package scraper + +import ( + "encoding/json" + "errors" + "os" + "path/filepath" + "sort" + "strings" + "time" + + "scrappr/internal/logx" + "scrappr/internal/model" +) + +const checkpointVersion = 2 + +type checkpoint struct { + Version int `json:"version"` + SavedAt time.Time `json:"saved_at"` + Reason string `json:"reason"` + Dataset model.Dataset `json:"dataset"` + CompletedURLs []string `json:"completed_urls"` + Stats checkpointStats `json:"stats"` +} + +type checkpointStats struct { + Completed int `json:"completed"` + Failed int `json:"failed"` + Retried int `json:"retried"` +} + +func (s *Scraper) SaveCheckpoint(reason string) error { + state := s.snapshotCheckpoint(reason) + + if err := os.MkdirAll(s.cfg.CacheDir, 0o755); err != nil { + return err + } + + tempPath := s.cfg.CheckpointPath + ".tmp" + file, err := os.Create(tempPath) + if err != nil { + return err + } + + encoder := json.NewEncoder(file) + encoder.SetIndent("", " ") + if err := encoder.Encode(state); err != nil { + file.Close() + _ = os.Remove(tempPath) + return err + } + + if err := file.Close(); err != nil { + _ = os.Remove(tempPath) + return err + } + + if err := os.Rename(tempPath, s.cfg.CheckpointPath); err != nil { + _ = os.Remove(tempPath) + return err + } + + logx.Eventf( + "cache", + "saved checkpoint (%s) items=%d effects=%d completed=%d -> %s", + reason, + len(state.Dataset.Items), + len(state.Dataset.Effects), + state.Stats.Completed, + s.cfg.CheckpointPath, + ) + + return nil +} + +func (s *Scraper) ClearCheckpoint() error { + err := os.Remove(s.cfg.CheckpointPath) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + logx.Eventf("cache", "cleared checkpoint %s", s.cfg.CheckpointPath) + return nil +} + +func (s *Scraper) loadCheckpoint() error { + file, err := os.Open(s.cfg.CheckpointPath) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil + } + return err + } + defer file.Close() + + var state checkpoint + if err := json.NewDecoder(file).Decode(&state); err != nil { + return err + } + + if state.Version != checkpointVersion { + logx.Eventf( + "cache", + "ignoring checkpoint %s with version=%d expected=%d", + s.cfg.CheckpointPath, + state.Version, + checkpointVersion, + ) + return nil + } + + s.mu.Lock() + defer s.mu.Unlock() + + s.items = make(map[string]*model.Item, len(state.Dataset.Items)) + for i := range state.Dataset.Items { + item := state.Dataset.Items[i] + s.items[item.URL] = &item + } + + s.effects = make(map[string]*model.Effect, len(state.Dataset.Effects)) + for i := range state.Dataset.Effects { + effect := state.Dataset.Effects[i] + s.effects[effect.URL] = &effect + } + + s.completedURLs = make(map[string]bool, len(state.CompletedURLs)) + for _, rawURL := range state.CompletedURLs { + if rawURL != "" { + s.completedURLs[rawURL] = true + } + } + + s.completed = state.Stats.Completed + s.failed = state.Stats.Failed + s.retried = state.Stats.Retried + s.queued = map[string]bool{} + s.lastEvent = time.Now() + + logx.Eventf( + "cache", + "loaded checkpoint from %s saved=%s items=%d effects=%d completed=%d", + s.cfg.CheckpointPath, + state.SavedAt.Format(time.RFC3339), + len(state.Dataset.Items), + len(state.Dataset.Effects), + state.Stats.Completed, + ) + + return nil +} + +func (s *Scraper) snapshotCheckpoint(reason string) checkpoint { + s.mu.Lock() + defer s.mu.Unlock() + + items := make([]model.Item, 0, len(s.items)) + for _, item := range s.items { + items = append(items, *item) + } + + effects := make([]model.Effect, 0, len(s.effects)) + for _, effect := range s.effects { + effects = append(effects, *effect) + } + + completedURLs := make([]string, 0, len(s.completedURLs)) + for rawURL := range s.completedURLs { + completedURLs = append(completedURLs, rawURL) + } + + sort.Slice(items, func(i, j int) bool { + return strings.ToLower(items[i].Name) < strings.ToLower(items[j].Name) + }) + sort.Slice(effects, func(i, j int) bool { + return strings.ToLower(effects[i].Name) < strings.ToLower(effects[j].Name) + }) + sort.Strings(completedURLs) + + return checkpoint{ + Version: checkpointVersion, + SavedAt: time.Now(), + Reason: reason, + Dataset: model.Dataset{ + Items: items, + Effects: effects, + }, + CompletedURLs: completedURLs, + Stats: checkpointStats{ + Completed: s.completed, + Failed: s.failed, + Retried: s.retried, + }, + } +} + +func (s *Scraper) CheckpointPath() string { + return filepath.Clean(s.cfg.CheckpointPath) +} diff --git a/internal/scraper/config.go b/internal/scraper/config.go index 7f8afc5..8a7c457 100644 --- a/internal/scraper/config.go +++ b/internal/scraper/config.go @@ -8,9 +8,13 @@ import ( type Config struct { BaseURL string AllowedDomain string + CacheDir string + CheckpointPath string MaxDepth int MaxRetries int MaxQueuedPages int + AutosaveEvery time.Duration + AutosavePages int RequestDelay time.Duration RequestJitter time.Duration RequestTimeout time.Duration @@ -35,9 +39,13 @@ func DefaultConfig() Config { return Config{ BaseURL: baseURL, AllowedDomain: "outward.fandom.com", + CacheDir: ".cache", + CheckpointPath: ".cache/scrape-state.json", MaxDepth: 3, MaxRetries: 2, MaxQueuedPages: 1500, + AutosaveEvery: 12 * time.Second, + AutosavePages: 20, RequestDelay: 650 * time.Millisecond, RequestJitter: 350 * time.Millisecond, RequestTimeout: 8 * time.Second, diff --git a/internal/scraper/parse.go b/internal/scraper/parse.go index b877afe..de7bde4 100644 --- a/internal/scraper/parse.go +++ b/internal/scraper/parse.go @@ -1,10 +1,12 @@ package scraper import ( + "encoding/json" "math/rand" "net/url" "path" "sort" + "strconv" "strings" "github.com/PuerkitoBio/goquery" @@ -127,9 +129,6 @@ func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc if strings.Contains(strings.ToLower(title), "/") { return false } - if doc.Find("aside.portable-infobox").Length() == 0 { - return false - } for _, category := range lcCats { if strings.Contains(category, "items") || strings.Contains(category, "food") || @@ -140,11 +139,21 @@ func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc } } - infoboxText := strings.ToLower(s.clean(doc.Find("aside.portable-infobox").Text())) + if !s.hasItemInfobox(doc) { + return false + } + + infoboxText := strings.ToLower(s.clean(s.infoboxText(doc))) + descriptionText := strings.ToLower(s.parseDescription(doc)) return strings.Contains(infoboxText, "item details") || strings.Contains(infoboxText, "consumable details") || strings.Contains(infoboxText, "equipment details") || - strings.Contains(infoboxText, "weapon") + strings.Contains(infoboxText, "weapon") || + strings.Contains(descriptionText, " is an item in outward") || + strings.Contains(descriptionText, " is a weapon in outward") || + strings.Contains(descriptionText, " is a consumable in outward") || + strings.Contains(descriptionText, " is an ingredient in outward") || + strings.Contains(descriptionText, " is a shield in outward") } func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool { @@ -166,12 +175,14 @@ func (s *Scraper) parseItemPage(doc *goquery.Document, title, pageURL string, ca Name: title, URL: pageURL, Categories: categories, - Infobox: s.parsePortableInfobox(doc), + Infobox: s.parseInfobox(doc), + ImageURL: s.parseImageURL(doc), } item.Description = s.parseDescription(doc) item.Effects, item.EffectLinks = s.parseEffectsSection(doc) item.Recipes = s.parseRecipesFromPage(doc, title) + item.Tables = s.parseContentTables(doc) return item } @@ -181,12 +192,20 @@ func (s *Scraper) parseEffectPage(doc *goquery.Document, title, pageURL string, Name: title, URL: pageURL, Categories: categories, - Infobox: s.parsePortableInfobox(doc), + Infobox: s.parseInfobox(doc), Description: s.parseDescription(doc), } } -func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string { +func (s *Scraper) hasItemInfobox(doc *goquery.Document) bool { + return doc.Find("aside.portable-infobox").Length() > 0 || doc.Find("table.infoboxtable").Length() > 0 +} + +func (s *Scraper) infoboxText(doc *goquery.Document) string { + return s.clean(doc.Find("aside.portable-infobox, table.infoboxtable").First().Text()) +} + +func (s *Scraper) parseInfobox(doc *goquery.Document) map[string]string { out := map[string]string{} doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) { @@ -205,6 +224,32 @@ func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string } }) + doc.Find("table.infoboxtable tr").Each(func(_ int, row *goquery.Selection) { + cells := row.Find("th, td") + if cells.Length() < 2 { + return + } + + label := s.clean(cells.First().Text()) + if label == "" { + return + } + + var values []string + cells.Slice(1, cells.Length()).Each(func(_ int, cell *goquery.Selection) { + text := s.clean(cell.Text()) + if text != "" { + values = append(values, text) + } + }) + + if len(values) == 0 { + return + } + + out[label] = strings.Join(values, " | ") + }) + return out } @@ -220,6 +265,38 @@ func (s *Scraper) parseCategories(doc *goquery.Document) []string { } }) + if len(categories) == 0 { + doc.Find("script").EachWithBreak(func(_ int, node *goquery.Selection) bool { + text := node.Text() + index := strings.Index(text, "wgCategories") + if index < 0 { + return true + } + + arrayStart := strings.Index(text[index:], "[") + arrayEnd := strings.Index(text[index:], "]") + if arrayStart < 0 || arrayEnd < 0 || arrayEnd <= arrayStart { + return true + } + + raw := text[index+arrayStart : index+arrayEnd+1] + var parsed []string + if err := json.Unmarshal([]byte(raw), &parsed); err != nil { + return true + } + + for _, category := range parsed { + category = s.clean(category) + if category != "" && !seen[category] { + seen[category] = true + categories = append(categories, category) + } + } + + return len(categories) == 0 + }) + } + if len(categories) == 0 { headerText := s.clean(doc.Find("body").Text()) if idx := strings.Index(headerText, "in:"); idx >= 0 { @@ -295,6 +372,10 @@ func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string) var recipes []model.Recipe doc.Find("table").Each(func(_ int, table *goquery.Selection) { + if table.HasClass("infoboxtable") { + return + } + headerText := strings.ToLower(s.clean(table.Find("tr").First().Text())) if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) { return @@ -354,6 +435,169 @@ func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string) return recipes } +func (s *Scraper) parseImageURL(doc *goquery.Document) string { + image := doc.Find("aside.portable-infobox img, table.infoboxtable img").First() + if image.Length() == 0 { + return "" + } + + for _, attr := range []string{"src", "data-src"} { + if raw, ok := image.Attr(attr); ok { + return s.normalizeImageURL(raw) + } + } + + return "" +} + +func (s *Scraper) normalizeImageURL(raw string) string { + raw = strings.TrimSpace(raw) + switch { + case raw == "": + return "" + case strings.HasPrefix(raw, "//"): + return "https:" + raw + default: + return raw + } +} + +func (s *Scraper) parseContentTables(doc *goquery.Document) []model.Table { + var tables []model.Table + + doc.Find(".mw-parser-output table").Each(func(_ int, table *goquery.Selection) { + if table.HasClass("infoboxtable") || table.Find("table").Length() > 0 { + return + } + + rows := table.Find("tr") + if rows.Length() < 2 { + return + } + + headerCells := rows.First().Find("th, td") + if headerCells.Length() == 0 { + return + } + + headers := make([]string, 0, headerCells.Length()) + headerKeys := make([]string, 0, headerCells.Length()) + headerCells.Each(func(i int, cell *goquery.Selection) { + header := s.clean(cell.Text()) + if header == "" { + header = "Column " + strconv.Itoa(i+1) + } + headers = append(headers, header) + headerKeys = append(headerKeys, s.tableHeaderKey(header, i)) + }) + + parsedTable := model.Table{ + Title: s.tableTitle(table), + Headers: headers, + } + + rows.Slice(1, rows.Length()).Each(func(_ int, row *goquery.Selection) { + cells := row.Find("th, td") + if cells.Length() == 0 { + return + } + + rawRow := make([]string, 0, cells.Length()) + rowData := map[string]string{} + + cells.Each(func(i int, cell *goquery.Selection) { + text := s.clean(cell.Text()) + rawRow = append(rawRow, text) + key := s.tableColumnKey(headerKeys, headers, i) + if text != "" { + rowData[key] = text + } + }) + + if s.rowIsEmpty(rawRow) { + return + } + + parsedTable.RawRows = append(parsedTable.RawRows, rawRow) + if len(rowData) > 0 { + parsedTable.Rows = append(parsedTable.Rows, rowData) + } + }) + + if len(parsedTable.RawRows) == 0 { + return + } + + tables = append(tables, parsedTable) + }) + + return tables +} + +func (s *Scraper) tableTitle(table *goquery.Selection) string { + var parts []string + + for prev := table.Prev(); prev.Length() > 0; prev = prev.Prev() { + switch goquery.NodeName(prev) { + case "h4", "h3": + text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]")) + if text != "" { + parts = append([]string{text}, parts...) + } + case "h2": + text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]")) + if text != "" { + parts = append([]string{text}, parts...) + } + return strings.Join(parts, " / ") + } + } + + return strings.Join(parts, " / ") +} + +func (s *Scraper) tableHeaderKey(header string, index int) string { + header = strings.TrimSpace(header) + if header == "" { + return s.fallbackColumnKey(index) + } + + header = strings.ToLower(header) + header = strings.ReplaceAll(header, " ", "_") + header = strings.ReplaceAll(header, "/", "_") + header = strings.ReplaceAll(header, "-", "_") + header = s.cfg.WhitespaceRe.ReplaceAllString(header, "_") + header = strings.Trim(header, "_") + if header == "" { + return s.fallbackColumnKey(index) + } + + return header +} + +func (s *Scraper) tableColumnKey(headerKeys, headers []string, index int) string { + if index < len(headerKeys) && headerKeys[index] != "" { + return headerKeys[index] + } + if index < len(headers) && headers[index] != "" { + return headers[index] + } + return s.fallbackColumnKey(index) +} + +func (s *Scraper) fallbackColumnKey(index int) string { + return "column_" + strconv.Itoa(index+1) +} + +func (s *Scraper) rowIsEmpty(row []string) bool { + for _, value := range row { + if value != "" { + return false + } + } + return true +} + func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection { var found *goquery.Selection diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 5c6cd44..15a74dc 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -18,31 +18,37 @@ type Scraper struct { cfg Config collector *colly.Collector - mu sync.Mutex - items map[string]*model.Item - effects map[string]*model.Effect - queued map[string]bool - completed int - failed int - retried int - requestSeq int - spinnerIndex int - activeURL string - activeSince time.Time - lastEvent time.Time + mu sync.Mutex + items map[string]*model.Item + effects map[string]*model.Effect + queued map[string]bool + completedURLs map[string]bool + completed int + failed int + retried int + requestSeq int + spinnerIndex int + activeURL string + activeSince time.Time + lastEvent time.Time } func New(cfg Config) *Scraper { return &Scraper{ - cfg: cfg, - items: map[string]*model.Item{}, - effects: map[string]*model.Effect{}, - queued: map[string]bool{}, - lastEvent: time.Now(), + cfg: cfg, + items: map[string]*model.Item{}, + effects: map[string]*model.Effect{}, + queued: map[string]bool{}, + completedURLs: map[string]bool{}, + lastEvent: time.Now(), } } func (s *Scraper) Run() (model.Dataset, error) { + if err := s.loadCheckpoint(); err != nil { + return model.Dataset{}, err + } + s.collector = colly.NewCollector( colly.AllowedDomains(s.cfg.AllowedDomain), colly.MaxDepth(s.cfg.MaxDepth), @@ -66,13 +72,16 @@ func (s *Scraper) Run() (model.Dataset, error) { done := make(chan struct{}) defer close(done) s.startStatusLoop(done) + s.startAutosaveLoop(done) - for _, seed := range append(append([]string{}, s.cfg.ItemSeeds...), s.cfg.CraftingSeeds...) { - s.queueVisit("seed", seed) - } + s.resumeQueue() s.collector.Wait() + if err := s.SaveCheckpoint("final"); err != nil { + return model.Dataset{}, err + } + return model.Dataset{ Items: s.flattenItems(), Effects: s.flattenEffects(), @@ -225,6 +234,9 @@ func (s *Scraper) registerHandlers() { s.collector.OnScraped(func(r *colly.Response) { s.mu.Lock() s.completed++ + if r != nil && r.Request != nil && r.StatusCode < 400 { + s.completedURLs[r.Request.URL.String()] = true + } s.activeURL = "" s.activeSince = time.Time{} s.lastEvent = time.Now() @@ -233,6 +245,12 @@ func (s *Scraper) registerHandlers() { s.mu.Unlock() logx.Eventf("done", "#%s total=%d queued=%d url=%s", r.Request.Ctx.Get("request_id"), doneCount, queueLen, r.Request.URL.String()) + + if s.cfg.AutosavePages > 0 && doneCount%s.cfg.AutosavePages == 0 { + if err := s.SaveCheckpoint("progress"); err != nil { + logx.Eventf("warn", "autosave failed after %d pages: %v", doneCount, err) + } + } }) s.collector.OnHTML(".mw-parser-output table a[href]", func(e *colly.HTMLElement) { @@ -301,6 +319,28 @@ func (s *Scraper) startStatusLoop(done <-chan struct{}) { }() } +func (s *Scraper) startAutosaveLoop(done <-chan struct{}) { + if s.cfg.AutosaveEvery <= 0 { + return + } + + go func() { + ticker := time.NewTicker(s.cfg.AutosaveEvery) + defer ticker.Stop() + + for { + select { + case <-done: + return + case <-ticker.C: + if err := s.SaveCheckpoint("timer"); err != nil { + logx.Eventf("warn", "autosave failed: %v", err) + } + } + } + }() +} + func (s *Scraper) queueVisit(fromURL, toURL string) { if toURL == "" { return @@ -311,6 +351,9 @@ func (s *Scraper) queueVisit(fromURL, toURL string) { case s.queued[toURL]: s.mu.Unlock() return + case s.completedURLs[toURL]: + s.mu.Unlock() + return case len(s.queued) >= s.cfg.MaxQueuedPages: s.mu.Unlock() logx.Eventf("skip", "queue budget reached from=%s to=%s", s.debugURLName(fromURL), toURL) @@ -340,6 +383,52 @@ func (s *Scraper) spinnerFrame() string { return frame } +func (s *Scraper) resumeQueue() { + for _, seed := range append(append([]string{}, s.cfg.ItemSeeds...), s.cfg.CraftingSeeds...) { + s.queueSeed(seed) + } + + s.mu.Lock() + items := make([]model.Item, 0, len(s.items)) + for _, item := range s.items { + items = append(items, *item) + } + s.mu.Unlock() + + for _, item := range items { + for _, effectLink := range item.EffectLinks { + link := s.absoluteWikiURL(effectLink) + if link == "" { + continue + } + s.queueVisit(item.URL, link) + } + } +} + +func (s *Scraper) queueSeed(toURL string) { + if toURL == "" { + return + } + + s.mu.Lock() + if s.queued[toURL] { + s.mu.Unlock() + return + } + s.queued[toURL] = true + queueLen := len(s.queued) + s.mu.Unlock() + + ctx := colly.NewContext() + ctx.Put("from_url", "seed") + + logx.Eventf("queue", "%d from=%s to=%s", queueLen, "seed", toURL) + if err := s.collector.Request("GET", toURL, nil, ctx, nil); err != nil { + logx.Eventf("warn", "queue failed from=%s to=%s: %v", "seed", toURL, err) + } +} + func (s *Scraper) shouldRetry(statusCode int) bool { return statusCode == 0 || statusCode == 408 || statusCode == 425 || statusCode == 429 || statusCode >= 500 }