2026-03-15 16:42:43 +02:00
|
|
|
package scraper
|
|
|
|
|
|
|
|
|
|
import (
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
"encoding/json"
|
2026-03-15 16:42:43 +02:00
|
|
|
"math/rand"
|
|
|
|
|
"net/url"
|
|
|
|
|
"path"
|
|
|
|
|
"sort"
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
"strconv"
|
2026-03-15 16:42:43 +02:00
|
|
|
"strings"
|
|
|
|
|
|
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
|
"github.com/gocolly/colly/v2"
|
|
|
|
|
|
|
|
|
|
"scrappr/internal/model"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) shouldVisit(raw string) bool {
|
|
|
|
|
if raw == "" {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
parsed, err := url.Parse(raw)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if parsed.Host != s.cfg.AllowedDomain {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !strings.HasPrefix(parsed.Path, "/wiki/") {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if s.cfg.IgnoredExact[parsed.Path] {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, prefix := range s.cfg.IgnoredPrefixes {
|
|
|
|
|
if strings.HasPrefix(parsed.Path, prefix) {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) applyBrowserHeaders(r *colly.Request) {
|
|
|
|
|
r.Headers.Set("User-Agent", s.randomUserAgent())
|
|
|
|
|
r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8")
|
|
|
|
|
r.Headers.Set("Accept-Language", "en-US,en;q=0.9")
|
|
|
|
|
r.Headers.Set("Cache-Control", "max-age=0")
|
|
|
|
|
r.Headers.Set("DNT", "1")
|
|
|
|
|
r.Headers.Set("Sec-Fetch-Dest", "document")
|
|
|
|
|
r.Headers.Set("Sec-Fetch-Mode", "navigate")
|
|
|
|
|
r.Headers.Set("Sec-Fetch-Site", "same-origin")
|
|
|
|
|
r.Headers.Set("Sec-Fetch-User", "?1")
|
|
|
|
|
r.Headers.Set("Upgrade-Insecure-Requests", "1")
|
|
|
|
|
|
|
|
|
|
if r.Headers.Get("Referer") == "" {
|
|
|
|
|
r.Headers.Set("Referer", s.cfg.BrowserReferrer)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) randomUserAgent() string {
|
|
|
|
|
return s.cfg.BrowserAgents[rand.Intn(len(s.cfg.BrowserAgents))]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) shouldQueueFromPage(fromPath, toURL string) bool {
|
|
|
|
|
parsed, err := url.Parse(toURL)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
toPath := parsed.Path
|
|
|
|
|
|
|
|
|
|
if s.cfg.ItemListPathRe.MatchString(fromPath) {
|
|
|
|
|
return strings.HasPrefix(toPath, "/wiki/") &&
|
|
|
|
|
!s.cfg.ItemListPathRe.MatchString(toPath) &&
|
|
|
|
|
!s.cfg.CraftingPathRe.MatchString(toPath)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if s.cfg.CraftingPathRe.MatchString(fromPath) {
|
|
|
|
|
return strings.HasPrefix(toPath, "/wiki/") &&
|
|
|
|
|
!s.cfg.CraftingPathRe.MatchString(toPath)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) shouldQueueTableLink(e *colly.HTMLElement) bool {
|
|
|
|
|
label := s.clean(e.Text)
|
|
|
|
|
if label == "" {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cell := e.DOM.Closest("td, th")
|
|
|
|
|
if cell.Length() == 0 {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
columnIndex := 0
|
|
|
|
|
for prev := cell.Prev(); prev.Length() > 0; prev = prev.Prev() {
|
|
|
|
|
switch goquery.NodeName(prev) {
|
|
|
|
|
case "td", "th":
|
|
|
|
|
columnIndex++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return columnIndex <= 1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) pageKindForPath(pagePath string) string {
|
|
|
|
|
switch {
|
|
|
|
|
case s.cfg.ItemListPathRe.MatchString(pagePath):
|
|
|
|
|
return "item-list"
|
|
|
|
|
case s.cfg.CraftingPathRe.MatchString(pagePath):
|
|
|
|
|
return "crafting"
|
|
|
|
|
default:
|
|
|
|
|
return "article"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc *goquery.Document) bool {
|
|
|
|
|
if s.cfg.ItemListPathRe.MatchString(pagePath) || s.cfg.CraftingPathRe.MatchString(pagePath) {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
if strings.Contains(strings.ToLower(title), "/") {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, category := range lcCats {
|
|
|
|
|
if strings.Contains(category, "items") || strings.Contains(category, "food") ||
|
|
|
|
|
strings.Contains(category, "consumables") || strings.Contains(category, "ingredients") ||
|
|
|
|
|
strings.Contains(category, "equipment") || strings.Contains(category, "weapons") ||
|
|
|
|
|
strings.Contains(category, "deployables") {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
if !s.hasItemInfobox(doc) {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
infoboxText := strings.ToLower(s.clean(s.infoboxText(doc)))
|
|
|
|
|
descriptionText := strings.ToLower(s.parseDescription(doc))
|
2026-03-15 16:42:43 +02:00
|
|
|
return strings.Contains(infoboxText, "item details") ||
|
|
|
|
|
strings.Contains(infoboxText, "consumable details") ||
|
|
|
|
|
strings.Contains(infoboxText, "equipment details") ||
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
strings.Contains(infoboxText, "weapon") ||
|
|
|
|
|
strings.Contains(descriptionText, " is an item in outward") ||
|
|
|
|
|
strings.Contains(descriptionText, " is a weapon in outward") ||
|
|
|
|
|
strings.Contains(descriptionText, " is a consumable in outward") ||
|
|
|
|
|
strings.Contains(descriptionText, " is an ingredient in outward") ||
|
|
|
|
|
strings.Contains(descriptionText, " is a shield in outward")
|
2026-03-15 16:42:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool {
|
|
|
|
|
if strings.Contains(strings.ToLower(title), "effect") {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, category := range lcCats {
|
|
|
|
|
if strings.Contains(category, "effects") {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return strings.Contains(strings.ToLower(pagePath), "/effect")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) parseItemPage(doc *goquery.Document, title, pageURL string, categories []string) model.Item {
|
|
|
|
|
item := model.Item{
|
|
|
|
|
Name: title,
|
|
|
|
|
URL: pageURL,
|
|
|
|
|
Categories: categories,
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
Infobox: s.parseInfobox(doc),
|
|
|
|
|
ImageURL: s.parseImageURL(doc),
|
2026-03-15 16:42:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
item.Description = s.parseDescription(doc)
|
|
|
|
|
item.Effects, item.EffectLinks = s.parseEffectsSection(doc)
|
|
|
|
|
item.Recipes = s.parseRecipesFromPage(doc, title)
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
item.Tables = s.parseContentTables(doc)
|
2026-03-15 16:42:43 +02:00
|
|
|
|
|
|
|
|
return item
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) parseEffectPage(doc *goquery.Document, title, pageURL string, categories []string) model.Effect {
|
|
|
|
|
return model.Effect{
|
|
|
|
|
Name: title,
|
|
|
|
|
URL: pageURL,
|
|
|
|
|
Categories: categories,
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
Infobox: s.parseInfobox(doc),
|
2026-03-15 16:42:43 +02:00
|
|
|
Description: s.parseDescription(doc),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
func (s *Scraper) hasItemInfobox(doc *goquery.Document) bool {
|
|
|
|
|
return doc.Find("aside.portable-infobox").Length() > 0 || doc.Find("table.infoboxtable").Length() > 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) infoboxText(doc *goquery.Document) string {
|
|
|
|
|
return s.clean(doc.Find("aside.portable-infobox, table.infoboxtable").First().Text())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) parseInfobox(doc *goquery.Document) map[string]string {
|
2026-03-15 16:42:43 +02:00
|
|
|
out := map[string]string{}
|
|
|
|
|
|
|
|
|
|
doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) {
|
|
|
|
|
label := s.clean(section.Find(".pi-data-label").First().Text())
|
|
|
|
|
value := s.clean(section.Find(".pi-data-value").First().Text())
|
|
|
|
|
|
|
|
|
|
if label == "" {
|
|
|
|
|
label = s.clean(section.Find("h3").First().Text())
|
|
|
|
|
}
|
|
|
|
|
if value == "" {
|
|
|
|
|
value = s.clean(section.Find("div").Last().Text())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if label != "" && value != "" {
|
|
|
|
|
out[label] = value
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
doc.Find("table.infoboxtable tr").Each(func(_ int, row *goquery.Selection) {
|
|
|
|
|
cells := row.Find("th, td")
|
|
|
|
|
if cells.Length() < 2 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
label := s.clean(cells.First().Text())
|
|
|
|
|
if label == "" {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var values []string
|
|
|
|
|
cells.Slice(1, cells.Length()).Each(func(_ int, cell *goquery.Selection) {
|
|
|
|
|
text := s.clean(cell.Text())
|
|
|
|
|
if text != "" {
|
|
|
|
|
values = append(values, text)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if len(values) == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out[label] = strings.Join(values, " | ")
|
|
|
|
|
})
|
|
|
|
|
|
2026-03-15 16:42:43 +02:00
|
|
|
return out
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) parseCategories(doc *goquery.Document) []string {
|
|
|
|
|
var categories []string
|
|
|
|
|
seen := map[string]bool{}
|
|
|
|
|
|
|
|
|
|
doc.Find(".page-header__categories a, .category-page__member-link, .wds-tabs__tab-label a").Each(func(_ int, node *goquery.Selection) {
|
|
|
|
|
text := s.clean(node.Text())
|
|
|
|
|
if text != "" && !seen[text] {
|
|
|
|
|
seen[text] = true
|
|
|
|
|
categories = append(categories, text)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
if len(categories) == 0 {
|
|
|
|
|
doc.Find("script").EachWithBreak(func(_ int, node *goquery.Selection) bool {
|
|
|
|
|
text := node.Text()
|
|
|
|
|
index := strings.Index(text, "wgCategories")
|
|
|
|
|
if index < 0 {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
arrayStart := strings.Index(text[index:], "[")
|
|
|
|
|
arrayEnd := strings.Index(text[index:], "]")
|
|
|
|
|
if arrayStart < 0 || arrayEnd < 0 || arrayEnd <= arrayStart {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
raw := text[index+arrayStart : index+arrayEnd+1]
|
|
|
|
|
var parsed []string
|
|
|
|
|
if err := json.Unmarshal([]byte(raw), &parsed); err != nil {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, category := range parsed {
|
|
|
|
|
category = s.clean(category)
|
|
|
|
|
if category != "" && !seen[category] {
|
|
|
|
|
seen[category] = true
|
|
|
|
|
categories = append(categories, category)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return len(categories) == 0
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-15 16:42:43 +02:00
|
|
|
if len(categories) == 0 {
|
|
|
|
|
headerText := s.clean(doc.Find("body").Text())
|
|
|
|
|
if idx := strings.Index(headerText, "in:"); idx >= 0 {
|
|
|
|
|
line := strings.TrimPrefix(headerText[idx:], "in:")
|
|
|
|
|
for _, part := range strings.Split(line, ",") {
|
|
|
|
|
part = s.clean(part)
|
|
|
|
|
if part != "" && !seen[part] {
|
|
|
|
|
seen[part] = true
|
|
|
|
|
categories = append(categories, part)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return categories
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) parseDescription(doc *goquery.Document) string {
|
|
|
|
|
var description string
|
|
|
|
|
|
|
|
|
|
doc.Find(".mw-parser-output > p").EachWithBreak(func(_ int, paragraph *goquery.Selection) bool {
|
|
|
|
|
text := s.clean(paragraph.Text())
|
|
|
|
|
if text == "" {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
lower := strings.ToLower(text)
|
|
|
|
|
if strings.Contains(lower, "is a") || strings.Contains(lower, "is an") || len(text) > 30 {
|
|
|
|
|
description = text
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return description
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) parseEffectsSection(doc *goquery.Document) ([]string, []string) {
|
|
|
|
|
var effects []string
|
|
|
|
|
var effectLinks []string
|
|
|
|
|
|
|
|
|
|
seenText := map[string]bool{}
|
|
|
|
|
seenLink := map[string]bool{}
|
|
|
|
|
section := s.findSection(doc, "Effects")
|
|
|
|
|
if section.Length() == 0 {
|
|
|
|
|
return effects, effectLinks
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
s.walkSectionUntilNextHeading(section, func(node *goquery.Selection) {
|
|
|
|
|
node.Find("li").Each(func(_ int, item *goquery.Selection) {
|
|
|
|
|
text := s.clean(item.Text())
|
|
|
|
|
if text != "" && !seenText[text] {
|
|
|
|
|
seenText[text] = true
|
|
|
|
|
effects = append(effects, text)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
item.Find("a[href^='/wiki/']").Each(func(_ int, anchor *goquery.Selection) {
|
|
|
|
|
href, _ := anchor.Attr("href")
|
|
|
|
|
label := s.clean(anchor.Text())
|
|
|
|
|
if href != "" && label != "" && s.looksLikeEffectLink(href, label) && !seenLink[href] {
|
|
|
|
|
seenLink[href] = true
|
|
|
|
|
effectLinks = append(effectLinks, href)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return effects, effectLinks
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string) []model.Recipe {
|
|
|
|
|
var recipes []model.Recipe
|
|
|
|
|
|
|
|
|
|
doc.Find("table").Each(func(_ int, table *goquery.Selection) {
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
if table.HasClass("infoboxtable") {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-15 16:42:43 +02:00
|
|
|
headerText := strings.ToLower(s.clean(table.Find("tr").First().Text()))
|
|
|
|
|
if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
table.Find("tr").Each(func(i int, row *goquery.Selection) {
|
|
|
|
|
if i == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cells := row.Find("th, td")
|
|
|
|
|
if cells.Length() < 2 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
resultCell := cells.Eq(0)
|
|
|
|
|
ingredientCell := cells.Eq(1)
|
|
|
|
|
stationCell := cells.Eq(2)
|
|
|
|
|
|
|
|
|
|
resultText := s.clean(resultCell.Text())
|
|
|
|
|
if resultText == "" {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
resultCount := ""
|
|
|
|
|
if matches := s.cfg.AmountPrefixRe.FindStringSubmatch(resultText); len(matches) > 1 {
|
|
|
|
|
resultCount = matches[1]
|
|
|
|
|
resultText = s.clean(strings.TrimPrefix(resultText, matches[0]))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var ingredients []string
|
|
|
|
|
ingredientCell.Find("li").Each(func(_ int, ingredient *goquery.Selection) {
|
|
|
|
|
text := s.clean(ingredient.Text())
|
|
|
|
|
if text != "" {
|
|
|
|
|
ingredients = append(ingredients, text)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if len(ingredients) == 0 {
|
|
|
|
|
for _, line := range s.splitLines(ingredientCell.Text()) {
|
|
|
|
|
if line != "" {
|
|
|
|
|
ingredients = append(ingredients, line)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
recipes = append(recipes, model.Recipe{
|
|
|
|
|
Result: resultText,
|
|
|
|
|
ResultCount: resultCount,
|
|
|
|
|
Ingredients: ingredients,
|
|
|
|
|
Station: s.clean(stationCell.Text()),
|
|
|
|
|
SourcePage: pageTitle,
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return recipes
|
|
|
|
|
}
|
|
|
|
|
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
func (s *Scraper) parseImageURL(doc *goquery.Document) string {
|
|
|
|
|
image := doc.Find("aside.portable-infobox img, table.infoboxtable img").First()
|
|
|
|
|
if image.Length() == 0 {
|
|
|
|
|
return ""
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, attr := range []string{"src", "data-src"} {
|
|
|
|
|
if raw, ok := image.Attr(attr); ok {
|
|
|
|
|
return s.normalizeImageURL(raw)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ""
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) normalizeImageURL(raw string) string {
|
|
|
|
|
raw = strings.TrimSpace(raw)
|
2026-03-15 18:23:58 +02:00
|
|
|
if raw == "" {
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
return ""
|
|
|
|
|
}
|
2026-03-15 18:23:58 +02:00
|
|
|
|
|
|
|
|
if strings.HasPrefix(raw, "//") {
|
|
|
|
|
raw = "https:" + raw
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
query := ""
|
|
|
|
|
if idx := strings.Index(raw, "?"); idx >= 0 {
|
|
|
|
|
query = raw[idx:]
|
|
|
|
|
raw = raw[:idx]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const scaledMarker = "/revision/latest/scale-to-width-down/"
|
|
|
|
|
if idx := strings.Index(raw, scaledMarker); idx >= 0 {
|
|
|
|
|
raw = raw[:idx] + "/revision/latest"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return raw + query
|
feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from
interruptions instead of restarting from scratch.
- introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling
- expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction
- skip infobox tables during recipe parsing to avoid false recipe matches
- add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
2026-03-15 17:08:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) parseContentTables(doc *goquery.Document) []model.Table {
|
|
|
|
|
var tables []model.Table
|
|
|
|
|
|
|
|
|
|
doc.Find(".mw-parser-output table").Each(func(_ int, table *goquery.Selection) {
|
|
|
|
|
if table.HasClass("infoboxtable") || table.Find("table").Length() > 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
rows := table.Find("tr")
|
|
|
|
|
if rows.Length() < 2 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
headerCells := rows.First().Find("th, td")
|
|
|
|
|
if headerCells.Length() == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
headers := make([]string, 0, headerCells.Length())
|
|
|
|
|
headerKeys := make([]string, 0, headerCells.Length())
|
|
|
|
|
headerCells.Each(func(i int, cell *goquery.Selection) {
|
|
|
|
|
header := s.clean(cell.Text())
|
|
|
|
|
if header == "" {
|
|
|
|
|
header = "Column " + strconv.Itoa(i+1)
|
|
|
|
|
}
|
|
|
|
|
headers = append(headers, header)
|
|
|
|
|
headerKeys = append(headerKeys, s.tableHeaderKey(header, i))
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
parsedTable := model.Table{
|
|
|
|
|
Title: s.tableTitle(table),
|
|
|
|
|
Headers: headers,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
rows.Slice(1, rows.Length()).Each(func(_ int, row *goquery.Selection) {
|
|
|
|
|
cells := row.Find("th, td")
|
|
|
|
|
if cells.Length() == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
rawRow := make([]string, 0, cells.Length())
|
|
|
|
|
rowData := map[string]string{}
|
|
|
|
|
|
|
|
|
|
cells.Each(func(i int, cell *goquery.Selection) {
|
|
|
|
|
text := s.clean(cell.Text())
|
|
|
|
|
rawRow = append(rawRow, text)
|
|
|
|
|
key := s.tableColumnKey(headerKeys, headers, i)
|
|
|
|
|
if text != "" {
|
|
|
|
|
rowData[key] = text
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if s.rowIsEmpty(rawRow) {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
parsedTable.RawRows = append(parsedTable.RawRows, rawRow)
|
|
|
|
|
if len(rowData) > 0 {
|
|
|
|
|
parsedTable.Rows = append(parsedTable.Rows, rowData)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if len(parsedTable.RawRows) == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tables = append(tables, parsedTable)
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return tables
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) tableTitle(table *goquery.Selection) string {
|
|
|
|
|
var parts []string
|
|
|
|
|
|
|
|
|
|
for prev := table.Prev(); prev.Length() > 0; prev = prev.Prev() {
|
|
|
|
|
switch goquery.NodeName(prev) {
|
|
|
|
|
case "h4", "h3":
|
|
|
|
|
text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]"))
|
|
|
|
|
if text != "" {
|
|
|
|
|
parts = append([]string{text}, parts...)
|
|
|
|
|
}
|
|
|
|
|
case "h2":
|
|
|
|
|
text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]"))
|
|
|
|
|
if text != "" {
|
|
|
|
|
parts = append([]string{text}, parts...)
|
|
|
|
|
}
|
|
|
|
|
return strings.Join(parts, " / ")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return strings.Join(parts, " / ")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) tableHeaderKey(header string, index int) string {
|
|
|
|
|
header = strings.TrimSpace(header)
|
|
|
|
|
if header == "" {
|
|
|
|
|
return s.fallbackColumnKey(index)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
header = strings.ToLower(header)
|
|
|
|
|
header = strings.ReplaceAll(header, " ", "_")
|
|
|
|
|
header = strings.ReplaceAll(header, "/", "_")
|
|
|
|
|
header = strings.ReplaceAll(header, "-", "_")
|
|
|
|
|
header = s.cfg.WhitespaceRe.ReplaceAllString(header, "_")
|
|
|
|
|
header = strings.Trim(header, "_")
|
|
|
|
|
if header == "" {
|
|
|
|
|
return s.fallbackColumnKey(index)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return header
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) tableColumnKey(headerKeys, headers []string, index int) string {
|
|
|
|
|
if index < len(headerKeys) && headerKeys[index] != "" {
|
|
|
|
|
return headerKeys[index]
|
|
|
|
|
}
|
|
|
|
|
if index < len(headers) && headers[index] != "" {
|
|
|
|
|
return headers[index]
|
|
|
|
|
}
|
|
|
|
|
return s.fallbackColumnKey(index)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) fallbackColumnKey(index int) string {
|
|
|
|
|
return "column_" + strconv.Itoa(index+1)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) rowIsEmpty(row []string) bool {
|
|
|
|
|
for _, value := range row {
|
|
|
|
|
if value != "" {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-15 16:42:43 +02:00
|
|
|
func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection {
|
|
|
|
|
var found *goquery.Selection
|
|
|
|
|
|
|
|
|
|
doc.Find(".mw-parser-output h2, .mw-parser-output h3, .mw-parser-output h4").EachWithBreak(func(_ int, section *goquery.Selection) bool {
|
|
|
|
|
text := strings.TrimSpace(strings.TrimSuffix(s.clean(section.Text()), "[]"))
|
|
|
|
|
if strings.EqualFold(text, title) {
|
|
|
|
|
found = section
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
return true
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if found == nil {
|
|
|
|
|
return &goquery.Selection{}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return found
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) walkSectionUntilNextHeading(start *goquery.Selection, fn func(*goquery.Selection)) {
|
|
|
|
|
for section := start.Next(); section.Length() > 0; section = section.Next() {
|
|
|
|
|
if goquery.NodeName(section) == "h2" || goquery.NodeName(section) == "h3" || goquery.NodeName(section) == "h4" {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
fn(section)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) looksLikeEffectLink(href, label string) bool {
|
|
|
|
|
lowerHref := strings.ToLower(href)
|
|
|
|
|
lowerLabel := strings.ToLower(label)
|
|
|
|
|
|
|
|
|
|
if strings.Contains(lowerHref, "/wiki/") && (strings.Contains(lowerHref, "effect") || strings.Contains(lowerLabel, "effect")) {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
knownHints := []string{
|
|
|
|
|
"bleeding", "burning", "poisoned", "possessed", "discipline", "rage",
|
|
|
|
|
"sapped", "scorched", "curse", "doomed", "chill", "warm", "cool",
|
|
|
|
|
"barrier", "protection", "imbue", "energized", "shimmer",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, hint := range knownHints {
|
|
|
|
|
if strings.Contains(lowerLabel, hint) {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) absoluteWikiURL(href string) string {
|
|
|
|
|
if href == "" {
|
|
|
|
|
return ""
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
|
|
|
|
return href
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if strings.HasPrefix(href, "/wiki/") {
|
|
|
|
|
return s.cfg.BaseURL + href
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ""
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) flattenItems() []model.Item {
|
|
|
|
|
out := make([]model.Item, 0, len(s.items))
|
|
|
|
|
for _, item := range s.items {
|
|
|
|
|
out = append(out, *item)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sort.Slice(out, func(i, j int) bool {
|
|
|
|
|
return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name)
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return out
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) flattenEffects() []model.Effect {
|
|
|
|
|
out := make([]model.Effect, 0, len(s.effects))
|
|
|
|
|
for _, effect := range s.effects {
|
|
|
|
|
out = append(out, *effect)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sort.Slice(out, func(i, j int) bool {
|
|
|
|
|
return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name)
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return out
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) clean(value string) string {
|
|
|
|
|
value = strings.ReplaceAll(value, "\u00a0", " ")
|
|
|
|
|
value = strings.ReplaceAll(value, "\n", " ")
|
|
|
|
|
value = strings.ReplaceAll(value, "\t", " ")
|
|
|
|
|
value = strings.TrimSpace(value)
|
|
|
|
|
return s.cfg.WhitespaceRe.ReplaceAllString(value, " ")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) splitLines(value string) []string {
|
|
|
|
|
value = strings.ReplaceAll(value, "\u00a0", " ")
|
|
|
|
|
raw := strings.Split(value, "\n")
|
|
|
|
|
out := make([]string, 0, len(raw))
|
|
|
|
|
|
|
|
|
|
for _, line := range raw {
|
|
|
|
|
line = s.clean(line)
|
|
|
|
|
if line != "" {
|
|
|
|
|
out = append(out, line)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return out
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) lowerSlice(in []string) []string {
|
|
|
|
|
out := make([]string, 0, len(in))
|
|
|
|
|
for _, value := range in {
|
|
|
|
|
out = append(out, strings.ToLower(value))
|
|
|
|
|
}
|
|
|
|
|
return out
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scraper) debugURLName(raw string) string {
|
|
|
|
|
parsed, err := url.Parse(raw)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return raw
|
|
|
|
|
}
|
|
|
|
|
return path.Base(parsed.Path)
|
|
|
|
|
}
|