Files
outward-scrapper/internal/scraper/parse.go

488 lines
12 KiB
Go
Raw Normal View History

2026-03-15 16:42:43 +02:00
package scraper
import (
"math/rand"
"net/url"
"path"
"sort"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly/v2"
"scrappr/internal/model"
)
func (s *Scraper) shouldVisit(raw string) bool {
if raw == "" {
return false
}
parsed, err := url.Parse(raw)
if err != nil {
return false
}
if parsed.Host != s.cfg.AllowedDomain {
return false
}
if !strings.HasPrefix(parsed.Path, "/wiki/") {
return false
}
if s.cfg.IgnoredExact[parsed.Path] {
return false
}
for _, prefix := range s.cfg.IgnoredPrefixes {
if strings.HasPrefix(parsed.Path, prefix) {
return false
}
}
return true
}
func (s *Scraper) applyBrowserHeaders(r *colly.Request) {
r.Headers.Set("User-Agent", s.randomUserAgent())
r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8")
r.Headers.Set("Accept-Language", "en-US,en;q=0.9")
r.Headers.Set("Cache-Control", "max-age=0")
r.Headers.Set("DNT", "1")
r.Headers.Set("Sec-Fetch-Dest", "document")
r.Headers.Set("Sec-Fetch-Mode", "navigate")
r.Headers.Set("Sec-Fetch-Site", "same-origin")
r.Headers.Set("Sec-Fetch-User", "?1")
r.Headers.Set("Upgrade-Insecure-Requests", "1")
if r.Headers.Get("Referer") == "" {
r.Headers.Set("Referer", s.cfg.BrowserReferrer)
}
}
func (s *Scraper) randomUserAgent() string {
return s.cfg.BrowserAgents[rand.Intn(len(s.cfg.BrowserAgents))]
}
func (s *Scraper) shouldQueueFromPage(fromPath, toURL string) bool {
parsed, err := url.Parse(toURL)
if err != nil {
return false
}
toPath := parsed.Path
if s.cfg.ItemListPathRe.MatchString(fromPath) {
return strings.HasPrefix(toPath, "/wiki/") &&
!s.cfg.ItemListPathRe.MatchString(toPath) &&
!s.cfg.CraftingPathRe.MatchString(toPath)
}
if s.cfg.CraftingPathRe.MatchString(fromPath) {
return strings.HasPrefix(toPath, "/wiki/") &&
!s.cfg.CraftingPathRe.MatchString(toPath)
}
return false
}
func (s *Scraper) shouldQueueTableLink(e *colly.HTMLElement) bool {
label := s.clean(e.Text)
if label == "" {
return false
}
cell := e.DOM.Closest("td, th")
if cell.Length() == 0 {
return false
}
columnIndex := 0
for prev := cell.Prev(); prev.Length() > 0; prev = prev.Prev() {
switch goquery.NodeName(prev) {
case "td", "th":
columnIndex++
}
}
return columnIndex <= 1
}
func (s *Scraper) pageKindForPath(pagePath string) string {
switch {
case s.cfg.ItemListPathRe.MatchString(pagePath):
return "item-list"
case s.cfg.CraftingPathRe.MatchString(pagePath):
return "crafting"
default:
return "article"
}
}
func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc *goquery.Document) bool {
if s.cfg.ItemListPathRe.MatchString(pagePath) || s.cfg.CraftingPathRe.MatchString(pagePath) {
return false
}
if strings.Contains(strings.ToLower(title), "/") {
return false
}
if doc.Find("aside.portable-infobox").Length() == 0 {
return false
}
for _, category := range lcCats {
if strings.Contains(category, "items") || strings.Contains(category, "food") ||
strings.Contains(category, "consumables") || strings.Contains(category, "ingredients") ||
strings.Contains(category, "equipment") || strings.Contains(category, "weapons") ||
strings.Contains(category, "deployables") {
return true
}
}
infoboxText := strings.ToLower(s.clean(doc.Find("aside.portable-infobox").Text()))
return strings.Contains(infoboxText, "item details") ||
strings.Contains(infoboxText, "consumable details") ||
strings.Contains(infoboxText, "equipment details") ||
strings.Contains(infoboxText, "weapon")
}
func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool {
if strings.Contains(strings.ToLower(title), "effect") {
return true
}
for _, category := range lcCats {
if strings.Contains(category, "effects") {
return true
}
}
return strings.Contains(strings.ToLower(pagePath), "/effect")
}
func (s *Scraper) parseItemPage(doc *goquery.Document, title, pageURL string, categories []string) model.Item {
item := model.Item{
Name: title,
URL: pageURL,
Categories: categories,
Infobox: s.parsePortableInfobox(doc),
}
item.Description = s.parseDescription(doc)
item.Effects, item.EffectLinks = s.parseEffectsSection(doc)
item.Recipes = s.parseRecipesFromPage(doc, title)
return item
}
func (s *Scraper) parseEffectPage(doc *goquery.Document, title, pageURL string, categories []string) model.Effect {
return model.Effect{
Name: title,
URL: pageURL,
Categories: categories,
Infobox: s.parsePortableInfobox(doc),
Description: s.parseDescription(doc),
}
}
func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string {
out := map[string]string{}
doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) {
label := s.clean(section.Find(".pi-data-label").First().Text())
value := s.clean(section.Find(".pi-data-value").First().Text())
if label == "" {
label = s.clean(section.Find("h3").First().Text())
}
if value == "" {
value = s.clean(section.Find("div").Last().Text())
}
if label != "" && value != "" {
out[label] = value
}
})
return out
}
func (s *Scraper) parseCategories(doc *goquery.Document) []string {
var categories []string
seen := map[string]bool{}
doc.Find(".page-header__categories a, .category-page__member-link, .wds-tabs__tab-label a").Each(func(_ int, node *goquery.Selection) {
text := s.clean(node.Text())
if text != "" && !seen[text] {
seen[text] = true
categories = append(categories, text)
}
})
if len(categories) == 0 {
headerText := s.clean(doc.Find("body").Text())
if idx := strings.Index(headerText, "in:"); idx >= 0 {
line := strings.TrimPrefix(headerText[idx:], "in:")
for _, part := range strings.Split(line, ",") {
part = s.clean(part)
if part != "" && !seen[part] {
seen[part] = true
categories = append(categories, part)
}
}
}
}
return categories
}
func (s *Scraper) parseDescription(doc *goquery.Document) string {
var description string
doc.Find(".mw-parser-output > p").EachWithBreak(func(_ int, paragraph *goquery.Selection) bool {
text := s.clean(paragraph.Text())
if text == "" {
return true
}
lower := strings.ToLower(text)
if strings.Contains(lower, "is a") || strings.Contains(lower, "is an") || len(text) > 30 {
description = text
return false
}
return true
})
return description
}
func (s *Scraper) parseEffectsSection(doc *goquery.Document) ([]string, []string) {
var effects []string
var effectLinks []string
seenText := map[string]bool{}
seenLink := map[string]bool{}
section := s.findSection(doc, "Effects")
if section.Length() == 0 {
return effects, effectLinks
}
s.walkSectionUntilNextHeading(section, func(node *goquery.Selection) {
node.Find("li").Each(func(_ int, item *goquery.Selection) {
text := s.clean(item.Text())
if text != "" && !seenText[text] {
seenText[text] = true
effects = append(effects, text)
}
item.Find("a[href^='/wiki/']").Each(func(_ int, anchor *goquery.Selection) {
href, _ := anchor.Attr("href")
label := s.clean(anchor.Text())
if href != "" && label != "" && s.looksLikeEffectLink(href, label) && !seenLink[href] {
seenLink[href] = true
effectLinks = append(effectLinks, href)
}
})
})
})
return effects, effectLinks
}
func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string) []model.Recipe {
var recipes []model.Recipe
doc.Find("table").Each(func(_ int, table *goquery.Selection) {
headerText := strings.ToLower(s.clean(table.Find("tr").First().Text()))
if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) {
return
}
table.Find("tr").Each(func(i int, row *goquery.Selection) {
if i == 0 {
return
}
cells := row.Find("th, td")
if cells.Length() < 2 {
return
}
resultCell := cells.Eq(0)
ingredientCell := cells.Eq(1)
stationCell := cells.Eq(2)
resultText := s.clean(resultCell.Text())
if resultText == "" {
return
}
resultCount := ""
if matches := s.cfg.AmountPrefixRe.FindStringSubmatch(resultText); len(matches) > 1 {
resultCount = matches[1]
resultText = s.clean(strings.TrimPrefix(resultText, matches[0]))
}
var ingredients []string
ingredientCell.Find("li").Each(func(_ int, ingredient *goquery.Selection) {
text := s.clean(ingredient.Text())
if text != "" {
ingredients = append(ingredients, text)
}
})
if len(ingredients) == 0 {
for _, line := range s.splitLines(ingredientCell.Text()) {
if line != "" {
ingredients = append(ingredients, line)
}
}
}
recipes = append(recipes, model.Recipe{
Result: resultText,
ResultCount: resultCount,
Ingredients: ingredients,
Station: s.clean(stationCell.Text()),
SourcePage: pageTitle,
})
})
})
return recipes
}
func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection {
var found *goquery.Selection
doc.Find(".mw-parser-output h2, .mw-parser-output h3, .mw-parser-output h4").EachWithBreak(func(_ int, section *goquery.Selection) bool {
text := strings.TrimSpace(strings.TrimSuffix(s.clean(section.Text()), "[]"))
if strings.EqualFold(text, title) {
found = section
return false
}
return true
})
if found == nil {
return &goquery.Selection{}
}
return found
}
func (s *Scraper) walkSectionUntilNextHeading(start *goquery.Selection, fn func(*goquery.Selection)) {
for section := start.Next(); section.Length() > 0; section = section.Next() {
if goquery.NodeName(section) == "h2" || goquery.NodeName(section) == "h3" || goquery.NodeName(section) == "h4" {
break
}
fn(section)
}
}
func (s *Scraper) looksLikeEffectLink(href, label string) bool {
lowerHref := strings.ToLower(href)
lowerLabel := strings.ToLower(label)
if strings.Contains(lowerHref, "/wiki/") && (strings.Contains(lowerHref, "effect") || strings.Contains(lowerLabel, "effect")) {
return true
}
knownHints := []string{
"bleeding", "burning", "poisoned", "possessed", "discipline", "rage",
"sapped", "scorched", "curse", "doomed", "chill", "warm", "cool",
"barrier", "protection", "imbue", "energized", "shimmer",
}
for _, hint := range knownHints {
if strings.Contains(lowerLabel, hint) {
return true
}
}
return false
}
func (s *Scraper) absoluteWikiURL(href string) string {
if href == "" {
return ""
}
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
return href
}
if strings.HasPrefix(href, "/wiki/") {
return s.cfg.BaseURL + href
}
return ""
}
func (s *Scraper) flattenItems() []model.Item {
out := make([]model.Item, 0, len(s.items))
for _, item := range s.items {
out = append(out, *item)
}
sort.Slice(out, func(i, j int) bool {
return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name)
})
return out
}
func (s *Scraper) flattenEffects() []model.Effect {
out := make([]model.Effect, 0, len(s.effects))
for _, effect := range s.effects {
out = append(out, *effect)
}
sort.Slice(out, func(i, j int) bool {
return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name)
})
return out
}
func (s *Scraper) clean(value string) string {
value = strings.ReplaceAll(value, "\u00a0", " ")
value = strings.ReplaceAll(value, "\n", " ")
value = strings.ReplaceAll(value, "\t", " ")
value = strings.TrimSpace(value)
return s.cfg.WhitespaceRe.ReplaceAllString(value, " ")
}
func (s *Scraper) splitLines(value string) []string {
value = strings.ReplaceAll(value, "\u00a0", " ")
raw := strings.Split(value, "\n")
out := make([]string, 0, len(raw))
for _, line := range raw {
line = s.clean(line)
if line != "" {
out = append(out, line)
}
}
return out
}
func (s *Scraper) lowerSlice(in []string) []string {
out := make([]string, 0, len(in))
for _, value := range in {
out = append(out, strings.ToLower(value))
}
return out
}
func (s *Scraper) debugURLName(raw string) string {
parsed, err := url.Parse(raw)
if err != nil {
return raw
}
return path.Base(parsed.Path)
}