package scraper import ( "encoding/json" "math/rand" "net/url" "path" "sort" "strconv" "strings" "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly/v2" "scrappr/internal/model" ) func (s *Scraper) shouldVisit(raw string) bool { if raw == "" { return false } parsed, err := url.Parse(raw) if err != nil { return false } if parsed.Host != s.cfg.AllowedDomain { return false } if !strings.HasPrefix(parsed.Path, "/wiki/") { return false } if s.cfg.IgnoredExact[parsed.Path] { return false } for _, prefix := range s.cfg.IgnoredPrefixes { if strings.HasPrefix(parsed.Path, prefix) { return false } } return true } func (s *Scraper) applyBrowserHeaders(r *colly.Request) { r.Headers.Set("User-Agent", s.randomUserAgent()) r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8") r.Headers.Set("Accept-Language", "en-US,en;q=0.9") r.Headers.Set("Cache-Control", "max-age=0") r.Headers.Set("DNT", "1") r.Headers.Set("Sec-Fetch-Dest", "document") r.Headers.Set("Sec-Fetch-Mode", "navigate") r.Headers.Set("Sec-Fetch-Site", "same-origin") r.Headers.Set("Sec-Fetch-User", "?1") r.Headers.Set("Upgrade-Insecure-Requests", "1") if r.Headers.Get("Referer") == "" { r.Headers.Set("Referer", s.cfg.BrowserReferrer) } } func (s *Scraper) randomUserAgent() string { return s.cfg.BrowserAgents[rand.Intn(len(s.cfg.BrowserAgents))] } func (s *Scraper) shouldQueueFromPage(fromPath, toURL string) bool { parsed, err := url.Parse(toURL) if err != nil { return false } toPath := parsed.Path if s.cfg.ItemListPathRe.MatchString(fromPath) { return strings.HasPrefix(toPath, "/wiki/") && !s.cfg.ItemListPathRe.MatchString(toPath) && !s.cfg.CraftingPathRe.MatchString(toPath) } if s.cfg.CraftingPathRe.MatchString(fromPath) { return strings.HasPrefix(toPath, "/wiki/") && !s.cfg.CraftingPathRe.MatchString(toPath) } return false } func (s *Scraper) shouldQueueTableLink(e *colly.HTMLElement) bool { label := s.clean(e.Text) if label == "" { return false } cell := e.DOM.Closest("td, th") if cell.Length() == 0 { return false } columnIndex := 0 for prev := cell.Prev(); prev.Length() > 0; prev = prev.Prev() { switch goquery.NodeName(prev) { case "td", "th": columnIndex++ } } return columnIndex <= 1 } func (s *Scraper) pageKindForPath(pagePath string) string { switch { case s.cfg.ItemListPathRe.MatchString(pagePath): return "item-list" case s.cfg.CraftingPathRe.MatchString(pagePath): return "crafting" default: return "article" } } func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc *goquery.Document) bool { if s.cfg.ItemListPathRe.MatchString(pagePath) || s.cfg.CraftingPathRe.MatchString(pagePath) { return false } if strings.Contains(strings.ToLower(title), "/") { return false } for _, category := range lcCats { if strings.Contains(category, "items") || strings.Contains(category, "food") || strings.Contains(category, "consumables") || strings.Contains(category, "ingredients") || strings.Contains(category, "equipment") || strings.Contains(category, "weapons") || strings.Contains(category, "deployables") { return true } } if !s.hasItemInfobox(doc) { return false } infoboxText := strings.ToLower(s.clean(s.infoboxText(doc))) descriptionText := strings.ToLower(s.parseDescription(doc)) return strings.Contains(infoboxText, "item details") || strings.Contains(infoboxText, "consumable details") || strings.Contains(infoboxText, "equipment details") || strings.Contains(infoboxText, "weapon") || strings.Contains(descriptionText, " is an item in outward") || strings.Contains(descriptionText, " is a weapon in outward") || strings.Contains(descriptionText, " is a consumable in outward") || strings.Contains(descriptionText, " is an ingredient in outward") || strings.Contains(descriptionText, " is a shield in outward") } func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool { if strings.Contains(strings.ToLower(title), "effect") { return true } for _, category := range lcCats { if strings.Contains(category, "effects") { return true } } return strings.Contains(strings.ToLower(pagePath), "/effect") } func (s *Scraper) parseItemPage(doc *goquery.Document, title, pageURL string, categories []string) model.Item { item := model.Item{ Name: title, URL: pageURL, Categories: categories, Infobox: s.parseInfobox(doc), ImageURL: s.parseImageURL(doc), } item.Description = s.parseDescription(doc) item.Effects, item.EffectLinks = s.parseEffectsSection(doc) item.Recipes = s.parseRecipesFromPage(doc, title) item.Tables = s.parseContentTables(doc) return item } func (s *Scraper) parseEffectPage(doc *goquery.Document, title, pageURL string, categories []string) model.Effect { return model.Effect{ Name: title, URL: pageURL, Categories: categories, Infobox: s.parseInfobox(doc), Description: s.parseDescription(doc), } } func (s *Scraper) hasItemInfobox(doc *goquery.Document) bool { return doc.Find("aside.portable-infobox").Length() > 0 || doc.Find("table.infoboxtable").Length() > 0 } func (s *Scraper) infoboxText(doc *goquery.Document) string { return s.clean(doc.Find("aside.portable-infobox, table.infoboxtable").First().Text()) } func (s *Scraper) parseInfobox(doc *goquery.Document) map[string]string { out := map[string]string{} doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) { label := s.clean(section.Find(".pi-data-label").First().Text()) value := s.clean(section.Find(".pi-data-value").First().Text()) if label == "" { label = s.clean(section.Find("h3").First().Text()) } if value == "" { value = s.clean(section.Find("div").Last().Text()) } if label != "" && value != "" { out[label] = value } }) doc.Find("table.infoboxtable tr").Each(func(_ int, row *goquery.Selection) { cells := row.Find("th, td") if cells.Length() < 2 { return } label := s.clean(cells.First().Text()) if label == "" { return } var values []string cells.Slice(1, cells.Length()).Each(func(_ int, cell *goquery.Selection) { text := s.clean(cell.Text()) if text != "" { values = append(values, text) } }) if len(values) == 0 { return } out[label] = strings.Join(values, " | ") }) return out } func (s *Scraper) parseCategories(doc *goquery.Document) []string { var categories []string seen := map[string]bool{} doc.Find(".page-header__categories a, .category-page__member-link, .wds-tabs__tab-label a").Each(func(_ int, node *goquery.Selection) { text := s.clean(node.Text()) if text != "" && !seen[text] { seen[text] = true categories = append(categories, text) } }) if len(categories) == 0 { doc.Find("script").EachWithBreak(func(_ int, node *goquery.Selection) bool { text := node.Text() index := strings.Index(text, "wgCategories") if index < 0 { return true } arrayStart := strings.Index(text[index:], "[") arrayEnd := strings.Index(text[index:], "]") if arrayStart < 0 || arrayEnd < 0 || arrayEnd <= arrayStart { return true } raw := text[index+arrayStart : index+arrayEnd+1] var parsed []string if err := json.Unmarshal([]byte(raw), &parsed); err != nil { return true } for _, category := range parsed { category = s.clean(category) if category != "" && !seen[category] { seen[category] = true categories = append(categories, category) } } return len(categories) == 0 }) } if len(categories) == 0 { headerText := s.clean(doc.Find("body").Text()) if idx := strings.Index(headerText, "in:"); idx >= 0 { line := strings.TrimPrefix(headerText[idx:], "in:") for _, part := range strings.Split(line, ",") { part = s.clean(part) if part != "" && !seen[part] { seen[part] = true categories = append(categories, part) } } } } return categories } func (s *Scraper) parseDescription(doc *goquery.Document) string { var description string doc.Find(".mw-parser-output > p").EachWithBreak(func(_ int, paragraph *goquery.Selection) bool { text := s.clean(paragraph.Text()) if text == "" { return true } lower := strings.ToLower(text) if strings.Contains(lower, "is a") || strings.Contains(lower, "is an") || len(text) > 30 { description = text return false } return true }) return description } func (s *Scraper) parseEffectsSection(doc *goquery.Document) ([]string, []string) { var effects []string var effectLinks []string seenText := map[string]bool{} seenLink := map[string]bool{} section := s.findSection(doc, "Effects") if section.Length() == 0 { return effects, effectLinks } s.walkSectionUntilNextHeading(section, func(node *goquery.Selection) { node.Find("li").Each(func(_ int, item *goquery.Selection) { text := s.clean(item.Text()) if text != "" && !seenText[text] { seenText[text] = true effects = append(effects, text) } item.Find("a[href^='/wiki/']").Each(func(_ int, anchor *goquery.Selection) { href, _ := anchor.Attr("href") label := s.clean(anchor.Text()) if href != "" && label != "" && s.looksLikeEffectLink(href, label) && !seenLink[href] { seenLink[href] = true effectLinks = append(effectLinks, href) } }) }) }) return effects, effectLinks } func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string) []model.Recipe { var recipes []model.Recipe doc.Find("table").Each(func(_ int, table *goquery.Selection) { if table.HasClass("infoboxtable") { return } headerText := strings.ToLower(s.clean(table.Find("tr").First().Text())) if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) { return } table.Find("tr").Each(func(i int, row *goquery.Selection) { if i == 0 { return } cells := row.Find("th, td") if cells.Length() < 2 { return } resultCell := cells.Eq(0) ingredientCell := cells.Eq(1) stationCell := cells.Eq(2) resultText := s.clean(resultCell.Text()) if resultText == "" { return } resultCount := "" if matches := s.cfg.AmountPrefixRe.FindStringSubmatch(resultText); len(matches) > 1 { resultCount = matches[1] resultText = s.clean(strings.TrimPrefix(resultText, matches[0])) } var ingredients []string ingredientCell.Find("li").Each(func(_ int, ingredient *goquery.Selection) { text := s.clean(ingredient.Text()) if text != "" { ingredients = append(ingredients, text) } }) if len(ingredients) == 0 { for _, line := range s.splitLines(ingredientCell.Text()) { if line != "" { ingredients = append(ingredients, line) } } } recipes = append(recipes, model.Recipe{ Result: resultText, ResultCount: resultCount, Ingredients: ingredients, Station: s.clean(stationCell.Text()), SourcePage: pageTitle, }) }) }) return recipes } func (s *Scraper) parseImageURL(doc *goquery.Document) string { image := doc.Find("aside.portable-infobox img, table.infoboxtable img").First() if image.Length() == 0 { return "" } for _, attr := range []string{"src", "data-src"} { if raw, ok := image.Attr(attr); ok { return s.normalizeImageURL(raw) } } return "" } func (s *Scraper) normalizeImageURL(raw string) string { raw = strings.TrimSpace(raw) if raw == "" { return "" } if strings.HasPrefix(raw, "//") { raw = "https:" + raw } query := "" if idx := strings.Index(raw, "?"); idx >= 0 { query = raw[idx:] raw = raw[:idx] } const scaledMarker = "/revision/latest/scale-to-width-down/" if idx := strings.Index(raw, scaledMarker); idx >= 0 { raw = raw[:idx] + "/revision/latest" } return raw + query } func (s *Scraper) parseContentTables(doc *goquery.Document) []model.Table { var tables []model.Table doc.Find(".mw-parser-output table").Each(func(_ int, table *goquery.Selection) { if table.HasClass("infoboxtable") || table.Find("table").Length() > 0 { return } rows := table.Find("tr") if rows.Length() < 2 { return } headerCells := rows.First().Find("th, td") if headerCells.Length() == 0 { return } headers := make([]string, 0, headerCells.Length()) headerKeys := make([]string, 0, headerCells.Length()) headerCells.Each(func(i int, cell *goquery.Selection) { header := s.clean(cell.Text()) if header == "" { header = "Column " + strconv.Itoa(i+1) } headers = append(headers, header) headerKeys = append(headerKeys, s.tableHeaderKey(header, i)) }) parsedTable := model.Table{ Title: s.tableTitle(table), Headers: headers, } rows.Slice(1, rows.Length()).Each(func(_ int, row *goquery.Selection) { cells := row.Find("th, td") if cells.Length() == 0 { return } rawRow := make([]string, 0, cells.Length()) rowData := map[string]string{} cells.Each(func(i int, cell *goquery.Selection) { text := s.clean(cell.Text()) rawRow = append(rawRow, text) key := s.tableColumnKey(headerKeys, headers, i) if text != "" { rowData[key] = text } }) if s.rowIsEmpty(rawRow) { return } parsedTable.RawRows = append(parsedTable.RawRows, rawRow) if len(rowData) > 0 { parsedTable.Rows = append(parsedTable.Rows, rowData) } }) if len(parsedTable.RawRows) == 0 { return } tables = append(tables, parsedTable) }) return tables } func (s *Scraper) tableTitle(table *goquery.Selection) string { var parts []string for prev := table.Prev(); prev.Length() > 0; prev = prev.Prev() { switch goquery.NodeName(prev) { case "h4", "h3": text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]")) if text != "" { parts = append([]string{text}, parts...) } case "h2": text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]")) if text != "" { parts = append([]string{text}, parts...) } return strings.Join(parts, " / ") } } return strings.Join(parts, " / ") } func (s *Scraper) tableHeaderKey(header string, index int) string { header = strings.TrimSpace(header) if header == "" { return s.fallbackColumnKey(index) } header = strings.ToLower(header) header = strings.ReplaceAll(header, " ", "_") header = strings.ReplaceAll(header, "/", "_") header = strings.ReplaceAll(header, "-", "_") header = s.cfg.WhitespaceRe.ReplaceAllString(header, "_") header = strings.Trim(header, "_") if header == "" { return s.fallbackColumnKey(index) } return header } func (s *Scraper) tableColumnKey(headerKeys, headers []string, index int) string { if index < len(headerKeys) && headerKeys[index] != "" { return headerKeys[index] } if index < len(headers) && headers[index] != "" { return headers[index] } return s.fallbackColumnKey(index) } func (s *Scraper) fallbackColumnKey(index int) string { return "column_" + strconv.Itoa(index+1) } func (s *Scraper) rowIsEmpty(row []string) bool { for _, value := range row { if value != "" { return false } } return true } func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection { var found *goquery.Selection doc.Find(".mw-parser-output h2, .mw-parser-output h3, .mw-parser-output h4").EachWithBreak(func(_ int, section *goquery.Selection) bool { text := strings.TrimSpace(strings.TrimSuffix(s.clean(section.Text()), "[]")) if strings.EqualFold(text, title) { found = section return false } return true }) if found == nil { return &goquery.Selection{} } return found } func (s *Scraper) walkSectionUntilNextHeading(start *goquery.Selection, fn func(*goquery.Selection)) { for section := start.Next(); section.Length() > 0; section = section.Next() { if goquery.NodeName(section) == "h2" || goquery.NodeName(section) == "h3" || goquery.NodeName(section) == "h4" { break } fn(section) } } func (s *Scraper) looksLikeEffectLink(href, label string) bool { lowerHref := strings.ToLower(href) lowerLabel := strings.ToLower(label) if strings.Contains(lowerHref, "/wiki/") && (strings.Contains(lowerHref, "effect") || strings.Contains(lowerLabel, "effect")) { return true } knownHints := []string{ "bleeding", "burning", "poisoned", "possessed", "discipline", "rage", "sapped", "scorched", "curse", "doomed", "chill", "warm", "cool", "barrier", "protection", "imbue", "energized", "shimmer", } for _, hint := range knownHints { if strings.Contains(lowerLabel, hint) { return true } } return false } func (s *Scraper) absoluteWikiURL(href string) string { if href == "" { return "" } if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") { return href } if strings.HasPrefix(href, "/wiki/") { return s.cfg.BaseURL + href } return "" } func (s *Scraper) flattenItems() []model.Item { out := make([]model.Item, 0, len(s.items)) for _, item := range s.items { out = append(out, *item) } sort.Slice(out, func(i, j int) bool { return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name) }) return out } func (s *Scraper) flattenEffects() []model.Effect { out := make([]model.Effect, 0, len(s.effects)) for _, effect := range s.effects { out = append(out, *effect) } sort.Slice(out, func(i, j int) bool { return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name) }) return out } func (s *Scraper) clean(value string) string { value = strings.ReplaceAll(value, "\u00a0", " ") value = strings.ReplaceAll(value, "\n", " ") value = strings.ReplaceAll(value, "\t", " ") value = strings.TrimSpace(value) return s.cfg.WhitespaceRe.ReplaceAllString(value, " ") } func (s *Scraper) splitLines(value string) []string { value = strings.ReplaceAll(value, "\u00a0", " ") raw := strings.Split(value, "\n") out := make([]string, 0, len(raw)) for _, line := range raw { line = s.clean(line) if line != "" { out = append(out, line) } } return out } func (s *Scraper) lowerSlice(in []string) []string { out := make([]string, 0, len(in)) for _, value := range in { out = append(out, strings.ToLower(value)) } return out } func (s *Scraper) debugURLName(raw string) string { parsed, err := url.Parse(raw) if err != nil { return raw } return path.Base(parsed.Path) }