feat(scraper): add checkpointing and richer page extraction
Add resumable checkpoint support so long scrapes can recover from interruptions instead of restarting from scratch. - introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling - expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction - skip infobox tables during recipe parsing to avoid false recipe matches - add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in READMEfeat(scraper): add checkpointing and richer page extraction Add resumable checkpoint support so long scrapes can recover from interruptions instead of restarting from scratch. - introduce autosave/load/clear checkpoint flow in `.cache/scrape-state.json`, including SIGINT/SIGTERM save-on-exit handling - expand parsing/model output to capture legacy and portable infobox fields, primary image URLs, effects, recipes, raw tables, and improved category extraction - skip infobox tables during recipe parsing to avoid false recipe matches - add cache log event type, ignore cache/output artifacts, and document new autosave tuning options in README
This commit is contained in:
@@ -1,10 +1,12 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"math/rand"
|
||||
"net/url"
|
||||
"path"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
@@ -127,9 +129,6 @@ func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc
|
||||
if strings.Contains(strings.ToLower(title), "/") {
|
||||
return false
|
||||
}
|
||||
if doc.Find("aside.portable-infobox").Length() == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, category := range lcCats {
|
||||
if strings.Contains(category, "items") || strings.Contains(category, "food") ||
|
||||
@@ -140,11 +139,21 @@ func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc
|
||||
}
|
||||
}
|
||||
|
||||
infoboxText := strings.ToLower(s.clean(doc.Find("aside.portable-infobox").Text()))
|
||||
if !s.hasItemInfobox(doc) {
|
||||
return false
|
||||
}
|
||||
|
||||
infoboxText := strings.ToLower(s.clean(s.infoboxText(doc)))
|
||||
descriptionText := strings.ToLower(s.parseDescription(doc))
|
||||
return strings.Contains(infoboxText, "item details") ||
|
||||
strings.Contains(infoboxText, "consumable details") ||
|
||||
strings.Contains(infoboxText, "equipment details") ||
|
||||
strings.Contains(infoboxText, "weapon")
|
||||
strings.Contains(infoboxText, "weapon") ||
|
||||
strings.Contains(descriptionText, " is an item in outward") ||
|
||||
strings.Contains(descriptionText, " is a weapon in outward") ||
|
||||
strings.Contains(descriptionText, " is a consumable in outward") ||
|
||||
strings.Contains(descriptionText, " is an ingredient in outward") ||
|
||||
strings.Contains(descriptionText, " is a shield in outward")
|
||||
}
|
||||
|
||||
func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool {
|
||||
@@ -166,12 +175,14 @@ func (s *Scraper) parseItemPage(doc *goquery.Document, title, pageURL string, ca
|
||||
Name: title,
|
||||
URL: pageURL,
|
||||
Categories: categories,
|
||||
Infobox: s.parsePortableInfobox(doc),
|
||||
Infobox: s.parseInfobox(doc),
|
||||
ImageURL: s.parseImageURL(doc),
|
||||
}
|
||||
|
||||
item.Description = s.parseDescription(doc)
|
||||
item.Effects, item.EffectLinks = s.parseEffectsSection(doc)
|
||||
item.Recipes = s.parseRecipesFromPage(doc, title)
|
||||
item.Tables = s.parseContentTables(doc)
|
||||
|
||||
return item
|
||||
}
|
||||
@@ -181,12 +192,20 @@ func (s *Scraper) parseEffectPage(doc *goquery.Document, title, pageURL string,
|
||||
Name: title,
|
||||
URL: pageURL,
|
||||
Categories: categories,
|
||||
Infobox: s.parsePortableInfobox(doc),
|
||||
Infobox: s.parseInfobox(doc),
|
||||
Description: s.parseDescription(doc),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string {
|
||||
func (s *Scraper) hasItemInfobox(doc *goquery.Document) bool {
|
||||
return doc.Find("aside.portable-infobox").Length() > 0 || doc.Find("table.infoboxtable").Length() > 0
|
||||
}
|
||||
|
||||
func (s *Scraper) infoboxText(doc *goquery.Document) string {
|
||||
return s.clean(doc.Find("aside.portable-infobox, table.infoboxtable").First().Text())
|
||||
}
|
||||
|
||||
func (s *Scraper) parseInfobox(doc *goquery.Document) map[string]string {
|
||||
out := map[string]string{}
|
||||
|
||||
doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) {
|
||||
@@ -205,6 +224,32 @@ func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string
|
||||
}
|
||||
})
|
||||
|
||||
doc.Find("table.infoboxtable tr").Each(func(_ int, row *goquery.Selection) {
|
||||
cells := row.Find("th, td")
|
||||
if cells.Length() < 2 {
|
||||
return
|
||||
}
|
||||
|
||||
label := s.clean(cells.First().Text())
|
||||
if label == "" {
|
||||
return
|
||||
}
|
||||
|
||||
var values []string
|
||||
cells.Slice(1, cells.Length()).Each(func(_ int, cell *goquery.Selection) {
|
||||
text := s.clean(cell.Text())
|
||||
if text != "" {
|
||||
values = append(values, text)
|
||||
}
|
||||
})
|
||||
|
||||
if len(values) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
out[label] = strings.Join(values, " | ")
|
||||
})
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
@@ -220,6 +265,38 @@ func (s *Scraper) parseCategories(doc *goquery.Document) []string {
|
||||
}
|
||||
})
|
||||
|
||||
if len(categories) == 0 {
|
||||
doc.Find("script").EachWithBreak(func(_ int, node *goquery.Selection) bool {
|
||||
text := node.Text()
|
||||
index := strings.Index(text, "wgCategories")
|
||||
if index < 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
arrayStart := strings.Index(text[index:], "[")
|
||||
arrayEnd := strings.Index(text[index:], "]")
|
||||
if arrayStart < 0 || arrayEnd < 0 || arrayEnd <= arrayStart {
|
||||
return true
|
||||
}
|
||||
|
||||
raw := text[index+arrayStart : index+arrayEnd+1]
|
||||
var parsed []string
|
||||
if err := json.Unmarshal([]byte(raw), &parsed); err != nil {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, category := range parsed {
|
||||
category = s.clean(category)
|
||||
if category != "" && !seen[category] {
|
||||
seen[category] = true
|
||||
categories = append(categories, category)
|
||||
}
|
||||
}
|
||||
|
||||
return len(categories) == 0
|
||||
})
|
||||
}
|
||||
|
||||
if len(categories) == 0 {
|
||||
headerText := s.clean(doc.Find("body").Text())
|
||||
if idx := strings.Index(headerText, "in:"); idx >= 0 {
|
||||
@@ -295,6 +372,10 @@ func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string)
|
||||
var recipes []model.Recipe
|
||||
|
||||
doc.Find("table").Each(func(_ int, table *goquery.Selection) {
|
||||
if table.HasClass("infoboxtable") {
|
||||
return
|
||||
}
|
||||
|
||||
headerText := strings.ToLower(s.clean(table.Find("tr").First().Text()))
|
||||
if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) {
|
||||
return
|
||||
@@ -354,6 +435,169 @@ func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string)
|
||||
return recipes
|
||||
}
|
||||
|
||||
func (s *Scraper) parseImageURL(doc *goquery.Document) string {
|
||||
image := doc.Find("aside.portable-infobox img, table.infoboxtable img").First()
|
||||
if image.Length() == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
for _, attr := range []string{"src", "data-src"} {
|
||||
if raw, ok := image.Attr(attr); ok {
|
||||
return s.normalizeImageURL(raw)
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
func (s *Scraper) normalizeImageURL(raw string) string {
|
||||
raw = strings.TrimSpace(raw)
|
||||
switch {
|
||||
case raw == "":
|
||||
return ""
|
||||
case strings.HasPrefix(raw, "//"):
|
||||
return "https:" + raw
|
||||
default:
|
||||
return raw
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scraper) parseContentTables(doc *goquery.Document) []model.Table {
|
||||
var tables []model.Table
|
||||
|
||||
doc.Find(".mw-parser-output table").Each(func(_ int, table *goquery.Selection) {
|
||||
if table.HasClass("infoboxtable") || table.Find("table").Length() > 0 {
|
||||
return
|
||||
}
|
||||
|
||||
rows := table.Find("tr")
|
||||
if rows.Length() < 2 {
|
||||
return
|
||||
}
|
||||
|
||||
headerCells := rows.First().Find("th, td")
|
||||
if headerCells.Length() == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
headers := make([]string, 0, headerCells.Length())
|
||||
headerKeys := make([]string, 0, headerCells.Length())
|
||||
headerCells.Each(func(i int, cell *goquery.Selection) {
|
||||
header := s.clean(cell.Text())
|
||||
if header == "" {
|
||||
header = "Column " + strconv.Itoa(i+1)
|
||||
}
|
||||
headers = append(headers, header)
|
||||
headerKeys = append(headerKeys, s.tableHeaderKey(header, i))
|
||||
})
|
||||
|
||||
parsedTable := model.Table{
|
||||
Title: s.tableTitle(table),
|
||||
Headers: headers,
|
||||
}
|
||||
|
||||
rows.Slice(1, rows.Length()).Each(func(_ int, row *goquery.Selection) {
|
||||
cells := row.Find("th, td")
|
||||
if cells.Length() == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
rawRow := make([]string, 0, cells.Length())
|
||||
rowData := map[string]string{}
|
||||
|
||||
cells.Each(func(i int, cell *goquery.Selection) {
|
||||
text := s.clean(cell.Text())
|
||||
rawRow = append(rawRow, text)
|
||||
key := s.tableColumnKey(headerKeys, headers, i)
|
||||
if text != "" {
|
||||
rowData[key] = text
|
||||
}
|
||||
})
|
||||
|
||||
if s.rowIsEmpty(rawRow) {
|
||||
return
|
||||
}
|
||||
|
||||
parsedTable.RawRows = append(parsedTable.RawRows, rawRow)
|
||||
if len(rowData) > 0 {
|
||||
parsedTable.Rows = append(parsedTable.Rows, rowData)
|
||||
}
|
||||
})
|
||||
|
||||
if len(parsedTable.RawRows) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
tables = append(tables, parsedTable)
|
||||
})
|
||||
|
||||
return tables
|
||||
}
|
||||
|
||||
func (s *Scraper) tableTitle(table *goquery.Selection) string {
|
||||
var parts []string
|
||||
|
||||
for prev := table.Prev(); prev.Length() > 0; prev = prev.Prev() {
|
||||
switch goquery.NodeName(prev) {
|
||||
case "h4", "h3":
|
||||
text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]"))
|
||||
if text != "" {
|
||||
parts = append([]string{text}, parts...)
|
||||
}
|
||||
case "h2":
|
||||
text := strings.TrimSpace(strings.TrimSuffix(s.clean(prev.Text()), "[]"))
|
||||
if text != "" {
|
||||
parts = append([]string{text}, parts...)
|
||||
}
|
||||
return strings.Join(parts, " / ")
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, " / ")
|
||||
}
|
||||
|
||||
func (s *Scraper) tableHeaderKey(header string, index int) string {
|
||||
header = strings.TrimSpace(header)
|
||||
if header == "" {
|
||||
return s.fallbackColumnKey(index)
|
||||
}
|
||||
|
||||
header = strings.ToLower(header)
|
||||
header = strings.ReplaceAll(header, " ", "_")
|
||||
header = strings.ReplaceAll(header, "/", "_")
|
||||
header = strings.ReplaceAll(header, "-", "_")
|
||||
header = s.cfg.WhitespaceRe.ReplaceAllString(header, "_")
|
||||
header = strings.Trim(header, "_")
|
||||
if header == "" {
|
||||
return s.fallbackColumnKey(index)
|
||||
}
|
||||
|
||||
return header
|
||||
}
|
||||
|
||||
func (s *Scraper) tableColumnKey(headerKeys, headers []string, index int) string {
|
||||
if index < len(headerKeys) && headerKeys[index] != "" {
|
||||
return headerKeys[index]
|
||||
}
|
||||
if index < len(headers) && headers[index] != "" {
|
||||
return headers[index]
|
||||
}
|
||||
return s.fallbackColumnKey(index)
|
||||
}
|
||||
|
||||
func (s *Scraper) fallbackColumnKey(index int) string {
|
||||
return "column_" + strconv.Itoa(index+1)
|
||||
}
|
||||
|
||||
func (s *Scraper) rowIsEmpty(row []string) bool {
|
||||
for _, value := range row {
|
||||
if value != "" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection {
|
||||
var found *goquery.Selection
|
||||
|
||||
|
||||
Reference in New Issue
Block a user