Initial COmmit

This commit is contained in:
2026-03-15 16:42:43 +02:00
commit 42e2083ece
12 changed files with 1409 additions and 0 deletions

34
.gitignore vendored Normal file
View File

@@ -0,0 +1,34 @@
*.exe
*.exe~
*.dll
*.so
*.dylib
# Test binary, built with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
# Dependency directories (remove the comment below to include it)
# vendor/
# Go workspace file
go.work
### Linux ###
*~
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
# End of https://www.toptal.com/developers/gitignore/api/go,linux

40
README.md Normal file
View File

@@ -0,0 +1,40 @@
# Scrappr
Small Go scraper for the Outward Fandom wiki.
## Layout
```text
.
├── cmd/scrappr/main.go # binary entrypoint
├── internal/app # bootstrapping and output writing
├── internal/logx # colored emoji logger
├── internal/model # dataset models
├── internal/scraper # crawl flow, parsing, queueing, retries
├── go.mod
├── go.sum
└── outward_data.json # generated output
```
## Run
```bash
go run ./cmd/scrappr
```
## What It Does
- Crawls item and crafting pages from `outward.fandom.com`
- Uses browser-like headers and rotating user agents
- Limits crawl depth and queue size to avoid drifting into junk pages
- Retries temporary failures with short backoff
- Prints colored emoji logs for queueing, requests, responses, parsing, retries, and periodic status
- Writes a stable, sorted JSON dataset to `outward_data.json`
## Tuning
Scraper defaults live in `internal/scraper/config.go`.
- Lower or raise `RequestDelay` / `RequestJitter`
- Tighten or relax `MaxQueuedPages`
- Adjust `RequestTimeout`, `MaxRetries`, and `ProgressEvery`

15
cmd/scrappr/main.go Normal file
View File

@@ -0,0 +1,15 @@
package main
import (
"os"
"scrappr/internal/app"
"scrappr/internal/logx"
)
func main() {
if err := app.Run(); err != nil {
logx.Eventf("error", "fatal: %v", err)
os.Exit(1)
}
}

27
go.mod Normal file
View File

@@ -0,0 +1,27 @@
module scrappr
go 1.25.5
require (
github.com/PuerkitoBio/goquery v1.11.0
github.com/gocolly/colly/v2 v2.3.0
)
require (
github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/antchfx/htmlquery v1.3.5 // indirect
github.com/antchfx/xmlquery v1.5.0 // indirect
github.com/antchfx/xpath v1.3.5 // indirect
github.com/bits-and-blooms/bitset v1.24.4 // indirect
github.com/gobwas/glob v0.2.3 // indirect
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/nlnwa/whatwg-url v0.6.2 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
github.com/temoto/robotstxt v1.1.2 // indirect
golang.org/x/net v0.47.0 // indirect
golang.org/x/text v0.31.0 // indirect
google.golang.org/appengine v1.6.8 // indirect
google.golang.org/protobuf v1.36.10 // indirect
)

123
go.sum Normal file
View File

@@ -0,0 +1,123 @@
github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
github.com/antchfx/htmlquery v1.3.5 h1:aYthDDClnG2a2xePf6tys/UyyM/kRcsFRm+ifhFKoU0=
github.com/antchfx/htmlquery v1.3.5/go.mod h1:5oyIPIa3ovYGtLqMPNjBF2Uf25NPCKsMjCnQ8lvjaoA=
github.com/antchfx/xmlquery v1.5.0 h1:uAi+mO40ZWfyU6mlUBxRVvL6uBNZ6LMU4M3+mQIBV4c=
github.com/antchfx/xmlquery v1.5.0/go.mod h1:lJfWRXzYMK1ss32zm1GQV3gMIW/HFey3xDZmkP1SuNc=
github.com/antchfx/xpath v1.3.5 h1:PqbXLC3TkfeZyakF5eeh3NTWEbYl4VHNVeufANzDbKQ=
github.com/antchfx/xpath v1.3.5/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/bits-and-blooms/bitset v1.24.4 h1:95H15Og1clikBrKr/DuzMXkQzECs1M6hhoGXLwLQOZE=
github.com/bits-and-blooms/bitset v1.24.4/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gocolly/colly/v2 v2.3.0 h1:HSFh0ckbgVd2CSGRE+Y/iA4goUhGROJwyQDCMXGFBWM=
github.com/gocolly/colly/v2 v2.3.0/go.mod h1:Qp54s/kQbwCQvFVx8KzKCSTXVJ1wWT4QeAKEu33x1q8=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ=
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q=
github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM=
google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=

56
internal/app/run.go Normal file
View File

@@ -0,0 +1,56 @@
package app
import (
"encoding/json"
"math/rand"
"os"
"time"
"scrappr/internal/logx"
"scrappr/internal/model"
"scrappr/internal/scraper"
)
const outputPath = "outward_data.json"
func Run() error {
rand.Seed(time.Now().UnixNano())
logx.Eventf("start", "Outward scraper booting")
cfg := scraper.DefaultConfig()
s := scraper.New(cfg)
dataset, err := s.Run()
if err != nil {
return err
}
logx.Eventf("write", "writing dataset to %s", outputPath)
if err := writeDataset(outputPath, dataset); err != nil {
return err
}
logx.Eventf(
"success",
"wrote %d items and %d effects to %s",
len(dataset.Items),
len(dataset.Effects),
outputPath,
)
return nil
}
func writeDataset(outputPath string, dataset model.Dataset) error {
file, err := os.Create(outputPath)
if err != nil {
return err
}
defer file.Close()
encoder := json.NewEncoder(file)
encoder.SetIndent("", " ")
return encoder.Encode(dataset)
}

77
internal/logx/logx.go Normal file
View File

@@ -0,0 +1,77 @@
package logx
import (
"fmt"
"strings"
"sync"
"time"
)
const (
colorReset = "\033[0m"
colorBlue = "\033[38;5;39m"
colorCyan = "\033[38;5;45m"
colorGreen = "\033[38;5;42m"
colorYellow = "\033[38;5;220m"
colorRed = "\033[38;5;196m"
colorGray = "\033[38;5;244m"
colorPink = "\033[38;5;213m"
)
type style struct {
emoji string
label string
color string
}
var (
mu = sync.Mutex{}
styles = map[string]style{
"start": {emoji: "🚀", label: "START", color: colorBlue},
"queue": {emoji: "📥", label: "QUEUE", color: colorCyan},
"visit": {emoji: "🌐", label: "VISIT", color: colorBlue},
"recv": {emoji: "📦", label: "RECV", color: colorCyan},
"parsed": {emoji: "🧠", label: "PARSED", color: colorPink},
"status": {emoji: "🌀", label: "STATUS", color: colorYellow},
"done": {emoji: "✅", label: "DONE", color: colorGreen},
"write": {emoji: "💾", label: "WRITE", color: colorBlue},
"skip": {emoji: "⏭️", label: "SKIP", color: colorGray},
"warn": {emoji: "⚠️", label: "WARN", color: colorYellow},
"error": {emoji: "💥", label: "ERROR", color: colorRed},
"retry": {emoji: "🔁", label: "RETRY", color: colorYellow},
"giveup": {emoji: "🛑", label: "GIVEUP", color: colorRed},
"success": {emoji: "🎉", label: "SUCCESS", color: colorGreen},
}
)
func Eventf(kind, format string, args ...any) {
st, ok := styles[kind]
if !ok {
st = style{emoji: "•", label: strings.ToUpper(kind), color: colorGray}
}
write(st, fmt.Sprintf(format, args...))
}
func Statusf(frame, format string, args ...any) {
st := styles["status"]
st.emoji = frame
write(st, fmt.Sprintf(format, args...))
}
func write(st style, message string) {
mu.Lock()
defer mu.Unlock()
timestamp := time.Now().Format("15:04:05")
fmt.Printf(
"%s[%s] %s %-7s %s%s\n",
st.color,
timestamp,
st.emoji,
st.label,
message,
colorReset,
)
}

33
internal/model/types.go Normal file
View File

@@ -0,0 +1,33 @@
package model
type Recipe struct {
Result string `json:"result"`
ResultCount string `json:"result_count,omitempty"`
Ingredients []string `json:"ingredients,omitempty"`
Station string `json:"station,omitempty"`
SourcePage string `json:"source_page,omitempty"`
}
type Item struct {
Name string `json:"name"`
URL string `json:"url"`
Categories []string `json:"categories,omitempty"`
Infobox map[string]string `json:"infobox,omitempty"`
Effects []string `json:"effects,omitempty"`
EffectLinks []string `json:"effect_links,omitempty"`
Recipes []Recipe `json:"recipes,omitempty"`
Description string `json:"description,omitempty"`
}
type Effect struct {
Name string `json:"name"`
URL string `json:"url"`
Categories []string `json:"categories,omitempty"`
Infobox map[string]string `json:"infobox,omitempty"`
Description string `json:"description,omitempty"`
}
type Dataset struct {
Items []Item `json:"items"`
Effects []Effect `json:"effects"`
}

102
internal/scraper/config.go Normal file
View File

@@ -0,0 +1,102 @@
package scraper
import (
"regexp"
"time"
)
type Config struct {
BaseURL string
AllowedDomain string
MaxDepth int
MaxRetries int
MaxQueuedPages int
RequestDelay time.Duration
RequestJitter time.Duration
RequestTimeout time.Duration
RetryBaseDelay time.Duration
ProgressEvery time.Duration
BrowserReferrer string
BrowserAgents []string
ItemSeeds []string
CraftingSeeds []string
IgnoredPrefixes []string
IgnoredExact map[string]bool
ItemListPathRe *regexp.Regexp
CraftingPathRe *regexp.Regexp
AmountPrefixRe *regexp.Regexp
WhitespaceRe *regexp.Regexp
SpinnerFrames []string
}
func DefaultConfig() Config {
baseURL := "https://outward.fandom.com"
return Config{
BaseURL: baseURL,
AllowedDomain: "outward.fandom.com",
MaxDepth: 3,
MaxRetries: 2,
MaxQueuedPages: 1500,
RequestDelay: 650 * time.Millisecond,
RequestJitter: 350 * time.Millisecond,
RequestTimeout: 8 * time.Second,
RetryBaseDelay: 1200 * time.Millisecond,
ProgressEvery: 3 * time.Second,
BrowserReferrer: baseURL + "/",
BrowserAgents: []string{
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
},
ItemSeeds: []string{
baseURL + "/wiki/Items/Weapons",
baseURL + "/wiki/Items/Equipment",
baseURL + "/wiki/Items/Consumables",
baseURL + "/wiki/Items/Ingredients",
baseURL + "/wiki/Items/Deployables",
baseURL + "/wiki/Items/Other",
baseURL + "/wiki/Items/Item_Values",
},
CraftingSeeds: []string{
baseURL + "/wiki/Crafting",
baseURL + "/wiki/Crafting/Survival",
baseURL + "/wiki/Crafting/Cooking",
baseURL + "/wiki/Crafting/Alchemy",
},
IgnoredPrefixes: []string{
"/wiki/File:",
"/wiki/Category:",
"/wiki/Special:",
"/wiki/Help:",
"/wiki/Template:",
"/wiki/User:",
"/wiki/User_blog:",
"/wiki/Forum:",
"/wiki/Message_Wall:",
"/wiki/Thread:",
"/wiki/Map:",
},
IgnoredExact: map[string]bool{
"/wiki/Outward_Wiki": true,
"/wiki/Items": true,
"/wiki/Crafting": false,
},
ItemListPathRe: regexp.MustCompile(`^/wiki/Items(?:/|$)`),
CraftingPathRe: regexp.MustCompile(`^/wiki/Crafting(?:/|$)`),
AmountPrefixRe: regexp.MustCompile(`^\s*(\d+x)\s+`),
WhitespaceRe: regexp.MustCompile(`\s+`),
SpinnerFrames: []string{
"⠋",
"⠙",
"⠹",
"⠸",
"⠼",
"⠴",
"⠦",
"⠧",
"⠇",
"⠏",
},
}
}

487
internal/scraper/parse.go Normal file
View File

@@ -0,0 +1,487 @@
package scraper
import (
"math/rand"
"net/url"
"path"
"sort"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly/v2"
"scrappr/internal/model"
)
func (s *Scraper) shouldVisit(raw string) bool {
if raw == "" {
return false
}
parsed, err := url.Parse(raw)
if err != nil {
return false
}
if parsed.Host != s.cfg.AllowedDomain {
return false
}
if !strings.HasPrefix(parsed.Path, "/wiki/") {
return false
}
if s.cfg.IgnoredExact[parsed.Path] {
return false
}
for _, prefix := range s.cfg.IgnoredPrefixes {
if strings.HasPrefix(parsed.Path, prefix) {
return false
}
}
return true
}
func (s *Scraper) applyBrowserHeaders(r *colly.Request) {
r.Headers.Set("User-Agent", s.randomUserAgent())
r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8")
r.Headers.Set("Accept-Language", "en-US,en;q=0.9")
r.Headers.Set("Cache-Control", "max-age=0")
r.Headers.Set("DNT", "1")
r.Headers.Set("Sec-Fetch-Dest", "document")
r.Headers.Set("Sec-Fetch-Mode", "navigate")
r.Headers.Set("Sec-Fetch-Site", "same-origin")
r.Headers.Set("Sec-Fetch-User", "?1")
r.Headers.Set("Upgrade-Insecure-Requests", "1")
if r.Headers.Get("Referer") == "" {
r.Headers.Set("Referer", s.cfg.BrowserReferrer)
}
}
func (s *Scraper) randomUserAgent() string {
return s.cfg.BrowserAgents[rand.Intn(len(s.cfg.BrowserAgents))]
}
func (s *Scraper) shouldQueueFromPage(fromPath, toURL string) bool {
parsed, err := url.Parse(toURL)
if err != nil {
return false
}
toPath := parsed.Path
if s.cfg.ItemListPathRe.MatchString(fromPath) {
return strings.HasPrefix(toPath, "/wiki/") &&
!s.cfg.ItemListPathRe.MatchString(toPath) &&
!s.cfg.CraftingPathRe.MatchString(toPath)
}
if s.cfg.CraftingPathRe.MatchString(fromPath) {
return strings.HasPrefix(toPath, "/wiki/") &&
!s.cfg.CraftingPathRe.MatchString(toPath)
}
return false
}
func (s *Scraper) shouldQueueTableLink(e *colly.HTMLElement) bool {
label := s.clean(e.Text)
if label == "" {
return false
}
cell := e.DOM.Closest("td, th")
if cell.Length() == 0 {
return false
}
columnIndex := 0
for prev := cell.Prev(); prev.Length() > 0; prev = prev.Prev() {
switch goquery.NodeName(prev) {
case "td", "th":
columnIndex++
}
}
return columnIndex <= 1
}
func (s *Scraper) pageKindForPath(pagePath string) string {
switch {
case s.cfg.ItemListPathRe.MatchString(pagePath):
return "item-list"
case s.cfg.CraftingPathRe.MatchString(pagePath):
return "crafting"
default:
return "article"
}
}
func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc *goquery.Document) bool {
if s.cfg.ItemListPathRe.MatchString(pagePath) || s.cfg.CraftingPathRe.MatchString(pagePath) {
return false
}
if strings.Contains(strings.ToLower(title), "/") {
return false
}
if doc.Find("aside.portable-infobox").Length() == 0 {
return false
}
for _, category := range lcCats {
if strings.Contains(category, "items") || strings.Contains(category, "food") ||
strings.Contains(category, "consumables") || strings.Contains(category, "ingredients") ||
strings.Contains(category, "equipment") || strings.Contains(category, "weapons") ||
strings.Contains(category, "deployables") {
return true
}
}
infoboxText := strings.ToLower(s.clean(doc.Find("aside.portable-infobox").Text()))
return strings.Contains(infoboxText, "item details") ||
strings.Contains(infoboxText, "consumable details") ||
strings.Contains(infoboxText, "equipment details") ||
strings.Contains(infoboxText, "weapon")
}
func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool {
if strings.Contains(strings.ToLower(title), "effect") {
return true
}
for _, category := range lcCats {
if strings.Contains(category, "effects") {
return true
}
}
return strings.Contains(strings.ToLower(pagePath), "/effect")
}
func (s *Scraper) parseItemPage(doc *goquery.Document, title, pageURL string, categories []string) model.Item {
item := model.Item{
Name: title,
URL: pageURL,
Categories: categories,
Infobox: s.parsePortableInfobox(doc),
}
item.Description = s.parseDescription(doc)
item.Effects, item.EffectLinks = s.parseEffectsSection(doc)
item.Recipes = s.parseRecipesFromPage(doc, title)
return item
}
func (s *Scraper) parseEffectPage(doc *goquery.Document, title, pageURL string, categories []string) model.Effect {
return model.Effect{
Name: title,
URL: pageURL,
Categories: categories,
Infobox: s.parsePortableInfobox(doc),
Description: s.parseDescription(doc),
}
}
func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string {
out := map[string]string{}
doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) {
label := s.clean(section.Find(".pi-data-label").First().Text())
value := s.clean(section.Find(".pi-data-value").First().Text())
if label == "" {
label = s.clean(section.Find("h3").First().Text())
}
if value == "" {
value = s.clean(section.Find("div").Last().Text())
}
if label != "" && value != "" {
out[label] = value
}
})
return out
}
func (s *Scraper) parseCategories(doc *goquery.Document) []string {
var categories []string
seen := map[string]bool{}
doc.Find(".page-header__categories a, .category-page__member-link, .wds-tabs__tab-label a").Each(func(_ int, node *goquery.Selection) {
text := s.clean(node.Text())
if text != "" && !seen[text] {
seen[text] = true
categories = append(categories, text)
}
})
if len(categories) == 0 {
headerText := s.clean(doc.Find("body").Text())
if idx := strings.Index(headerText, "in:"); idx >= 0 {
line := strings.TrimPrefix(headerText[idx:], "in:")
for _, part := range strings.Split(line, ",") {
part = s.clean(part)
if part != "" && !seen[part] {
seen[part] = true
categories = append(categories, part)
}
}
}
}
return categories
}
func (s *Scraper) parseDescription(doc *goquery.Document) string {
var description string
doc.Find(".mw-parser-output > p").EachWithBreak(func(_ int, paragraph *goquery.Selection) bool {
text := s.clean(paragraph.Text())
if text == "" {
return true
}
lower := strings.ToLower(text)
if strings.Contains(lower, "is a") || strings.Contains(lower, "is an") || len(text) > 30 {
description = text
return false
}
return true
})
return description
}
func (s *Scraper) parseEffectsSection(doc *goquery.Document) ([]string, []string) {
var effects []string
var effectLinks []string
seenText := map[string]bool{}
seenLink := map[string]bool{}
section := s.findSection(doc, "Effects")
if section.Length() == 0 {
return effects, effectLinks
}
s.walkSectionUntilNextHeading(section, func(node *goquery.Selection) {
node.Find("li").Each(func(_ int, item *goquery.Selection) {
text := s.clean(item.Text())
if text != "" && !seenText[text] {
seenText[text] = true
effects = append(effects, text)
}
item.Find("a[href^='/wiki/']").Each(func(_ int, anchor *goquery.Selection) {
href, _ := anchor.Attr("href")
label := s.clean(anchor.Text())
if href != "" && label != "" && s.looksLikeEffectLink(href, label) && !seenLink[href] {
seenLink[href] = true
effectLinks = append(effectLinks, href)
}
})
})
})
return effects, effectLinks
}
func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string) []model.Recipe {
var recipes []model.Recipe
doc.Find("table").Each(func(_ int, table *goquery.Selection) {
headerText := strings.ToLower(s.clean(table.Find("tr").First().Text()))
if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) {
return
}
table.Find("tr").Each(func(i int, row *goquery.Selection) {
if i == 0 {
return
}
cells := row.Find("th, td")
if cells.Length() < 2 {
return
}
resultCell := cells.Eq(0)
ingredientCell := cells.Eq(1)
stationCell := cells.Eq(2)
resultText := s.clean(resultCell.Text())
if resultText == "" {
return
}
resultCount := ""
if matches := s.cfg.AmountPrefixRe.FindStringSubmatch(resultText); len(matches) > 1 {
resultCount = matches[1]
resultText = s.clean(strings.TrimPrefix(resultText, matches[0]))
}
var ingredients []string
ingredientCell.Find("li").Each(func(_ int, ingredient *goquery.Selection) {
text := s.clean(ingredient.Text())
if text != "" {
ingredients = append(ingredients, text)
}
})
if len(ingredients) == 0 {
for _, line := range s.splitLines(ingredientCell.Text()) {
if line != "" {
ingredients = append(ingredients, line)
}
}
}
recipes = append(recipes, model.Recipe{
Result: resultText,
ResultCount: resultCount,
Ingredients: ingredients,
Station: s.clean(stationCell.Text()),
SourcePage: pageTitle,
})
})
})
return recipes
}
func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection {
var found *goquery.Selection
doc.Find(".mw-parser-output h2, .mw-parser-output h3, .mw-parser-output h4").EachWithBreak(func(_ int, section *goquery.Selection) bool {
text := strings.TrimSpace(strings.TrimSuffix(s.clean(section.Text()), "[]"))
if strings.EqualFold(text, title) {
found = section
return false
}
return true
})
if found == nil {
return &goquery.Selection{}
}
return found
}
func (s *Scraper) walkSectionUntilNextHeading(start *goquery.Selection, fn func(*goquery.Selection)) {
for section := start.Next(); section.Length() > 0; section = section.Next() {
if goquery.NodeName(section) == "h2" || goquery.NodeName(section) == "h3" || goquery.NodeName(section) == "h4" {
break
}
fn(section)
}
}
func (s *Scraper) looksLikeEffectLink(href, label string) bool {
lowerHref := strings.ToLower(href)
lowerLabel := strings.ToLower(label)
if strings.Contains(lowerHref, "/wiki/") && (strings.Contains(lowerHref, "effect") || strings.Contains(lowerLabel, "effect")) {
return true
}
knownHints := []string{
"bleeding", "burning", "poisoned", "possessed", "discipline", "rage",
"sapped", "scorched", "curse", "doomed", "chill", "warm", "cool",
"barrier", "protection", "imbue", "energized", "shimmer",
}
for _, hint := range knownHints {
if strings.Contains(lowerLabel, hint) {
return true
}
}
return false
}
func (s *Scraper) absoluteWikiURL(href string) string {
if href == "" {
return ""
}
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
return href
}
if strings.HasPrefix(href, "/wiki/") {
return s.cfg.BaseURL + href
}
return ""
}
func (s *Scraper) flattenItems() []model.Item {
out := make([]model.Item, 0, len(s.items))
for _, item := range s.items {
out = append(out, *item)
}
sort.Slice(out, func(i, j int) bool {
return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name)
})
return out
}
func (s *Scraper) flattenEffects() []model.Effect {
out := make([]model.Effect, 0, len(s.effects))
for _, effect := range s.effects {
out = append(out, *effect)
}
sort.Slice(out, func(i, j int) bool {
return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name)
})
return out
}
func (s *Scraper) clean(value string) string {
value = strings.ReplaceAll(value, "\u00a0", " ")
value = strings.ReplaceAll(value, "\n", " ")
value = strings.ReplaceAll(value, "\t", " ")
value = strings.TrimSpace(value)
return s.cfg.WhitespaceRe.ReplaceAllString(value, " ")
}
func (s *Scraper) splitLines(value string) []string {
value = strings.ReplaceAll(value, "\u00a0", " ")
raw := strings.Split(value, "\n")
out := make([]string, 0, len(raw))
for _, line := range raw {
line = s.clean(line)
if line != "" {
out = append(out, line)
}
}
return out
}
func (s *Scraper) lowerSlice(in []string) []string {
out := make([]string, 0, len(in))
for _, value := range in {
out = append(out, strings.ToLower(value))
}
return out
}
func (s *Scraper) debugURLName(raw string) string {
parsed, err := url.Parse(raw)
if err != nil {
return raw
}
return path.Base(parsed.Path)
}

411
internal/scraper/scraper.go Normal file
View File

@@ -0,0 +1,411 @@
package scraper
import (
"bytes"
"math/rand"
"strconv"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly/v2"
"scrappr/internal/logx"
"scrappr/internal/model"
)
type Scraper struct {
cfg Config
collector *colly.Collector
mu sync.Mutex
items map[string]*model.Item
effects map[string]*model.Effect
queued map[string]bool
completed int
failed int
retried int
requestSeq int
spinnerIndex int
activeURL string
activeSince time.Time
lastEvent time.Time
}
func New(cfg Config) *Scraper {
return &Scraper{
cfg: cfg,
items: map[string]*model.Item{},
effects: map[string]*model.Effect{},
queued: map[string]bool{},
lastEvent: time.Now(),
}
}
func (s *Scraper) Run() (model.Dataset, error) {
s.collector = colly.NewCollector(
colly.AllowedDomains(s.cfg.AllowedDomain),
colly.MaxDepth(s.cfg.MaxDepth),
colly.Async(true),
)
s.collector.SetRequestTimeout(s.cfg.RequestTimeout)
s.collector.ParseHTTPErrorResponse = true
if err := s.collector.Limit(&colly.LimitRule{
DomainGlob: "*" + s.cfg.AllowedDomain + "*",
Parallelism: 1,
Delay: s.cfg.RequestDelay,
RandomDelay: s.cfg.RequestJitter,
}); err != nil {
return model.Dataset{}, err
}
s.registerHandlers()
done := make(chan struct{})
defer close(done)
s.startStatusLoop(done)
for _, seed := range append(append([]string{}, s.cfg.ItemSeeds...), s.cfg.CraftingSeeds...) {
s.queueVisit("seed", seed)
}
s.collector.Wait()
return model.Dataset{
Items: s.flattenItems(),
Effects: s.flattenEffects(),
}, nil
}
func (s *Scraper) registerHandlers() {
s.collector.OnRequest(func(r *colly.Request) {
s.applyBrowserHeaders(r)
startedAt := time.Now()
fromURL := r.Ctx.Get("from_url")
if fromURL == "" {
fromURL = "seed"
}
s.mu.Lock()
s.requestSeq++
reqID := s.requestSeq
s.activeURL = r.URL.String()
s.activeSince = startedAt
s.lastEvent = startedAt
s.mu.Unlock()
r.Ctx.Put("request_id", strconv.Itoa(reqID))
r.Ctx.Put("started_at_unix_nano", strconv.FormatInt(startedAt.UnixNano(), 10))
logx.Eventf(
"visit",
"#%d depth=%d attempt=%d from=%s to=%s",
reqID,
r.Depth,
s.retryAttempt(r.Ctx)+1,
s.debugURLName(fromURL),
r.URL.String(),
)
})
s.collector.OnError(func(r *colly.Response, err error) {
if r == nil || r.Request == nil {
logx.Eventf("error", "request failed before response: %v", err)
return
}
duration := s.requestDuration(r.Request)
s.mu.Lock()
s.failed++
s.activeURL = ""
s.activeSince = time.Time{}
s.lastEvent = time.Now()
s.mu.Unlock()
logx.Eventf(
"error",
"#%s status=%d after=%s url=%s: %v",
r.Request.Ctx.Get("request_id"),
r.StatusCode,
s.durationString(duration),
r.Request.URL.String(),
err,
)
if s.shouldRetry(r.StatusCode) {
s.mu.Lock()
s.retried++
s.mu.Unlock()
s.retryRequest(r, err)
}
})
s.collector.OnResponse(func(r *colly.Response) {
if r.StatusCode >= 400 {
logx.Eventf("skip", "#%s status=%d url=%s", r.Request.Ctx.Get("request_id"), r.StatusCode, r.Request.URL.String())
return
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
if err != nil {
logx.Eventf("warn", "parse error for %s: %v", r.Request.URL.String(), err)
return
}
title := s.clean(doc.Find("h1.page-header__title").First().Text())
if title == "" {
title = s.clean(doc.Find("h1").First().Text())
}
if title == "" {
logx.Eventf("warn", "empty title for %s", r.Request.URL.String())
return
}
pageURL := r.Request.URL.String()
pagePath := r.Request.URL.Path
categories := s.parseCategories(doc)
lcCats := s.lowerSlice(categories)
pageKind := s.pageKindForPath(pagePath)
switch {
case s.isEffectPage(pagePath, title, lcCats):
effect := s.parseEffectPage(doc, title, pageURL, categories)
if effect.Name != "" {
s.mu.Lock()
s.effects[effect.URL] = &effect
s.mu.Unlock()
pageKind = "effect"
logx.Eventf("parsed", "#%s effect=%q", r.Request.Ctx.Get("request_id"), effect.Name)
}
case s.isLikelyItemPage(pagePath, title, lcCats, doc):
item := s.parseItemPage(doc, title, pageURL, categories)
s.mu.Lock()
s.items[item.URL] = &item
s.mu.Unlock()
pageKind = "item"
logx.Eventf(
"parsed",
"#%s item=%q effects=%d recipes=%d",
r.Request.Ctx.Get("request_id"),
item.Name,
len(item.Effects),
len(item.Recipes),
)
for _, effectLink := range item.EffectLinks {
link := s.absoluteWikiURL(effectLink)
if link == "" {
continue
}
s.queueVisit(pageURL, link)
}
default:
logx.Eventf("skip", "#%s page=%q kind=%s", r.Request.Ctx.Get("request_id"), title, pageKind)
}
logx.Eventf(
"recv",
"#%s status=%d bytes=%d kind=%s title=%q after=%s",
r.Request.Ctx.Get("request_id"),
r.StatusCode,
len(r.Body),
pageKind,
title,
s.durationString(s.requestDuration(r.Request)),
)
})
s.collector.OnScraped(func(r *colly.Response) {
s.mu.Lock()
s.completed++
s.activeURL = ""
s.activeSince = time.Time{}
s.lastEvent = time.Now()
doneCount := s.completed
queueLen := len(s.queued)
s.mu.Unlock()
logx.Eventf("done", "#%s total=%d queued=%d url=%s", r.Request.Ctx.Get("request_id"), doneCount, queueLen, r.Request.URL.String())
})
s.collector.OnHTML(".mw-parser-output table a[href]", func(e *colly.HTMLElement) {
href := e.Attr("href")
link := e.Request.AbsoluteURL(href)
if !s.shouldVisit(link) {
return
}
if s.shouldQueueFromPage(e.Request.URL.Path, link) && s.shouldQueueTableLink(e) {
s.queueVisit(e.Request.URL.String(), link)
return
}
if s.looksLikeEffectLink(href, e.Text) {
s.queueVisit(e.Request.URL.String(), link)
}
})
}
func (s *Scraper) startStatusLoop(done <-chan struct{}) {
go func() {
ticker := time.NewTicker(s.cfg.ProgressEvery)
defer ticker.Stop()
for {
select {
case <-done:
return
case <-ticker.C:
s.mu.Lock()
queueLen := len(s.queued)
itemLen := len(s.items)
effectLen := len(s.effects)
currentURL := s.activeURL
currentName := "-"
currentFor := s.durationString(time.Since(s.activeSince))
idleFor := s.durationString(time.Since(s.lastEvent))
completedCount := s.completed
failedCount := s.failed
retriedCount := s.retried
frame := s.spinnerFrame()
s.mu.Unlock()
if currentURL == "" {
currentFor = "-"
} else {
currentName = s.debugURLName(currentURL)
}
logx.Statusf(
frame,
"queued=%d completed=%d failed=%d retries=%d items=%d effects=%d active=%s active_for=%s idle=%s",
queueLen,
completedCount,
failedCount,
retriedCount,
itemLen,
effectLen,
currentName,
currentFor,
idleFor,
)
}
}
}()
}
func (s *Scraper) queueVisit(fromURL, toURL string) {
if toURL == "" {
return
}
s.mu.Lock()
switch {
case s.queued[toURL]:
s.mu.Unlock()
return
case len(s.queued) >= s.cfg.MaxQueuedPages:
s.mu.Unlock()
logx.Eventf("skip", "queue budget reached from=%s to=%s", s.debugURLName(fromURL), toURL)
return
default:
s.queued[toURL] = true
queueLen := len(s.queued)
s.mu.Unlock()
ctx := colly.NewContext()
ctx.Put("from_url", fromURL)
logx.Eventf("queue", "%d from=%s to=%s", queueLen, s.debugURLName(fromURL), toURL)
if err := s.collector.Request("GET", toURL, nil, ctx, nil); err != nil {
logx.Eventf("warn", "queue failed from=%s to=%s: %v", s.debugURLName(fromURL), toURL, err)
}
}
}
func (s *Scraper) spinnerFrame() string {
if len(s.cfg.SpinnerFrames) == 0 {
return "🌀"
}
frame := s.cfg.SpinnerFrames[s.spinnerIndex%len(s.cfg.SpinnerFrames)]
s.spinnerIndex++
return frame
}
func (s *Scraper) shouldRetry(statusCode int) bool {
return statusCode == 0 || statusCode == 408 || statusCode == 425 || statusCode == 429 || statusCode >= 500
}
func (s *Scraper) retryRequest(r *colly.Response, err error) {
attempt := s.retryAttempt(r.Request.Ctx)
if attempt >= s.cfg.MaxRetries {
logx.Eventf("giveup", "url=%s attempts=%d: %v", r.Request.URL.String(), attempt, err)
return
}
attempt++
r.Request.Ctx.Put("retry_count", strconv.Itoa(attempt))
wait := s.retryDelay(attempt)
logx.Eventf("retry", "%d/%d %s after %s", attempt, s.cfg.MaxRetries, r.Request.URL.String(), wait)
time.Sleep(wait)
if retryErr := r.Request.Retry(); retryErr != nil {
logx.Eventf("error", "retry failed for %s: %v (original error: %v)", r.Request.URL.String(), retryErr, err)
}
}
func (s *Scraper) retryAttempt(ctx *colly.Context) int {
attempt := 0
if raw := ctx.Get("retry_count"); raw != "" {
parsed, err := strconv.Atoi(raw)
if err == nil {
attempt = parsed
}
}
return attempt
}
func (s *Scraper) retryDelay(attempt int) time.Duration {
backoff := s.cfg.RetryBaseDelay * time.Duration(1<<(attempt-1))
return backoff + s.jitter(500*time.Millisecond)
}
func (s *Scraper) jitter(max time.Duration) time.Duration {
if max <= 0 {
return 0
}
return time.Duration(rand.Int63n(int64(max)))
}
func (s *Scraper) requestDuration(r *colly.Request) time.Duration {
raw := r.Ctx.Get("started_at_unix_nano")
if raw == "" {
return 0
}
startedAtUnixNano, err := strconv.ParseInt(raw, 10, 64)
if err != nil {
return 0
}
return time.Since(time.Unix(0, startedAtUnixNano))
}
func (s *Scraper) durationString(d time.Duration) string {
if d <= 0 {
return "0s"
}
return d.Round(100 * time.Millisecond).String()
}

4
outward_data.json Normal file
View File

@@ -0,0 +1,4 @@
{
"items": [],
"effects": []
}