commit 42e2083ecefd9d6735c5fa54d9d8dd3fa3b068ae Author: Daniel Legt Date: Sun Mar 15 16:42:43 2026 +0200 Initial COmmit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..212e961 --- /dev/null +++ b/.gitignore @@ -0,0 +1,34 @@ +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Dependency directories (remove the comment below to include it) +# vendor/ + +# Go workspace file +go.work + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +# End of https://www.toptal.com/developers/gitignore/api/go,linux diff --git a/README.md b/README.md new file mode 100644 index 0000000..ef988d0 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +# Scrappr + +Small Go scraper for the Outward Fandom wiki. + +## Layout + +```text +. +├── cmd/scrappr/main.go # binary entrypoint +├── internal/app # bootstrapping and output writing +├── internal/logx # colored emoji logger +├── internal/model # dataset models +├── internal/scraper # crawl flow, parsing, queueing, retries +├── go.mod +├── go.sum +└── outward_data.json # generated output +``` + +## Run + +```bash +go run ./cmd/scrappr +``` + +## What It Does + +- Crawls item and crafting pages from `outward.fandom.com` +- Uses browser-like headers and rotating user agents +- Limits crawl depth and queue size to avoid drifting into junk pages +- Retries temporary failures with short backoff +- Prints colored emoji logs for queueing, requests, responses, parsing, retries, and periodic status +- Writes a stable, sorted JSON dataset to `outward_data.json` + +## Tuning + +Scraper defaults live in `internal/scraper/config.go`. + +- Lower or raise `RequestDelay` / `RequestJitter` +- Tighten or relax `MaxQueuedPages` +- Adjust `RequestTimeout`, `MaxRetries`, and `ProgressEvery` diff --git a/cmd/scrappr/main.go b/cmd/scrappr/main.go new file mode 100644 index 0000000..5b7c136 --- /dev/null +++ b/cmd/scrappr/main.go @@ -0,0 +1,15 @@ +package main + +import ( + "os" + + "scrappr/internal/app" + "scrappr/internal/logx" +) + +func main() { + if err := app.Run(); err != nil { + logx.Eventf("error", "fatal: %v", err) + os.Exit(1) + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..c0dab22 --- /dev/null +++ b/go.mod @@ -0,0 +1,27 @@ +module scrappr + +go 1.25.5 + +require ( + github.com/PuerkitoBio/goquery v1.11.0 + github.com/gocolly/colly/v2 v2.3.0 +) + +require ( + github.com/andybalholm/cascadia v1.3.3 // indirect + github.com/antchfx/htmlquery v1.3.5 // indirect + github.com/antchfx/xmlquery v1.5.0 // indirect + github.com/antchfx/xpath v1.3.5 // indirect + github.com/bits-and-blooms/bitset v1.24.4 // indirect + github.com/gobwas/glob v0.2.3 // indirect + github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/kennygrant/sanitize v1.2.4 // indirect + github.com/nlnwa/whatwg-url v0.6.2 // indirect + github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect + github.com/temoto/robotstxt v1.1.2 // indirect + golang.org/x/net v0.47.0 // indirect + golang.org/x/text v0.31.0 // indirect + google.golang.org/appengine v1.6.8 // indirect + google.golang.org/protobuf v1.36.10 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..a0a600e --- /dev/null +++ b/go.sum @@ -0,0 +1,123 @@ +github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= +github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/antchfx/htmlquery v1.3.5 h1:aYthDDClnG2a2xePf6tys/UyyM/kRcsFRm+ifhFKoU0= +github.com/antchfx/htmlquery v1.3.5/go.mod h1:5oyIPIa3ovYGtLqMPNjBF2Uf25NPCKsMjCnQ8lvjaoA= +github.com/antchfx/xmlquery v1.5.0 h1:uAi+mO40ZWfyU6mlUBxRVvL6uBNZ6LMU4M3+mQIBV4c= +github.com/antchfx/xmlquery v1.5.0/go.mod h1:lJfWRXzYMK1ss32zm1GQV3gMIW/HFey3xDZmkP1SuNc= +github.com/antchfx/xpath v1.3.5 h1:PqbXLC3TkfeZyakF5eeh3NTWEbYl4VHNVeufANzDbKQ= +github.com/antchfx/xpath v1.3.5/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/bits-and-blooms/bitset v1.24.4 h1:95H15Og1clikBrKr/DuzMXkQzECs1M6hhoGXLwLQOZE= +github.com/bits-and-blooms/bitset v1.24.4/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/gocolly/colly/v2 v2.3.0 h1:HSFh0ckbgVd2CSGRE+Y/iA4goUhGROJwyQDCMXGFBWM= +github.com/gocolly/colly/v2 v2.3.0/go.mod h1:Qp54s/kQbwCQvFVx8KzKCSTXVJ1wWT4QeAKEu33x1q8= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ= +github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q= +github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= +google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= diff --git a/internal/app/run.go b/internal/app/run.go new file mode 100644 index 0000000..20b413a --- /dev/null +++ b/internal/app/run.go @@ -0,0 +1,56 @@ +package app + +import ( + "encoding/json" + "math/rand" + "os" + "time" + + "scrappr/internal/logx" + "scrappr/internal/model" + "scrappr/internal/scraper" +) + +const outputPath = "outward_data.json" + +func Run() error { + rand.Seed(time.Now().UnixNano()) + + logx.Eventf("start", "Outward scraper booting") + + cfg := scraper.DefaultConfig() + s := scraper.New(cfg) + + dataset, err := s.Run() + if err != nil { + return err + } + + logx.Eventf("write", "writing dataset to %s", outputPath) + if err := writeDataset(outputPath, dataset); err != nil { + return err + } + + logx.Eventf( + "success", + "wrote %d items and %d effects to %s", + len(dataset.Items), + len(dataset.Effects), + outputPath, + ) + + return nil +} + +func writeDataset(outputPath string, dataset model.Dataset) error { + file, err := os.Create(outputPath) + if err != nil { + return err + } + defer file.Close() + + encoder := json.NewEncoder(file) + encoder.SetIndent("", " ") + + return encoder.Encode(dataset) +} diff --git a/internal/logx/logx.go b/internal/logx/logx.go new file mode 100644 index 0000000..8f459b7 --- /dev/null +++ b/internal/logx/logx.go @@ -0,0 +1,77 @@ +package logx + +import ( + "fmt" + "strings" + "sync" + "time" +) + +const ( + colorReset = "\033[0m" + colorBlue = "\033[38;5;39m" + colorCyan = "\033[38;5;45m" + colorGreen = "\033[38;5;42m" + colorYellow = "\033[38;5;220m" + colorRed = "\033[38;5;196m" + colorGray = "\033[38;5;244m" + colorPink = "\033[38;5;213m" +) + +type style struct { + emoji string + label string + color string +} + +var ( + mu = sync.Mutex{} + + styles = map[string]style{ + "start": {emoji: "🚀", label: "START", color: colorBlue}, + "queue": {emoji: "📥", label: "QUEUE", color: colorCyan}, + "visit": {emoji: "🌐", label: "VISIT", color: colorBlue}, + "recv": {emoji: "📦", label: "RECV", color: colorCyan}, + "parsed": {emoji: "🧠", label: "PARSED", color: colorPink}, + "status": {emoji: "🌀", label: "STATUS", color: colorYellow}, + "done": {emoji: "✅", label: "DONE", color: colorGreen}, + "write": {emoji: "💾", label: "WRITE", color: colorBlue}, + "skip": {emoji: "⏭️", label: "SKIP", color: colorGray}, + "warn": {emoji: "⚠️", label: "WARN", color: colorYellow}, + "error": {emoji: "💥", label: "ERROR", color: colorRed}, + "retry": {emoji: "🔁", label: "RETRY", color: colorYellow}, + "giveup": {emoji: "🛑", label: "GIVEUP", color: colorRed}, + "success": {emoji: "🎉", label: "SUCCESS", color: colorGreen}, + } +) + +func Eventf(kind, format string, args ...any) { + st, ok := styles[kind] + if !ok { + st = style{emoji: "•", label: strings.ToUpper(kind), color: colorGray} + } + + write(st, fmt.Sprintf(format, args...)) +} + +func Statusf(frame, format string, args ...any) { + st := styles["status"] + st.emoji = frame + write(st, fmt.Sprintf(format, args...)) +} + +func write(st style, message string) { + mu.Lock() + defer mu.Unlock() + + timestamp := time.Now().Format("15:04:05") + fmt.Printf( + "%s[%s] %s %-7s %s%s\n", + st.color, + timestamp, + st.emoji, + st.label, + message, + colorReset, + ) +} diff --git a/internal/model/types.go b/internal/model/types.go new file mode 100644 index 0000000..f21e61c --- /dev/null +++ b/internal/model/types.go @@ -0,0 +1,33 @@ +package model + +type Recipe struct { + Result string `json:"result"` + ResultCount string `json:"result_count,omitempty"` + Ingredients []string `json:"ingredients,omitempty"` + Station string `json:"station,omitempty"` + SourcePage string `json:"source_page,omitempty"` +} + +type Item struct { + Name string `json:"name"` + URL string `json:"url"` + Categories []string `json:"categories,omitempty"` + Infobox map[string]string `json:"infobox,omitempty"` + Effects []string `json:"effects,omitempty"` + EffectLinks []string `json:"effect_links,omitempty"` + Recipes []Recipe `json:"recipes,omitempty"` + Description string `json:"description,omitempty"` +} + +type Effect struct { + Name string `json:"name"` + URL string `json:"url"` + Categories []string `json:"categories,omitempty"` + Infobox map[string]string `json:"infobox,omitempty"` + Description string `json:"description,omitempty"` +} + +type Dataset struct { + Items []Item `json:"items"` + Effects []Effect `json:"effects"` +} diff --git a/internal/scraper/config.go b/internal/scraper/config.go new file mode 100644 index 0000000..7f8afc5 --- /dev/null +++ b/internal/scraper/config.go @@ -0,0 +1,102 @@ +package scraper + +import ( + "regexp" + "time" +) + +type Config struct { + BaseURL string + AllowedDomain string + MaxDepth int + MaxRetries int + MaxQueuedPages int + RequestDelay time.Duration + RequestJitter time.Duration + RequestTimeout time.Duration + RetryBaseDelay time.Duration + ProgressEvery time.Duration + BrowserReferrer string + BrowserAgents []string + ItemSeeds []string + CraftingSeeds []string + IgnoredPrefixes []string + IgnoredExact map[string]bool + ItemListPathRe *regexp.Regexp + CraftingPathRe *regexp.Regexp + AmountPrefixRe *regexp.Regexp + WhitespaceRe *regexp.Regexp + SpinnerFrames []string +} + +func DefaultConfig() Config { + baseURL := "https://outward.fandom.com" + + return Config{ + BaseURL: baseURL, + AllowedDomain: "outward.fandom.com", + MaxDepth: 3, + MaxRetries: 2, + MaxQueuedPages: 1500, + RequestDelay: 650 * time.Millisecond, + RequestJitter: 350 * time.Millisecond, + RequestTimeout: 8 * time.Second, + RetryBaseDelay: 1200 * time.Millisecond, + ProgressEvery: 3 * time.Second, + BrowserReferrer: baseURL + "/", + BrowserAgents: []string{ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36", + }, + ItemSeeds: []string{ + baseURL + "/wiki/Items/Weapons", + baseURL + "/wiki/Items/Equipment", + baseURL + "/wiki/Items/Consumables", + baseURL + "/wiki/Items/Ingredients", + baseURL + "/wiki/Items/Deployables", + baseURL + "/wiki/Items/Other", + baseURL + "/wiki/Items/Item_Values", + }, + CraftingSeeds: []string{ + baseURL + "/wiki/Crafting", + baseURL + "/wiki/Crafting/Survival", + baseURL + "/wiki/Crafting/Cooking", + baseURL + "/wiki/Crafting/Alchemy", + }, + IgnoredPrefixes: []string{ + "/wiki/File:", + "/wiki/Category:", + "/wiki/Special:", + "/wiki/Help:", + "/wiki/Template:", + "/wiki/User:", + "/wiki/User_blog:", + "/wiki/Forum:", + "/wiki/Message_Wall:", + "/wiki/Thread:", + "/wiki/Map:", + }, + IgnoredExact: map[string]bool{ + "/wiki/Outward_Wiki": true, + "/wiki/Items": true, + "/wiki/Crafting": false, + }, + ItemListPathRe: regexp.MustCompile(`^/wiki/Items(?:/|$)`), + CraftingPathRe: regexp.MustCompile(`^/wiki/Crafting(?:/|$)`), + AmountPrefixRe: regexp.MustCompile(`^\s*(\d+x)\s+`), + WhitespaceRe: regexp.MustCompile(`\s+`), + SpinnerFrames: []string{ + "⠋", + "⠙", + "⠹", + "⠸", + "⠼", + "⠴", + "⠦", + "⠧", + "⠇", + "⠏", + }, + } +} diff --git a/internal/scraper/parse.go b/internal/scraper/parse.go new file mode 100644 index 0000000..b877afe --- /dev/null +++ b/internal/scraper/parse.go @@ -0,0 +1,487 @@ +package scraper + +import ( + "math/rand" + "net/url" + "path" + "sort" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly/v2" + + "scrappr/internal/model" +) + +func (s *Scraper) shouldVisit(raw string) bool { + if raw == "" { + return false + } + + parsed, err := url.Parse(raw) + if err != nil { + return false + } + + if parsed.Host != s.cfg.AllowedDomain { + return false + } + + if !strings.HasPrefix(parsed.Path, "/wiki/") { + return false + } + + if s.cfg.IgnoredExact[parsed.Path] { + return false + } + + for _, prefix := range s.cfg.IgnoredPrefixes { + if strings.HasPrefix(parsed.Path, prefix) { + return false + } + } + + return true +} + +func (s *Scraper) applyBrowserHeaders(r *colly.Request) { + r.Headers.Set("User-Agent", s.randomUserAgent()) + r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8") + r.Headers.Set("Accept-Language", "en-US,en;q=0.9") + r.Headers.Set("Cache-Control", "max-age=0") + r.Headers.Set("DNT", "1") + r.Headers.Set("Sec-Fetch-Dest", "document") + r.Headers.Set("Sec-Fetch-Mode", "navigate") + r.Headers.Set("Sec-Fetch-Site", "same-origin") + r.Headers.Set("Sec-Fetch-User", "?1") + r.Headers.Set("Upgrade-Insecure-Requests", "1") + + if r.Headers.Get("Referer") == "" { + r.Headers.Set("Referer", s.cfg.BrowserReferrer) + } +} + +func (s *Scraper) randomUserAgent() string { + return s.cfg.BrowserAgents[rand.Intn(len(s.cfg.BrowserAgents))] +} + +func (s *Scraper) shouldQueueFromPage(fromPath, toURL string) bool { + parsed, err := url.Parse(toURL) + if err != nil { + return false + } + + toPath := parsed.Path + + if s.cfg.ItemListPathRe.MatchString(fromPath) { + return strings.HasPrefix(toPath, "/wiki/") && + !s.cfg.ItemListPathRe.MatchString(toPath) && + !s.cfg.CraftingPathRe.MatchString(toPath) + } + + if s.cfg.CraftingPathRe.MatchString(fromPath) { + return strings.HasPrefix(toPath, "/wiki/") && + !s.cfg.CraftingPathRe.MatchString(toPath) + } + + return false +} + +func (s *Scraper) shouldQueueTableLink(e *colly.HTMLElement) bool { + label := s.clean(e.Text) + if label == "" { + return false + } + + cell := e.DOM.Closest("td, th") + if cell.Length() == 0 { + return false + } + + columnIndex := 0 + for prev := cell.Prev(); prev.Length() > 0; prev = prev.Prev() { + switch goquery.NodeName(prev) { + case "td", "th": + columnIndex++ + } + } + + return columnIndex <= 1 +} + +func (s *Scraper) pageKindForPath(pagePath string) string { + switch { + case s.cfg.ItemListPathRe.MatchString(pagePath): + return "item-list" + case s.cfg.CraftingPathRe.MatchString(pagePath): + return "crafting" + default: + return "article" + } +} + +func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc *goquery.Document) bool { + if s.cfg.ItemListPathRe.MatchString(pagePath) || s.cfg.CraftingPathRe.MatchString(pagePath) { + return false + } + if strings.Contains(strings.ToLower(title), "/") { + return false + } + if doc.Find("aside.portable-infobox").Length() == 0 { + return false + } + + for _, category := range lcCats { + if strings.Contains(category, "items") || strings.Contains(category, "food") || + strings.Contains(category, "consumables") || strings.Contains(category, "ingredients") || + strings.Contains(category, "equipment") || strings.Contains(category, "weapons") || + strings.Contains(category, "deployables") { + return true + } + } + + infoboxText := strings.ToLower(s.clean(doc.Find("aside.portable-infobox").Text())) + return strings.Contains(infoboxText, "item details") || + strings.Contains(infoboxText, "consumable details") || + strings.Contains(infoboxText, "equipment details") || + strings.Contains(infoboxText, "weapon") +} + +func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool { + if strings.Contains(strings.ToLower(title), "effect") { + return true + } + + for _, category := range lcCats { + if strings.Contains(category, "effects") { + return true + } + } + + return strings.Contains(strings.ToLower(pagePath), "/effect") +} + +func (s *Scraper) parseItemPage(doc *goquery.Document, title, pageURL string, categories []string) model.Item { + item := model.Item{ + Name: title, + URL: pageURL, + Categories: categories, + Infobox: s.parsePortableInfobox(doc), + } + + item.Description = s.parseDescription(doc) + item.Effects, item.EffectLinks = s.parseEffectsSection(doc) + item.Recipes = s.parseRecipesFromPage(doc, title) + + return item +} + +func (s *Scraper) parseEffectPage(doc *goquery.Document, title, pageURL string, categories []string) model.Effect { + return model.Effect{ + Name: title, + URL: pageURL, + Categories: categories, + Infobox: s.parsePortableInfobox(doc), + Description: s.parseDescription(doc), + } +} + +func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string { + out := map[string]string{} + + doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) { + label := s.clean(section.Find(".pi-data-label").First().Text()) + value := s.clean(section.Find(".pi-data-value").First().Text()) + + if label == "" { + label = s.clean(section.Find("h3").First().Text()) + } + if value == "" { + value = s.clean(section.Find("div").Last().Text()) + } + + if label != "" && value != "" { + out[label] = value + } + }) + + return out +} + +func (s *Scraper) parseCategories(doc *goquery.Document) []string { + var categories []string + seen := map[string]bool{} + + doc.Find(".page-header__categories a, .category-page__member-link, .wds-tabs__tab-label a").Each(func(_ int, node *goquery.Selection) { + text := s.clean(node.Text()) + if text != "" && !seen[text] { + seen[text] = true + categories = append(categories, text) + } + }) + + if len(categories) == 0 { + headerText := s.clean(doc.Find("body").Text()) + if idx := strings.Index(headerText, "in:"); idx >= 0 { + line := strings.TrimPrefix(headerText[idx:], "in:") + for _, part := range strings.Split(line, ",") { + part = s.clean(part) + if part != "" && !seen[part] { + seen[part] = true + categories = append(categories, part) + } + } + } + } + + return categories +} + +func (s *Scraper) parseDescription(doc *goquery.Document) string { + var description string + + doc.Find(".mw-parser-output > p").EachWithBreak(func(_ int, paragraph *goquery.Selection) bool { + text := s.clean(paragraph.Text()) + if text == "" { + return true + } + + lower := strings.ToLower(text) + if strings.Contains(lower, "is a") || strings.Contains(lower, "is an") || len(text) > 30 { + description = text + return false + } + + return true + }) + + return description +} + +func (s *Scraper) parseEffectsSection(doc *goquery.Document) ([]string, []string) { + var effects []string + var effectLinks []string + + seenText := map[string]bool{} + seenLink := map[string]bool{} + section := s.findSection(doc, "Effects") + if section.Length() == 0 { + return effects, effectLinks + } + + s.walkSectionUntilNextHeading(section, func(node *goquery.Selection) { + node.Find("li").Each(func(_ int, item *goquery.Selection) { + text := s.clean(item.Text()) + if text != "" && !seenText[text] { + seenText[text] = true + effects = append(effects, text) + } + + item.Find("a[href^='/wiki/']").Each(func(_ int, anchor *goquery.Selection) { + href, _ := anchor.Attr("href") + label := s.clean(anchor.Text()) + if href != "" && label != "" && s.looksLikeEffectLink(href, label) && !seenLink[href] { + seenLink[href] = true + effectLinks = append(effectLinks, href) + } + }) + }) + }) + + return effects, effectLinks +} + +func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string) []model.Recipe { + var recipes []model.Recipe + + doc.Find("table").Each(func(_ int, table *goquery.Selection) { + headerText := strings.ToLower(s.clean(table.Find("tr").First().Text())) + if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) { + return + } + + table.Find("tr").Each(func(i int, row *goquery.Selection) { + if i == 0 { + return + } + + cells := row.Find("th, td") + if cells.Length() < 2 { + return + } + + resultCell := cells.Eq(0) + ingredientCell := cells.Eq(1) + stationCell := cells.Eq(2) + + resultText := s.clean(resultCell.Text()) + if resultText == "" { + return + } + + resultCount := "" + if matches := s.cfg.AmountPrefixRe.FindStringSubmatch(resultText); len(matches) > 1 { + resultCount = matches[1] + resultText = s.clean(strings.TrimPrefix(resultText, matches[0])) + } + + var ingredients []string + ingredientCell.Find("li").Each(func(_ int, ingredient *goquery.Selection) { + text := s.clean(ingredient.Text()) + if text != "" { + ingredients = append(ingredients, text) + } + }) + + if len(ingredients) == 0 { + for _, line := range s.splitLines(ingredientCell.Text()) { + if line != "" { + ingredients = append(ingredients, line) + } + } + } + + recipes = append(recipes, model.Recipe{ + Result: resultText, + ResultCount: resultCount, + Ingredients: ingredients, + Station: s.clean(stationCell.Text()), + SourcePage: pageTitle, + }) + }) + }) + + return recipes +} + +func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection { + var found *goquery.Selection + + doc.Find(".mw-parser-output h2, .mw-parser-output h3, .mw-parser-output h4").EachWithBreak(func(_ int, section *goquery.Selection) bool { + text := strings.TrimSpace(strings.TrimSuffix(s.clean(section.Text()), "[]")) + if strings.EqualFold(text, title) { + found = section + return false + } + return true + }) + + if found == nil { + return &goquery.Selection{} + } + + return found +} + +func (s *Scraper) walkSectionUntilNextHeading(start *goquery.Selection, fn func(*goquery.Selection)) { + for section := start.Next(); section.Length() > 0; section = section.Next() { + if goquery.NodeName(section) == "h2" || goquery.NodeName(section) == "h3" || goquery.NodeName(section) == "h4" { + break + } + fn(section) + } +} + +func (s *Scraper) looksLikeEffectLink(href, label string) bool { + lowerHref := strings.ToLower(href) + lowerLabel := strings.ToLower(label) + + if strings.Contains(lowerHref, "/wiki/") && (strings.Contains(lowerHref, "effect") || strings.Contains(lowerLabel, "effect")) { + return true + } + + knownHints := []string{ + "bleeding", "burning", "poisoned", "possessed", "discipline", "rage", + "sapped", "scorched", "curse", "doomed", "chill", "warm", "cool", + "barrier", "protection", "imbue", "energized", "shimmer", + } + + for _, hint := range knownHints { + if strings.Contains(lowerLabel, hint) { + return true + } + } + + return false +} + +func (s *Scraper) absoluteWikiURL(href string) string { + if href == "" { + return "" + } + + if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") { + return href + } + + if strings.HasPrefix(href, "/wiki/") { + return s.cfg.BaseURL + href + } + + return "" +} + +func (s *Scraper) flattenItems() []model.Item { + out := make([]model.Item, 0, len(s.items)) + for _, item := range s.items { + out = append(out, *item) + } + + sort.Slice(out, func(i, j int) bool { + return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name) + }) + + return out +} + +func (s *Scraper) flattenEffects() []model.Effect { + out := make([]model.Effect, 0, len(s.effects)) + for _, effect := range s.effects { + out = append(out, *effect) + } + + sort.Slice(out, func(i, j int) bool { + return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name) + }) + + return out +} + +func (s *Scraper) clean(value string) string { + value = strings.ReplaceAll(value, "\u00a0", " ") + value = strings.ReplaceAll(value, "\n", " ") + value = strings.ReplaceAll(value, "\t", " ") + value = strings.TrimSpace(value) + return s.cfg.WhitespaceRe.ReplaceAllString(value, " ") +} + +func (s *Scraper) splitLines(value string) []string { + value = strings.ReplaceAll(value, "\u00a0", " ") + raw := strings.Split(value, "\n") + out := make([]string, 0, len(raw)) + + for _, line := range raw { + line = s.clean(line) + if line != "" { + out = append(out, line) + } + } + + return out +} + +func (s *Scraper) lowerSlice(in []string) []string { + out := make([]string, 0, len(in)) + for _, value := range in { + out = append(out, strings.ToLower(value)) + } + return out +} + +func (s *Scraper) debugURLName(raw string) string { + parsed, err := url.Parse(raw) + if err != nil { + return raw + } + return path.Base(parsed.Path) +} diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go new file mode 100644 index 0000000..5c6cd44 --- /dev/null +++ b/internal/scraper/scraper.go @@ -0,0 +1,411 @@ +package scraper + +import ( + "bytes" + "math/rand" + "strconv" + "sync" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly/v2" + + "scrappr/internal/logx" + "scrappr/internal/model" +) + +type Scraper struct { + cfg Config + collector *colly.Collector + + mu sync.Mutex + items map[string]*model.Item + effects map[string]*model.Effect + queued map[string]bool + completed int + failed int + retried int + requestSeq int + spinnerIndex int + activeURL string + activeSince time.Time + lastEvent time.Time +} + +func New(cfg Config) *Scraper { + return &Scraper{ + cfg: cfg, + items: map[string]*model.Item{}, + effects: map[string]*model.Effect{}, + queued: map[string]bool{}, + lastEvent: time.Now(), + } +} + +func (s *Scraper) Run() (model.Dataset, error) { + s.collector = colly.NewCollector( + colly.AllowedDomains(s.cfg.AllowedDomain), + colly.MaxDepth(s.cfg.MaxDepth), + colly.Async(true), + ) + + s.collector.SetRequestTimeout(s.cfg.RequestTimeout) + s.collector.ParseHTTPErrorResponse = true + + if err := s.collector.Limit(&colly.LimitRule{ + DomainGlob: "*" + s.cfg.AllowedDomain + "*", + Parallelism: 1, + Delay: s.cfg.RequestDelay, + RandomDelay: s.cfg.RequestJitter, + }); err != nil { + return model.Dataset{}, err + } + + s.registerHandlers() + + done := make(chan struct{}) + defer close(done) + s.startStatusLoop(done) + + for _, seed := range append(append([]string{}, s.cfg.ItemSeeds...), s.cfg.CraftingSeeds...) { + s.queueVisit("seed", seed) + } + + s.collector.Wait() + + return model.Dataset{ + Items: s.flattenItems(), + Effects: s.flattenEffects(), + }, nil +} + +func (s *Scraper) registerHandlers() { + s.collector.OnRequest(func(r *colly.Request) { + s.applyBrowserHeaders(r) + + startedAt := time.Now() + fromURL := r.Ctx.Get("from_url") + if fromURL == "" { + fromURL = "seed" + } + + s.mu.Lock() + s.requestSeq++ + reqID := s.requestSeq + s.activeURL = r.URL.String() + s.activeSince = startedAt + s.lastEvent = startedAt + s.mu.Unlock() + + r.Ctx.Put("request_id", strconv.Itoa(reqID)) + r.Ctx.Put("started_at_unix_nano", strconv.FormatInt(startedAt.UnixNano(), 10)) + + logx.Eventf( + "visit", + "#%d depth=%d attempt=%d from=%s to=%s", + reqID, + r.Depth, + s.retryAttempt(r.Ctx)+1, + s.debugURLName(fromURL), + r.URL.String(), + ) + }) + + s.collector.OnError(func(r *colly.Response, err error) { + if r == nil || r.Request == nil { + logx.Eventf("error", "request failed before response: %v", err) + return + } + + duration := s.requestDuration(r.Request) + + s.mu.Lock() + s.failed++ + s.activeURL = "" + s.activeSince = time.Time{} + s.lastEvent = time.Now() + s.mu.Unlock() + + logx.Eventf( + "error", + "#%s status=%d after=%s url=%s: %v", + r.Request.Ctx.Get("request_id"), + r.StatusCode, + s.durationString(duration), + r.Request.URL.String(), + err, + ) + + if s.shouldRetry(r.StatusCode) { + s.mu.Lock() + s.retried++ + s.mu.Unlock() + s.retryRequest(r, err) + } + }) + + s.collector.OnResponse(func(r *colly.Response) { + if r.StatusCode >= 400 { + logx.Eventf("skip", "#%s status=%d url=%s", r.Request.Ctx.Get("request_id"), r.StatusCode, r.Request.URL.String()) + return + } + + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) + if err != nil { + logx.Eventf("warn", "parse error for %s: %v", r.Request.URL.String(), err) + return + } + + title := s.clean(doc.Find("h1.page-header__title").First().Text()) + if title == "" { + title = s.clean(doc.Find("h1").First().Text()) + } + if title == "" { + logx.Eventf("warn", "empty title for %s", r.Request.URL.String()) + return + } + + pageURL := r.Request.URL.String() + pagePath := r.Request.URL.Path + categories := s.parseCategories(doc) + lcCats := s.lowerSlice(categories) + pageKind := s.pageKindForPath(pagePath) + + switch { + case s.isEffectPage(pagePath, title, lcCats): + effect := s.parseEffectPage(doc, title, pageURL, categories) + if effect.Name != "" { + s.mu.Lock() + s.effects[effect.URL] = &effect + s.mu.Unlock() + + pageKind = "effect" + logx.Eventf("parsed", "#%s effect=%q", r.Request.Ctx.Get("request_id"), effect.Name) + } + case s.isLikelyItemPage(pagePath, title, lcCats, doc): + item := s.parseItemPage(doc, title, pageURL, categories) + + s.mu.Lock() + s.items[item.URL] = &item + s.mu.Unlock() + + pageKind = "item" + logx.Eventf( + "parsed", + "#%s item=%q effects=%d recipes=%d", + r.Request.Ctx.Get("request_id"), + item.Name, + len(item.Effects), + len(item.Recipes), + ) + + for _, effectLink := range item.EffectLinks { + link := s.absoluteWikiURL(effectLink) + if link == "" { + continue + } + s.queueVisit(pageURL, link) + } + default: + logx.Eventf("skip", "#%s page=%q kind=%s", r.Request.Ctx.Get("request_id"), title, pageKind) + } + + logx.Eventf( + "recv", + "#%s status=%d bytes=%d kind=%s title=%q after=%s", + r.Request.Ctx.Get("request_id"), + r.StatusCode, + len(r.Body), + pageKind, + title, + s.durationString(s.requestDuration(r.Request)), + ) + }) + + s.collector.OnScraped(func(r *colly.Response) { + s.mu.Lock() + s.completed++ + s.activeURL = "" + s.activeSince = time.Time{} + s.lastEvent = time.Now() + doneCount := s.completed + queueLen := len(s.queued) + s.mu.Unlock() + + logx.Eventf("done", "#%s total=%d queued=%d url=%s", r.Request.Ctx.Get("request_id"), doneCount, queueLen, r.Request.URL.String()) + }) + + s.collector.OnHTML(".mw-parser-output table a[href]", func(e *colly.HTMLElement) { + href := e.Attr("href") + link := e.Request.AbsoluteURL(href) + if !s.shouldVisit(link) { + return + } + + if s.shouldQueueFromPage(e.Request.URL.Path, link) && s.shouldQueueTableLink(e) { + s.queueVisit(e.Request.URL.String(), link) + return + } + + if s.looksLikeEffectLink(href, e.Text) { + s.queueVisit(e.Request.URL.String(), link) + } + }) +} + +func (s *Scraper) startStatusLoop(done <-chan struct{}) { + go func() { + ticker := time.NewTicker(s.cfg.ProgressEvery) + defer ticker.Stop() + + for { + select { + case <-done: + return + case <-ticker.C: + s.mu.Lock() + queueLen := len(s.queued) + itemLen := len(s.items) + effectLen := len(s.effects) + currentURL := s.activeURL + currentName := "-" + currentFor := s.durationString(time.Since(s.activeSince)) + idleFor := s.durationString(time.Since(s.lastEvent)) + completedCount := s.completed + failedCount := s.failed + retriedCount := s.retried + frame := s.spinnerFrame() + s.mu.Unlock() + + if currentURL == "" { + currentFor = "-" + } else { + currentName = s.debugURLName(currentURL) + } + + logx.Statusf( + frame, + "queued=%d completed=%d failed=%d retries=%d items=%d effects=%d active=%s active_for=%s idle=%s", + queueLen, + completedCount, + failedCount, + retriedCount, + itemLen, + effectLen, + currentName, + currentFor, + idleFor, + ) + } + } + }() +} + +func (s *Scraper) queueVisit(fromURL, toURL string) { + if toURL == "" { + return + } + + s.mu.Lock() + switch { + case s.queued[toURL]: + s.mu.Unlock() + return + case len(s.queued) >= s.cfg.MaxQueuedPages: + s.mu.Unlock() + logx.Eventf("skip", "queue budget reached from=%s to=%s", s.debugURLName(fromURL), toURL) + return + default: + s.queued[toURL] = true + queueLen := len(s.queued) + s.mu.Unlock() + + ctx := colly.NewContext() + ctx.Put("from_url", fromURL) + + logx.Eventf("queue", "%d from=%s to=%s", queueLen, s.debugURLName(fromURL), toURL) + if err := s.collector.Request("GET", toURL, nil, ctx, nil); err != nil { + logx.Eventf("warn", "queue failed from=%s to=%s: %v", s.debugURLName(fromURL), toURL, err) + } + } +} + +func (s *Scraper) spinnerFrame() string { + if len(s.cfg.SpinnerFrames) == 0 { + return "🌀" + } + + frame := s.cfg.SpinnerFrames[s.spinnerIndex%len(s.cfg.SpinnerFrames)] + s.spinnerIndex++ + return frame +} + +func (s *Scraper) shouldRetry(statusCode int) bool { + return statusCode == 0 || statusCode == 408 || statusCode == 425 || statusCode == 429 || statusCode >= 500 +} + +func (s *Scraper) retryRequest(r *colly.Response, err error) { + attempt := s.retryAttempt(r.Request.Ctx) + + if attempt >= s.cfg.MaxRetries { + logx.Eventf("giveup", "url=%s attempts=%d: %v", r.Request.URL.String(), attempt, err) + return + } + + attempt++ + r.Request.Ctx.Put("retry_count", strconv.Itoa(attempt)) + + wait := s.retryDelay(attempt) + logx.Eventf("retry", "%d/%d %s after %s", attempt, s.cfg.MaxRetries, r.Request.URL.String(), wait) + time.Sleep(wait) + + if retryErr := r.Request.Retry(); retryErr != nil { + logx.Eventf("error", "retry failed for %s: %v (original error: %v)", r.Request.URL.String(), retryErr, err) + } +} + +func (s *Scraper) retryAttempt(ctx *colly.Context) int { + attempt := 0 + if raw := ctx.Get("retry_count"); raw != "" { + parsed, err := strconv.Atoi(raw) + if err == nil { + attempt = parsed + } + } + return attempt +} + +func (s *Scraper) retryDelay(attempt int) time.Duration { + backoff := s.cfg.RetryBaseDelay * time.Duration(1<<(attempt-1)) + return backoff + s.jitter(500*time.Millisecond) +} + +func (s *Scraper) jitter(max time.Duration) time.Duration { + if max <= 0 { + return 0 + } + + return time.Duration(rand.Int63n(int64(max))) +} + +func (s *Scraper) requestDuration(r *colly.Request) time.Duration { + raw := r.Ctx.Get("started_at_unix_nano") + if raw == "" { + return 0 + } + + startedAtUnixNano, err := strconv.ParseInt(raw, 10, 64) + if err != nil { + return 0 + } + + return time.Since(time.Unix(0, startedAtUnixNano)) +} + +func (s *Scraper) durationString(d time.Duration) string { + if d <= 0 { + return "0s" + } + + return d.Round(100 * time.Millisecond).String() +} diff --git a/outward_data.json b/outward_data.json new file mode 100644 index 0000000..c636429 --- /dev/null +++ b/outward_data.json @@ -0,0 +1,4 @@ +{ + "items": [], + "effects": [] +}