Initial COmmit
This commit is contained in:
34
.gitignore
vendored
Normal file
34
.gitignore
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
*.exe
|
||||
*.exe~
|
||||
*.dll
|
||||
*.so
|
||||
*.dylib
|
||||
|
||||
# Test binary, built with `go test -c`
|
||||
*.test
|
||||
|
||||
# Output of the go coverage tool, specifically when used with LiteIDE
|
||||
*.out
|
||||
|
||||
# Dependency directories (remove the comment below to include it)
|
||||
# vendor/
|
||||
|
||||
# Go workspace file
|
||||
go.work
|
||||
|
||||
### Linux ###
|
||||
*~
|
||||
|
||||
# temporary files which can be created if a process still has a handle open of a deleted file
|
||||
.fuse_hidden*
|
||||
|
||||
# KDE directory preferences
|
||||
.directory
|
||||
|
||||
# Linux trash folder which might appear on any partition or disk
|
||||
.Trash-*
|
||||
|
||||
# .nfs files are created when an open file is removed but is still being accessed
|
||||
.nfs*
|
||||
|
||||
# End of https://www.toptal.com/developers/gitignore/api/go,linux
|
||||
40
README.md
Normal file
40
README.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Scrappr
|
||||
|
||||
Small Go scraper for the Outward Fandom wiki.
|
||||
|
||||
## Layout
|
||||
|
||||
```text
|
||||
.
|
||||
├── cmd/scrappr/main.go # binary entrypoint
|
||||
├── internal/app # bootstrapping and output writing
|
||||
├── internal/logx # colored emoji logger
|
||||
├── internal/model # dataset models
|
||||
├── internal/scraper # crawl flow, parsing, queueing, retries
|
||||
├── go.mod
|
||||
├── go.sum
|
||||
└── outward_data.json # generated output
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
go run ./cmd/scrappr
|
||||
```
|
||||
|
||||
## What It Does
|
||||
|
||||
- Crawls item and crafting pages from `outward.fandom.com`
|
||||
- Uses browser-like headers and rotating user agents
|
||||
- Limits crawl depth and queue size to avoid drifting into junk pages
|
||||
- Retries temporary failures with short backoff
|
||||
- Prints colored emoji logs for queueing, requests, responses, parsing, retries, and periodic status
|
||||
- Writes a stable, sorted JSON dataset to `outward_data.json`
|
||||
|
||||
## Tuning
|
||||
|
||||
Scraper defaults live in `internal/scraper/config.go`.
|
||||
|
||||
- Lower or raise `RequestDelay` / `RequestJitter`
|
||||
- Tighten or relax `MaxQueuedPages`
|
||||
- Adjust `RequestTimeout`, `MaxRetries`, and `ProgressEvery`
|
||||
15
cmd/scrappr/main.go
Normal file
15
cmd/scrappr/main.go
Normal file
@@ -0,0 +1,15 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"scrappr/internal/app"
|
||||
"scrappr/internal/logx"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if err := app.Run(); err != nil {
|
||||
logx.Eventf("error", "fatal: %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
27
go.mod
Normal file
27
go.mod
Normal file
@@ -0,0 +1,27 @@
|
||||
module scrappr
|
||||
|
||||
go 1.25.5
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.11.0
|
||||
github.com/gocolly/colly/v2 v2.3.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/andybalholm/cascadia v1.3.3 // indirect
|
||||
github.com/antchfx/htmlquery v1.3.5 // indirect
|
||||
github.com/antchfx/xmlquery v1.5.0 // indirect
|
||||
github.com/antchfx/xpath v1.3.5 // indirect
|
||||
github.com/bits-and-blooms/bitset v1.24.4 // indirect
|
||||
github.com/gobwas/glob v0.2.3 // indirect
|
||||
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/kennygrant/sanitize v1.2.4 // indirect
|
||||
github.com/nlnwa/whatwg-url v0.6.2 // indirect
|
||||
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
|
||||
github.com/temoto/robotstxt v1.1.2 // indirect
|
||||
golang.org/x/net v0.47.0 // indirect
|
||||
golang.org/x/text v0.31.0 // indirect
|
||||
google.golang.org/appengine v1.6.8 // indirect
|
||||
google.golang.org/protobuf v1.36.10 // indirect
|
||||
)
|
||||
123
go.sum
Normal file
123
go.sum
Normal file
@@ -0,0 +1,123 @@
|
||||
github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
|
||||
github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
|
||||
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
||||
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
|
||||
github.com/antchfx/htmlquery v1.3.5 h1:aYthDDClnG2a2xePf6tys/UyyM/kRcsFRm+ifhFKoU0=
|
||||
github.com/antchfx/htmlquery v1.3.5/go.mod h1:5oyIPIa3ovYGtLqMPNjBF2Uf25NPCKsMjCnQ8lvjaoA=
|
||||
github.com/antchfx/xmlquery v1.5.0 h1:uAi+mO40ZWfyU6mlUBxRVvL6uBNZ6LMU4M3+mQIBV4c=
|
||||
github.com/antchfx/xmlquery v1.5.0/go.mod h1:lJfWRXzYMK1ss32zm1GQV3gMIW/HFey3xDZmkP1SuNc=
|
||||
github.com/antchfx/xpath v1.3.5 h1:PqbXLC3TkfeZyakF5eeh3NTWEbYl4VHNVeufANzDbKQ=
|
||||
github.com/antchfx/xpath v1.3.5/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
|
||||
github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/bits-and-blooms/bitset v1.24.4 h1:95H15Og1clikBrKr/DuzMXkQzECs1M6hhoGXLwLQOZE=
|
||||
github.com/bits-and-blooms/bitset v1.24.4/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
|
||||
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
|
||||
github.com/gocolly/colly/v2 v2.3.0 h1:HSFh0ckbgVd2CSGRE+Y/iA4goUhGROJwyQDCMXGFBWM=
|
||||
github.com/gocolly/colly/v2 v2.3.0/go.mod h1:Qp54s/kQbwCQvFVx8KzKCSTXVJ1wWT4QeAKEu33x1q8=
|
||||
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
|
||||
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ=
|
||||
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw=
|
||||
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
|
||||
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
|
||||
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
|
||||
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
|
||||
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
|
||||
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
|
||||
github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q=
|
||||
github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
|
||||
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
|
||||
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
|
||||
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
|
||||
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
|
||||
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
|
||||
golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
|
||||
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
|
||||
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
|
||||
golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
|
||||
golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
|
||||
golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
|
||||
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
|
||||
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
|
||||
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
|
||||
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
|
||||
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
|
||||
golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
|
||||
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
|
||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
|
||||
golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
|
||||
golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
|
||||
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM=
|
||||
google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds=
|
||||
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
||||
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
|
||||
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
|
||||
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||
56
internal/app/run.go
Normal file
56
internal/app/run.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"math/rand"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"scrappr/internal/logx"
|
||||
"scrappr/internal/model"
|
||||
"scrappr/internal/scraper"
|
||||
)
|
||||
|
||||
const outputPath = "outward_data.json"
|
||||
|
||||
func Run() error {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
|
||||
logx.Eventf("start", "Outward scraper booting")
|
||||
|
||||
cfg := scraper.DefaultConfig()
|
||||
s := scraper.New(cfg)
|
||||
|
||||
dataset, err := s.Run()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
logx.Eventf("write", "writing dataset to %s", outputPath)
|
||||
if err := writeDataset(outputPath, dataset); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
logx.Eventf(
|
||||
"success",
|
||||
"wrote %d items and %d effects to %s",
|
||||
len(dataset.Items),
|
||||
len(dataset.Effects),
|
||||
outputPath,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func writeDataset(outputPath string, dataset model.Dataset) error {
|
||||
file, err := os.Create(outputPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
encoder := json.NewEncoder(file)
|
||||
encoder.SetIndent("", " ")
|
||||
|
||||
return encoder.Encode(dataset)
|
||||
}
|
||||
77
internal/logx/logx.go
Normal file
77
internal/logx/logx.go
Normal file
@@ -0,0 +1,77 @@
|
||||
package logx
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
colorReset = "\033[0m"
|
||||
colorBlue = "\033[38;5;39m"
|
||||
colorCyan = "\033[38;5;45m"
|
||||
colorGreen = "\033[38;5;42m"
|
||||
colorYellow = "\033[38;5;220m"
|
||||
colorRed = "\033[38;5;196m"
|
||||
colorGray = "\033[38;5;244m"
|
||||
colorPink = "\033[38;5;213m"
|
||||
)
|
||||
|
||||
type style struct {
|
||||
emoji string
|
||||
label string
|
||||
color string
|
||||
}
|
||||
|
||||
var (
|
||||
mu = sync.Mutex{}
|
||||
|
||||
styles = map[string]style{
|
||||
"start": {emoji: "🚀", label: "START", color: colorBlue},
|
||||
"queue": {emoji: "📥", label: "QUEUE", color: colorCyan},
|
||||
"visit": {emoji: "🌐", label: "VISIT", color: colorBlue},
|
||||
"recv": {emoji: "📦", label: "RECV", color: colorCyan},
|
||||
"parsed": {emoji: "🧠", label: "PARSED", color: colorPink},
|
||||
"status": {emoji: "🌀", label: "STATUS", color: colorYellow},
|
||||
"done": {emoji: "✅", label: "DONE", color: colorGreen},
|
||||
"write": {emoji: "💾", label: "WRITE", color: colorBlue},
|
||||
"skip": {emoji: "⏭️", label: "SKIP", color: colorGray},
|
||||
"warn": {emoji: "⚠️", label: "WARN", color: colorYellow},
|
||||
"error": {emoji: "💥", label: "ERROR", color: colorRed},
|
||||
"retry": {emoji: "🔁", label: "RETRY", color: colorYellow},
|
||||
"giveup": {emoji: "🛑", label: "GIVEUP", color: colorRed},
|
||||
"success": {emoji: "🎉", label: "SUCCESS", color: colorGreen},
|
||||
}
|
||||
)
|
||||
|
||||
func Eventf(kind, format string, args ...any) {
|
||||
st, ok := styles[kind]
|
||||
if !ok {
|
||||
st = style{emoji: "•", label: strings.ToUpper(kind), color: colorGray}
|
||||
}
|
||||
|
||||
write(st, fmt.Sprintf(format, args...))
|
||||
}
|
||||
|
||||
func Statusf(frame, format string, args ...any) {
|
||||
st := styles["status"]
|
||||
st.emoji = frame
|
||||
write(st, fmt.Sprintf(format, args...))
|
||||
}
|
||||
|
||||
func write(st style, message string) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
timestamp := time.Now().Format("15:04:05")
|
||||
fmt.Printf(
|
||||
"%s[%s] %s %-7s %s%s\n",
|
||||
st.color,
|
||||
timestamp,
|
||||
st.emoji,
|
||||
st.label,
|
||||
message,
|
||||
colorReset,
|
||||
)
|
||||
}
|
||||
33
internal/model/types.go
Normal file
33
internal/model/types.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package model
|
||||
|
||||
type Recipe struct {
|
||||
Result string `json:"result"`
|
||||
ResultCount string `json:"result_count,omitempty"`
|
||||
Ingredients []string `json:"ingredients,omitempty"`
|
||||
Station string `json:"station,omitempty"`
|
||||
SourcePage string `json:"source_page,omitempty"`
|
||||
}
|
||||
|
||||
type Item struct {
|
||||
Name string `json:"name"`
|
||||
URL string `json:"url"`
|
||||
Categories []string `json:"categories,omitempty"`
|
||||
Infobox map[string]string `json:"infobox,omitempty"`
|
||||
Effects []string `json:"effects,omitempty"`
|
||||
EffectLinks []string `json:"effect_links,omitempty"`
|
||||
Recipes []Recipe `json:"recipes,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
}
|
||||
|
||||
type Effect struct {
|
||||
Name string `json:"name"`
|
||||
URL string `json:"url"`
|
||||
Categories []string `json:"categories,omitempty"`
|
||||
Infobox map[string]string `json:"infobox,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
}
|
||||
|
||||
type Dataset struct {
|
||||
Items []Item `json:"items"`
|
||||
Effects []Effect `json:"effects"`
|
||||
}
|
||||
102
internal/scraper/config.go
Normal file
102
internal/scraper/config.go
Normal file
@@ -0,0 +1,102 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
BaseURL string
|
||||
AllowedDomain string
|
||||
MaxDepth int
|
||||
MaxRetries int
|
||||
MaxQueuedPages int
|
||||
RequestDelay time.Duration
|
||||
RequestJitter time.Duration
|
||||
RequestTimeout time.Duration
|
||||
RetryBaseDelay time.Duration
|
||||
ProgressEvery time.Duration
|
||||
BrowserReferrer string
|
||||
BrowserAgents []string
|
||||
ItemSeeds []string
|
||||
CraftingSeeds []string
|
||||
IgnoredPrefixes []string
|
||||
IgnoredExact map[string]bool
|
||||
ItemListPathRe *regexp.Regexp
|
||||
CraftingPathRe *regexp.Regexp
|
||||
AmountPrefixRe *regexp.Regexp
|
||||
WhitespaceRe *regexp.Regexp
|
||||
SpinnerFrames []string
|
||||
}
|
||||
|
||||
func DefaultConfig() Config {
|
||||
baseURL := "https://outward.fandom.com"
|
||||
|
||||
return Config{
|
||||
BaseURL: baseURL,
|
||||
AllowedDomain: "outward.fandom.com",
|
||||
MaxDepth: 3,
|
||||
MaxRetries: 2,
|
||||
MaxQueuedPages: 1500,
|
||||
RequestDelay: 650 * time.Millisecond,
|
||||
RequestJitter: 350 * time.Millisecond,
|
||||
RequestTimeout: 8 * time.Second,
|
||||
RetryBaseDelay: 1200 * time.Millisecond,
|
||||
ProgressEvery: 3 * time.Second,
|
||||
BrowserReferrer: baseURL + "/",
|
||||
BrowserAgents: []string{
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
|
||||
},
|
||||
ItemSeeds: []string{
|
||||
baseURL + "/wiki/Items/Weapons",
|
||||
baseURL + "/wiki/Items/Equipment",
|
||||
baseURL + "/wiki/Items/Consumables",
|
||||
baseURL + "/wiki/Items/Ingredients",
|
||||
baseURL + "/wiki/Items/Deployables",
|
||||
baseURL + "/wiki/Items/Other",
|
||||
baseURL + "/wiki/Items/Item_Values",
|
||||
},
|
||||
CraftingSeeds: []string{
|
||||
baseURL + "/wiki/Crafting",
|
||||
baseURL + "/wiki/Crafting/Survival",
|
||||
baseURL + "/wiki/Crafting/Cooking",
|
||||
baseURL + "/wiki/Crafting/Alchemy",
|
||||
},
|
||||
IgnoredPrefixes: []string{
|
||||
"/wiki/File:",
|
||||
"/wiki/Category:",
|
||||
"/wiki/Special:",
|
||||
"/wiki/Help:",
|
||||
"/wiki/Template:",
|
||||
"/wiki/User:",
|
||||
"/wiki/User_blog:",
|
||||
"/wiki/Forum:",
|
||||
"/wiki/Message_Wall:",
|
||||
"/wiki/Thread:",
|
||||
"/wiki/Map:",
|
||||
},
|
||||
IgnoredExact: map[string]bool{
|
||||
"/wiki/Outward_Wiki": true,
|
||||
"/wiki/Items": true,
|
||||
"/wiki/Crafting": false,
|
||||
},
|
||||
ItemListPathRe: regexp.MustCompile(`^/wiki/Items(?:/|$)`),
|
||||
CraftingPathRe: regexp.MustCompile(`^/wiki/Crafting(?:/|$)`),
|
||||
AmountPrefixRe: regexp.MustCompile(`^\s*(\d+x)\s+`),
|
||||
WhitespaceRe: regexp.MustCompile(`\s+`),
|
||||
SpinnerFrames: []string{
|
||||
"⠋",
|
||||
"⠙",
|
||||
"⠹",
|
||||
"⠸",
|
||||
"⠼",
|
||||
"⠴",
|
||||
"⠦",
|
||||
"⠧",
|
||||
"⠇",
|
||||
"⠏",
|
||||
},
|
||||
}
|
||||
}
|
||||
487
internal/scraper/parse.go
Normal file
487
internal/scraper/parse.go
Normal file
@@ -0,0 +1,487 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"math/rand"
|
||||
"net/url"
|
||||
"path"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/gocolly/colly/v2"
|
||||
|
||||
"scrappr/internal/model"
|
||||
)
|
||||
|
||||
func (s *Scraper) shouldVisit(raw string) bool {
|
||||
if raw == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
parsed, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if parsed.Host != s.cfg.AllowedDomain {
|
||||
return false
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(parsed.Path, "/wiki/") {
|
||||
return false
|
||||
}
|
||||
|
||||
if s.cfg.IgnoredExact[parsed.Path] {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, prefix := range s.cfg.IgnoredPrefixes {
|
||||
if strings.HasPrefix(parsed.Path, prefix) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *Scraper) applyBrowserHeaders(r *colly.Request) {
|
||||
r.Headers.Set("User-Agent", s.randomUserAgent())
|
||||
r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8")
|
||||
r.Headers.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
r.Headers.Set("Cache-Control", "max-age=0")
|
||||
r.Headers.Set("DNT", "1")
|
||||
r.Headers.Set("Sec-Fetch-Dest", "document")
|
||||
r.Headers.Set("Sec-Fetch-Mode", "navigate")
|
||||
r.Headers.Set("Sec-Fetch-Site", "same-origin")
|
||||
r.Headers.Set("Sec-Fetch-User", "?1")
|
||||
r.Headers.Set("Upgrade-Insecure-Requests", "1")
|
||||
|
||||
if r.Headers.Get("Referer") == "" {
|
||||
r.Headers.Set("Referer", s.cfg.BrowserReferrer)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scraper) randomUserAgent() string {
|
||||
return s.cfg.BrowserAgents[rand.Intn(len(s.cfg.BrowserAgents))]
|
||||
}
|
||||
|
||||
func (s *Scraper) shouldQueueFromPage(fromPath, toURL string) bool {
|
||||
parsed, err := url.Parse(toURL)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
toPath := parsed.Path
|
||||
|
||||
if s.cfg.ItemListPathRe.MatchString(fromPath) {
|
||||
return strings.HasPrefix(toPath, "/wiki/") &&
|
||||
!s.cfg.ItemListPathRe.MatchString(toPath) &&
|
||||
!s.cfg.CraftingPathRe.MatchString(toPath)
|
||||
}
|
||||
|
||||
if s.cfg.CraftingPathRe.MatchString(fromPath) {
|
||||
return strings.HasPrefix(toPath, "/wiki/") &&
|
||||
!s.cfg.CraftingPathRe.MatchString(toPath)
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *Scraper) shouldQueueTableLink(e *colly.HTMLElement) bool {
|
||||
label := s.clean(e.Text)
|
||||
if label == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
cell := e.DOM.Closest("td, th")
|
||||
if cell.Length() == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
columnIndex := 0
|
||||
for prev := cell.Prev(); prev.Length() > 0; prev = prev.Prev() {
|
||||
switch goquery.NodeName(prev) {
|
||||
case "td", "th":
|
||||
columnIndex++
|
||||
}
|
||||
}
|
||||
|
||||
return columnIndex <= 1
|
||||
}
|
||||
|
||||
func (s *Scraper) pageKindForPath(pagePath string) string {
|
||||
switch {
|
||||
case s.cfg.ItemListPathRe.MatchString(pagePath):
|
||||
return "item-list"
|
||||
case s.cfg.CraftingPathRe.MatchString(pagePath):
|
||||
return "crafting"
|
||||
default:
|
||||
return "article"
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scraper) isLikelyItemPage(pagePath, title string, lcCats []string, doc *goquery.Document) bool {
|
||||
if s.cfg.ItemListPathRe.MatchString(pagePath) || s.cfg.CraftingPathRe.MatchString(pagePath) {
|
||||
return false
|
||||
}
|
||||
if strings.Contains(strings.ToLower(title), "/") {
|
||||
return false
|
||||
}
|
||||
if doc.Find("aside.portable-infobox").Length() == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, category := range lcCats {
|
||||
if strings.Contains(category, "items") || strings.Contains(category, "food") ||
|
||||
strings.Contains(category, "consumables") || strings.Contains(category, "ingredients") ||
|
||||
strings.Contains(category, "equipment") || strings.Contains(category, "weapons") ||
|
||||
strings.Contains(category, "deployables") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
infoboxText := strings.ToLower(s.clean(doc.Find("aside.portable-infobox").Text()))
|
||||
return strings.Contains(infoboxText, "item details") ||
|
||||
strings.Contains(infoboxText, "consumable details") ||
|
||||
strings.Contains(infoboxText, "equipment details") ||
|
||||
strings.Contains(infoboxText, "weapon")
|
||||
}
|
||||
|
||||
func (s *Scraper) isEffectPage(pagePath, title string, lcCats []string) bool {
|
||||
if strings.Contains(strings.ToLower(title), "effect") {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, category := range lcCats {
|
||||
if strings.Contains(category, "effects") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Contains(strings.ToLower(pagePath), "/effect")
|
||||
}
|
||||
|
||||
func (s *Scraper) parseItemPage(doc *goquery.Document, title, pageURL string, categories []string) model.Item {
|
||||
item := model.Item{
|
||||
Name: title,
|
||||
URL: pageURL,
|
||||
Categories: categories,
|
||||
Infobox: s.parsePortableInfobox(doc),
|
||||
}
|
||||
|
||||
item.Description = s.parseDescription(doc)
|
||||
item.Effects, item.EffectLinks = s.parseEffectsSection(doc)
|
||||
item.Recipes = s.parseRecipesFromPage(doc, title)
|
||||
|
||||
return item
|
||||
}
|
||||
|
||||
func (s *Scraper) parseEffectPage(doc *goquery.Document, title, pageURL string, categories []string) model.Effect {
|
||||
return model.Effect{
|
||||
Name: title,
|
||||
URL: pageURL,
|
||||
Categories: categories,
|
||||
Infobox: s.parsePortableInfobox(doc),
|
||||
Description: s.parseDescription(doc),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scraper) parsePortableInfobox(doc *goquery.Document) map[string]string {
|
||||
out := map[string]string{}
|
||||
|
||||
doc.Find("aside.portable-infobox .pi-item").Each(func(_ int, section *goquery.Selection) {
|
||||
label := s.clean(section.Find(".pi-data-label").First().Text())
|
||||
value := s.clean(section.Find(".pi-data-value").First().Text())
|
||||
|
||||
if label == "" {
|
||||
label = s.clean(section.Find("h3").First().Text())
|
||||
}
|
||||
if value == "" {
|
||||
value = s.clean(section.Find("div").Last().Text())
|
||||
}
|
||||
|
||||
if label != "" && value != "" {
|
||||
out[label] = value
|
||||
}
|
||||
})
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *Scraper) parseCategories(doc *goquery.Document) []string {
|
||||
var categories []string
|
||||
seen := map[string]bool{}
|
||||
|
||||
doc.Find(".page-header__categories a, .category-page__member-link, .wds-tabs__tab-label a").Each(func(_ int, node *goquery.Selection) {
|
||||
text := s.clean(node.Text())
|
||||
if text != "" && !seen[text] {
|
||||
seen[text] = true
|
||||
categories = append(categories, text)
|
||||
}
|
||||
})
|
||||
|
||||
if len(categories) == 0 {
|
||||
headerText := s.clean(doc.Find("body").Text())
|
||||
if idx := strings.Index(headerText, "in:"); idx >= 0 {
|
||||
line := strings.TrimPrefix(headerText[idx:], "in:")
|
||||
for _, part := range strings.Split(line, ",") {
|
||||
part = s.clean(part)
|
||||
if part != "" && !seen[part] {
|
||||
seen[part] = true
|
||||
categories = append(categories, part)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return categories
|
||||
}
|
||||
|
||||
func (s *Scraper) parseDescription(doc *goquery.Document) string {
|
||||
var description string
|
||||
|
||||
doc.Find(".mw-parser-output > p").EachWithBreak(func(_ int, paragraph *goquery.Selection) bool {
|
||||
text := s.clean(paragraph.Text())
|
||||
if text == "" {
|
||||
return true
|
||||
}
|
||||
|
||||
lower := strings.ToLower(text)
|
||||
if strings.Contains(lower, "is a") || strings.Contains(lower, "is an") || len(text) > 30 {
|
||||
description = text
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
})
|
||||
|
||||
return description
|
||||
}
|
||||
|
||||
func (s *Scraper) parseEffectsSection(doc *goquery.Document) ([]string, []string) {
|
||||
var effects []string
|
||||
var effectLinks []string
|
||||
|
||||
seenText := map[string]bool{}
|
||||
seenLink := map[string]bool{}
|
||||
section := s.findSection(doc, "Effects")
|
||||
if section.Length() == 0 {
|
||||
return effects, effectLinks
|
||||
}
|
||||
|
||||
s.walkSectionUntilNextHeading(section, func(node *goquery.Selection) {
|
||||
node.Find("li").Each(func(_ int, item *goquery.Selection) {
|
||||
text := s.clean(item.Text())
|
||||
if text != "" && !seenText[text] {
|
||||
seenText[text] = true
|
||||
effects = append(effects, text)
|
||||
}
|
||||
|
||||
item.Find("a[href^='/wiki/']").Each(func(_ int, anchor *goquery.Selection) {
|
||||
href, _ := anchor.Attr("href")
|
||||
label := s.clean(anchor.Text())
|
||||
if href != "" && label != "" && s.looksLikeEffectLink(href, label) && !seenLink[href] {
|
||||
seenLink[href] = true
|
||||
effectLinks = append(effectLinks, href)
|
||||
}
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
return effects, effectLinks
|
||||
}
|
||||
|
||||
func (s *Scraper) parseRecipesFromPage(doc *goquery.Document, pageTitle string) []model.Recipe {
|
||||
var recipes []model.Recipe
|
||||
|
||||
doc.Find("table").Each(func(_ int, table *goquery.Selection) {
|
||||
headerText := strings.ToLower(s.clean(table.Find("tr").First().Text()))
|
||||
if !(strings.Contains(headerText, "result") && strings.Contains(headerText, "ingredient")) {
|
||||
return
|
||||
}
|
||||
|
||||
table.Find("tr").Each(func(i int, row *goquery.Selection) {
|
||||
if i == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
cells := row.Find("th, td")
|
||||
if cells.Length() < 2 {
|
||||
return
|
||||
}
|
||||
|
||||
resultCell := cells.Eq(0)
|
||||
ingredientCell := cells.Eq(1)
|
||||
stationCell := cells.Eq(2)
|
||||
|
||||
resultText := s.clean(resultCell.Text())
|
||||
if resultText == "" {
|
||||
return
|
||||
}
|
||||
|
||||
resultCount := ""
|
||||
if matches := s.cfg.AmountPrefixRe.FindStringSubmatch(resultText); len(matches) > 1 {
|
||||
resultCount = matches[1]
|
||||
resultText = s.clean(strings.TrimPrefix(resultText, matches[0]))
|
||||
}
|
||||
|
||||
var ingredients []string
|
||||
ingredientCell.Find("li").Each(func(_ int, ingredient *goquery.Selection) {
|
||||
text := s.clean(ingredient.Text())
|
||||
if text != "" {
|
||||
ingredients = append(ingredients, text)
|
||||
}
|
||||
})
|
||||
|
||||
if len(ingredients) == 0 {
|
||||
for _, line := range s.splitLines(ingredientCell.Text()) {
|
||||
if line != "" {
|
||||
ingredients = append(ingredients, line)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
recipes = append(recipes, model.Recipe{
|
||||
Result: resultText,
|
||||
ResultCount: resultCount,
|
||||
Ingredients: ingredients,
|
||||
Station: s.clean(stationCell.Text()),
|
||||
SourcePage: pageTitle,
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
return recipes
|
||||
}
|
||||
|
||||
func (s *Scraper) findSection(doc *goquery.Document, title string) *goquery.Selection {
|
||||
var found *goquery.Selection
|
||||
|
||||
doc.Find(".mw-parser-output h2, .mw-parser-output h3, .mw-parser-output h4").EachWithBreak(func(_ int, section *goquery.Selection) bool {
|
||||
text := strings.TrimSpace(strings.TrimSuffix(s.clean(section.Text()), "[]"))
|
||||
if strings.EqualFold(text, title) {
|
||||
found = section
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
|
||||
if found == nil {
|
||||
return &goquery.Selection{}
|
||||
}
|
||||
|
||||
return found
|
||||
}
|
||||
|
||||
func (s *Scraper) walkSectionUntilNextHeading(start *goquery.Selection, fn func(*goquery.Selection)) {
|
||||
for section := start.Next(); section.Length() > 0; section = section.Next() {
|
||||
if goquery.NodeName(section) == "h2" || goquery.NodeName(section) == "h3" || goquery.NodeName(section) == "h4" {
|
||||
break
|
||||
}
|
||||
fn(section)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scraper) looksLikeEffectLink(href, label string) bool {
|
||||
lowerHref := strings.ToLower(href)
|
||||
lowerLabel := strings.ToLower(label)
|
||||
|
||||
if strings.Contains(lowerHref, "/wiki/") && (strings.Contains(lowerHref, "effect") || strings.Contains(lowerLabel, "effect")) {
|
||||
return true
|
||||
}
|
||||
|
||||
knownHints := []string{
|
||||
"bleeding", "burning", "poisoned", "possessed", "discipline", "rage",
|
||||
"sapped", "scorched", "curse", "doomed", "chill", "warm", "cool",
|
||||
"barrier", "protection", "imbue", "energized", "shimmer",
|
||||
}
|
||||
|
||||
for _, hint := range knownHints {
|
||||
if strings.Contains(lowerLabel, hint) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *Scraper) absoluteWikiURL(href string) string {
|
||||
if href == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||||
return href
|
||||
}
|
||||
|
||||
if strings.HasPrefix(href, "/wiki/") {
|
||||
return s.cfg.BaseURL + href
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
func (s *Scraper) flattenItems() []model.Item {
|
||||
out := make([]model.Item, 0, len(s.items))
|
||||
for _, item := range s.items {
|
||||
out = append(out, *item)
|
||||
}
|
||||
|
||||
sort.Slice(out, func(i, j int) bool {
|
||||
return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name)
|
||||
})
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *Scraper) flattenEffects() []model.Effect {
|
||||
out := make([]model.Effect, 0, len(s.effects))
|
||||
for _, effect := range s.effects {
|
||||
out = append(out, *effect)
|
||||
}
|
||||
|
||||
sort.Slice(out, func(i, j int) bool {
|
||||
return strings.ToLower(out[i].Name) < strings.ToLower(out[j].Name)
|
||||
})
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *Scraper) clean(value string) string {
|
||||
value = strings.ReplaceAll(value, "\u00a0", " ")
|
||||
value = strings.ReplaceAll(value, "\n", " ")
|
||||
value = strings.ReplaceAll(value, "\t", " ")
|
||||
value = strings.TrimSpace(value)
|
||||
return s.cfg.WhitespaceRe.ReplaceAllString(value, " ")
|
||||
}
|
||||
|
||||
func (s *Scraper) splitLines(value string) []string {
|
||||
value = strings.ReplaceAll(value, "\u00a0", " ")
|
||||
raw := strings.Split(value, "\n")
|
||||
out := make([]string, 0, len(raw))
|
||||
|
||||
for _, line := range raw {
|
||||
line = s.clean(line)
|
||||
if line != "" {
|
||||
out = append(out, line)
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *Scraper) lowerSlice(in []string) []string {
|
||||
out := make([]string, 0, len(in))
|
||||
for _, value := range in {
|
||||
out = append(out, strings.ToLower(value))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *Scraper) debugURLName(raw string) string {
|
||||
parsed, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return raw
|
||||
}
|
||||
return path.Base(parsed.Path)
|
||||
}
|
||||
411
internal/scraper/scraper.go
Normal file
411
internal/scraper/scraper.go
Normal file
@@ -0,0 +1,411 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"math/rand"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/gocolly/colly/v2"
|
||||
|
||||
"scrappr/internal/logx"
|
||||
"scrappr/internal/model"
|
||||
)
|
||||
|
||||
type Scraper struct {
|
||||
cfg Config
|
||||
collector *colly.Collector
|
||||
|
||||
mu sync.Mutex
|
||||
items map[string]*model.Item
|
||||
effects map[string]*model.Effect
|
||||
queued map[string]bool
|
||||
completed int
|
||||
failed int
|
||||
retried int
|
||||
requestSeq int
|
||||
spinnerIndex int
|
||||
activeURL string
|
||||
activeSince time.Time
|
||||
lastEvent time.Time
|
||||
}
|
||||
|
||||
func New(cfg Config) *Scraper {
|
||||
return &Scraper{
|
||||
cfg: cfg,
|
||||
items: map[string]*model.Item{},
|
||||
effects: map[string]*model.Effect{},
|
||||
queued: map[string]bool{},
|
||||
lastEvent: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scraper) Run() (model.Dataset, error) {
|
||||
s.collector = colly.NewCollector(
|
||||
colly.AllowedDomains(s.cfg.AllowedDomain),
|
||||
colly.MaxDepth(s.cfg.MaxDepth),
|
||||
colly.Async(true),
|
||||
)
|
||||
|
||||
s.collector.SetRequestTimeout(s.cfg.RequestTimeout)
|
||||
s.collector.ParseHTTPErrorResponse = true
|
||||
|
||||
if err := s.collector.Limit(&colly.LimitRule{
|
||||
DomainGlob: "*" + s.cfg.AllowedDomain + "*",
|
||||
Parallelism: 1,
|
||||
Delay: s.cfg.RequestDelay,
|
||||
RandomDelay: s.cfg.RequestJitter,
|
||||
}); err != nil {
|
||||
return model.Dataset{}, err
|
||||
}
|
||||
|
||||
s.registerHandlers()
|
||||
|
||||
done := make(chan struct{})
|
||||
defer close(done)
|
||||
s.startStatusLoop(done)
|
||||
|
||||
for _, seed := range append(append([]string{}, s.cfg.ItemSeeds...), s.cfg.CraftingSeeds...) {
|
||||
s.queueVisit("seed", seed)
|
||||
}
|
||||
|
||||
s.collector.Wait()
|
||||
|
||||
return model.Dataset{
|
||||
Items: s.flattenItems(),
|
||||
Effects: s.flattenEffects(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *Scraper) registerHandlers() {
|
||||
s.collector.OnRequest(func(r *colly.Request) {
|
||||
s.applyBrowserHeaders(r)
|
||||
|
||||
startedAt := time.Now()
|
||||
fromURL := r.Ctx.Get("from_url")
|
||||
if fromURL == "" {
|
||||
fromURL = "seed"
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
s.requestSeq++
|
||||
reqID := s.requestSeq
|
||||
s.activeURL = r.URL.String()
|
||||
s.activeSince = startedAt
|
||||
s.lastEvent = startedAt
|
||||
s.mu.Unlock()
|
||||
|
||||
r.Ctx.Put("request_id", strconv.Itoa(reqID))
|
||||
r.Ctx.Put("started_at_unix_nano", strconv.FormatInt(startedAt.UnixNano(), 10))
|
||||
|
||||
logx.Eventf(
|
||||
"visit",
|
||||
"#%d depth=%d attempt=%d from=%s to=%s",
|
||||
reqID,
|
||||
r.Depth,
|
||||
s.retryAttempt(r.Ctx)+1,
|
||||
s.debugURLName(fromURL),
|
||||
r.URL.String(),
|
||||
)
|
||||
})
|
||||
|
||||
s.collector.OnError(func(r *colly.Response, err error) {
|
||||
if r == nil || r.Request == nil {
|
||||
logx.Eventf("error", "request failed before response: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
duration := s.requestDuration(r.Request)
|
||||
|
||||
s.mu.Lock()
|
||||
s.failed++
|
||||
s.activeURL = ""
|
||||
s.activeSince = time.Time{}
|
||||
s.lastEvent = time.Now()
|
||||
s.mu.Unlock()
|
||||
|
||||
logx.Eventf(
|
||||
"error",
|
||||
"#%s status=%d after=%s url=%s: %v",
|
||||
r.Request.Ctx.Get("request_id"),
|
||||
r.StatusCode,
|
||||
s.durationString(duration),
|
||||
r.Request.URL.String(),
|
||||
err,
|
||||
)
|
||||
|
||||
if s.shouldRetry(r.StatusCode) {
|
||||
s.mu.Lock()
|
||||
s.retried++
|
||||
s.mu.Unlock()
|
||||
s.retryRequest(r, err)
|
||||
}
|
||||
})
|
||||
|
||||
s.collector.OnResponse(func(r *colly.Response) {
|
||||
if r.StatusCode >= 400 {
|
||||
logx.Eventf("skip", "#%s status=%d url=%s", r.Request.Ctx.Get("request_id"), r.StatusCode, r.Request.URL.String())
|
||||
return
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
||||
if err != nil {
|
||||
logx.Eventf("warn", "parse error for %s: %v", r.Request.URL.String(), err)
|
||||
return
|
||||
}
|
||||
|
||||
title := s.clean(doc.Find("h1.page-header__title").First().Text())
|
||||
if title == "" {
|
||||
title = s.clean(doc.Find("h1").First().Text())
|
||||
}
|
||||
if title == "" {
|
||||
logx.Eventf("warn", "empty title for %s", r.Request.URL.String())
|
||||
return
|
||||
}
|
||||
|
||||
pageURL := r.Request.URL.String()
|
||||
pagePath := r.Request.URL.Path
|
||||
categories := s.parseCategories(doc)
|
||||
lcCats := s.lowerSlice(categories)
|
||||
pageKind := s.pageKindForPath(pagePath)
|
||||
|
||||
switch {
|
||||
case s.isEffectPage(pagePath, title, lcCats):
|
||||
effect := s.parseEffectPage(doc, title, pageURL, categories)
|
||||
if effect.Name != "" {
|
||||
s.mu.Lock()
|
||||
s.effects[effect.URL] = &effect
|
||||
s.mu.Unlock()
|
||||
|
||||
pageKind = "effect"
|
||||
logx.Eventf("parsed", "#%s effect=%q", r.Request.Ctx.Get("request_id"), effect.Name)
|
||||
}
|
||||
case s.isLikelyItemPage(pagePath, title, lcCats, doc):
|
||||
item := s.parseItemPage(doc, title, pageURL, categories)
|
||||
|
||||
s.mu.Lock()
|
||||
s.items[item.URL] = &item
|
||||
s.mu.Unlock()
|
||||
|
||||
pageKind = "item"
|
||||
logx.Eventf(
|
||||
"parsed",
|
||||
"#%s item=%q effects=%d recipes=%d",
|
||||
r.Request.Ctx.Get("request_id"),
|
||||
item.Name,
|
||||
len(item.Effects),
|
||||
len(item.Recipes),
|
||||
)
|
||||
|
||||
for _, effectLink := range item.EffectLinks {
|
||||
link := s.absoluteWikiURL(effectLink)
|
||||
if link == "" {
|
||||
continue
|
||||
}
|
||||
s.queueVisit(pageURL, link)
|
||||
}
|
||||
default:
|
||||
logx.Eventf("skip", "#%s page=%q kind=%s", r.Request.Ctx.Get("request_id"), title, pageKind)
|
||||
}
|
||||
|
||||
logx.Eventf(
|
||||
"recv",
|
||||
"#%s status=%d bytes=%d kind=%s title=%q after=%s",
|
||||
r.Request.Ctx.Get("request_id"),
|
||||
r.StatusCode,
|
||||
len(r.Body),
|
||||
pageKind,
|
||||
title,
|
||||
s.durationString(s.requestDuration(r.Request)),
|
||||
)
|
||||
})
|
||||
|
||||
s.collector.OnScraped(func(r *colly.Response) {
|
||||
s.mu.Lock()
|
||||
s.completed++
|
||||
s.activeURL = ""
|
||||
s.activeSince = time.Time{}
|
||||
s.lastEvent = time.Now()
|
||||
doneCount := s.completed
|
||||
queueLen := len(s.queued)
|
||||
s.mu.Unlock()
|
||||
|
||||
logx.Eventf("done", "#%s total=%d queued=%d url=%s", r.Request.Ctx.Get("request_id"), doneCount, queueLen, r.Request.URL.String())
|
||||
})
|
||||
|
||||
s.collector.OnHTML(".mw-parser-output table a[href]", func(e *colly.HTMLElement) {
|
||||
href := e.Attr("href")
|
||||
link := e.Request.AbsoluteURL(href)
|
||||
if !s.shouldVisit(link) {
|
||||
return
|
||||
}
|
||||
|
||||
if s.shouldQueueFromPage(e.Request.URL.Path, link) && s.shouldQueueTableLink(e) {
|
||||
s.queueVisit(e.Request.URL.String(), link)
|
||||
return
|
||||
}
|
||||
|
||||
if s.looksLikeEffectLink(href, e.Text) {
|
||||
s.queueVisit(e.Request.URL.String(), link)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (s *Scraper) startStatusLoop(done <-chan struct{}) {
|
||||
go func() {
|
||||
ticker := time.NewTicker(s.cfg.ProgressEvery)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-done:
|
||||
return
|
||||
case <-ticker.C:
|
||||
s.mu.Lock()
|
||||
queueLen := len(s.queued)
|
||||
itemLen := len(s.items)
|
||||
effectLen := len(s.effects)
|
||||
currentURL := s.activeURL
|
||||
currentName := "-"
|
||||
currentFor := s.durationString(time.Since(s.activeSince))
|
||||
idleFor := s.durationString(time.Since(s.lastEvent))
|
||||
completedCount := s.completed
|
||||
failedCount := s.failed
|
||||
retriedCount := s.retried
|
||||
frame := s.spinnerFrame()
|
||||
s.mu.Unlock()
|
||||
|
||||
if currentURL == "" {
|
||||
currentFor = "-"
|
||||
} else {
|
||||
currentName = s.debugURLName(currentURL)
|
||||
}
|
||||
|
||||
logx.Statusf(
|
||||
frame,
|
||||
"queued=%d completed=%d failed=%d retries=%d items=%d effects=%d active=%s active_for=%s idle=%s",
|
||||
queueLen,
|
||||
completedCount,
|
||||
failedCount,
|
||||
retriedCount,
|
||||
itemLen,
|
||||
effectLen,
|
||||
currentName,
|
||||
currentFor,
|
||||
idleFor,
|
||||
)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (s *Scraper) queueVisit(fromURL, toURL string) {
|
||||
if toURL == "" {
|
||||
return
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
switch {
|
||||
case s.queued[toURL]:
|
||||
s.mu.Unlock()
|
||||
return
|
||||
case len(s.queued) >= s.cfg.MaxQueuedPages:
|
||||
s.mu.Unlock()
|
||||
logx.Eventf("skip", "queue budget reached from=%s to=%s", s.debugURLName(fromURL), toURL)
|
||||
return
|
||||
default:
|
||||
s.queued[toURL] = true
|
||||
queueLen := len(s.queued)
|
||||
s.mu.Unlock()
|
||||
|
||||
ctx := colly.NewContext()
|
||||
ctx.Put("from_url", fromURL)
|
||||
|
||||
logx.Eventf("queue", "%d from=%s to=%s", queueLen, s.debugURLName(fromURL), toURL)
|
||||
if err := s.collector.Request("GET", toURL, nil, ctx, nil); err != nil {
|
||||
logx.Eventf("warn", "queue failed from=%s to=%s: %v", s.debugURLName(fromURL), toURL, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scraper) spinnerFrame() string {
|
||||
if len(s.cfg.SpinnerFrames) == 0 {
|
||||
return "🌀"
|
||||
}
|
||||
|
||||
frame := s.cfg.SpinnerFrames[s.spinnerIndex%len(s.cfg.SpinnerFrames)]
|
||||
s.spinnerIndex++
|
||||
return frame
|
||||
}
|
||||
|
||||
func (s *Scraper) shouldRetry(statusCode int) bool {
|
||||
return statusCode == 0 || statusCode == 408 || statusCode == 425 || statusCode == 429 || statusCode >= 500
|
||||
}
|
||||
|
||||
func (s *Scraper) retryRequest(r *colly.Response, err error) {
|
||||
attempt := s.retryAttempt(r.Request.Ctx)
|
||||
|
||||
if attempt >= s.cfg.MaxRetries {
|
||||
logx.Eventf("giveup", "url=%s attempts=%d: %v", r.Request.URL.String(), attempt, err)
|
||||
return
|
||||
}
|
||||
|
||||
attempt++
|
||||
r.Request.Ctx.Put("retry_count", strconv.Itoa(attempt))
|
||||
|
||||
wait := s.retryDelay(attempt)
|
||||
logx.Eventf("retry", "%d/%d %s after %s", attempt, s.cfg.MaxRetries, r.Request.URL.String(), wait)
|
||||
time.Sleep(wait)
|
||||
|
||||
if retryErr := r.Request.Retry(); retryErr != nil {
|
||||
logx.Eventf("error", "retry failed for %s: %v (original error: %v)", r.Request.URL.String(), retryErr, err)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scraper) retryAttempt(ctx *colly.Context) int {
|
||||
attempt := 0
|
||||
if raw := ctx.Get("retry_count"); raw != "" {
|
||||
parsed, err := strconv.Atoi(raw)
|
||||
if err == nil {
|
||||
attempt = parsed
|
||||
}
|
||||
}
|
||||
return attempt
|
||||
}
|
||||
|
||||
func (s *Scraper) retryDelay(attempt int) time.Duration {
|
||||
backoff := s.cfg.RetryBaseDelay * time.Duration(1<<(attempt-1))
|
||||
return backoff + s.jitter(500*time.Millisecond)
|
||||
}
|
||||
|
||||
func (s *Scraper) jitter(max time.Duration) time.Duration {
|
||||
if max <= 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
return time.Duration(rand.Int63n(int64(max)))
|
||||
}
|
||||
|
||||
func (s *Scraper) requestDuration(r *colly.Request) time.Duration {
|
||||
raw := r.Ctx.Get("started_at_unix_nano")
|
||||
if raw == "" {
|
||||
return 0
|
||||
}
|
||||
|
||||
startedAtUnixNano, err := strconv.ParseInt(raw, 10, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
return time.Since(time.Unix(0, startedAtUnixNano))
|
||||
}
|
||||
|
||||
func (s *Scraper) durationString(d time.Duration) string {
|
||||
if d <= 0 {
|
||||
return "0s"
|
||||
}
|
||||
|
||||
return d.Round(100 * time.Millisecond).String()
|
||||
}
|
||||
4
outward_data.json
Normal file
4
outward_data.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"items": [],
|
||||
"effects": []
|
||||
}
|
||||
Reference in New Issue
Block a user