Golang web scraping using Scrapemate
Introduction
In this blog post, we are going to use scrapemate to extract hockey teams form
from the website https://www.scrapethissite.com/pages/forms
This website contains sandboxes for testing your scrapers, so no real data.
You can find the full code on github
The previous post uses the low lever API from scrapemate, this one uses the high level API
Code Skeleton
Create a folder named scrapemate-highlevel-api-example
mkdir scrapemate-highlevel-api-example
cd scrapemate-highlevel-api-example
the initialize a go module
go mod init github.com/gosom/scrapemate-highlevel-api-example
Create 2 folders:
hockey
testdata
mkdir hockey testdata
Parser
Now we need to figure out how we are going to parse the data from the website.
Scrapemate high level API is using goquery and CSS-selectors. You can use another html parsing library by utilizing the low level API if you like.
Create a file team.go
in the hockey
directory
touch hockey/team.go
and copy the following:
package hockey
import (
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
)
type Team struct {
Name string
Year int
Wins int
Losses int
OTLosses int
WinPct float64
GoalsFor int
GoalsAgainst int
GoalDiff int
}
func (t Team) CsvHeaders() []string {
return []string{
"Name",
"Year",
"Wins",
"Losses",
"OTLosses",
"WinPct",
"GoalsFor",
"GoalsAgainst",
"GoalDiff",
}
}
func (t Team) CsvRow() []string {
return []string{
t.Name,
strconv.Itoa(t.Year),
strconv.Itoa(t.Wins),
strconv.Itoa(t.Losses),
strconv.Itoa(t.OTLosses),
strconv.FormatFloat(t.WinPct, 'f', 2, 64),
strconv.Itoa(t.GoalsFor),
strconv.Itoa(t.GoalsAgainst),
strconv.Itoa(t.GoalDiff),
}
}
func parseTeams(doc *goquery.Document) ([]Team, error) {
sel := "table.table tr.team"
var teams []Team
doc.Find(sel).Each(func(i int, s *goquery.Selection) {
teams = append(teams, parseTeam(s))
})
return teams, nil
}
func parseTeam(s *goquery.Selection) Team {
var team Team
team.Name = cleanText(s.Find("td.name").Text())
team.Year = parseInt(s.Find("td.year").Text())
team.Wins = parseInt(s.Find("td.wins").Text())
team.Losses = parseInt(s.Find("td.losses").Text())
team.OTLosses = parseInt(s.Find("td.ot-losses").Text())
team.WinPct = parseFloat(s.Find("td.pct").Text())
team.GoalsFor = parseInt(s.Find("td.gf").Text())
team.GoalsAgainst = parseInt(s.Find("td.ga").Text())
team.GoalDiff = parseInt(s.Find("td.diff").Text())
return team
}
func parseNextLink(doc *goquery.Document) (string, map[string]string) {
sel := "ul.pagination>li:last-child>a[aria-label=Next]"
s := doc.Find(sel).AttrOr("href", "")
if s == "" {
return "", nil
}
s = "https://www.scrapethissite.com" + s
parts := strings.Split(s, "?")
nextLink := parts[0]
params := make(map[string]string)
for _, p := range strings.Split(parts[1], "&") {
kv := strings.Split(p, "=")
params[kv[0]] = kv[1]
}
return nextLink, params
}
func cleanText(s string) string {
s = strings.TrimFunc(s, func(r rune) bool {
return r == '\n'
})
return strings.TrimSpace(s)
}
func parseInt(s string) int {
s = cleanText(s)
if s == "" {
return 0
}
ans, _ := strconv.Atoi(s)
return ans
}
func parseFloat(s string) float64 {
s = cleanText(s)
if s == "" {
return 0
}
ans, _ := strconv.ParseFloat(s, 64)
return ans
}
The code above is straightforward.
The most important functions are:
parseTeams
: returns a list ofTeam
structs with the attributes populatedparseNextLink
: returns the next link in two parts the url and the url params
We need to test that we parse properly. Let's create some unit tests.
But first download into testdata the website:
curl -o testdata/teams.html 'https://www.scrapethissite.com/pages/forms/?page_num=1&per_page=100'
create a file hockey/team_test.go
and paste the following:
package hockey
import (
"os"
"testing"
"github.com/PuerkitoBio/goquery"
"github.com/stretchr/testify/require"
)
func Test_parseTeams(t *testing.T) {
fd, err := os.Open("../testdata/teams.html")
require.NoError(t, err)
defer fd.Close()
doc, err := goquery.NewDocumentFromReader(fd)
require.NoError(t, err)
teams, err := parseTeams(doc)
require.NoError(t, err)
require.Equal(t, 100, len(teams))
team := teams[0]
require.Equal(t, "Boston Bruins", team.Name)
require.Equal(t, 1990, team.Year)
require.Equal(t, 44, team.Wins)
require.Equal(t, 24, team.Losses)
require.Equal(t, 0, team.OTLosses)
require.Equal(t, 0.55, team.WinPct)
require.Equal(t, 299, team.GoalsFor)
require.Equal(t, 264, team.GoalsAgainst)
require.Equal(t, 35, team.GoalDiff)
}
func Test_parseNextLink(t *testing.T) {
fd, err := os.Open("../testdata/teams.html")
require.NoError(t, err)
defer fd.Close()
doc, err := goquery.NewDocumentFromReader(fd)
require.NoError(t, err)
nextLink, params := parseNextLink(doc)
require.Equal(t, "https://www.scrapethissite.com/pages/forms/", nextLink)
require.Equal(t, "2", params["page_num"])
require.Equal(t, "100", params["per_page"])
}
Run
go mod tidy
and then run the unit tests
go test -v ./...
Tests must pass.
Scraping Job definition
create a file hockey/collect.go
touch hockey/collect.go
and paste the following
package hockey
import (
"context"
"fmt"
"net/http"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/google/uuid"
"github.com/gosom/scrapemate"
)
type TeamCollectJob struct {
scrapemate.Job
}
func NewTeamCollectJob(u string, params map[string]string) *TeamCollectJob {
return &TeamCollectJob{
Job: scrapemate.Job{
// just give it a random id
ID: uuid.New().String(),
Method: http.MethodGet,
URL: u,
UrlParams: params,
Headers: map[string]string{
"User-Agent": scrapemate.DefaultUserAgent,
},
Timeout: 10 * time.Second,
MaxRetries: 3,
},
}
}
func (o *TeamCollectJob) Process(ctx context.Context, resp *scrapemate.Response) (any, []scrapemate.IJob, error) {
doc, ok := resp.Document.(*goquery.Document)
if !ok {
return nil, nil, fmt.Errorf("invalid document type %T expected *goquery.Document", resp.Document)
}
teams, err := parseTeams(doc)
if err != nil {
return nil, nil, err
}
var nextJobs []scrapemate.IJob
nextLink, params := parseNextLink(doc)
if nextLink != "" {
nextJobs = append(nextJobs, NewTeamCollectJob(nextLink, params))
}
return teams, nextJobs, nil
}
Here we define the scraping job (TeamCollectJob
) and the Process
method.
The Process
method returns three things:
the result (here a slice of
Teams
)the next jobs (the job defined by the next link in pagination)
an error if it occurs
Run
go mod tidy
the main function
create a file main.go
touch main.go
and paste the contents
package main
import (
"context"
"encoding/csv"
"os"
"github.com/gosom/scrapemate"
"github.com/gosom/scrapemate-highlevel-api-example/hockey"
"github.com/gosom/scrapemate/adapters/writers/csvwriter"
"github.com/gosom/scrapemate/scrapemateapp"
)
func main() {
if err := run(); err != nil {
os.Stderr.WriteString(err.Error() + "\n")
os.Exit(1)
return
}
os.Exit(0)
}
func run() error {
csvWriter := csvwriter.NewCsvWriter(csv.NewWriter(os.Stdout))
writers := []scrapemate.ResultWriter{
csvWriter,
}
cfg, err := scrapemateapp.NewConfig(writers)
if err != nil {
return err
}
app, err := scrapemateapp.NewScrapeMateApp(cfg)
if err != nil {
return err
}
params := map[string]string{
"page_num": "1",
"per_page": "100",
}
seedJobs := []scrapemate.IJob{
hockey.NewTeamCollectJob("https://www.scrapethissite.com/pages/forms/", params),
}
return app.Start(context.Background(), seedJobs...)
}
Here we define a csvwriter that writes to stdout
csvWriter := csvwriter.NewCsvWriter(csv.NewWriter(os.Stdout))
writers := []scrapemate.ResultWriter{
csvWriter,
}
Then we initialize our scraper:
cfg, err := scrapemateapp.NewConfig(writers)
if err != nil {
return err
}
app, err := scrapemateapp.NewScrapeMateApp(cfg)
if err != nil {
return err
}
Finally, we create a seed job (the one that our scraper will use to start).
and start the scraper
params := map[string]string{
"page_num": "1",
"per_page": "100",
}
seedJobs := []scrapemate.IJob{
hockey.NewTeamCollectJob("https://www.scrapethissite.com/pages/forms/", params),
}
return app.Start(context.Background(), seedJobs...)
Run
go mod tidy
Run the scraper
In order to run the scraper just do:
go run main.go 1>hockey.csv
After all the 6 pages are crawled you may stop the scraper using CTRL-C
the results must be in hockey.csv
Summary
Scrapemate is a web scraping framework written in Golang. In this post, we demonstrated how easy it is to create a scraper and save the results in a CSV file.
Read the blog post for the low level API here and see how easier is to scrape using the high level API.
See the full example here and see another example on github examples