Golang web scraping using Scrapemate

Introduction

In this blog post, we are going to use scrapemate to extract hockey teams form

from the website https://www.scrapethissite.com/pages/forms

This website contains sandboxes for testing your scrapers, so no real data.

You can find the full code on github

The previous post uses the low lever API from scrapemate, this one uses the high level API

Code Skeleton

Create a folder named scrapemate-highlevel-api-example

mkdir scrapemate-highlevel-api-example
cd scrapemate-highlevel-api-example

the initialize a go module

go mod init github.com/gosom/scrapemate-highlevel-api-example

Create 2 folders:

  • hockey

  • testdata

mkdir hockey testdata

Parser

Now we need to figure out how we are going to parse the data from the website.

Scrapemate high level API is using goquery and CSS-selectors. You can use another html parsing library by utilizing the low level API if you like.

Create a file team.go in the hockey directory

touch hockey/team.go

and copy the following:

package hockey

import (
    "strconv"
    "strings"

    "github.com/PuerkitoBio/goquery"
)

type Team struct {
    Name         string
    Year         int
    Wins         int
    Losses       int
    OTLosses     int
    WinPct       float64
    GoalsFor     int
    GoalsAgainst int
    GoalDiff     int
}

func (t Team) CsvHeaders() []string {
    return []string{
        "Name",
        "Year",
        "Wins",
        "Losses",
        "OTLosses",
        "WinPct",
        "GoalsFor",
        "GoalsAgainst",
        "GoalDiff",
    }
}

func (t Team) CsvRow() []string {
    return []string{
        t.Name,
        strconv.Itoa(t.Year),
        strconv.Itoa(t.Wins),
        strconv.Itoa(t.Losses),
        strconv.Itoa(t.OTLosses),
        strconv.FormatFloat(t.WinPct, 'f', 2, 64),
        strconv.Itoa(t.GoalsFor),
        strconv.Itoa(t.GoalsAgainst),
        strconv.Itoa(t.GoalDiff),
    }
}

func parseTeams(doc *goquery.Document) ([]Team, error) {
    sel := "table.table tr.team"
    var teams []Team
    doc.Find(sel).Each(func(i int, s *goquery.Selection) {
        teams = append(teams, parseTeam(s))
    })
    return teams, nil
}

func parseTeam(s *goquery.Selection) Team {
    var team Team
    team.Name = cleanText(s.Find("td.name").Text())
    team.Year = parseInt(s.Find("td.year").Text())
    team.Wins = parseInt(s.Find("td.wins").Text())
    team.Losses = parseInt(s.Find("td.losses").Text())
    team.OTLosses = parseInt(s.Find("td.ot-losses").Text())
    team.WinPct = parseFloat(s.Find("td.pct").Text())
    team.GoalsFor = parseInt(s.Find("td.gf").Text())
    team.GoalsAgainst = parseInt(s.Find("td.ga").Text())
    team.GoalDiff = parseInt(s.Find("td.diff").Text())
    return team
}

func parseNextLink(doc *goquery.Document) (string, map[string]string) {
    sel := "ul.pagination>li:last-child>a[aria-label=Next]"
    s := doc.Find(sel).AttrOr("href", "")
    if s == "" {
        return "", nil
    }
    s = "https://www.scrapethissite.com" + s
    parts := strings.Split(s, "?")
    nextLink := parts[0]
    params := make(map[string]string)
    for _, p := range strings.Split(parts[1], "&") {
        kv := strings.Split(p, "=")
        params[kv[0]] = kv[1]
    }
    return nextLink, params
}

func cleanText(s string) string {
    s = strings.TrimFunc(s, func(r rune) bool {
        return r == '\n'
    })
    return strings.TrimSpace(s)
}

func parseInt(s string) int {
    s = cleanText(s)
    if s == "" {
        return 0
    }
    ans, _ := strconv.Atoi(s)
    return ans
}

func parseFloat(s string) float64 {
    s = cleanText(s)
    if s == "" {
        return 0
    }
    ans, _ := strconv.ParseFloat(s, 64)
    return ans
}

The code above is straightforward.

The most important functions are:

  • parseTeams : returns a list of Team structs with the attributes populated

  • parseNextLink: returns the next link in two parts the url and the url params

We need to test that we parse properly. Let's create some unit tests.

But first download into testdata the website:

 curl -o testdata/teams.html 'https://www.scrapethissite.com/pages/forms/?page_num=1&per_page=100'

create a file hockey/team_test.go and paste the following:

package hockey

import (
    "os"
    "testing"

    "github.com/PuerkitoBio/goquery"
    "github.com/stretchr/testify/require"
)

func Test_parseTeams(t *testing.T) {
    fd, err := os.Open("../testdata/teams.html")
    require.NoError(t, err)
    defer fd.Close()
    doc, err := goquery.NewDocumentFromReader(fd)
    require.NoError(t, err)

    teams, err := parseTeams(doc)
    require.NoError(t, err)
    require.Equal(t, 100, len(teams))

    team := teams[0]
    require.Equal(t, "Boston Bruins", team.Name)
    require.Equal(t, 1990, team.Year)
    require.Equal(t, 44, team.Wins)
    require.Equal(t, 24, team.Losses)
    require.Equal(t, 0, team.OTLosses)
    require.Equal(t, 0.55, team.WinPct)
    require.Equal(t, 299, team.GoalsFor)
    require.Equal(t, 264, team.GoalsAgainst)
    require.Equal(t, 35, team.GoalDiff)
}

func Test_parseNextLink(t *testing.T) {
    fd, err := os.Open("../testdata/teams.html")
    require.NoError(t, err)
    defer fd.Close()
    doc, err := goquery.NewDocumentFromReader(fd)
    require.NoError(t, err)

    nextLink, params := parseNextLink(doc)
    require.Equal(t, "https://www.scrapethissite.com/pages/forms/", nextLink)
    require.Equal(t, "2", params["page_num"])
    require.Equal(t, "100", params["per_page"])
}

Run

go mod tidy

and then run the unit tests

go test -v ./...

Tests must pass.

Scraping Job definition

create a file hockey/collect.go

touch hockey/collect.go

and paste the following

package hockey

import (
    "context"
    "fmt"
    "net/http"
    "time"

    "github.com/PuerkitoBio/goquery"
    "github.com/google/uuid"
    "github.com/gosom/scrapemate"
)

type TeamCollectJob struct {
    scrapemate.Job
}

func NewTeamCollectJob(u string, params map[string]string) *TeamCollectJob {
    return &TeamCollectJob{
        Job: scrapemate.Job{
            // just give it a random id
            ID:        uuid.New().String(),
            Method:    http.MethodGet,
            URL:       u,
            UrlParams: params,
            Headers: map[string]string{
                "User-Agent": scrapemate.DefaultUserAgent,
            },
            Timeout:    10 * time.Second,
            MaxRetries: 3,
        },
    }
}

func (o *TeamCollectJob) Process(ctx context.Context, resp *scrapemate.Response) (any, []scrapemate.IJob, error) {
    doc, ok := resp.Document.(*goquery.Document)
    if !ok {
        return nil, nil, fmt.Errorf("invalid document type %T expected *goquery.Document", resp.Document)
    }
    teams, err := parseTeams(doc)
    if err != nil {
        return nil, nil, err
    }

    var nextJobs []scrapemate.IJob

    nextLink, params := parseNextLink(doc)
    if nextLink != "" {
        nextJobs = append(nextJobs, NewTeamCollectJob(nextLink, params))
    }

    return teams, nextJobs, nil
}

Here we define the scraping job (TeamCollectJob) and the Process method.

The Process method returns three things:

  • the result (here a slice of Teams)

  • the next jobs (the job defined by the next link in pagination)

  • an error if it occurs

Run

go mod tidy

the main function

create a file main.go

touch main.go

and paste the contents

package main

import (
    "context"
    "encoding/csv"
    "os"

    "github.com/gosom/scrapemate"
    "github.com/gosom/scrapemate-highlevel-api-example/hockey"
    "github.com/gosom/scrapemate/adapters/writers/csvwriter"
    "github.com/gosom/scrapemate/scrapemateapp"
)

func main() {
    if err := run(); err != nil {
        os.Stderr.WriteString(err.Error() + "\n")
        os.Exit(1)
        return
    }
    os.Exit(0)
}

func run() error {
    csvWriter := csvwriter.NewCsvWriter(csv.NewWriter(os.Stdout))

    writers := []scrapemate.ResultWriter{
        csvWriter,
    }

    cfg, err := scrapemateapp.NewConfig(writers)
    if err != nil {
        return err
    }
    app, err := scrapemateapp.NewScrapeMateApp(cfg)
    if err != nil {
        return err
    }
    params := map[string]string{
        "page_num": "1",
        "per_page": "100",
    }
    seedJobs := []scrapemate.IJob{
        hockey.NewTeamCollectJob("https://www.scrapethissite.com/pages/forms/", params),
    }
    return app.Start(context.Background(), seedJobs...)
}

Here we define a csvwriter that writes to stdout

csvWriter := csvwriter.NewCsvWriter(csv.NewWriter(os.Stdout))

writers := []scrapemate.ResultWriter{
    csvWriter,
}

Then we initialize our scraper:

cfg, err := scrapemateapp.NewConfig(writers)
    if err != nil {
        return err
    }
    app, err := scrapemateapp.NewScrapeMateApp(cfg)
    if err != nil {
        return err
    }

Finally, we create a seed job (the one that our scraper will use to start).

and start the scraper

params := map[string]string{
        "page_num": "1",
        "per_page": "100",
    }
    seedJobs := []scrapemate.IJob{
        hockey.NewTeamCollectJob("https://www.scrapethissite.com/pages/forms/", params),
    }
    return app.Start(context.Background(), seedJobs...)

Run

go mod tidy

Run the scraper

In order to run the scraper just do:

 go run main.go 1>hockey.csv

After all the 6 pages are crawled you may stop the scraper using CTRL-C

the results must be in hockey.csv

Summary

Scrapemate is a web scraping framework written in Golang. In this post, we demonstrated how easy it is to create a scraper and save the results in a CSV file.

Read the blog post for the low level API here and see how easier is to scrape using the high level API.

See the full example here and see another example on github examples