Skip to content

Commit

Permalink
Store refs and sizes in their own tables, no longer store full User-A…
Browse files Browse the repository at this point in the history
…gent string

hits.ref and hits.ref_scheme are now in their own refs table; there was
a lot of duplication here, and this saves quite a bit of space. For
example on goatcounter.com one site has 79 million pageviews with the
same 10-byte string set as the ref. Add up the ref_scheme and it's over
800M of the same string repeated over and over again. Never mind all the
really common strings like "Google", "www.facebook.com" and whatnot
shared between all the different sites.

This also makes it easier to update the refs grouping; for example I
have "yandex.ru/clck/jsredir", "yandex.ru", and a few variants, but
these should all just be "Yandex". Updating that right now is hard
because it has to rewrite so much in the (large) hits table.

The same applies to sizes; especially with the scaling these could be
quite long strings.

All of this saves quite a bit of size; for example for arp242.net:

                   before    after
    total db size    224M     112M
    hits              90M       9M
    ref_counts        38M      29M
    user_agents       12M        -
    refs                -     0.5M
    sizes               -     0.3M

---

Store browser_id and system_id on the hits column, and no longer store
the User-Agent header. It was never really needed, and only done so I
could correct information later if it turned out there was as a bug in
the parser I wrote, but this has never been needed because it was pretty
much right from the get-go. Directly storing this on hits also
simplifies a few queries.
  • Loading branch information
arp242 committed Jul 31, 2023
1 parent 93e457f commit 2a9aef4
Show file tree
Hide file tree
Showing 37 changed files with 783 additions and 598 deletions.
21 changes: 21 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,27 @@ This list is not comprehensive, and only lists new features and major changes,
but not every minor bugfix. The goatcounter.com service generally runs the
latest master.

Unreleased
----------
This release rewrites quite a few tables to a more efficient format. For
small-to-medium instances this will take a few minutes at the most, but if you
have very large instances this may take a few hours. It also requires enough
free disk space to rewrite the `hits` table.

If you want, you can show the migration to run with:

% goatcounter db migrate -show 2022-11-06-1-hits

Or if you use PostgreSQL:

% goatcounter db migrate -show -db postgresql+dbname=goatcounter 2022-11-06-1-hits

- UA Client hints are now used to get the browser and system name (if present).

- The `User-Agent` header is no longer stored; only the browser and system
parsed out of there. It's pretty reliable, and especially mobile browser
User-Agents are ridiculously unique.

2022-11-15 v2.4.1
-----------------
- Fix regression that caused the charts for SQLite to be off.
Expand Down
2 changes: 2 additions & 0 deletions cmd/goatcounter/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ migrate command:
committing it. Useful to test if migrations will run
correctly without actually altering the database.
-show Only show the SQL it would execute, but don't run anything.
Positional arguments are names of the migration, either as just the name
("2020-01-05-2-x") or as the file path ("./db/migrate/2020-01-05-2-x.sql").
Expand Down
80 changes: 40 additions & 40 deletions cmd/goatcounter/db_migrate.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ import (

func cmdDBMigrate(f zli.Flags, dbConnect, debug *string, createdb *bool) error {
var (
dev = f.Bool(false, "dev").Pointer()
test = f.Bool(false, "test").Pointer()
dev = f.Bool(false, "dev")
test = f.Bool(false, "test")
show = f.Bool(false, "show")
)
err := f.Parse()
if err != nil {
Expand All @@ -33,54 +34,53 @@ func cmdDBMigrate(f zli.Flags, dbConnect, debug *string, createdb *bool) error {
return errors.New("need a migration or command")
}

return func(dbConnect, debug string, createdb, dev, test bool) error {
zlog.Config.SetDebug(debug)
zlog.Config.SetDebug(*debug)

db, _, err := connectDB(dbConnect, "", nil, createdb, false)
if err != nil {
return err
}
defer db.Close()
db, _, err := connectDB(*dbConnect, "", nil, *createdb, false)
if err != nil {
return err
}
defer db.Close()

fsys, err := zfs.EmbedOrDir(goatcounter.DB, "", dev.Bool())
if err != nil {
return err
}
m, err := zdb.NewMigrate(db, fsys, gomig.Migrations)
if err != nil {
return err
}

fsys, err := zfs.EmbedOrDir(goatcounter.DB, "", dev)
m.Test(test.Bool())
m.Show(show.Set())

if zslice.ContainsAny(f.Args, "pending", "list") {
have, ran, err := m.List()
if err != nil {
return err
}
m, err := zdb.NewMigrate(db, fsys, gomig.Migrations)
if err != nil {
return err
diff := zslice.Difference(have, ran)
pending := "no pending migrations"
if len(diff) > 0 {
pending = fmt.Sprintf("pending migrations:\n\t%s", strings.Join(diff, "\n\t"))
}

m.Test(test)

if zslice.ContainsAny(f.Args, "pending", "list") {
have, ran, err := m.List()
if err != nil {
return err
}
diff := zslice.Difference(have, ran)
pending := "no pending migrations"
if len(diff) > 0 {
pending = fmt.Sprintf("pending migrations:\n\t%s", strings.Join(diff, "\n\t"))
}

if slices.Contains(f.Args, "list") {
for i := range have {
if slices.Contains(diff, have[i]) {
have[i] = "pending: " + have[i]
}
if slices.Contains(f.Args, "list") {
for i := range have {
if slices.Contains(diff, have[i]) {
have[i] = "pending: " + have[i]
}
fmt.Fprintln(zli.Stdout, strings.Join(have, "\n"))
return nil
}

if len(diff) > 0 {
return errors.New(pending)
}
fmt.Fprintln(zli.Stdout, pending)
fmt.Fprintln(zli.Stdout, strings.Join(have, "\n"))
return nil
}

return m.Run(f.Args...)
}(*dbConnect, *debug, *createdb, *dev, *test)
if len(diff) > 0 {
return errors.New(pending)
}
fmt.Fprintln(zli.Stdout, pending)
return nil
}

return m.Run(f.Args...)
}
15 changes: 0 additions & 15 deletions cmd/goatcounter/db_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,21 +69,6 @@ func TestDBQuery(t *testing.T) {
FirstVisit: true,
UserAgentHeader: "Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0",
})

runCmd(t, exit, "db", "query", "-db="+dbc, "ua")
wantExit(t, exit, out, 0)

want = `
count first_seen browser_name browser_version system_name system_version ua
1 2020-06-18 Firefox 79 Linux ~Z (X11; ~L x86_64; rv:79.0) ~g20100101 ~f79.0`
if pgSQL { // TODO: don't know why the date is printed different.
want = `
count first_seen browser_name browser_version system_name system_version ua
1 2020-06-18 00:00:00 Firefox 79 Linux ~Z (X11; ~L x86_64; rv:79.0) ~g20100101 ~f79.0`
}
if d := zdb.Diff(out.String(), want); d != "" {
t.Error(d)
}
}

func TestDBNewDB(t *testing.T) {
Expand Down
27 changes: 13 additions & 14 deletions cmd/goatcounter/import_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,12 @@ func runImportClean(ctx context.Context, t *testing.T) func() {
}

if zdb.SQLDialect(ctx) == zdb.DialectSQLite {
err = zdb.Exec(ctx, `update sqlite_sequence set seq = 0 where name in ('hits', 'paths', 'user_agents')`)
err = zdb.Exec(ctx, `update sqlite_sequence set seq = 0 where name in ('hits', 'paths')`)
if err != nil {
t.Fatal(err)
}
err = zdb.Exec(ctx, `delete from user_agents`)
} else {
err = zdb.Exec(ctx, `truncate hits, paths, user_agents`)
err = zdb.Exec(ctx, `truncate hits, paths`)
}
if err != nil {
t.Fatal(err)
Expand Down Expand Up @@ -156,10 +155,10 @@ func TestImport(t *testing.T) {

got := zdb.DumpString(ctx, `select * from hits`)
want := `
hit_id site_id path_id user_agent_id session bot ref ref_scheme size location first_visit created_at
1 1 1 1 00112233445566778899aabbccddef03 0 NULL 1280,768,1 AR 1 2020-12-01 00:07:10
2 1 2 1 00112233445566778899aabbccddef03 0 NULL 1280,768,1 AR 1 2020-12-01 00:07:44
3 1 3 2 00112233445566778899aabbccddef04 0 www.reddit.com o 1680,1050,2 RO 1 2020-12-27 00:37:37`
hit_id site_id path_id session bot ref ref_scheme size location first_visit created_at
1 1 1 00112233445566778899aabbccddef03 0 NULL 1280,768,1 AR 1 2020-12-01 00:07:10
2 1 2 00112233445566778899aabbccddef03 0 NULL 1280,768,1 AR 1 2020-12-01 00:07:44
3 1 3 00112233445566778899aabbccddef04 0 www.reddit.com o 1680,1050,2 RO 1 2020-12-27 00:37:37`
if d := ztest.Diff(got, want, ztest.DiffNormalizeWhitespace); d != "" {
t.Error(d)
}
Expand All @@ -170,9 +169,9 @@ func TestImport(t *testing.T) {

got := zdb.DumpString(ctx, `select * from hits`)
want := `
hit_id site_id path_id user_agent_id session bot ref ref_scheme size location first_visit created_at
1 1 1 1 00112233445566778899aabbccddef01 0 www.example.com/start.html h 1 2000-10-10 20:55:36
2 1 1 1 00112233445566778899aabbccddef01 0 NULL 0 2000-10-10 20:55:36`
hit_id site_id path_id session bot ref ref_scheme size location first_visit created_at
1 1 1 00112233445566778899aabbccddef01 0 www.example.com/start.html h 1 2000-10-10 20:55:36
2 1 1 00112233445566778899aabbccddef01 0 NULL 0 2000-10-10 20:55:36`
if d := ztest.Diff(got, want, ztest.DiffNormalizeWhitespace); d != "" {
t.Error(d)
}
Expand All @@ -195,10 +194,10 @@ func TestImport(t *testing.T) {

got := zdb.DumpString(ctx, `select * from hits`)

want := "hit_id site_id path_id user_agent_id session bot ref ref_scheme size location first_visit created_at\n"
want := "hit_id site_id path_id session bot ref ref_scheme size location first_visit created_at\n"
for i := 1; i < 5; i++ {
want += fmt.Sprintf(
"%-3d 1 1 1 00112233445566778899aabbccddef01 0 www.example.com/start.html h 0 2000-10-10 20:55:36\n",
"%-3d 1 1 00112233445566778899aabbccddef01 0 www.example.com/start.html h 0 2000-10-10 20:55:36\n",
i)

if i == 1 { // first_visit
Expand Down Expand Up @@ -226,10 +225,10 @@ func TestImport(t *testing.T) {
}

got := zdb.DumpString(ctx, `select * from hits`)
want := "hit_id site_id path_id user_agent_id session bot ref ref_scheme size location first_visit created_at\n"
want := "hit_id site_id path_id session bot ref ref_scheme size location first_visit created_at\n"
for i := 1; i < 101; i++ {
want += fmt.Sprintf(
"%-3d 1 1 1 00112233445566778899aabbccddef01 0 www.example.com/start.html h 0 2000-10-10 20:55:36\n",
"%-3d 1 1 00112233445566778899aabbccddef01 0 www.example.com/start.html h 0 2000-10-10 20:55:36\n",
i)

if i == 1 { // first_visit
Expand Down
22 changes: 22 additions & 0 deletions context.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ var (
keyCacheBrowsers = &struct{ n string }{""}
keyCacheSystems = &struct{ n string }{""}
keyCachePaths = &struct{ n string }{""}
keyCacheRefs = &struct{ n string }{""}
keyCacheSizes = &struct{ n string }{""}
keyCacheLoc = &struct{ n string }{""}
keyCacheCampaigns = &struct{ n string }{""}
keyChangedTitles = &struct{ n string }{""}
Expand Down Expand Up @@ -141,6 +143,12 @@ func CopyContextValues(ctx context.Context) context.Context {
if c := ctx.Value(keyCachePaths); c != nil {
n = context.WithValue(n, keyCachePaths, c.(*zcache.Cache))
}
if c := ctx.Value(keyCacheRefs); c != nil {
n = context.WithValue(n, keyCacheRefs, c.(*zcache.Cache))
}
if c := ctx.Value(keyCacheSizes); c != nil {
n = context.WithValue(n, keyCacheSizes, c.(*zcache.Cache))
}
if c := ctx.Value(keyCacheLoc); c != nil {
n = context.WithValue(n, keyCacheLoc, c.(*zcache.Cache))
}
Expand Down Expand Up @@ -188,6 +196,8 @@ func NewCache(ctx context.Context) context.Context {
ctx = context.WithValue(ctx, keyCacheBrowsers, zcache.New(1*time.Hour, 5*time.Minute))
ctx = context.WithValue(ctx, keyCacheSystems, zcache.New(1*time.Hour, 5*time.Minute))
ctx = context.WithValue(ctx, keyCachePaths, zcache.New(1*time.Hour, 5*time.Minute))
ctx = context.WithValue(ctx, keyCacheRefs, zcache.New(1*time.Hour, 5*time.Minute))
ctx = context.WithValue(ctx, keyCacheSizes, zcache.New(1*time.Hour, 5*time.Minute))
ctx = context.WithValue(ctx, keyCacheLoc, zcache.New(zcache.NoExpiration, zcache.NoExpiration))
ctx = context.WithValue(ctx, keyCacheCampaigns, zcache.New(24*time.Hour, 15*time.Minute))
ctx = context.WithValue(ctx, keyCacheI18n, zcache.New(zcache.NoExpiration, zcache.NoExpiration))
Expand Down Expand Up @@ -236,6 +246,18 @@ func cachePaths(ctx context.Context) *zcache.Cache {
}
return zcache.New(0, 0)
}
func cacheRefs(ctx context.Context) *zcache.Cache {
if c := ctx.Value(keyCacheRefs); c != nil {
return c.(*zcache.Cache)
}
return zcache.New(0, 0)
}
func cacheSizes(ctx context.Context) *zcache.Cache {
if c := ctx.Value(keyCacheSizes); c != nil {
return c.(*zcache.Cache)
}
return zcache.New(0, 0)
}
func cacheLoc(ctx context.Context) *zcache.Cache {
if c := ctx.Value(keyCacheLoc); c != nil {
return c.(*zcache.Cache)
Expand Down
49 changes: 1 addition & 48 deletions cron/browser_stat.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@ package cron
import (
"context"
"strconv"
"sync"

"zgo.at/errors"
"zgo.at/goatcounter/v2"
"zgo.at/zdb"
"zgo.at/zlog"
)

func updateBrowserStats(ctx context.Context, hits []goatcounter.Hit) error {
Expand All @@ -28,12 +26,8 @@ func updateBrowserStats(ctx context.Context, hits []goatcounter.Hit) error {
if h.Bot > 0 {
continue
}
if h.UserAgentID == nil {
continue
}

if h.BrowserID == 0 {
h.BrowserID, _ = getUA(ctx, *h.UserAgentID)
continue
}

day := h.CreatedAt.Format("2006-01-02")
Expand Down Expand Up @@ -70,44 +64,3 @@ func updateBrowserStats(ctx context.Context, hits []goatcounter.Hit) error {
return ins.Finish()
}), "cron.updateBrowserStats")
}

var (
userAgentMap map[int64][2]int64
getUAOnce sync.Once
)

func getUA(ctx context.Context, uaID int64) (browser, system int64) {
// Load all the user_agents in memory; this speeds up things quite a bit,
// and the IDs never change. This is about 4M for 500k rows.
getUAOnce.Do(func() {
var ua []struct {
UserAgentID int64 `db:"user_agent_id"`
BrowserID int64 `db:"browser_id"`
SystemID int64 `db:"system_id"`
}
err := zdb.Select(ctx, &ua,
`select user_agent_id, browser_id, system_id from user_agents`)
if err != nil {
panic(err)
}

userAgentMap = make(map[int64][2]int64, len(ua))
for _, u := range ua {
userAgentMap[u.UserAgentID] = [2]int64{u.BrowserID, u.SystemID}
}
})

ua, ok := userAgentMap[uaID]
if !ok {
var u goatcounter.UserAgent
err := u.ByID(ctx, uaID)
if err != nil {
zlog.Field("uaID", uaID).Error(err)
return 0, 0
}
ua = [2]int64{u.BrowserID, u.SystemID}
userAgentMap[uaID] = ua
}

return ua[0], ua[1]
}
Loading

0 comments on commit 2a9aef4

Please sign in to comment.