repos / pico

pico services - prose.sh, pastes.sh, imgs.sh, feeds.sh, pgs.sh
git clone https://github.com/picosh/pico.git

commit
d96be3e
parent
6954c8a
author
Eric Bower
date
2024-11-29 14:12:51 +0000 UTC
chore(analytics): more cleaning of data
5 files changed,  +187, -7
A cmd/scripts/clean-analytics/clean.go
+99, -0
  1@@ -0,0 +1,99 @@
  2+package main
  3+
  4+import (
  5+	"fmt"
  6+	"log/slog"
  7+	"os"
  8+
  9+	"github.com/picosh/pico/db/postgres"
 10+	"github.com/picosh/pico/shared"
 11+)
 12+
 13+func main() {
 14+	logger := slog.Default()
 15+	DbURL := os.Getenv("DATABASE_URL")
 16+	dbpool := postgres.NewDB(DbURL, logger)
 17+	batchSize := 100_000
 18+	offset := 0
 19+
 20+	var totalRows int
 21+	err := dbpool.Db.QueryRow("SELECT count(id) FROM analytics_visits").Scan(&totalRows)
 22+	if err != nil {
 23+		panic(err)
 24+	}
 25+
 26+	fmt.Println("TOTAL ROWS", totalRows)
 27+
 28+	for {
 29+		fmt.Println("===")
 30+		fmt.Println("offset", offset)
 31+		fmt.Println("===")
 32+		rows, err := dbpool.Db.Query("SELECT id, host, referer FROM analytics_visits ORDER BY created_at DESC LIMIT $1 OFFSET $2", batchSize, offset)
 33+		if err != nil {
 34+			panic(err)
 35+		}
 36+
 37+		// Process the rows
 38+		for rows.Next() {
 39+			var id, origHost, origRef string
 40+			err := rows.Scan(
 41+				&id,
 42+				&origHost,
 43+				&origRef,
 44+			)
 45+			if err != nil {
 46+				panic(err)
 47+			}
 48+
 49+			update := false
 50+
 51+			host, err := shared.CleanHost(origHost)
 52+			if err != nil {
 53+				fmt.Println(err)
 54+			}
 55+
 56+			if origHost != host {
 57+				update = true
 58+				fmt.Printf(
 59+					"HOST %s->%s\n",
 60+					origHost, host,
 61+				)
 62+			}
 63+
 64+			ref, err := shared.CleanReferer(origRef)
 65+			if err != nil {
 66+				fmt.Println(err)
 67+			}
 68+
 69+			if origRef != ref {
 70+				update = true
 71+				fmt.Printf(
 72+					"REF %s->%s\n",
 73+					origRef, ref,
 74+				)
 75+			}
 76+
 77+			if update {
 78+				fmt.Printf("Updating visit ID:%s\n", id)
 79+				_, err := dbpool.Db.Exec(
 80+					"UPDATE analytics_visits SET host=$1, referer=$2 WHERE id=$3",
 81+					host,
 82+					ref,
 83+					id,
 84+				)
 85+				if err != nil {
 86+					panic(err)
 87+				}
 88+			}
 89+		}
 90+
 91+		if rows.Err() != nil {
 92+			panic(rows.Err())
 93+		}
 94+
 95+		offset += batchSize
 96+		if offset >= totalRows {
 97+			break
 98+		}
 99+	}
100+}
M db/db.go
+1, -0
1@@ -382,6 +382,7 @@ type DB interface {
2 
3 	InsertVisit(view *AnalyticsVisits) error
4 	VisitSummary(opts *SummaryOpts) (*SummaryVisits, error)
5+	FindVisitSiteList(userID string) ([]string, error)
6 
7 	AddPicoPlusUser(username string, paymentType, txId string) error
8 	FindFeatureForUser(userID string, feature string) (*FeatureFlag, error)
M db/postgres/storage.go
+25, -0
 1@@ -1180,6 +1180,31 @@ func (me *PsqlDB) VisitSummary(opts *db.SummaryOpts) (*db.SummaryVisits, error)
 2 	}, nil
 3 }
 4 
 5+func (me *PsqlDB) FindVisitSiteList(userID string) ([]string, error) {
 6+	siteList := []string{}
 7+
 8+	rs, err := me.Db.Query("SELECT DISTINCT(host) FROM analytics_visits WHERE user_id=$1", userID)
 9+	if err != nil {
10+		return nil, err
11+	}
12+
13+	for rs.Next() {
14+		var host string
15+		err := rs.Scan(
16+			&host,
17+		)
18+		if err != nil {
19+			return nil, err
20+		}
21+		siteList = append(siteList, host)
22+	}
23+	if rs.Err() != nil {
24+		return nil, rs.Err()
25+	}
26+
27+	return siteList, nil
28+}
29+
30 func (me *PsqlDB) FindUsers() ([]*db.User, error) {
31 	var users []*db.User
32 	rs, err := me.Db.Query(sqlSelectUsers)
M db/stub/stub.go
+4, -0
 1@@ -157,6 +157,10 @@ func (me *StubDB) VisitSummary(opts *db.SummaryOpts) (*db.SummaryVisits, error)
 2 	return &db.SummaryVisits{}, notImpl
 3 }
 4 
 5+func (me *StubDB) FindVisitSiteList(userID string) ([]string, error) {
 6+	return []string{}, notImpl
 7+}
 8+
 9 func (me *StubDB) FindUsers() ([]*db.User, error) {
10 	return []*db.User{}, notImpl
11 }
M shared/analytics.go
+58, -7
  1@@ -12,6 +12,7 @@ import (
  2 	"net"
  3 	"net/http"
  4 	"net/url"
  5+	"strings"
  6 	"time"
  7 
  8 	"github.com/picosh/pico/db"
  9@@ -32,7 +33,7 @@ func trackableUserAgent(agent string) error {
 10 	if crawlerdetect.IsCrawler(agent) {
 11 		return fmt.Errorf(
 12 			"request is likely from a bot (User-Agent: %s)",
 13-			cleanUserAgent(agent),
 14+			CleanUserAgent(agent),
 15 		)
 16 	}
 17 	return nil
 18@@ -78,22 +79,66 @@ func cleanUrlFromRequest(r *http.Request) (string, string) {
 19 	return host, r.URL.Path
 20 }
 21 
 22-func cleanUserAgent(ua string) string {
 23+func CleanUserAgent(ua string) string {
 24 	// truncate user-agent because http headers have no text limit
 25 	if len(ua) > 1000 {
 26 		return ua[:1000]
 27 	}
 28-	return ua
 29+	return strings.TrimSpace(ua)
 30 }
 31 
 32-func cleanReferer(ref string) (string, error) {
 33+func filterIp(host string) (string, error) {
 34+	if host == "" {
 35+		return "", nil
 36+	}
 37+	addr := net.ParseIP(host)
 38+	if addr != nil {
 39+		return "", fmt.Errorf("host is an ip")
 40+	}
 41+	return host, nil
 42+}
 43+
 44+func CleanReferer(raw string) (string, error) {
 45+	ref := raw
 46+	if ref == "" {
 47+		return "", nil
 48+	}
 49+	// referer sometimes dont include scheme but we need it
 50+	if !strings.HasPrefix(ref, "http") {
 51+		ref = "https://" + ref
 52+	}
 53 	// we only want to store host for security reasons
 54 	// https://developer.mozilla.org/en-US/docs/Web/Security/Referer_header:_privacy_and_security_concerns
 55 	u, err := url.Parse(ref)
 56 	if err != nil {
 57 		return "", err
 58 	}
 59-	return u.Host, nil
 60+	hostname := u.Hostname()
 61+	hostname, _ = filterIp(hostname)
 62+	hostname = strings.TrimSpace(strings.ToLower(hostname))
 63+	return hostname, err
 64+}
 65+
 66+func CleanHost(raw string) (string, error) {
 67+	prep := strings.TrimSpace(strings.ToLower(raw))
 68+	if prep == "" {
 69+		return "", fmt.Errorf("host is blank")
 70+	}
 71+	// hosts dont usually include scheme but we need it
 72+	if !strings.HasPrefix(prep, "http") {
 73+		prep = "https://" + prep
 74+	}
 75+	// no clue why but our prod data contains periods
 76+	prep = strings.Trim(prep, ".")
 77+	// we only want to store host for security reasons
 78+	// https://developer.mozilla.org/en-US/docs/Web/Security/Referer_header:_privacy_and_security_concerns
 79+	u, err := url.Parse(prep)
 80+	if err != nil {
 81+		return raw, err
 82+	}
 83+	host := u.Hostname()
 84+	host, err = filterIp(host)
 85+	return host, err
 86 }
 87 
 88 var ErrAnalyticsDisabled = errors.New("owner does not have site analytics enabled")
 89@@ -116,12 +161,18 @@ func AnalyticsVisitFromVisit(visit *db.AnalyticsVisits, dbpool db.DB, secret str
 90 	_, path := cleanUrl(visit.Path)
 91 	visit.Path = path
 92 
 93-	referer, err := cleanReferer(visit.Referer)
 94+	referer, err := CleanReferer(visit.Referer)
 95 	if err != nil {
 96 		return err
 97 	}
 98 	visit.Referer = referer
 99-	visit.UserAgent = cleanUserAgent(visit.UserAgent)
100+
101+	hostname, err := CleanHost(visit.Host)
102+	if err != nil {
103+		return err
104+	}
105+	visit.Host = hostname
106+	visit.UserAgent = CleanUserAgent(visit.UserAgent)
107 
108 	return nil
109 }