- commit
- d96be3e
- parent
- 6954c8a
- author
- Eric Bower
- date
- 2024-11-29 14:12:51 +0000 UTC
chore(analytics): more cleaning of data
5 files changed,
+187,
-7
+99,
-0
1@@ -0,0 +1,99 @@
2+package main
3+
4+import (
5+ "fmt"
6+ "log/slog"
7+ "os"
8+
9+ "github.com/picosh/pico/db/postgres"
10+ "github.com/picosh/pico/shared"
11+)
12+
13+func main() {
14+ logger := slog.Default()
15+ DbURL := os.Getenv("DATABASE_URL")
16+ dbpool := postgres.NewDB(DbURL, logger)
17+ batchSize := 100_000
18+ offset := 0
19+
20+ var totalRows int
21+ err := dbpool.Db.QueryRow("SELECT count(id) FROM analytics_visits").Scan(&totalRows)
22+ if err != nil {
23+ panic(err)
24+ }
25+
26+ fmt.Println("TOTAL ROWS", totalRows)
27+
28+ for {
29+ fmt.Println("===")
30+ fmt.Println("offset", offset)
31+ fmt.Println("===")
32+ rows, err := dbpool.Db.Query("SELECT id, host, referer FROM analytics_visits ORDER BY created_at DESC LIMIT $1 OFFSET $2", batchSize, offset)
33+ if err != nil {
34+ panic(err)
35+ }
36+
37+ // Process the rows
38+ for rows.Next() {
39+ var id, origHost, origRef string
40+ err := rows.Scan(
41+ &id,
42+ &origHost,
43+ &origRef,
44+ )
45+ if err != nil {
46+ panic(err)
47+ }
48+
49+ update := false
50+
51+ host, err := shared.CleanHost(origHost)
52+ if err != nil {
53+ fmt.Println(err)
54+ }
55+
56+ if origHost != host {
57+ update = true
58+ fmt.Printf(
59+ "HOST %s->%s\n",
60+ origHost, host,
61+ )
62+ }
63+
64+ ref, err := shared.CleanReferer(origRef)
65+ if err != nil {
66+ fmt.Println(err)
67+ }
68+
69+ if origRef != ref {
70+ update = true
71+ fmt.Printf(
72+ "REF %s->%s\n",
73+ origRef, ref,
74+ )
75+ }
76+
77+ if update {
78+ fmt.Printf("Updating visit ID:%s\n", id)
79+ _, err := dbpool.Db.Exec(
80+ "UPDATE analytics_visits SET host=$1, referer=$2 WHERE id=$3",
81+ host,
82+ ref,
83+ id,
84+ )
85+ if err != nil {
86+ panic(err)
87+ }
88+ }
89+ }
90+
91+ if rows.Err() != nil {
92+ panic(rows.Err())
93+ }
94+
95+ offset += batchSize
96+ if offset >= totalRows {
97+ break
98+ }
99+ }
100+}
M
db/db.go
+1,
-0
1@@ -382,6 +382,7 @@ type DB interface {
2
3 InsertVisit(view *AnalyticsVisits) error
4 VisitSummary(opts *SummaryOpts) (*SummaryVisits, error)
5+ FindVisitSiteList(userID string) ([]string, error)
6
7 AddPicoPlusUser(username string, paymentType, txId string) error
8 FindFeatureForUser(userID string, feature string) (*FeatureFlag, error)
+25,
-0
1@@ -1180,6 +1180,31 @@ func (me *PsqlDB) VisitSummary(opts *db.SummaryOpts) (*db.SummaryVisits, error)
2 }, nil
3 }
4
5+func (me *PsqlDB) FindVisitSiteList(userID string) ([]string, error) {
6+ siteList := []string{}
7+
8+ rs, err := me.Db.Query("SELECT DISTINCT(host) FROM analytics_visits WHERE user_id=$1", userID)
9+ if err != nil {
10+ return nil, err
11+ }
12+
13+ for rs.Next() {
14+ var host string
15+ err := rs.Scan(
16+ &host,
17+ )
18+ if err != nil {
19+ return nil, err
20+ }
21+ siteList = append(siteList, host)
22+ }
23+ if rs.Err() != nil {
24+ return nil, rs.Err()
25+ }
26+
27+ return siteList, nil
28+}
29+
30 func (me *PsqlDB) FindUsers() ([]*db.User, error) {
31 var users []*db.User
32 rs, err := me.Db.Query(sqlSelectUsers)
+4,
-0
1@@ -157,6 +157,10 @@ func (me *StubDB) VisitSummary(opts *db.SummaryOpts) (*db.SummaryVisits, error)
2 return &db.SummaryVisits{}, notImpl
3 }
4
5+func (me *StubDB) FindVisitSiteList(userID string) ([]string, error) {
6+ return []string{}, notImpl
7+}
8+
9 func (me *StubDB) FindUsers() ([]*db.User, error) {
10 return []*db.User{}, notImpl
11 }
1@@ -12,6 +12,7 @@ import (
2 "net"
3 "net/http"
4 "net/url"
5+ "strings"
6 "time"
7
8 "github.com/picosh/pico/db"
9@@ -32,7 +33,7 @@ func trackableUserAgent(agent string) error {
10 if crawlerdetect.IsCrawler(agent) {
11 return fmt.Errorf(
12 "request is likely from a bot (User-Agent: %s)",
13- cleanUserAgent(agent),
14+ CleanUserAgent(agent),
15 )
16 }
17 return nil
18@@ -78,22 +79,66 @@ func cleanUrlFromRequest(r *http.Request) (string, string) {
19 return host, r.URL.Path
20 }
21
22-func cleanUserAgent(ua string) string {
23+func CleanUserAgent(ua string) string {
24 // truncate user-agent because http headers have no text limit
25 if len(ua) > 1000 {
26 return ua[:1000]
27 }
28- return ua
29+ return strings.TrimSpace(ua)
30 }
31
32-func cleanReferer(ref string) (string, error) {
33+func filterIp(host string) (string, error) {
34+ if host == "" {
35+ return "", nil
36+ }
37+ addr := net.ParseIP(host)
38+ if addr != nil {
39+ return "", fmt.Errorf("host is an ip")
40+ }
41+ return host, nil
42+}
43+
44+func CleanReferer(raw string) (string, error) {
45+ ref := raw
46+ if ref == "" {
47+ return "", nil
48+ }
49+ // referer sometimes dont include scheme but we need it
50+ if !strings.HasPrefix(ref, "http") {
51+ ref = "https://" + ref
52+ }
53 // we only want to store host for security reasons
54 // https://developer.mozilla.org/en-US/docs/Web/Security/Referer_header:_privacy_and_security_concerns
55 u, err := url.Parse(ref)
56 if err != nil {
57 return "", err
58 }
59- return u.Host, nil
60+ hostname := u.Hostname()
61+ hostname, _ = filterIp(hostname)
62+ hostname = strings.TrimSpace(strings.ToLower(hostname))
63+ return hostname, err
64+}
65+
66+func CleanHost(raw string) (string, error) {
67+ prep := strings.TrimSpace(strings.ToLower(raw))
68+ if prep == "" {
69+ return "", fmt.Errorf("host is blank")
70+ }
71+ // hosts dont usually include scheme but we need it
72+ if !strings.HasPrefix(prep, "http") {
73+ prep = "https://" + prep
74+ }
75+ // no clue why but our prod data contains periods
76+ prep = strings.Trim(prep, ".")
77+ // we only want to store host for security reasons
78+ // https://developer.mozilla.org/en-US/docs/Web/Security/Referer_header:_privacy_and_security_concerns
79+ u, err := url.Parse(prep)
80+ if err != nil {
81+ return raw, err
82+ }
83+ host := u.Hostname()
84+ host, err = filterIp(host)
85+ return host, err
86 }
87
88 var ErrAnalyticsDisabled = errors.New("owner does not have site analytics enabled")
89@@ -116,12 +161,18 @@ func AnalyticsVisitFromVisit(visit *db.AnalyticsVisits, dbpool db.DB, secret str
90 _, path := cleanUrl(visit.Path)
91 visit.Path = path
92
93- referer, err := cleanReferer(visit.Referer)
94+ referer, err := CleanReferer(visit.Referer)
95 if err != nil {
96 return err
97 }
98 visit.Referer = referer
99- visit.UserAgent = cleanUserAgent(visit.UserAgent)
100+
101+ hostname, err := CleanHost(visit.Host)
102+ if err != nil {
103+ return err
104+ }
105+ visit.Host = hostname
106+ visit.UserAgent = CleanUserAgent(visit.UserAgent)
107
108 return nil
109 }