Eric Bower
·
29 Nov 24
analytics.go
1package shared
2
3import (
4 "context"
5 "crypto/hmac"
6 "crypto/sha256"
7 "encoding/hex"
8 "encoding/json"
9 "errors"
10 "fmt"
11 "log/slog"
12 "net"
13 "net/http"
14 "net/url"
15 "strings"
16 "time"
17
18 "github.com/picosh/pico/db"
19 "github.com/picosh/utils/pipe/metrics"
20 "github.com/simplesurance/go-ip-anonymizer/ipanonymizer"
21 "github.com/x-way/crawlerdetect"
22)
23
24func HmacString(secret, data string) string {
25 hmacer := hmac.New(sha256.New, []byte(secret))
26 hmacer.Write([]byte(data))
27 dataHmac := hmacer.Sum(nil)
28 return hex.EncodeToString(dataHmac)
29}
30
31func trackableUserAgent(agent string) error {
32 // dont store requests from bots
33 if crawlerdetect.IsCrawler(agent) {
34 return fmt.Errorf(
35 "request is likely from a bot (User-Agent: %s)",
36 CleanUserAgent(agent),
37 )
38 }
39 return nil
40}
41
42func trackableRequest(r *http.Request) error {
43 agent := r.UserAgent()
44 return trackableUserAgent(agent)
45}
46
47func cleanIpAddress(ip string) (string, error) {
48 host, _, err := net.SplitHostPort(ip)
49 if err != nil {
50 host = ip
51 }
52 // /24 IPv4 subnet mask
53 // /64 IPv6 subnet mask
54 anonymizer := ipanonymizer.NewWithMask(
55 net.CIDRMask(24, 32),
56 net.CIDRMask(64, 128),
57 )
58 anonIp, err := anonymizer.IPString(host)
59 return anonIp, err
60}
61
62func cleanUrl(orig string) (string, string) {
63 u, err := url.Parse(orig)
64 if err != nil {
65 return "", ""
66 }
67 return u.Host, u.Path
68}
69
70func cleanUrlFromRequest(r *http.Request) (string, string) {
71 host := r.Header.Get("x-forwarded-host")
72 if host == "" {
73 host = r.URL.Host
74 }
75 if host == "" {
76 host = r.Host
77 }
78 // we don't want query params in the url for security reasons
79 return host, r.URL.Path
80}
81
82func CleanUserAgent(ua string) string {
83 // truncate user-agent because http headers have no text limit
84 if len(ua) > 1000 {
85 return ua[:1000]
86 }
87 return strings.TrimSpace(ua)
88}
89
90func filterIp(host string) (string, error) {
91 if host == "" {
92 return "", nil
93 }
94 addr := net.ParseIP(host)
95 if addr != nil {
96 return "", fmt.Errorf("host is an ip")
97 }
98 return host, nil
99}
100
101func CleanReferer(raw string) (string, error) {
102 ref := raw
103 if ref == "" {
104 return "", nil
105 }
106 // referer sometimes dont include scheme but we need it
107 if !strings.HasPrefix(ref, "http") {
108 ref = "https://" + ref
109 }
110 // we only want to store host for security reasons
111 // https://developer.mozilla.org/en-US/docs/Web/Security/Referer_header:_privacy_and_security_concerns
112 u, err := url.Parse(ref)
113 if err != nil {
114 return "", err
115 }
116 hostname := u.Hostname()
117 hostname, _ = filterIp(hostname)
118 hostname = strings.TrimSpace(strings.ToLower(hostname))
119 return hostname, err
120}
121
122func CleanHost(raw string) (string, error) {
123 prep := strings.TrimSpace(strings.ToLower(raw))
124 if prep == "" {
125 return "", fmt.Errorf("host is blank")
126 }
127 // hosts dont usually include scheme but we need it
128 if !strings.HasPrefix(prep, "http") {
129 prep = "https://" + prep
130 }
131 // no clue why but our prod data contains periods
132 prep = strings.Trim(prep, ".")
133 // we only want to store host for security reasons
134 // https://developer.mozilla.org/en-US/docs/Web/Security/Referer_header:_privacy_and_security_concerns
135 u, err := url.Parse(prep)
136 if err != nil {
137 return raw, err
138 }
139 host := u.Hostname()
140 host, err = filterIp(host)
141 return host, err
142}
143
144var ErrAnalyticsDisabled = errors.New("owner does not have site analytics enabled")
145
146func AnalyticsVisitFromVisit(visit *db.AnalyticsVisits, dbpool db.DB, secret string) error {
147 if !dbpool.HasFeatureForUser(visit.UserID, "analytics") {
148 return ErrAnalyticsDisabled
149 }
150
151 err := trackableUserAgent(visit.UserAgent)
152 if err != nil {
153 return err
154 }
155
156 ipAddress, err := cleanIpAddress(visit.IpAddress)
157 if err != nil {
158 return err
159 }
160 visit.IpAddress = HmacString(secret, ipAddress)
161 _, path := cleanUrl(visit.Path)
162 visit.Path = path
163
164 referer, err := CleanReferer(visit.Referer)
165 if err != nil {
166 return err
167 }
168 visit.Referer = referer
169
170 hostname, err := CleanHost(visit.Host)
171 if err != nil {
172 return err
173 }
174 visit.Host = hostname
175 visit.UserAgent = CleanUserAgent(visit.UserAgent)
176
177 return nil
178}
179
180func ipFromRequest(r *http.Request) string {
181 // https://caddyserver.com/docs/caddyfile/directives/reverse_proxy#defaults
182 ipOrig := r.Header.Get("x-forwarded-for")
183 if ipOrig == "" {
184 ipOrig = r.RemoteAddr
185 }
186 // probably means this is a web tunnel
187 if ipOrig == "" || ipOrig == "@" {
188 sshCtx, err := GetSshCtx(r)
189 if err == nil {
190 ipOrig = sshCtx.RemoteAddr().String()
191 }
192 }
193
194 return ipOrig
195}
196
197func AnalyticsVisitFromRequest(r *http.Request, dbpool db.DB, userID string) (*db.AnalyticsVisits, error) {
198 if !dbpool.HasFeatureForUser(userID, "analytics") {
199 return nil, ErrAnalyticsDisabled
200 }
201
202 err := trackableRequest(r)
203 if err != nil {
204 return nil, err
205 }
206
207 ipAddress := ipFromRequest(r)
208 host, path := cleanUrlFromRequest(r)
209
210 return &db.AnalyticsVisits{
211 UserID: userID,
212 Host: host,
213 Path: path,
214 IpAddress: ipAddress,
215 UserAgent: r.UserAgent(),
216 Referer: r.Referer(),
217 Status: http.StatusOK,
218 }, nil
219}
220
221func AnalyticsCollect(ch chan *db.AnalyticsVisits, dbpool db.DB, logger *slog.Logger) {
222 drain := metrics.RegisterReconnectMetricRecorder(
223 context.Background(),
224 logger,
225 NewPicoPipeClient(),
226 100,
227 10*time.Millisecond,
228 )
229
230 for visit := range ch {
231 data, err := json.Marshal(visit)
232 if err != nil {
233 logger.Error("could not json marshall visit record", "err", err)
234 continue
235 }
236
237 data = append(data, '\n')
238
239 _, err = drain.Write(data)
240 if err != nil {
241 logger.Error("could not write to metric-drain", "err", err)
242 }
243 }
244}