diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index beee7e1..744673c 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "posthog", "description": "Access PostHog analytics, feature flags, experiments, error tracking, and insights directly from Claude Code. Optionally capture Claude Code sessions to PostHog LLM Analytics.", - "version": "1.1.21", + "version": "1.1.22", "author": { "name": "PostHog", "email": "hey@posthog.com", diff --git a/.codex-plugin/plugin.json b/.codex-plugin/plugin.json index 653d836..a748b41 100644 --- a/.codex-plugin/plugin.json +++ b/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "posthog", - "version": "1.0.20", + "version": "1.0.21", "description": "Access PostHog analytics, feature flags, experiments, error tracking, and insights directly from Codex", "author": { "name": "PostHog", diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json index 82f6460..cc92e8c 100644 --- a/.cursor-plugin/plugin.json +++ b/.cursor-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "posthog", "displayName": "PostHog", - "version": "1.1.17", + "version": "1.1.18", "description": "Access PostHog analytics, feature flags, experiments, error tracking, and insights directly from Cursor", "author": { "name": "PostHog", diff --git a/gemini-extension.json b/gemini-extension.json index 11f168d..a3bccd0 100644 --- a/gemini-extension.json +++ b/gemini-extension.json @@ -1,6 +1,6 @@ { "name": "posthog", - "version": "1.0.19", + "version": "1.0.20", "description": "Access PostHog analytics, feature flags, experiments, error tracking, and insights directly from Gemini CLI", "mcpServers": { "posthog": { diff --git a/skills/.sync-manifest b/skills/.sync-manifest index f377bd6..1115e84 100644 --- a/skills/.sync-manifest +++ b/skills/.sync-manifest @@ -31,6 +31,7 @@ instrument-product-analytics investigate-metric investigating-replay managing-experiment-lifecycle +managing-path-cleaning-rules managing-subscriptions querying-posthog-data setting-up-a-data-warehouse-source diff --git a/skills/managing-path-cleaning-rules/SKILL.md b/skills/managing-path-cleaning-rules/SKILL.md new file mode 100644 index 0000000..33fa475 --- /dev/null +++ b/skills/managing-path-cleaning-rules/SKILL.md @@ -0,0 +1,180 @@ +--- +name: managing-path-cleaning-rules +description: 'Inspects URL paths and proposes, tests, orders, and applies project-level path cleaning rules so dynamic segments (numeric IDs, UUIDs, slugs, dates) collapse into readable aliases. Use when the user says "clean the paths", "normalize URLs", "group similar pages", "too many distinct paths", "/users/123 and /users/456 are the same page", "set up path cleaning", or asks why a Web analytics or Paths breakdown is fragmented across thousands of nearly-identical URLs. Covers regex syntax (re2), alias placeholder convention, rule ordering, the test workflow, and applying rules via the project-settings-update MCP tool.' +--- + +# Managing path cleaning rules + +Path cleaning rules normalize `$pathname` and `$entry_pathname` so that pages +sharing the same template (`/users/123/profile`, `/users/456/profile`, …) collapse +into one row (`/users//profile`) in Web analytics tiles, Paths insights, and +any HogQL query that calls `apply_path_cleaning`. They are the right answer when +a breakdown is fragmented across thousands of near-identical URLs. + +This skill teaches you how to: + +- recognize when path cleaning is the right tool +- inspect real paths to find what needs cleaning +- write `regex` + `alias` rules in re2 syntax with the project's placeholder + convention +- test rules before saving them +- order rules so specific patterns aren't swallowed by generic ones +- apply the rules via MCP + +## Data model + +`Team.path_cleaning_filters` is a JSON list of `PathCleaningFilter` objects: + +```json +{ + "regex": "/users/\\d+/profile", + "alias": "/users//profile", + "order": 0 +} +``` + +- **`regex`** — a [re2](https://github.com/google/re2/wiki/Syntax) pattern. No + need to escape `/`. Anchor with `^` / `$` when you mean it. +- **`alias`** — the literal replacement. Use angle-bracket placeholders + (``, ``, ``, ``) by convention so the cleaned path stays + human-readable. The alias is _not_ a regex template — backreferences are not + supported. +- **`order`** — integer. Rules apply **sequentially** in `order` ascending, + each rule's output feeds the next. + +Application is `replaceRegexpAll(pathname, regex, alias)` per rule, chained. +Source: `posthog/hogql/property.py:613`. + +## Workflow + +### 1. Confirm path cleaning is the right move + +Ask yourself: is the user complaining about cardinality (too many distinct paths +in a chart), or do they want a per-URL drill-down? Path cleaning is for the +former. If they want per-URL data, suggest a property filter on `$pathname` +instead. + +### 2. Inspect the real paths + +Don't guess at patterns — query them. With the `execute-sql` MCP tool: + +```sql +SELECT properties.$pathname AS path, count() AS views +FROM events +WHERE event = '$pageview' + AND timestamp > now() - INTERVAL 7 DAY +GROUP BY path +ORDER BY views DESC +LIMIT 200 +``` + +Scan the result for: + +- numeric IDs: `/users/123`, `/orders/4242` +- UUIDs: `/sessions/8f3c1a3b-…` +- slugs: `/posts/why-i-love-posthog` +- dates: `/archive/2024-09-12` +- locales: `/en-US/`, `/fr-FR/` +- pagination: `?page=3`, `/page/3/` + +### 3. Draft regex + alias + +| Pattern | Example match | `regex` | `alias` | +| ------------------- | ---------------------- | ---------------------------- | ---------------------- | +| Numeric segment | `/users/123/profile` | `/users/\d+/profile` | `/users//profile` | +| UUID v4 | `/sessions/8f3c1a3b-…` | `/sessions/[0-9a-f-]{36}` | `/sessions/` | +| Slug | `/posts/why-posthog` | `/posts/[a-z0-9-]+$` | `/posts/` | +| ISO date | `/archive/2024-09-12` | `/archive/\d{4}-\d{2}-\d{2}` | `/archive/` | +| Locale prefix | `/en-US/about` | `^/[a-z]{2}-[A-Z]{2}/` | `//` | +| Trailing query/page | `/blog?page=3` | `\?page=\d+$` | (empty alias drops it) | + +Anchoring rules of thumb: + +- start the regex with `^` only when the segment must be at the beginning of + the path +- end with `$` to keep a generic rule (e.g. `\d+$`) from matching mid-path + segments + +### 4. Test before saving + +Three options, pick one: + +- **Settings page tester**: `/settings/project#path_cleaning` has a built-in + "test path" input that replays the full ordered chain. +- **Project HogQL** (via `execute-sql`): + + ```sql + SELECT replaceRegexpAll('/users/42/profile', '/users/\d+/profile', '/users//profile') + ``` + + Chain `replaceRegexpAll` calls in the same order the rules will run if you + want to verify multi-rule interaction. + +- **Built-in AI helper**: there is already an `AiRegexHelper` modal accessible + from the rule editor (`Help me with Regex` button) that turns natural + language into a regex. Suggest it to the user when they say "I don't know + regex" — but always validate the output against real paths via the tester. + +### 5. Order rules from most-specific to most-general + +Sequential application means a generic rule placed first will swallow +everything that should have hit a specific rule. + +```text +order=0 /users/me/profile → /users/me/profile (specific, runs first) +order=1 /users/\d+/profile → /users//profile +order=2 /users/[a-z0-9-]+ → /users/ (catch-all, runs last) +``` + +If `/users/[a-z0-9-]+` ran first it would also match `/users/me/profile` and +make the more specific rule unreachable. + +### 6. Apply via MCP + +Use the `project-settings-update` tool with the full list (the field is +replaced, not merged): + +```json +{ + "path_cleaning_filters": [ + { "regex": "/users/me/profile", "alias": "/users/me/profile", "order": 0 }, + { "regex": "/users/\\d+/profile", "alias": "/users//profile", "order": 1 }, + { "regex": "/users/[a-z0-9-]+", "alias": "/users/", "order": 2 } + ] +} +``` + +Always **read the existing rules first** (project settings include +`path_cleaning_filters`) and merge — overwriting silently destroys whatever the +team has already configured. + +## Where the rules apply + +When the user (or a HogQL query) opts in: + +- Web analytics: the **Path cleaning** toggle in the page header + (`PathCleaningToggle.tsx`) +- Paths insights: the path cleaning toggle in the insight filters +- HogQL: any query that calls `apply_path_cleaning(path_expr, team)` + +The rules are stored once per project — they are not insight-scoped. + +## Common pitfalls + +- **Backreferences in `alias` need double-escaping** — ClickHouse's + `replaceRegexpAll` supports `\0` (whole match) and `\1`–`\9` (capture + groups). In a JSON field or SQL string literal the backslash must be + doubled, so use `\\1` in `path_cleaning_filters` / HogQL to get the `\1` + backreference at the ClickHouse layer. +- **Forgetting `$`** — `\d+` without an end anchor matches every numeric run + in any path, so `/blog/2024-09-12/post` becomes + `/blog/--/post` when you only meant to match the year + segment. Use `\d+$` or `\d+(/|$)` depending on intent. +- **Escaping `/`** — re2 does not require it. `\/` works but adds noise. +- **Case sensitivity** — re2 is case-sensitive by default. Use `(?i)` at the + start of the pattern for case-insensitive matching, e.g. `(?i)/users/\d+`. +- **Replacing the whole list** — `path_cleaning_filters` is overwrite, not + append. Always start from the current list. +- **Rules apply globally** — adding a rule can change historical numbers in + every Web analytics / Paths chart that has cleaning enabled. Warn the user + before applying anything destructive. diff --git a/skills/querying-posthog-data/references/example-error-tracking.md b/skills/querying-posthog-data/references/example-error-tracking.md index 7af4759..53adf8b 100644 --- a/skills/querying-posthog-data/references/example-error-tracking.md +++ b/skills/querying-posthog-data/references/example-error-tracking.md @@ -10,13 +10,13 @@ SELECT count(DISTINCT uuid) AS occurrences, count(DISTINCT nullIf($session_id, '')) AS sessions, count(DISTINCT coalesce(nullIf(toString(person_id), '00000000-0000-0000-0000-000000000000'), distinct_id)) AS users, - sumForEach(arrayMap(bin -> if(and(greater(timestamp, bin), lessOrEquals(dateDiff('seconds', bin, timestamp), divide(dateDiff('seconds', toDateTime(toDateTime('2026-05-08 01:25:35.268897')), toDateTime(toDateTime('2026-05-09 01:25:35.269594'))), 20))), 1, 0), arrayMap(i -> dateAdd(toDateTime(toDateTime('2026-05-08 01:25:35.268897')), toIntervalSecond(multiply(i, divide(dateDiff('seconds', toDateTime(toDateTime('2026-05-08 01:25:35.268897')), toDateTime(toDateTime('2026-05-09 01:25:35.269594'))), 20)))), range(0, 20)))) AS volumeRange, + sumForEach(arrayMap(bin -> if(and(greater(timestamp, bin), lessOrEquals(dateDiff('seconds', bin, timestamp), divide(dateDiff('seconds', toDateTime(toDateTime('2026-05-11 22:29:45.852087')), toDateTime(toDateTime('2026-05-12 22:29:45.852553'))), 20))), 1, 0), arrayMap(i -> dateAdd(toDateTime(toDateTime('2026-05-11 22:29:45.852087')), toIntervalSecond(multiply(i, divide(dateDiff('seconds', toDateTime(toDateTime('2026-05-11 22:29:45.852087')), toDateTime(toDateTime('2026-05-12 22:29:45.852553'))), 20)))), range(0, 20)))) AS volumeRange, argMin(tuple(uuid, distinct_id, timestamp, properties), timestamp) AS first_event, argMax(properties.$lib, timestamp) AS library FROM events AS e WHERE - and(equals(event, '$exception'), isNotNull(e.issue_id), equals(properties.tag, 'max_ai'), greaterOrEquals(timestamp, toDateTime(toDateTime('2026-05-08 01:25:35.268897'))), lessOrEquals(timestamp, toDateTime(toDateTime('2026-05-09 01:25:35.269594'))), or(greater(position(lower(properties.$exception_types), lower('constant')), 0), greater(position(lower(properties.$exception_values), lower('constant')), 0), greater(position(lower(properties.$exception_sources), lower('constant')), 0), greater(position(lower(properties.$exception_functions), lower('constant')), 0), greater(position(lower(properties.email), lower('constant')), 0), greater(position(lower(person.properties.email), lower('constant')), 0))) + and(equals(event, '$exception'), isNotNull(e.issue_id), equals(properties.tag, 'max_ai'), greaterOrEquals(timestamp, toDateTime(toDateTime('2026-05-11 22:29:45.852087'))), lessOrEquals(timestamp, toDateTime(toDateTime('2026-05-12 22:29:45.852553'))), or(greater(position(lower(properties.$exception_types), lower('constant')), 0), greater(position(lower(properties.$exception_values), lower('constant')), 0), greater(position(lower(properties.$exception_sources), lower('constant')), 0), greater(position(lower(properties.$exception_functions), lower('constant')), 0), greater(position(lower(properties.email), lower('constant')), 0), greater(position(lower(person.properties.email), lower('constant')), 0))) GROUP BY id ORDER BY diff --git a/skills/querying-posthog-data/references/example-logs.md b/skills/querying-posthog-data/references/example-logs.md index 382e8a2..9cef296 100644 --- a/skills/querying-posthog-data/references/example-logs.md +++ b/skills/querying-posthog-data/references/example-logs.md @@ -23,7 +23,7 @@ SELECT FROM logs WHERE - and(and(greaterOrEquals(toStartOfDay(time_bucket), toStartOfDay(assumeNotNull(toDateTime('2025-12-09 00:00:00')))), lessOrEquals(toStartOfDay(time_bucket), toStartOfDay(assumeNotNull(toDateTime('2025-12-10 00:00:00'))))), 1, greaterOrEquals(timestamp, toDateTime('2026-05-08 01:25:38.402844')), indexHint(like(lower(body), '%timeout%')), ilike(toString(body), '%timeout%'), in(severity_text, tuple('warn', 'error', 'fatal'))) + and(and(greaterOrEquals(toStartOfDay(time_bucket), toStartOfDay(assumeNotNull(toDateTime('2025-12-09 00:00:00')))), lessOrEquals(toStartOfDay(time_bucket), toStartOfDay(assumeNotNull(toDateTime('2025-12-10 00:00:00'))))), 1, greaterOrEquals(timestamp, toDateTime('2026-05-11 22:29:47.786146')), indexHint(like(lower(body), '%timeout%')), ilike(toString(body), '%timeout%'), in(severity_text, tuple('warn', 'error', 'fatal'))) ORDER BY timestamp DESC, uuid DESC diff --git a/skills/querying-posthog-data/references/example-session-replay.md b/skills/querying-posthog-data/references/example-session-replay.md index 86001ce..1ec8fbf 100644 --- a/skills/querying-posthog-data/references/example-session-replay.md +++ b/skills/querying-posthog-data/references/example-session-replay.md @@ -19,17 +19,17 @@ SELECT sum(s.console_error_count) AS console_error_count, max(s.retention_period_days) AS retention_period_days, plus(dateTrunc('DAY', start_time), toIntervalDay(coalesce(retention_period_days, 30))) AS expiry_time, - date_diff('DAY', toDateTime('2026-05-09 01:25:39.591112'), expiry_time) AS recording_ttl, - greaterOrEquals(max(s._timestamp), toDateTime('2026-05-09 01:20:39.590287')) AS ongoing, + date_diff('DAY', toDateTime('2026-05-12 22:29:48.544635'), expiry_time) AS recording_ttl, + greaterOrEquals(max(s._timestamp), toDateTime('2026-05-12 22:24:48.543846')) AS ongoing, round(multiply(divide(plus(plus(plus(divide(sum(s.active_milliseconds), 1000), sum(s.click_count)), sum(s.keypress_count)), sum(s.console_error_count)), plus(plus(plus(plus(sum(s.mouse_activity_count), dateDiff('SECOND', start_time, end_time)), sum(s.console_error_count)), sum(s.console_log_count)), sum(s.console_warn_count))), 100), 2) AS activity_score FROM raw_session_replay_events AS s WHERE - and(greaterOrEquals(s.min_first_timestamp, toDateTime('2026-05-06 00:00:00.000000')), lessOrEquals(s.min_first_timestamp, toDateTime('2026-05-09 01:25:39.590452'))) + and(greaterOrEquals(s.min_first_timestamp, toDateTime('2026-05-09 00:00:00.000000')), lessOrEquals(s.min_first_timestamp, toDateTime('2026-05-12 22:29:48.544031'))) GROUP BY session_id HAVING - and(greaterOrEquals(expiry_time, toDateTime('2026-05-09 01:25:39.591005')), equals(max(s.is_deleted), 0), greater(active_seconds, 5.0)) + and(greaterOrEquals(expiry_time, toDateTime('2026-05-12 22:29:48.544530')), equals(max(s.is_deleted), 0), greater(active_seconds, 5.0)) ORDER BY start_time DESC, session_id DESC diff --git a/skills/querying-posthog-data/references/example-sessions.md b/skills/querying-posthog-data/references/example-sessions.md index c27d03d..e5bec1e 100644 --- a/skills/querying-posthog-data/references/example-sessions.md +++ b/skills/querying-posthog-data/references/example-sessions.md @@ -13,7 +13,7 @@ SELECT FROM sessions WHERE - and(less($start_timestamp, toDateTime('2026-05-09 01:25:44.964556')), greater($start_timestamp, toDateTime('2026-05-08 01:25:39.965353'))) + and(less($start_timestamp, toDateTime('2026-05-12 22:29:53.829974')), greater($start_timestamp, toDateTime('2026-05-11 22:29:48.830623'))) ORDER BY $start_timestamp DESC LIMIT 50000