Skip to content

Commit 68bbdf9

Browse files
authored
Add streaming GraphQL pagination (#3)
Add streaming GraphQL pagination Add config validation for numeric comparisons and regex patterns Add log level aliases Change default log level from debug to info
1 parent 879b4e4 commit 68bbdf9

11 files changed

Lines changed: 1510 additions & 108 deletions

File tree

README.md

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,9 @@ uv add git+https://github.com/sourcegraph/src-py-lib.git
3434
- `src_py_lib.clients.graphql` — shared GraphQL execution with automatic cursor
3535
pagination, batched alias lookups, and schema introspection export.
3636
- `src_py_lib.clients.sourcegraph` — Sourcegraph GraphQL client with token
37-
validation and shared config fields for `SRC_ENDPOINT` (default:
38-
`https://sourcegraph.com`) and `SRC_ACCESS_TOKEN`.
37+
validation, endpoint normalization, connection streaming, and shared config
38+
fields for `SRC_ENDPOINT` (default: `https://sourcegraph.com`) and
39+
`SRC_ACCESS_TOKEN`.
3940
- `src_py_lib.clients.linear` — Linear GraphQL client with automatic cursor
4041
handling, token validation, shared config fields, and injectable HTTP policy.
4142
- `src_py_lib.clients.slack` — Slack Web API client with token validation,
@@ -70,7 +71,7 @@ import src_py_lib as src
7071

7172
class LinearExportConfig(src.LinearClientConfig):
7273
output_dir: Path = src.config_field(
73-
Path("."),
74+
default=Path("."),
7475
env_var="LINEAR_EXPORT_OUTPUT_DIR",
7576
cli_flag="--output-dir",
7677
metavar="PATH",
@@ -85,32 +86,45 @@ print(f"Writing files under {config.output_dir}")
8586
Config precedence is: code defaults, `.env`, shell environment, then CLI
8687
overrides. API client modules can provide shared Config base classes such as
8788
`LinearClientConfig`, and `parse_args` resolves `op://...` references by
88-
default. Pass a custom `argparse.ArgumentParser` to `parse_args` when a
89-
CLI also has non-Config flags. Mark sensitive fields with `secret=True` so
90-
snapshots do not expose resolved values.
89+
default. `config_field(default=...)` supports aliases, store-true /
90+
store-false command flags, optional values, numeric bounds, and string patterns
91+
for simple CLIs. Pass a custom `argparse.ArgumentParser` to `parse_args` only when you
92+
need parsing beyond Config fields. Help text preserves description and
93+
argument-help newlines, and reserves enough option-column width for long config
94+
flags. Mark sensitive fields with `secret=True` so snapshots do not expose
95+
resolved values.
9196

9297
## Logging example
9398

9499
Configure logging once at process startup. Prefer configuring the root logger
95100
(`logger_name=""`, the default) so project modules and shared `src_py_lib` modules
96101
such as `src_py_lib.utils.http` are captured by the same terminal and JSONL handlers.
97102
Use `logging()` in CLIs to configure logging, add the command field to all
98-
structured events, and emit standard startup metadata.
103+
structured events, and emit standard run/startup/run-end metadata.
99104
Use `debug()`, `info()`, `warning()`, `error()`, and `critical()` for one-off
100105
structured events. Use `event()` blocks around timed work; they emit `trace`,
101-
`span`, and nested `parent_span` fields.
106+
`span`, and nested `parent_span` fields. Use `start_level="debug"` to hide
107+
noisy start events while keeping end timing visible, and
108+
`omit_success_status=True` for very high-volume success events. Use `stage()`
109+
for workflow context such as `stage="apply"`.
102110
When the root logger is configured, noisy `httpx`/`httpcore` records are suppressed;
103111
`HTTPClient` emits structured `http_request` events instead.
104-
Set `SRC_LOG_LEVEL=INFO` for a run to omit DEBUG events from the log file.
112+
Run-end events include HTTP attempt/byte/status/retry counters. Set
113+
`LoggingSettings.resource_sample_interval_seconds` to emit DEBUG
114+
`resource_sample` events and include process resource totals on run end. Set
115+
`SRC_LOG_LEVEL=INFO` for a run to omit DEBUG events from the log file.
116+
`LoggingConfig` includes `--verbose/-v`, `--quiet/-q`, and `--silent/-s`
117+
shortcuts (also available as `SRC_LOG_VERBOSE`, `SRC_LOG_QUIET`, and
118+
`SRC_LOG_SILENT`). Use `logging_settings_from_config()` to build
119+
`LoggingSettings` from those conventions.
105120

106121
```python
107122
import src_py_lib as src
108-
from src_py_lib.clients.sourcegraph import SourcegraphClient
109123

110124
with src.logging({"src_token": "provided"}):
111125
src.info("sync_started", repository_count=3)
112126

113-
client = SourcegraphClient("https://sourcegraph.example.com", "token")
127+
client = src.SourcegraphClient("https://sourcegraph.example.com", "token")
114128
data = client.graphql("query Viewer { currentUser { username } }")
115129
```
116130

src/src_py_lib/__init__.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
from __future__ import annotations
44

55
import sys
6+
from collections.abc import Callable, Mapping
67
from contextlib import AbstractContextManager
78
from pathlib import Path
9+
from typing import Any
810

911
from src_py_lib.clients.github import GitHubClient, PullRequest, gh_cli_token, pr_ref_from_url
1012
from src_py_lib.clients.google_sheets import (
@@ -18,6 +20,7 @@
1820
GraphQLError,
1921
aliased_batched_query,
2022
introspect_schema,
23+
stream_connection_nodes,
2124
)
2225
from src_py_lib.clients.linear import (
2326
LinearClient,
@@ -31,6 +34,12 @@
3134
SlackPacer,
3235
slack_client_from_config,
3336
)
37+
from src_py_lib.clients.sourcegraph import (
38+
SourcegraphClient,
39+
SourcegraphClientConfig,
40+
normalize_sourcegraph_endpoint,
41+
sourcegraph_client_from_config,
42+
)
3443
from src_py_lib.utils.config import (
3544
Config,
3645
ConfigError,
@@ -63,7 +72,11 @@
6372
log,
6473
log_context,
6574
logging_context,
75+
logging_settings_from_config,
76+
resolve_log_level_name,
77+
stage,
6678
startup_event,
79+
submit_with_log_context,
6780
warning,
6881
)
6982
from src_py_lib.utils.tsv import write_tsv
@@ -75,13 +88,17 @@ def logging(
7588
command: str | None = None,
7689
git_cwd: Path | str | None = None,
7790
logging_config: LoggingSettings | None = None,
91+
run_fields: Mapping[str, Any] | None = None,
92+
run_summary: Callable[[], Mapping[str, Any]] | None = None,
7893
) -> AbstractContextManager[Path | None]:
7994
"""Configure standard CLI logging and emit startup metadata."""
8095
return logging_context(
8196
command or _script_name(),
8297
config,
8398
git_cwd=git_cwd,
8499
logging_config=logging_config,
100+
run_fields=run_fields,
101+
run_summary=run_summary,
85102
)
86103

87104

@@ -109,6 +126,8 @@ def _script_name() -> str:
109126
"SlackClientConfig",
110127
"SlackError",
111128
"SlackPacer",
129+
"SourcegraphClient",
130+
"SourcegraphClientConfig",
112131
"aliased_batched_query",
113132
"config_field",
114133
"config_snapshot",
@@ -131,14 +150,21 @@ def _script_name() -> str:
131150
"load_json_cache",
132151
"load_json_subset",
133152
"logging",
153+
"logging_settings_from_config",
134154
"log",
135155
"log_context",
156+
"normalize_sourcegraph_endpoint",
136157
"parse_args",
137158
"pr_ref_from_url",
138159
"quota_project_from_adc",
160+
"resolve_log_level_name",
139161
"save_json_cache",
140162
"slack_client_from_config",
163+
"sourcegraph_client_from_config",
164+
"stage",
141165
"startup_event",
166+
"stream_connection_nodes",
167+
"submit_with_log_context",
142168
"warning",
143169
"write_tsv",
144170
]

src/src_py_lib/clients/graphql.py

Lines changed: 125 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import json
66
import re
7-
from collections.abc import Callable, Mapping
7+
from collections.abc import Callable, Iterator, Mapping, Sequence
88
from dataclasses import dataclass, field
99
from pathlib import Path
1010
from typing import cast
@@ -113,6 +113,17 @@
113113
class GraphQLError(RuntimeError):
114114
"""Raised for GraphQL transport or application errors."""
115115

116+
def __init__(
117+
self,
118+
message: str,
119+
*,
120+
status_code: int | None = None,
121+
is_application_error: bool = False,
122+
) -> None:
123+
super().__init__(message)
124+
self.status_code = status_code
125+
self.is_application_error = is_application_error
126+
116127

117128
@dataclass
118129
class GraphQLClient:
@@ -174,6 +185,49 @@ def execute_next_page(next_variables: JSONDict) -> JSONDict:
174185
)
175186
return data
176187

188+
def stream_connection_nodes(
189+
self,
190+
query: str,
191+
variables: Mapping[str, JSONValue] | None = None,
192+
*,
193+
connection_path: Sequence[str],
194+
page_size: int | None = None,
195+
first_variable: str = "first",
196+
after_variable: str = "after",
197+
) -> Iterator[JSONDict]:
198+
"""Stream one GraphQL connection's nodes page by page.
199+
200+
`connection_path` is the response path to the connection object that
201+
contains `nodes` and `pageInfo`, for example `("viewer", "items")`.
202+
Unlike `execute(..., follow_pages=True)`, this does not accumulate all
203+
nodes in memory before returning.
204+
"""
205+
page_number = 1
206+
207+
def execute_page(
208+
operation: str, page_variables: Mapping[str, JSONValue] | None
209+
) -> JSONDict:
210+
nonlocal page_number
211+
data = self._execute_once(
212+
operation,
213+
dict(page_variables or {}),
214+
page_number=page_number,
215+
first_variable=first_variable,
216+
after_variable=after_variable,
217+
)
218+
page_number += 1
219+
return data
220+
221+
yield from stream_connection_nodes(
222+
execute_page,
223+
query,
224+
variables,
225+
connection_path=connection_path,
226+
page_size=page_size,
227+
first_variable=first_variable,
228+
after_variable=after_variable,
229+
)
230+
177231
def _execute_once(
178232
self,
179233
query: str,
@@ -200,15 +254,19 @@ def _execute_once(
200254
payload = self.http.json("POST", self.url, headers=self.headers, json_body=body)
201255
except HTTPClientError as exception:
202256
raise GraphQLError(
203-
f"{self.label} GraphQL request failed: {exception}"
257+
f"{self.label} GraphQL request failed: {exception}",
258+
status_code=exception.status_code,
204259
) from exception
205260
errors = payload.get("errors")
206261
data = json_dict(payload.get("data"))
207262
fields["response_fields"] = sorted(data)
208263
if errors:
209264
fields["graphql_errors"] = len(errors) if isinstance(errors, list) else 1
210265
if errors and not (self.tolerate_partial_errors and data):
211-
raise GraphQLError(f"{self.label} GraphQL errors: {errors}")
266+
raise GraphQLError(
267+
f"{self.label} GraphQL errors: {errors}",
268+
is_application_error=True,
269+
)
212270
return data
213271

214272

@@ -218,6 +276,49 @@ def operation_name(query: str) -> str:
218276
return match.group(1) if match else "anonymous"
219277

220278

279+
def stream_connection_nodes(
280+
execute: Callable[[str, Mapping[str, JSONValue] | None], JSONDict],
281+
query: str,
282+
variables: Mapping[str, JSONValue] | None = None,
283+
*,
284+
connection_path: Sequence[str],
285+
page_size: int | None = None,
286+
first_variable: str = "first",
287+
after_variable: str = "after",
288+
) -> Iterator[JSONDict]:
289+
"""Stream one GraphQL connection's nodes through any execute callable."""
290+
page_variables: JSONDict = dict(variables) if variables is not None else {}
291+
if page_size is not None:
292+
page_variables[first_variable] = page_size
293+
query_uses_after_variable = _query_uses_variable(query, after_variable)
294+
if query_uses_after_variable and after_variable not in page_variables:
295+
page_variables[after_variable] = None
296+
297+
path = tuple(connection_path)
298+
current_cursor = page_variables.get(after_variable)
299+
while True:
300+
data = execute(query, dict(page_variables))
301+
page = _node_page_at_path(data, path)
302+
for node in json_list(page.get("nodes")):
303+
yield json_dict(node)
304+
305+
page_info = json_dict(page.get("pageInfo"))
306+
has_next_page = page_info.get("hasNextPage")
307+
if not isinstance(has_next_page, bool):
308+
raise GraphQLError(
309+
f"GraphQL pagination path {_path_label(path)} missing pageInfo.hasNextPage"
310+
)
311+
if not has_next_page:
312+
return
313+
if not query_uses_after_variable:
314+
raise GraphQLError(
315+
f"GraphQL query returned more pages but does not use ${after_variable}"
316+
)
317+
next_cursor = _next_page_cursor(page_info, path, current_cursor)
318+
page_variables[after_variable] = next_cursor
319+
current_cursor = next_cursor
320+
321+
221322
def _int_variable(variables: JSONDict, name: str) -> int | None:
222323
value = variables.get(name)
223324
return value if isinstance(value, int) else None
@@ -301,9 +402,7 @@ def _fetch_remaining_pages(
301402
target_page = _node_page_at_path(data, path)
302403
target_nodes = json_list(target_page.get("nodes"))
303404
page_info = json_dict(target_page.get("pageInfo"))
304-
after = json_str(page_info, "endCursor")
305-
if not after:
306-
raise GraphQLError(f"GraphQL pagination path {'.'.join(path)} missing pageInfo.endCursor")
405+
after = _next_page_cursor(page_info, path, variables.get(after_variable))
307406

308407
while after:
309408
page_variables = dict(variables)
@@ -322,11 +421,7 @@ def _fetch_remaining_pages(
322421
)
323422
if not has_next_page:
324423
return
325-
after = json_str(next_page_info, "endCursor")
326-
if not after:
327-
raise GraphQLError(
328-
f"GraphQL pagination path {'.'.join(path)} missing pageInfo.endCursor"
329-
)
424+
after = _next_page_cursor(next_page_info, path, after)
330425

331426

332427
def _next_page_paths(data: JSONDict) -> list[tuple[str, ...]]:
@@ -355,10 +450,27 @@ def _node_page_at_path(data: JSONDict, path: tuple[str, ...]) -> JSONDict:
355450
current = json_dict(current).get(key)
356451
page = json_dict(current)
357452
if not page:
358-
label = ".".join(path) or "<root>"
359-
raise GraphQLError(f"GraphQL response did not include pagination path {label}")
453+
raise GraphQLError(f"GraphQL response did not include pagination path {_path_label(path)}")
360454
return page
361455

362456

457+
def _next_page_cursor(page_info: JSONDict, path: tuple[str, ...], current_cursor: object) -> str:
458+
next_cursor = json_str(page_info, "endCursor")
459+
if not next_cursor:
460+
raise GraphQLError(
461+
f"GraphQL pagination path {_path_label(path)} missing pageInfo.endCursor"
462+
)
463+
if isinstance(current_cursor, str) and next_cursor == current_cursor:
464+
raise GraphQLError(
465+
f"GraphQL pagination path {_path_label(path)} stalled: "
466+
f"pageInfo.endCursor did not advance from {current_cursor!r}"
467+
)
468+
return next_cursor
469+
470+
471+
def _path_label(path: tuple[str, ...]) -> str:
472+
return ".".join(path) or "<root>"
473+
474+
363475
def _query_uses_variable(query: str, variable: str) -> bool:
364476
return re.search(rf"\${re.escape(variable)}\b", query) is not None

src/src_py_lib/clients/linear.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,11 @@ class LinearClientConfig(Config):
4949
"""Config fields needed to build a Linear API client."""
5050

5151
linear_api_token: str = config_field(
52-
"",
52+
default="",
5353
env_var="LINEAR_API_TOKEN",
5454
cli_flag="--linear-api-token",
5555
metavar="TOKEN",
56-
help="Linear API token or op:// secret reference.",
56+
help="Linear API token or op:// secret reference",
5757
secret=True,
5858
required=True,
5959
)

src/src_py_lib/clients/slack.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ class SlackClientConfig(Config):
2727
"""Config fields needed to build a Slack API client."""
2828

2929
slack_bot_token: str = config_field(
30-
"",
30+
default="",
3131
env_var="SLACK_BOT_TOKEN",
3232
cli_flag="--slack-bot-token",
3333
metavar="TOKEN",
34-
help="Slack bot token or op:// secret reference.",
34+
help="Slack bot token or op:// secret reference",
3535
secret=True,
3636
required=True,
3737
)

0 commit comments

Comments
 (0)