From 73458bcb047e315a4a3501341bb47cf1c23ed028 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Fri, 4 Jul 2025 10:36:46 +0100 Subject: [PATCH 01/19] add code owners file to the public repo --- .github/CODEOWNERS | 6 +++ examples/query_assist.py | 107 ++++++++++++++++++++++++++++----------- 2 files changed, 84 insertions(+), 29 deletions(-) create mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..c903a16 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,6 @@ +# CODEOWNERS +# These owners will be the default owners for everything in +# the repo. Unless a later match takes precedence, +# @garyedwards and @gnathoir will be requested for +# review when someone opens a pull request. +* @tomtitherington @garyedwards @gnathoi diff --git a/examples/query_assist.py b/examples/query_assist.py index 099bbef..33946e5 100644 --- a/examples/query_assist.py +++ b/examples/query_assist.py @@ -4,6 +4,7 @@ import logging import os import re +from datetime import datetime import httpx from rdflib.plugins.stores.sparqlstore import SPARQLStore @@ -15,6 +16,7 @@ def parse_args(): + """Parses command-line arguments.""" parser = argparse.ArgumentParser( description="Download assets, list UPRNs by output area, or map ODS→UPRN from a DID triplestore." ) @@ -58,6 +60,7 @@ def parse_args(): def load_column_from_csv(path, column): + """Loads a single column from a CSV file.""" values = [] with open(path, newline="") as cf: reader = csv.DictReader(cf) @@ -82,17 +85,20 @@ def asset_subdir(enum_iri: str) -> str: def build_asset_query(uprn_list, args): + """Builds the SPARQL query to fetch asset data including result times.""" prefixes = """ PREFIX did: PREFIX dob: PREFIX so: PREFIX sosa: PREFIX prov: + PREFIX xsd: """ - select = "SELECT DISTINCT ?uprnValue ?contentUrl ?enum\n" + select = "SELECT DISTINCT ?uprnValue ?contentUrl ?enum ?resultTime\n" where = [ " ?res so:contentUrl ?contentUrl .", - " ?res dob:typeQualifier ?enum .", # ① now unconditional + " ?res dob:typeQualifier ?enum .", + " ?res sosa:resultTime ?resultTime .", " ?res ( ^sosa:hasResult | ^prov:generated / prov:used )* ?obs .", " ?obs a sosa:Observation ;", " sosa:hasFeatureOfInterest ?foi .", @@ -102,15 +108,18 @@ def build_asset_query(uprn_list, args): if args.sensor: where.append(f" ?obs sosa:madeBySensor {args.sensor} .") - quoted = ", ".join(f'"{u}"' for u in uprn_list) - where.append(f" FILTER(str(?uprnValue) IN ({quoted}))") + quoted_uprns = ", ".join(f'"{u}"' for u in uprn_list) + where.append(f" FILTER(str(?uprnValue) IN ({quoted_uprns}))") if args.types: - where.append(f" FILTER(?enum IN ({args.types}))") + # Assuming types are full IRIs, wrap them in <> + quoted_types = ", ".join(f"<{t.strip()}>" for t in args.types.split(",")) + where.append(f" FILTER(?enum IN ({quoted_types}))") return prefixes + select + "WHERE {\n" + "\n".join(where) + "\n}" def build_output_area_query(area_list): + """Builds the SPARQL query to fetch UPRNs within given output areas.""" prefixes = """ PREFIX spr: PREFIX so: @@ -129,6 +138,7 @@ def build_output_area_query(area_list): def build_ods_to_uprn_query(ods_list): + """Builds the SPARQL query to map ODS codes to UPRNs.""" prefixes = """ PREFIX dob: PREFIX so: @@ -148,22 +158,35 @@ def build_ods_to_uprn_query(ods_list): def download_asset(url: str, save_dir: str, api_key: str): + """Downloads a single asset from a URL to a specified directory.""" try: - resp = httpx.get(url, headers={"x-api-key": api_key}, timeout=120) - resp.raise_for_status() - cd = resp.headers.get("Content-Disposition", "") - m = re.search(r'filename="([^"]+)"', cd) - fn = m.group(1) if m else os.path.basename(url) - os.makedirs(save_dir, exist_ok=True) - path = os.path.join(save_dir, fn) - with open(path, "wb") as f: - f.write(resp.content) - logging.info(f"✔ Saved {url} → {path}") + # Use a client with a longer timeout for potentially large files + with httpx.Client(timeout=120.0) as client: + resp = client.get(url, headers={"x-api-key": api_key}) + resp.raise_for_status() + + # Determine filename from Content-Disposition or URL + cd = resp.headers.get("Content-Disposition", "") + m = re.search(r'filename="([^"]+)"', cd) + fn = m.group(1) if m else os.path.basename(url) + + os.makedirs(save_dir, exist_ok=True) + path = os.path.join(save_dir, fn) + + with open(path, "wb") as f: + f.write(resp.content) + logging.info(f"✔ Saved {url} → {path}") + + except httpx.HTTPStatusError as e: + logging.error( + f"✖ HTTP error downloading {url}: {e.response.status_code} - {e.response.text}" + ) except Exception as e: logging.error(f"✖ Failed to download {url}: {e}") def main(): + """Main execution function.""" args = parse_args() download_base = args.download_dir or os.path.join(os.getcwd(), "downloads") os.makedirs(download_base, exist_ok=True) @@ -176,11 +199,11 @@ def main(): ods_list.extend(load_column_from_csv(entry, "ods")) else: ods_list.extend(o.strip() for o in entry.split(",") if o.strip()) - ods_list = list(dict.fromkeys(ods_list)) + ods_list = sorted(list(dict.fromkeys(ods_list))) # Sort for consistent query store = SPARQLStore(query_endpoint=args.db_url, returnFormat="json") q = build_ods_to_uprn_query(ods_list) - logging.info("SPARQL query for ODS→UPRN mapping with recCodeAddress:\n%s", q) + logging.info("SPARQL query for ODS→UPRN mapping:\n%s", q) res = store.query(q) out_csv = os.path.join(download_base, "ods_to_uprn.csv") @@ -224,7 +247,9 @@ def main(): grouping = {} for row in res: - grouping.setdefault(row["outputArea"], []).append(row["uprnValue"]) + grouping.setdefault(str(row["outputArea"]), []).append( + str(row["uprnValue"]) + ) for oa, uprns in grouping.items(): name = oa.split("/")[-1] @@ -232,7 +257,7 @@ def main(): with open(out_csv, "w", newline="") as cf: writer = csv.writer(cf) writer.writerow(["uprn"]) - for u in uprns: + for u in sorted(uprns): writer.writerow([u]) logging.info(f"✔ Saved CSV for {oa} → {out_csv}") @@ -244,13 +269,15 @@ def main(): uprn_list.extend(load_column_from_csv(entry, "uprn")) else: uprn_list.extend(u.strip() for u in entry.split(",") if u.strip()) - uprn_list = list(dict.fromkeys(uprn_list)) + uprn_list = sorted(list(dict.fromkeys(uprn_list))) # Sort for consistent query if uprn_list: api_key = os.getenv(args.api_key_env) if not api_key: - logging.error(f"Env var {args.api_key_env!r} is not set") - raise RuntimeError(f"Env var {args.api_key_env!r} is not set") + logging.error( + f"API key environment variable {args.api_key_env!r} is not set." + ) + return # Exit gracefully store = SPARQLStore(query_endpoint=args.db_url, returnFormat="json") q = build_asset_query(uprn_list, args) @@ -258,15 +285,37 @@ def main(): res = store.query(q) for row in res: - uprn_val = str(row["uprnValue"]) - url = str(row["contentUrl"]) - enum_iri = str(row["enum"]) - subdir = asset_subdir(enum_iri) + try: + uprn_val = str(row["uprnValue"]) + url = str(row["contentUrl"]) + enum_iri = str(row["enum"]) + + # rdflib automatically parses xsd:dateTime into a Python datetime object + result_time_obj = row["resultTime"].value + if isinstance(result_time_obj, datetime): + date_str = result_time_obj.strftime("%Y-%m-%d") + else: + # Fallback for unexpected date formats + date_str = str(result_time_obj).split("T")[0] + + asset_type_subdir = asset_subdir(enum_iri) + + # Construct the new directory structure: //// + tgt_dir = os.path.join( + download_base, uprn_val, date_str, asset_type_subdir + ) - tgt_dir = os.path.join(download_base, uprn_val, subdir) - logging.info(f"⤷ Downloading {url} into {tgt_dir}/") + logging.info(f"⤷ Queuing download for {url} into {tgt_dir}/") + download_asset(url, tgt_dir, api_key) - download_asset(url, tgt_dir, api_key) + except KeyError as e: + logging.error( + f"✖ Query result row was missing expected key: {e}. Row: {row}" + ) + except Exception as e: + logging.error( + f"✖ An unexpected error occurred while processing row {row}: {e}" + ) if __name__ == "__main__": From ebb79c26c9a65d5250b6cd147269ab2000e92c8e Mon Sep 17 00:00:00 2001 From: gnathoi Date: Fri, 4 Jul 2025 10:45:03 +0100 Subject: [PATCH 02/19] add changelog --- CHANGELOG.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ea84bbe --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,43 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- Add a `CODEOWNERS` file for ownership management. +- Add a `CHANGELOG.md` file for ckear documentation of the development of the asset-api. + +### Changed +- Refactor the query_assist.py output folder structure for dates of obersvations. + +--- + +## [0.0.2] - 2025-06-25 + +### Added +- Implement testing infrastructure. + +### Changed +- Refactor the query_assist.py output folder structure for types. + +--- + +## [0.0.1] - 2025-05-02 + +### Added +- Add initial Python examples. +- Create a `LICENSE` file. +- Add `.gitignore` to the project setup. +- Add the initial OpenAPI specification. +- Add the project `README.md` file. + +### Changed +- Update and enhance Python examples. +- Update the `LICENSE` file. + +### Fixed +- Correct formatting to be comma-separated. From 97eef2d35d272056a9f9ac1cba0c3374acef1ba4 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Fri, 4 Jul 2025 10:50:10 +0100 Subject: [PATCH 03/19] sosa:phenomenonTime adde to query --- examples/query_assist.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/examples/query_assist.py b/examples/query_assist.py index 33946e5..6cb54fe 100644 --- a/examples/query_assist.py +++ b/examples/query_assist.py @@ -85,7 +85,7 @@ def asset_subdir(enum_iri: str) -> str: def build_asset_query(uprn_list, args): - """Builds the SPARQL query to fetch asset data including result times.""" + """Builds the SPARQL query to fetch asset data including phenomenon times.""" prefixes = """ PREFIX did: PREFIX dob: @@ -94,13 +94,13 @@ def build_asset_query(uprn_list, args): PREFIX prov: PREFIX xsd: """ - select = "SELECT DISTINCT ?uprnValue ?contentUrl ?enum ?resultTime\n" + select = "SELECT DISTINCT ?uprnValue ?contentUrl ?enum ?phenomenonTime\n" where = [ " ?res so:contentUrl ?contentUrl .", " ?res dob:typeQualifier ?enum .", - " ?res sosa:resultTime ?resultTime .", " ?res ( ^sosa:hasResult | ^prov:generated / prov:used )* ?obs .", " ?obs a sosa:Observation ;", + " sosa:phenomenonTime ?phenomenonTime ;", " sosa:hasFeatureOfInterest ?foi .", " ?foi so:identifier ?uprnRes .", " ?uprnRes a dob:UPRNValue ; so:value ?uprnValue .", @@ -290,17 +290,15 @@ def main(): url = str(row["contentUrl"]) enum_iri = str(row["enum"]) - # rdflib automatically parses xsd:dateTime into a Python datetime object - result_time_obj = row["resultTime"].value - if isinstance(result_time_obj, datetime): - date_str = result_time_obj.strftime("%Y-%m-%d") + phenomenon_time_obj = row["phenomenonTime"].value + if isinstance(phenomenon_time_obj, datetime): + date_str = phenomenon_time_obj.strftime("%Y-%m-%d") else: # Fallback for unexpected date formats - date_str = str(result_time_obj).split("T")[0] + date_str = str(phenomenon_time_obj).split("T")[0] asset_type_subdir = asset_subdir(enum_iri) - # Construct the new directory structure: //// tgt_dir = os.path.join( download_base, uprn_val, date_str, asset_type_subdir ) From cb539ff84511707456073b0d1a28fff337e8d785 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Fri, 4 Jul 2025 10:53:07 +0100 Subject: [PATCH 04/19] fix: correct spelling in changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea84bbe..d56b042 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add a `CHANGELOG.md` file for ckear documentation of the development of the asset-api. ### Changed -- Refactor the query_assist.py output folder structure for dates of obersvations. +- Refactor the query_assist.py output folder structure for dates of observations. --- From 32626b7b2a195149cd902e0cb027958a4420092c Mon Sep 17 00:00:00 2001 From: gnathoi Date: Tue, 26 Aug 2025 14:11:44 +0100 Subject: [PATCH 05/19] add: nl querying --- .gitignore | 2 +- examples/nl_query_cli.py | 603 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 604 insertions(+), 1 deletion(-) create mode 100644 examples/nl_query_cli.py diff --git a/.gitignore b/.gitignore index 856c297..6685d36 100644 --- a/.gitignore +++ b/.gitignore @@ -30,7 +30,7 @@ MANIFEST # Virtual environments venv/ ENV/ -env/ +env*/ .venv/ .ENV/ .env/ diff --git a/examples/nl_query_cli.py b/examples/nl_query_cli.py new file mode 100644 index 0000000..ddb72e7 --- /dev/null +++ b/examples/nl_query_cli.py @@ -0,0 +1,603 @@ +#!/usr/bin/env python3 +""" +Natural-language CLI router for query_assist.py using an Ollama-served model. + +- Sends a compact routing prompt (system + few shots + user) to Ollama's /api/chat. +- Expects the model to emit ONLY a JSON object describing the desired command. +- Builds an argv list and shells out to query_assist.py accordingly. + +Dependencies: + pip install -U requests + +Assumptions: + - An Ollama server is reachable (e.g., in a container) at: + base URL from $OLLAMA_HOST or http://localhost:11434 + - The desired model (e.g., 'llama3.1:8b-instruct') is pulled and available in Ollama. + +Notes: + - We set `format: "json"` in the chat request to bias the model toward a clean JSON response. + - We still robustly slice the first JSON object from the returned text to tolerate minor deviations. + +Environment: + - OLLAMA_HOST (optional): e.g., http://localhost:11434 or http://ollama:11434 +""" + +import argparse +import json +import os +import re +import shlex +import subprocess +import sys +from typing import Any, Dict, List, Tuple, Union + +import requests + +SYSTEM_ROUTER_PROMPT = """You are a rigorous function-call router for a Python CLI named query_assist.py. + +Supported commands and how to populate them: + +1) download_assets + Required: uprn (string CSV path OR array of strings like ["5045394","200003455212"]) + Optional: sensor (string, e.g., "bess:OusterLidarSensor") + types (array of strings; each a type IRI, e.g., ["did:rgb-image","did:lidar-pointcloud-merged"]) + download_dir (string path) + api_key_env (string, name of env var with API key) + db_url (string URL to SPARQL endpoint) + +2) ods_to_uprn + Required: ods (string CSV path OR array of strings like ["G85013","Q12345"]) + +3) uprns_by_output_area + Required: output_area (string CSV path OR array of strings, e.g., ["E00004550","E00032882"]) + +Schema (MUST output exactly one JSON object with these keys as needed): +{ + "command": "download_assets" | "ods_to_uprn" | "uprns_by_output_area", + "uprn": string | string[] | null, + "ods": string | string[] | null, + "output_area": string | string[] | null, + "sensor": string | null, + "types": string[] | null, + "download_dir": string | null, + "api_key_env": string | null, + "db_url": string | null +} + +Constraints: +- Return ONLY the JSON object. No prose, no markdown. +- If the user request implies asset types, map them to the supported IRIs if possible: + - RGB image -> "did:rgb-image" + - merged lidar point cloud -> "did:lidar-pointcloud-merged" + - lidar range panorama -> "did:lidar-range-pano" + - lidar reflectance panorama -> "did:lidar-reflectance-pano" + - lidar signal panorama -> "did:lidar-signal-pano" + - lidar near-infrared panorama -> "did:lidar-nearir-pano" + - IR false color -> "did:ir-false-color-image" + - IR temperature array -> "did:ir-temperature-array" + - IR counts -> "did:ir-count-image" + - temperature (no contentUrl) -> "did:celsius-temperature" + - relative humidity (no contentUrl) -> "did:relative-humidity" +- Prefer being decisive. When in doubt, infer sensible defaults. +""" + +FEW_SHOTS: List[Tuple[str, str]] = [ + ( + "Download the merged lidar point cloud for UPRN 5045394 into /data/assets. " + "Use MY_KEY as the env var for the API key.", + '{"command":"download_assets","uprn":["5045394"],"sensor":null,' + '"types":["did:lidar-pointcloud-merged"],"download_dir":"/data/assets",' + '"api_key_env":"MY_KEY","db_url":null,"ods":null,"output_area":null}', + ), + ( + "Map ODS G85013 to UPRNs.", + '{"command":"ods_to_uprn","ods":["G85013"],"uprn":null,"output_area":null,' + '"sensor":null,"types":null,"download_dir":null,"api_key_env":null,"db_url":null}', + ), + ( + "List all UPRNs in output areas E00004550 and E00032882.", + '{"command":"uprns_by_output_area","output_area":["E00004550","E00032882"],' + '"uprn":null,"ods":null,"sensor":null,"types":null,"download_dir":null,' + '"api_key_env":null,"db_url":null}', + ), +] + + +def extract_assistant_text_from_ollama(resp: Dict[str, Any]) -> str: + """ + Extract the assistant's text from Ollama /api/chat or /api/generate response. + """ + # Preferred: /api/chat schema + msg = resp.get("message") + if isinstance(msg, dict): + content = msg.get("content") + if isinstance(content, str) and content.strip(): + return content + + # Fallback: /api/generate schema + if isinstance(resp.get("response"), str) and resp["response"].strip(): + return resp["response"] + + # Last resort: stringify + return json.dumps(resp, ensure_ascii=False) + + +def slice_first_json_object(text: str) -> str: + """ + Robustly extract the first top-level JSON object {...} from text. + Raises ValueError if none is found. + """ + start = text.find("{") + if start == -1: + raise ValueError("No JSON object start found.") + depth = 0 + for i, c in enumerate(text[start:], start=start): + if c == "{": + depth += 1 + elif c == "}": + depth -= 1 + if depth == 0: + return text[start : i + 1] + raise ValueError("Unbalanced braces; JSON object not closed.") + + +def ensure_list_or_path(v: Union[None, str, List[str]]) -> List[str]: + """ + Convert JSON field (None | string | list) into a list of CLI tokens. + """ + if v is None: + return [] + if isinstance(v, list): + return [str(x) for x in v if str(x).strip()] + s = str(v).strip() + if not s: + return [] + return [s] + + +def build_argv(spec: Dict[str, Any], py: str, qa_path: str) -> List[str]: + """ + Map the JSON spec to query_assist.py argv. + """ + cmd = [py, qa_path] + command = spec.get("command") + + db_url = spec.get("db_url") + download_dir = spec.get("download_dir") + api_key_env = spec.get("api_key_env") + sensor = spec.get("sensor") + types = spec.get("types") + + if command == "download_assets": + uprn = ensure_list_or_path(spec.get("uprn")) + if not uprn: + raise ValueError("download_assets requires 'uprn'.") + cmd += ["--uprn"] + uprn + if sensor: + cmd += ["--sensor", str(sensor)] + if types: + cmd += ["--types", ",".join([str(t) for t in types])] + elif command == "ods_to_uprn": + ods = ensure_list_or_path(spec.get("ods")) + if not ods: + raise ValueError("ods_to_uprn requires 'ods'.") + cmd += ["--ods"] + ods + elif command == "uprns_by_output_area": + oa = ensure_list_or_path(spec.get("output_area")) + if not oa: + raise ValueError("uprns_by_output_area requires 'output_area'.") + cmd += ["--output-area"] + oa + else: + raise ValueError(f"Unsupported command: {command!r}") + + if db_url: + cmd += ["--db-url", str(db_url)] + if download_dir: + cmd += ["--download-dir", str(download_dir)] + if api_key_env: + cmd += ["--api-key-env", str(api_key_env)] + + return cmd + + +def print_router_help(): + """Print brief guidance for the NL router when no command was inferred.""" + print( + "[router] Expected one of the commands: download_assets | ods_to_uprn | uprns_by_output_area" + ) + print("[router] Examples:") + print( + " 'Download the merged lidar point cloud for UPRN 5045394' → download_assets" + ) + print(" 'Map ODS G85013 to UPRNs' → ods_to_uprn") + print( + " 'List all UPRNs in output areas E00004550 and E00032882' → uprns_by_output_area" + ) + print( + "[router] You can also be concise, e.g.: '5045394 merged lidar', 'ODS G85013', 'output areas E00004550 E00032882'." + ) + + +# --- Heuristic parsing --- +def heuristic_parse(nl: str) -> Union[Dict[str, Any], None]: + """Attempt to derive a spec dict directly from natural language without model. + + Patterns handled: + - Output areas: presence of 'output area' or codes like E00012345 + - ODS codes: tokens like a letter followed by 5 digits (e.g., G85013) with 'ODS' keyword + - UPRN asset downloads: numeric tokens length >=6 plus words like 'download', 'get', 'asset', 'lidar', 'image' + """ + text = nl.strip() + if not text: + return None + lowered = text.lower() + + output_area_codes = re.findall(r"\bE\d{8}\b", text) + ods_codes = re.findall(r"\b[A-Z]\d{5}\b", text) + uprn_candidates = [t for t in re.findall(r"\b\d{6,}\b", text)] + + # Output area heuristic + if output_area_codes and ( + "output area" in lowered + or "output areas" in lowered + or len(output_area_codes) > 1 + ): + return { + "command": "uprns_by_output_area", + "output_area": output_area_codes, + "uprn": None, + "ods": None, + "sensor": None, + "types": None, + "download_dir": None, + "api_key_env": None, + "db_url": None, + } + + # ODS heuristic + if ods_codes and ("ods" in lowered or not uprn_candidates): + return { + "command": "ods_to_uprn", + "ods": ods_codes, + "uprn": None, + "output_area": None, + "sensor": None, + "types": None, + "download_dir": None, + "api_key_env": None, + "db_url": None, + } + + # Asset download heuristic + asset_keywords = { + "download", + "get", + "asset", + "assets", + "lidar", + "image", + "images", + "point", + "pointcloud", + } + if uprn_candidates and any(k in lowered for k in asset_keywords): + return { + "command": "download_assets", + "uprn": uprn_candidates, + "sensor": None, + "types": None, + "download_dir": None, + "api_key_env": None, + "db_url": None, + "ods": None, + "output_area": None, + } + + return None + + +def ollama_chat( + base_url: str, + model: str, + messages: List[Dict[str, str]], + temperature: float = 0.0, + top_p: float = 0.95, + num_predict: int = 256, + num_ctx: Union[int, None] = None, + keep_alive: Union[str, None] = None, + request_timeout_s: float = 120.0, + force_json: bool = True, +) -> Dict[str, Any]: + """ + Call Ollama's /api/chat with given messages and decoding options. + Returns the parsed JSON response. + """ + url = base_url.rstrip("/") + "/api/chat" + payload: Dict[str, Any] = { + "model": model, + "messages": messages, + "stream": False, + "options": { + "temperature": float(temperature), + "top_p": float(top_p), + "num_predict": int(num_predict), + }, + } + if num_ctx is not None: + payload["options"]["num_ctx"] = int(num_ctx) + if keep_alive: + payload["keep_alive"] = str(keep_alive) + if force_json: + # Instruct Ollama to format the assistant message as strict JSON + payload["format"] = "json" + + resp = requests.post(url, json=payload, timeout=(5.0, request_timeout_s)) + resp.raise_for_status() + return resp.json() + + +def run_once( + base_url: str, + model_id: str, + nl: str, + qa_path: str, + py_exe: str, + dry_run: bool, + temperature: float, + top_p: float, + num_predict: int, + num_ctx: Union[int, None], + keep_alive: Union[str, None], + force_json: bool, + debug_model: bool, +) -> int: + """ + One-shot turn: model → JSON → construct argv → (dry) run query_assist.py. + """ + # Heuristic fast-path: try to parse without model when patterns are obvious. + heuristic_spec = heuristic_parse(nl) + if heuristic_spec: + print("[router] Heuristic matched; bypassing model.") + if debug_model: + print( + f"[debug] Heuristic spec derived from input: {json.dumps(heuristic_spec)}" + ) + spec = heuristic_spec + argv = build_argv(spec, py_exe, qa_path) + print("\n[router] JSON spec (heuristic):") + print(json.dumps(spec, indent=2)) + print("\n[router] Command:") + print(" ".join([shlex.quote(x) for x in argv])) + if dry_run: + return 0 + proc = subprocess.run(argv) + return proc.returncode + + messages = [{"role": "system", "content": SYSTEM_ROUTER_PROMPT}] + for u, a in FEW_SHOTS: + messages.append({"role": "user", "content": u}) + messages.append({"role": "assistant", "content": a}) + messages.append({"role": "user", "content": nl}) + + resp = ollama_chat( + base_url=base_url, + model=model_id, + messages=messages, + temperature=temperature, + top_p=top_p, + num_predict=num_predict, + num_ctx=num_ctx, + keep_alive=keep_alive, + force_json=force_json, + ) + + if debug_model: + print("[debug] Primary model raw JSON response:") + try: + print(json.dumps(resp, indent=2, ensure_ascii=False)) + except Exception: + print(str(resp)) + + content = extract_assistant_text_from_ollama(resp) + if debug_model: + print("[debug] Extracted content (primary):") + print(repr(content)) + + # Try strict JSON first; if that fails, slice the first JSON object. + spec = None + if content.strip(): + try: + spec = json.loads(content) + except Exception: + try: + blob = slice_first_json_object(content) + spec = json.loads(blob) + except Exception: + spec = None + if spec is None: + # Retry once without forcing JSON if we had forced it and got nothing + if force_json: + print( + "[router] Empty/invalid JSON content; retrying without format=json..." + ) + resp2 = ollama_chat( + base_url=base_url, + model=model_id, + messages=messages, + temperature=temperature, + top_p=top_p, + num_predict=num_predict, + num_ctx=num_ctx, + keep_alive=keep_alive, + force_json=False, + ) + content2 = extract_assistant_text_from_ollama(resp2) + if debug_model: + print("[debug] Secondary model raw JSON response (no format=json):") + try: + print(json.dumps(resp2, indent=2, ensure_ascii=False)) + except Exception: + print(str(resp2)) + print("[debug] Extracted content (secondary):") + print(repr(content2)) + try: + spec = json.loads(content2) + except Exception: + try: + blob = slice_first_json_object(content2) + spec = json.loads(blob) + except Exception: + spec = None + # Last resort: apply heuristic after model failure + if spec is None: + heuristic_spec = heuristic_parse(nl) + if heuristic_spec: + print("[router] Falling back to heuristic after model failure.") + spec = heuristic_spec + + # Fallback: if no command or unsupported command, emit help and return gracefully. + if not spec or spec.get("command") not in { + "download_assets", + "ods_to_uprn", + "uprns_by_output_area", + }: + print("[router] No actionable command inferred from model output.") + print("[router] Raw model content:") + print(content.strip()) + print_router_help() + return 0 + + argv = build_argv(spec, py_exe, qa_path) + + print("\n[router] JSON spec:") + print(json.dumps(spec, indent=2)) + print("\n[router] Command:") + print(" ".join([shlex.quote(x) for x in argv])) + if dry_run: + return 0 + + # Inherit env (so API_KEY etc. is available) + proc = subprocess.run(argv) + return proc.returncode + + +def main(): + ap = argparse.ArgumentParser( + description="NL interface for query_assist.py using Ollama" + ) + ap.add_argument("--model-id", default="gpt-oss:20b", help="Ollama model name/tag") + ap.add_argument( + "--query-assist-path", + default=os.path.join(os.path.dirname(__file__), "query_assist.py"), + help="Path to query_assist.py", + ) + ap.add_argument( + "--base-url", + default=os.environ.get("OLLAMA_HOST", "http://localhost:11434"), + help="Base URL of the Ollama server (or set OLLAMA_HOST)", + ) + ap.add_argument( + "--dry-run", + action="store_true", + help="Only print the derived command; do not execute", + ) + ap.add_argument( + "--once", "-q", help="Run a single NL query and exit (non-interactive)" + ) + + # Decoding / runtime knobs (kept simple and model-agnostic) + ap.add_argument( + "--temperature", type=float, default=0.0, help="Sampling temperature" + ) + ap.add_argument( + "--top-p", type=float, default=0.95, help="Nucleus sampling probability" + ) + ap.add_argument( + "--num-predict", type=int, default=256, help="Max new tokens to generate" + ) + ap.add_argument( + "--num-ctx", + type=int, + default=None, + help="Context window size hint (model-dependent)", + ) + ap.add_argument( + "--keep-alive", default=None, help="Ollama keep-alive (e.g., '5m', '30m', '0')" + ) + ap.add_argument( + "--no-force-json", + action="store_true", + help="Do not set format='json' in the chat call (not recommended)", + ) + ap.add_argument( + "--debug-model", + action="store_true", + help="Print raw model JSON responses and extracted text", + ) + + args = ap.parse_args() + + print(f"[init] Ollama base URL: {args.base_url}") + print( + f"[init] model={args.model_id} temperature={args.temperature} top_p={args.top_p} " + f"num_predict={args.num_predict} num_ctx={args.num_ctx} keep_alive={args.keep_alive} " + f"force_json={not args.no_force_json}" + ) + + py_exe = sys.executable + qa_path = args.query_assist_path + + if args.once: + try: + rc = run_once( + base_url=args.base_url, + model_id=args.model_id, + nl=args.once, + qa_path=qa_path, + py_exe=py_exe, + dry_run=args.dry_run, + temperature=args.temperature, + top_p=args.top_p, + num_predict=args.num_predict, + num_ctx=args.num_ctx, + keep_alive=args.keep_alive, + force_json=(not args.no_force_json), + debug_model=args.debug_model, + ) + sys.exit(rc) + except Exception as e: + print(f"[router] Error: {e}", file=sys.stderr) + sys.exit(1) + + print("NL router for query_assist.py (Ollama). Type 'exit' or Ctrl-D to quit.") + while True: + try: + nl = input("\n> ").strip() + except EOFError: + break + if not nl: + continue + if nl.lower() in {"exit", "quit"}: + break + try: + run_once( + base_url=args.base_url, + model_id=args.model_id, + nl=nl, + qa_path=qa_path, + py_exe=py_exe, + dry_run=args.dry_run, + temperature=args.temperature, + top_p=args.top_p, + num_predict=args.num_predict, + num_ctx=args.num_ctx, + keep_alive=args.keep_alive, + force_json=(not args.no_force_json), + debug_model=args.debug_model, + ) + except Exception as e: + print(f"[router] Error: {e}", file=sys.stderr) + + +if __name__ == "__main__": + main() From b186e9db268ded3b9984fa1270e3c6cce78425d5 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Tue, 26 Aug 2025 14:13:49 +0100 Subject: [PATCH 06/19] change: expand few shot --- examples/nl_query_cli.py | 88 ++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 22 deletions(-) diff --git a/examples/nl_query_cli.py b/examples/nl_query_cli.py index ddb72e7..14369f7 100644 --- a/examples/nl_query_cli.py +++ b/examples/nl_query_cli.py @@ -1,26 +1,4 @@ #!/usr/bin/env python3 -""" -Natural-language CLI router for query_assist.py using an Ollama-served model. - -- Sends a compact routing prompt (system + few shots + user) to Ollama's /api/chat. -- Expects the model to emit ONLY a JSON object describing the desired command. -- Builds an argv list and shells out to query_assist.py accordingly. - -Dependencies: - pip install -U requests - -Assumptions: - - An Ollama server is reachable (e.g., in a container) at: - base URL from $OLLAMA_HOST or http://localhost:11434 - - The desired model (e.g., 'llama3.1:8b-instruct') is pulled and available in Ollama. - -Notes: - - We set `format: "json"` in the chat request to bias the model toward a clean JSON response. - - We still robustly slice the first JSON object from the returned text to tolerate minor deviations. - -Environment: - - OLLAMA_HOST (optional): e.g., http://localhost:11434 or http://ollama:11434 -""" import argparse import json @@ -100,6 +78,72 @@ '"uprn":null,"ods":null,"sensor":null,"types":null,"download_dir":null,' '"api_key_env":null,"db_url":null}', ), + # --- Additional diverse shots --- + ( + "Get RGB images and merged lidar for UPRNs 5045394 and 200003455212 to /mnt/dl (API key var KEY2).", + '{"command":"download_assets","uprn":["5045394","200003455212"],"sensor":null,' + '"types":["did:rgb-image","did:lidar-pointcloud-merged"],"download_dir":"/mnt/dl",' + '"api_key_env":"KEY2","db_url":null,"ods":null,"output_area":null}', + ), + ( + "output areas E00004550 E00032882 E00063193 list uprns", + '{"command":"uprns_by_output_area","output_area":["E00004550","E00032882","E00063193"],' + '"uprn":null,"ods":null,"sensor":null,"types":null,"download_dir":null,' + '"api_key_env":null,"db_url":null}', + ), + ( + "ODS codes G85013 Q12345 map to uprns", + '{"command":"ods_to_uprn","ods":["G85013","Q12345"],"uprn":null,"output_area":null,' + '"sensor":null,"types":null,"download_dir":null,"api_key_env":null,"db_url":null}', + ), + ( + "5045394 merged lidar pointcloud now", + '{"command":"download_assets","uprn":["5045394"],"sensor":null,' + '"types":["did:lidar-pointcloud-merged"],"download_dir":null,' + '"api_key_env":null,"db_url":null,"ods":null,"output_area":null}', + ), + ( + "Download lidar range and reflectance panoramas for UPRN 5045394 sensor bess:OusterLidarSensor", + '{"command":"download_assets","uprn":["5045394"],"sensor":"bess:OusterLidarSensor",' + '"types":["did:lidar-range-pano","did:lidar-reflectance-pano"],"download_dir":null,' + '"api_key_env":null,"db_url":null,"ods":null,"output_area":null}', + ), + ( + "Give me temperature and humidity for UPRN 200003455212", + '{"command":"download_assets","uprn":["200003455212"],"sensor":null,' + '"types":["did:celsius-temperature","did:relative-humidity"],"download_dir":null,' + '"api_key_env":null,"db_url":null,"ods":null,"output_area":null}', + ), + ( + "Fetch IR false color and temperature array for 5045394", + '{"command":"download_assets","uprn":["5045394"],"sensor":null,' + '"types":["did:ir-false-color-image","did:ir-temperature-array"],"download_dir":null,' + '"api_key_env":null,"db_url":null,"ods":null,"output_area":null}', + ), + ( + "List UPRNs in output area E00004550 (single)", + '{"command":"uprns_by_output_area","output_area":["E00004550"],"uprn":null,' + '"ods":null,"sensor":null,"types":null,"download_dir":null,' + '"api_key_env":null,"db_url":null}', + ), + ( + "Map ODS G85013 and G85014 with endpoint override http://myhost:3030/ds/query", + '{"command":"ods_to_uprn","ods":["G85013","G85014"],"uprn":null,"output_area":null,' + '"sensor":null,"types":null,"download_dir":null,"api_key_env":null,' + '"db_url":"http://myhost:3030/ds/query"}', + ), + ( + "Download rgb image for 5045394 to /tmp/dl using key var APIKEY", + '{"command":"download_assets","uprn":["5045394"],"sensor":null,' + '"types":["did:rgb-image"],"download_dir":"/tmp/dl","api_key_env":"APIKEY",' + '"db_url":null,"ods":null,"output_area":null}', + ), + ( + "Get point cloud frame for UPRN 5045394", + '{"command":"download_assets","uprn":["5045394"],"sensor":null,' + '"types":["did:lidar-pointcloud-frame"],"download_dir":null,"api_key_env":null,' + '"db_url":null,"ods":null,"output_area":null}', + ), ] From 63b2d2f1f876e3e2d52452827f2928dbf00cef75 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Tue, 26 Aug 2025 14:36:43 +0100 Subject: [PATCH 07/19] change: tidier --- examples/nl_query_cli.py | 492 ++++++++++++++++++++++++++------------- 1 file changed, 327 insertions(+), 165 deletions(-) diff --git a/examples/nl_query_cli.py b/examples/nl_query_cli.py index 14369f7..1f4b956 100644 --- a/examples/nl_query_cli.py +++ b/examples/nl_query_cli.py @@ -1,16 +1,23 @@ #!/usr/bin/env python3 +from __future__ import annotations + import argparse import json +import logging import os import re import shlex import subprocess import sys -from typing import Any, Dict, List, Tuple, Union +from typing import Any import requests +# ------------------------------- +# System & Summary Prompts (unchanged semantics; formatting tuned) +# ------------------------------- + SYSTEM_ROUTER_PROMPT = """You are a rigorous function-call router for a Python CLI named query_assist.py. Supported commands and how to populate them: @@ -59,7 +66,33 @@ - Prefer being decisive. When in doubt, infer sensible defaults. """ -FEW_SHOTS: List[Tuple[str, str]] = [ +SUMMARY_SYSTEM_PROMPT = """You transform a raw natural-language user request about assets / UPRNs / ODS / output areas into a concise structured summary. + +Return ONLY a single JSON object with this exact schema (no prose): +{ + "bullets": string[] // 2-8 short bullet points capturing intent & extracted fields + ,"router_text": string // ONE concise imperative sentence for a routing model to decide the command + ,"extracted": { // best-effort extraction; omit keys you cannot infer + "uprn": string[] | null, + "ods": string[] | null, + "output_area": string[] | null, + "sensor": string | null, + "types": string[] | null, + "download_dir": string | null, + "api_key_env": string | null, + "db_url": string | null + } +} + +Guidance: +- Normalize UPRNs to digit strings. +- Keep ordering as given when sensible. +- For types, map descriptive phrases to IRIs per provided mapping when obvious. +- router_text should be minimal but sufficient (e.g., "Download merged lidar point cloud for UPRN 5045394"). +- If purely informational greeting with no actionable command, set bullets to ["No actionable request"], router_text="no-op" and extracted={}. +""" + +FEW_SHOTS: list[tuple[str, str]] = [ ( "Download the merged lidar point cloud for UPRN 5045394 into /data/assets. " "Use MY_KEY as the env var for the API key.", @@ -78,7 +111,6 @@ '"uprn":null,"ods":null,"sensor":null,"types":null,"download_dir":null,' '"api_key_env":null,"db_url":null}', ), - # --- Additional diverse shots --- ( "Get RGB images and merged lidar for UPRNs 5045394 and 200003455212 to /mnt/dl (API key var KEY2).", '{"command":"download_assets","uprn":["5045394","200003455212"],"sensor":null,' @@ -146,31 +178,27 @@ ), ] +# ------------------------------- +# Helpers +# ------------------------------- -def extract_assistant_text_from_ollama(resp: Dict[str, Any]) -> str: - """ - Extract the assistant's text from Ollama /api/chat or /api/generate response. - """ - # Preferred: /api/chat schema +log = logging.getLogger("nl_query_cli") + + +def extract_assistant_text_from_ollama(resp: dict[str, Any]) -> str: + """Extract assistant text from Ollama /api/chat or /api/generate.""" msg = resp.get("message") if isinstance(msg, dict): content = msg.get("content") if isinstance(content, str) and content.strip(): return content - - # Fallback: /api/generate schema if isinstance(resp.get("response"), str) and resp["response"].strip(): return resp["response"] - - # Last resort: stringify return json.dumps(resp, ensure_ascii=False) def slice_first_json_object(text: str) -> str: - """ - Robustly extract the first top-level JSON object {...} from text. - Raises ValueError if none is found. - """ + """Extract the first top-level JSON object {...} from text.""" start = text.find("{") if start == -1: raise ValueError("No JSON object start found.") @@ -185,10 +213,8 @@ def slice_first_json_object(text: str) -> str: raise ValueError("Unbalanced braces; JSON object not closed.") -def ensure_list_or_path(v: Union[None, str, List[str]]) -> List[str]: - """ - Convert JSON field (None | string | list) into a list of CLI tokens. - """ +def ensure_list_or_path(v: None | str | list[str]) -> list[str]: + """Convert (None | str | list) into a list of CLI tokens.""" if v is None: return [] if isinstance(v, list): @@ -199,10 +225,8 @@ def ensure_list_or_path(v: Union[None, str, List[str]]) -> List[str]: return [s] -def build_argv(spec: Dict[str, Any], py: str, qa_path: str) -> List[str]: - """ - Map the JSON spec to query_assist.py argv. - """ +def build_argv(spec: dict[str, Any], py: str, qa_path: str) -> list[str]: + """Map the JSON spec to query_assist.py argv.""" cmd = [py, qa_path] command = spec.get("command") @@ -244,32 +268,13 @@ def build_argv(spec: Dict[str, Any], py: str, qa_path: str) -> List[str]: return cmd -def print_router_help(): - """Print brief guidance for the NL router when no command was inferred.""" - print( - "[router] Expected one of the commands: download_assets | ods_to_uprn | uprns_by_output_area" - ) - print("[router] Examples:") - print( - " 'Download the merged lidar point cloud for UPRN 5045394' → download_assets" - ) - print(" 'Map ODS G85013 to UPRNs' → ods_to_uprn") - print( - " 'List all UPRNs in output areas E00004550 and E00032882' → uprns_by_output_area" - ) - print( - "[router] You can also be concise, e.g.: '5045394 merged lidar', 'ODS G85013', 'output areas E00004550 E00032882'." - ) - - -# --- Heuristic parsing --- -def heuristic_parse(nl: str) -> Union[Dict[str, Any], None]: - """Attempt to derive a spec dict directly from natural language without model. +def heuristic_parse(nl: str) -> dict[str, Any] | None: + """ + Heuristic parse for common patterns: - Patterns handled: - - Output areas: presence of 'output area' or codes like E00012345 - - ODS codes: tokens like a letter followed by 5 digits (e.g., G85013) with 'ODS' keyword - - UPRN asset downloads: numeric tokens length >=6 plus words like 'download', 'get', 'asset', 'lidar', 'image' + - Output areas: 'output area(s)' or codes E######## + - ODS codes: tokens like A12345 (with/without "ODS") + - UPRN asset downloads: ≥6-digit tokens + keywords """ text = nl.strip() if not text: @@ -280,7 +285,13 @@ def heuristic_parse(nl: str) -> Union[Dict[str, Any], None]: ods_codes = re.findall(r"\b[A-Z]\d{5}\b", text) uprn_candidates = [t for t in re.findall(r"\b\d{6,}\b", text)] - # Output area heuristic + merged_pc = bool( + re.search(r"merged\s+(lidar\s+)?point\s*cloud", lowered) + or "merged lidar" in lowered + ) + endpoint_match = re.search(r"(https?://[\w\.-:%/]+)", text) + endpoint_url = endpoint_match.group(1) if endpoint_match else None + if output_area_codes and ( "output area" in lowered or "output areas" in lowered @@ -298,7 +309,6 @@ def heuristic_parse(nl: str) -> Union[Dict[str, Any], None]: "db_url": None, } - # ODS heuristic if ods_codes and ("ods" in lowered or not uprn_candidates): return { "command": "ods_to_uprn", @@ -312,7 +322,6 @@ def heuristic_parse(nl: str) -> Union[Dict[str, Any], None]: "db_url": None, } - # Asset download heuristic asset_keywords = { "download", "get", @@ -325,14 +334,15 @@ def heuristic_parse(nl: str) -> Union[Dict[str, Any], None]: "pointcloud", } if uprn_candidates and any(k in lowered for k in asset_keywords): + types_list = ["did:lidar-pointcloud-merged"] if merged_pc else None return { "command": "download_assets", "uprn": uprn_candidates, "sensor": None, - "types": None, + "types": types_list, "download_dir": None, "api_key_env": None, - "db_url": None, + "db_url": endpoint_url, "ods": None, "output_area": None, } @@ -343,21 +353,18 @@ def heuristic_parse(nl: str) -> Union[Dict[str, Any], None]: def ollama_chat( base_url: str, model: str, - messages: List[Dict[str, str]], + messages: list[dict[str, str]], temperature: float = 0.0, top_p: float = 0.95, num_predict: int = 256, - num_ctx: Union[int, None] = None, - keep_alive: Union[str, None] = None, + num_ctx: int | None = None, + keep_alive: str | None = None, request_timeout_s: float = 120.0, force_json: bool = True, -) -> Dict[str, Any]: - """ - Call Ollama's /api/chat with given messages and decoding options. - Returns the parsed JSON response. - """ +) -> dict[str, Any]: + """Call Ollama's /api/chat and return parsed JSON.""" url = base_url.rstrip("/") + "/api/chat" - payload: Dict[str, Any] = { + payload: dict[str, Any] = { "model": model, "messages": messages, "stream": False, @@ -372,7 +379,6 @@ def ollama_chat( if keep_alive: payload["keep_alive"] = str(keep_alive) if force_json: - # Instruct Ollama to format the assistant message as strict JSON payload["format"] = "json" resp = requests.post(url, json=payload, timeout=(5.0, request_timeout_s)) @@ -390,38 +396,148 @@ def run_once( temperature: float, top_p: float, num_predict: int, - num_ctx: Union[int, None], - keep_alive: Union[str, None], + num_ctx: int | None, + keep_alive: str | None, force_json: bool, debug_model: bool, + summarize: bool, + summary_model: str, + summary_temperature: float, + show_summary: bool, ) -> int: """ - One-shot turn: model → JSON → construct argv → (dry) run query_assist.py. + One-shot turn: summarize (optional) → route spec → build argv → (dry) run query_assist.py. + Only DEBUG shows JSON specs and raw model content. """ - # Heuristic fast-path: try to parse without model when patterns are obvious. - heuristic_spec = heuristic_parse(nl) - if heuristic_spec: - print("[router] Heuristic matched; bypassing model.") - if debug_model: - print( - f"[debug] Heuristic spec derived from input: {json.dumps(heuristic_spec)}" + original_nl = nl + log.info("Request: %s", original_nl) + + # --- Summarization (independent of routing) --- + summary_router_text = None + summary_obj: dict[str, Any] | None = None + if summarize: + try: + sum_resp = ollama_chat( + base_url=base_url, + model=summary_model, + messages=[ + {"role": "system", "content": SUMMARY_SYSTEM_PROMPT}, + {"role": "user", "content": nl}, + ], + temperature=summary_temperature, + top_p=top_p, + num_predict=256, + num_ctx=num_ctx, + keep_alive=keep_alive, + force_json=True, ) + if debug_model: + log.debug( + "Summarizer raw JSON response:\n%s", + json.dumps(sum_resp, indent=2, ensure_ascii=False), + ) + sum_content = extract_assistant_text_from_ollama(sum_resp) + log.debug("Summarizer extracted content: %r", sum_content) + + if sum_content.strip(): + try: + summary_obj = json.loads(sum_content) + except Exception: + try: + blob = slice_first_json_object(sum_content) + summary_obj = json.loads(blob) + except Exception: + summary_obj = None + + if not summary_obj: + # Heuristic fallback summary + fallback_bullets = [] + uprns_fb = re.findall(r"\b\d{6,}\b", nl) + if uprns_fb: + fallback_bullets.append(f"UPRNs: {', '.join(uprns_fb)}") + if "merged" in nl.lower(): + fallback_bullets.append("Type: merged lidar pointcloud") + url_fb = re.search(r"(https?://[\w\.-:%/]+)", nl) + if url_fb: + fallback_bullets.append(f"Endpoint: {url_fb.group(1)}") + if not fallback_bullets: + fallback_bullets.append("No actionable request") + summary_obj = { + "bullets": fallback_bullets, + "router_text": nl, + "extracted": {}, + } + + bullets = summary_obj.get("bullets") or [] + summary_router_text = summary_obj.get("router_text") or None + if show_summary and bullets: + log.info("Summary: %s", " | ".join(bullets)) + + except Exception as e: + if show_summary: + log.info("Summary step failed: %s (continuing with original input)", e) + + candidate_text = summary_router_text or nl + + # --- Try summary-extracted direct spec first --- + if summary_obj and isinstance(summary_obj.get("extracted"), dict): + ex = summary_obj["extracted"] + inferred_command = None + if ex.get("ods"): + inferred_command = "ods_to_uprn" + if ex.get("output_area"): + inferred_command = "uprns_by_output_area" + if ex.get("uprn"): + inferred_command = "download_assets" + + if inferred_command: + spec_direct = { + "command": inferred_command, + "uprn": ex.get("uprn"), + "ods": ex.get("ods"), + "output_area": ex.get("output_area"), + "sensor": ex.get("sensor"), + "types": ex.get("types"), + "download_dir": ex.get("download_dir"), + "api_key_env": ex.get("api_key_env"), + "db_url": ex.get("db_url"), + } + if spec_direct.get("types") is None and "merged" in candidate_text.lower(): + spec_direct["types"] = ["did:lidar-pointcloud-merged"] + + try: + argv = build_argv(spec_direct, py_exe, qa_path) + log.debug( + "Router JSON (summary extracted):\n%s", + json.dumps(spec_direct, indent=2), + ) + log.info("Command: %s", " ".join([shlex.quote(x) for x in argv])) + if dry_run: + return 0 + proc = subprocess.run(argv) + return proc.returncode + except Exception: + # Fall through to heuristic + routing if build failed + pass + + # --- Heuristic fast path --- + heuristic_spec = heuristic_parse(candidate_text) + if heuristic_spec: + log.debug("Heuristic spec: %s", json.dumps(heuristic_spec)) spec = heuristic_spec argv = build_argv(spec, py_exe, qa_path) - print("\n[router] JSON spec (heuristic):") - print(json.dumps(spec, indent=2)) - print("\n[router] Command:") - print(" ".join([shlex.quote(x) for x in argv])) + log.info("Command: %s", " ".join([shlex.quote(x) for x in argv])) if dry_run: return 0 proc = subprocess.run(argv) return proc.returncode + # --- Model routing --- messages = [{"role": "system", "content": SYSTEM_ROUTER_PROMPT}] for u, a in FEW_SHOTS: messages.append({"role": "user", "content": u}) messages.append({"role": "assistant", "content": a}) - messages.append({"role": "user", "content": nl}) + messages.append({"role": "user", "content": candidate_text}) resp = ollama_chat( base_url=base_url, @@ -434,20 +550,15 @@ def run_once( keep_alive=keep_alive, force_json=force_json, ) - if debug_model: - print("[debug] Primary model raw JSON response:") - try: - print(json.dumps(resp, indent=2, ensure_ascii=False)) - except Exception: - print(str(resp)) + log.debug( + "Primary model raw JSON:\n%s", + json.dumps(resp, indent=2, ensure_ascii=False), + ) content = extract_assistant_text_from_ollama(resp) - if debug_model: - print("[debug] Extracted content (primary):") - print(repr(content)) + log.debug("Primary model extracted content: %r", content) - # Try strict JSON first; if that fails, slice the first JSON object. spec = None if content.strip(): try: @@ -458,76 +569,74 @@ def run_once( spec = json.loads(blob) except Exception: spec = None - if spec is None: - # Retry once without forcing JSON if we had forced it and got nothing - if force_json: - print( - "[router] Empty/invalid JSON content; retrying without format=json..." - ) - resp2 = ollama_chat( - base_url=base_url, - model=model_id, - messages=messages, - temperature=temperature, - top_p=top_p, - num_predict=num_predict, - num_ctx=num_ctx, - keep_alive=keep_alive, - force_json=False, + + if spec is None and force_json: + log.debug("Empty/invalid JSON content; retrying without format=json...") + resp2 = ollama_chat( + base_url=base_url, + model=model_id, + messages=messages, + temperature=temperature, + top_p=top_p, + num_predict=num_predict, + num_ctx=num_ctx, + keep_alive=keep_alive, + force_json=False, + ) + if debug_model: + log.debug( + "Secondary model raw JSON:\n%s", + json.dumps(resp2, indent=2, ensure_ascii=False), ) - content2 = extract_assistant_text_from_ollama(resp2) - if debug_model: - print("[debug] Secondary model raw JSON response (no format=json):") - try: - print(json.dumps(resp2, indent=2, ensure_ascii=False)) - except Exception: - print(str(resp2)) - print("[debug] Extracted content (secondary):") - print(repr(content2)) + content2 = extract_assistant_text_from_ollama(resp2) + log.debug("Secondary model extracted content: %r", content2) + try: + spec = json.loads(content2) + except Exception: try: - spec = json.loads(content2) + blob = slice_first_json_object(content2) + spec = json.loads(blob) except Exception: - try: - blob = slice_first_json_object(content2) - spec = json.loads(blob) - except Exception: - spec = None - # Last resort: apply heuristic after model failure - if spec is None: - heuristic_spec = heuristic_parse(nl) - if heuristic_spec: - print("[router] Falling back to heuristic after model failure.") - spec = heuristic_spec - - # Fallback: if no command or unsupported command, emit help and return gracefully. + spec = None + + if spec is None: + heuristic_spec = heuristic_parse(nl) + if heuristic_spec: + log.debug( + "Fallback to heuristic after model failure: %s", + json.dumps(heuristic_spec), + ) + spec = heuristic_spec + if not spec or spec.get("command") not in { "download_assets", "ods_to_uprn", "uprns_by_output_area", }: - print("[router] No actionable command inferred from model output.") - print("[router] Raw model content:") - print(content.strip()) - print_router_help() + log.warning("No actionable command inferred.") + log.info( + "Try examples like:\n" + " • 'Download the merged lidar point cloud for UPRN 5045394'\n" + " • 'Map ODS G85013 to UPRNs'\n" + " • 'List all UPRNs in output areas E00004550 and E00032882'" + ) + log.debug("Raw model content:\n%s", content.strip()) return 0 argv = build_argv(spec, py_exe, qa_path) + log.debug("Router JSON:\n%s", json.dumps(spec, indent=2)) + log.info("Command: %s", " ".join([shlex.quote(x) for x in argv])) - print("\n[router] JSON spec:") - print(json.dumps(spec, indent=2)) - print("\n[router] Command:") - print(" ".join([shlex.quote(x) for x in argv])) if dry_run: return 0 - # Inherit env (so API_KEY etc. is available) proc = subprocess.run(argv) return proc.returncode def main(): ap = argparse.ArgumentParser( - description="NL interface for query_assist.py using Ollama" + description="Natural-language interface for query_assist.py using Ollama" ) ap.add_argument("--model-id", default="gpt-oss:20b", help="Ollama model name/tag") ap.add_argument( @@ -543,7 +652,7 @@ def main(): ap.add_argument( "--dry-run", action="store_true", - help="Only print the derived command; do not execute", + help="Only log the derived command; do not execute", ) ap.add_argument( "--once", "-q", help="Run a single NL query and exit (non-interactive)" @@ -571,28 +680,73 @@ def main(): ap.add_argument( "--no-force-json", action="store_true", - help="Do not set format='json' in the chat call (not recommended)", + help="Do not set format='json' in the chat call", + ) + + # Logging controls + ap.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="Increase verbosity ( -v = info, -vv = debug )", ) ap.add_argument( "--debug-model", action="store_true", - help="Print raw model JSON responses and extracted text", + help="Force DEBUG and include raw model responses", + ) + + # Summarization controls + ap.add_argument( + "--no-summarize", + action="store_true", + help="Disable preliminary summarization step", + ) + ap.add_argument( + "--summary-model", + default=None, + help="Model ID for summarization (defaults to routing model)", + ) + ap.add_argument( + "--summary-temperature", + type=float, + default=0.0, + help="Temperature for summarization model", + ) + ap.add_argument( + "--hide-summary", action="store_true", help="Do not log summarization bullets" ) args = ap.parse_args() - print(f"[init] Ollama base URL: {args.base_url}") - print( - f"[init] model={args.model_id} temperature={args.temperature} top_p={args.top_p} " - f"num_predict={args.num_predict} num_ctx={args.num_ctx} keep_alive={args.keep_alive} " - f"force_json={not args.no_force_json}" + # Configure logging (quiet by default: WARNING). -v sets INFO, -vv or --debug-model sets DEBUG. + if args.debug_model or args.verbose >= 2: + level = logging.DEBUG + elif args.verbose == 1: + level = logging.INFO + else: + level = logging.WARNING + + logging.basicConfig(level=level, format="%(levelname)s: %(message)s") + + # Initial context logs (concise) + log.info("Model: %s (%s)", args.model_id, args.base_url) + log.debug( + "Decoding temp=%.2f top_p=%.2f predict=%d ctx=%s keep_alive=%s force_json=%s", + args.temperature, + args.top_p, + args.num_predict, + str(args.num_ctx), + str(args.keep_alive), + str(not args.no_force_json), ) py_exe = sys.executable qa_path = args.query_assist_path - if args.once: - try: + try: + if args.once: rc = run_once( base_url=args.base_url, model_id=args.model_id, @@ -607,24 +761,25 @@ def main(): keep_alive=args.keep_alive, force_json=(not args.no_force_json), debug_model=args.debug_model, + summarize=(not args.no_summarize), + summary_model=(args.summary_model or args.model_id), + summary_temperature=args.summary_temperature, + show_summary=not args.hide_summary, ) sys.exit(rc) - except Exception as e: - print(f"[router] Error: {e}", file=sys.stderr) - sys.exit(1) - print("NL router for query_assist.py (Ollama). Type 'exit' or Ctrl-D to quit.") - while True: - try: - nl = input("\n> ").strip() - except EOFError: - break - if not nl: - continue - if nl.lower() in {"exit", "quit"}: - break - try: - run_once( + if level <= logging.INFO: + print("NL router for query_assist.py. Type 'exit' or Ctrl-D to quit.") + while True: + try: + nl = input("> ").strip() + except EOFError: + break + if not nl: + continue + if nl.lower() in {"exit", "quit"}: + break + rc = run_once( base_url=args.base_url, model_id=args.model_id, nl=nl, @@ -638,9 +793,16 @@ def main(): keep_alive=args.keep_alive, force_json=(not args.no_force_json), debug_model=args.debug_model, + summarize=(not args.no_summarize), + summary_model=(args.summary_model or args.model_id), + summary_temperature=args.summary_temperature, + show_summary=not args.hide_summary, ) - except Exception as e: - print(f"[router] Error: {e}", file=sys.stderr) + if rc != 0: + log.warning("Subprocess exited with code %d", rc) + except Exception as e: + log.error("Fatal error: %s", e) + sys.exit(1) if __name__ == "__main__": From 64fe7ad0d3dff2c686b27e56924b8a84090ffe9e Mon Sep 17 00:00:00 2001 From: gnathoi Date: Tue, 26 Aug 2025 15:14:27 +0100 Subject: [PATCH 08/19] basic workign --- examples/nl_query_cli.py | 307 ++++++++++++++++++++++++++++++++++----- 1 file changed, 269 insertions(+), 38 deletions(-) diff --git a/examples/nl_query_cli.py b/examples/nl_query_cli.py index 1f4b956..8471753 100644 --- a/examples/nl_query_cli.py +++ b/examples/nl_query_cli.py @@ -8,14 +8,16 @@ import os import re import shlex +import shutil import subprocess import sys +import textwrap from typing import Any import requests # ------------------------------- -# System & Summary Prompts (unchanged semantics; formatting tuned) +# System & Summary Prompts # ------------------------------- SYSTEM_ROUTER_PROMPT = """You are a rigorous function-call router for a Python CLI named query_assist.py. @@ -225,6 +227,12 @@ def ensure_list_or_path(v: None | str | list[str]) -> list[str]: return [s] +def _find_csv_paths(text: str) -> list[str]: + """Return a list of CSV-like path tokens found in text.""" + # Accept absolute, relative, and bare filenames ending with .csv + return re.findall(r'(?:(?:[A-Za-z]:)?[^\s"\'<>|]+\.csv)\b', text) + + def build_argv(spec: dict[str, Any], py: str, qa_path: str) -> list[str]: """Map the JSON spec to query_assist.py argv.""" cmd = [py, qa_path] @@ -270,10 +278,11 @@ def build_argv(spec: dict[str, Any], py: str, qa_path: str) -> list[str]: def heuristic_parse(nl: str) -> dict[str, Any] | None: """ - Heuristic parse for common patterns: - - - Output areas: 'output area(s)' or codes E######## - - ODS codes: tokens like A12345 (with/without "ODS") + Heuristic parse for common patterns with CSV precedence: + - CSV + 'uprn' → download_assets (uprn=) + - CSV + 'output area' → uprns_by_output_area (output_area=) + - Output areas: codes E######## + - ODS: tokens like A12345 - UPRN asset downloads: ≥6-digit tokens + keywords """ text = nl.strip() @@ -281,17 +290,77 @@ def heuristic_parse(nl: str) -> dict[str, Any] | None: return None lowered = text.lower() + csv_paths = _find_csv_paths(text) output_area_codes = re.findall(r"\bE\d{8}\b", text) ods_codes = re.findall(r"\b[A-Z]\d{5}\b", text) uprn_candidates = [t for t in re.findall(r"\b\d{6,}\b", text)] - merged_pc = bool( - re.search(r"merged\s+(lidar\s+)?point\s*cloud", lowered) + # Asset type hints + wants_merged = bool( + re.search(r"merged\s+(lidar\s+)?point\s*clouds?", lowered) or "merged lidar" in lowered ) + wants_rgb = ("rgb" in lowered) and ("image" in lowered or "images" in lowered) + + # Endpoint override endpoint_match = re.search(r"(https?://[\w\.-:%/]+)", text) endpoint_url = endpoint_match.group(1) if endpoint_match else None + # --- CSV precedence --- + if csv_paths: + # If the user mentions UPRN(s), prefer treating the CSV as a UPRN list + if "uprn" in lowered: + types_list = [] + if wants_merged: + types_list.append("did:lidar-pointcloud-merged") + if wants_rgb: + types_list.append("did:rgb-image") + return { + "command": "download_assets", + "uprn": csv_paths, + "sensor": None, + "types": types_list or None, + "download_dir": None, + "api_key_env": None, + "db_url": endpoint_url, + "ods": None, + "output_area": None, + } + # Else, if they mention output areas explicitly, treat CSV as an OA list + if ( + "output area" in lowered + or "output areas" in lowered + or "oa" in lowered.split() + ): + return { + "command": "uprns_by_output_area", + "output_area": csv_paths, + "uprn": None, + "ods": None, + "sensor": None, + "types": None, + "download_dir": None, + "api_key_env": None, + "db_url": None, + } + # If ambiguous: assume UPRN list (safer/more common in this CLI) + return { + "command": "download_assets", + "uprn": csv_paths, + "sensor": None, + "types": ( + ["did:lidar-pointcloud-merged"] + if wants_merged + else (["did:rgb-image"] if wants_rgb else None) + ), + "download_dir": None, + "api_key_env": None, + "db_url": endpoint_url, + "ods": None, + "output_area": None, + } + + # --- Pure OA codes --- if output_area_codes and ( "output area" in lowered or "output areas" in lowered @@ -309,6 +378,7 @@ def heuristic_parse(nl: str) -> dict[str, Any] | None: "db_url": None, } + # --- ODS codes (only if no explicit UPRN numbers present) --- if ods_codes and ("ods" in lowered or not uprn_candidates): return { "command": "ods_to_uprn", @@ -322,6 +392,7 @@ def heuristic_parse(nl: str) -> dict[str, Any] | None: "db_url": None, } + # --- UPRN download with optional types --- asset_keywords = { "download", "get", @@ -332,14 +403,19 @@ def heuristic_parse(nl: str) -> dict[str, Any] | None: "images", "point", "pointcloud", + "point cloud", } if uprn_candidates and any(k in lowered for k in asset_keywords): - types_list = ["did:lidar-pointcloud-merged"] if merged_pc else None + types_list = [] + if wants_merged: + types_list.append("did:lidar-pointcloud-merged") + if wants_rgb: + types_list.append("did:rgb-image") return { "command": "download_assets", "uprn": uprn_candidates, "sensor": None, - "types": types_list, + "types": types_list or None, "download_dir": None, "api_key_env": None, "db_url": endpoint_url, @@ -386,6 +462,97 @@ def ollama_chat( return resp.json() +# ------------------------------- +# Verbose-mode intro helpers +# ------------------------------- + + +def _fetch_model_intro(base_url: str, model_id: str) -> str: + """ + Ask the routing model (plain text) to describe its function briefly. + Safe: if the request fails, returns a static fallback. + """ + try: + resp = ollama_chat( + base_url=base_url, + model=model_id, + messages=[ + {"role": "system", "content": SYSTEM_ROUTER_PROMPT}, + { + "role": "user", + "content": ( + "Briefly describe your function in 4–7 short bullet points without JSON. " + "Focus on how natural-language input is turned into a structured command " + "for query_assist.py and what arguments you can infer." + ), + }, + ], + force_json=False, + temperature=0.0, + top_p=0.95, + num_predict=200, + ) + text = extract_assistant_text_from_ollama(resp).strip() + if text.startswith("{") and text.endswith("}"): + try: + obj = json.loads(text) + text = obj.get("description") or obj.get("text") or text + except Exception: + pass + return text + except Exception as e: + logging.getLogger("nl_query_cli").debug("Intro fetch failed: %s", e) + return ( + "- Routes natural-language queries to one of three commands: " + "download_assets, ods_to_uprn, uprns_by_output_area.\n" + "- Extracts UPRNs/ODS/Output Areas plus optional sensor, types, " + "download_dir, api_key_env, db_url.\n" + "- Maps asset phrases to canonical IRIs (e.g., merged lidar → did:lidar-pointcloud-merged).\n" + "- Builds argv for query_assist.py and executes it (unless --dry-run).\n" + "- Uses heuristics, few-shots, and optional summarization for robustness." + ) + + +def _render_box(title: str, body: str) -> str: + """Render a Unicode box with a title and wrapped body.""" + term_width = shutil.get_terminal_size(fallback=(100, 24)).columns + max_width = max(60, min(term_width - 2, 100)) + wrap_width = max_width - 4 + + body_lines = [] + for para in body.splitlines(): + if not para.strip(): + body_lines.append("") + else: + body_lines.extend(textwrap.wrap(para, width=wrap_width)) + + title = title.strip() + title_line = f" {title} " + top = "┌" + "─" * (max_width - 2) + "┐" + sep = "├" + "─" * (max_width - 2) + "┤" + bot = "└" + "─" * (max_width - 2) + "┘" + + if len(title_line) <= (max_width - 2): + left = (max_width - 2 - len(title_line)) // 2 + right = max_width - 2 - len(title_line) - left + top = "┌" + "─" * left + title_line + "─" * right + "┐" + + content = "\n".join("│ " + line.ljust(max_width - 4) + " │" for line in body_lines) + return "\n".join([top, sep, content, bot]) + + +def _print_intro_banner(base_url: str, model_id: str) -> None: + intro = _fetch_model_intro(base_url, model_id) + banner = _render_box(f"query_assist.py Router — {model_id}", intro) + print(banner) + print() + + +# ------------------------------- +# Core turn runner +# ------------------------------- + + def run_once( base_url: str, model_id: str, @@ -410,7 +577,7 @@ def run_once( Only DEBUG shows JSON specs and raw model content. """ original_nl = nl - log.info("Request: %s", original_nl) + # log.info("Request: %s", original_nl) # --- Summarization (independent of routing) --- summary_router_text = None @@ -450,27 +617,95 @@ def run_once( summary_obj = None if not summary_obj: - # Heuristic fallback summary - fallback_bullets = [] - uprns_fb = re.findall(r"\b\d{6,}\b", nl) - if uprns_fb: - fallback_bullets.append(f"UPRNs: {', '.join(uprns_fb)}") - if "merged" in nl.lower(): + # Heuristic fallback summary aligned with heuristic_parse + lowered = nl.lower() + fallback_bullets: list[str] = [] + extracted: dict[str, Any] = {} + + csv_paths = _find_csv_paths(nl) + oa_fb = re.findall(r"\bE\d{8}\b", nl) + ods_fb = re.findall(r"\b[A-Z]\d{5}\b", nl) + uprn_fb = re.findall(r"\b\d{6,}\b", nl) + + wants_merged = bool( + re.search(r"merged\s+(lidar\s+)?point\s*clouds?", lowered) + or "merged lidar" in lowered + ) + wants_rgb = ("rgb" in lowered) and ( + "image" in lowered or "images" in lowered + ) + + if csv_paths: + if "uprn" in lowered: + extracted["uprn"] = csv_paths + fallback_bullets.append(f"UPRNs CSV: {', '.join(csv_paths)}") + elif ( + ("output area" in lowered) + or ("output areas" in lowered) + or ("oa" in lowered.split()) + ): + extracted["output_area"] = csv_paths + fallback_bullets.append( + f"Output-area CSV: {', '.join(csv_paths)}" + ) + else: + extracted["uprn"] = csv_paths + fallback_bullets.append( + f"UPRNs CSV (assumed): {', '.join(csv_paths)}" + ) + + if not csv_paths: + if oa_fb: + extracted["output_area"] = oa_fb + fallback_bullets.append(f"Output areas: {', '.join(oa_fb)}") + if ods_fb and not uprn_fb: + extracted["ods"] = ods_fb + fallback_bullets.append(f"ODS: {', '.join(ods_fb)}") + if uprn_fb: + extracted["uprn"] = uprn_fb + fallback_bullets.append(f"UPRNs: {', '.join(uprn_fb)}") + + if wants_merged: + extracted["types"] = list( + set( + (extracted.get("types") or []) + + ["did:lidar-pointcloud-merged"] + ) + ) fallback_bullets.append("Type: merged lidar pointcloud") + if wants_rgb: + extracted["types"] = list( + set((extracted.get("types") or []) + ["did:rgb-image"]) + ) + fallback_bullets.append("Type: RGB image") + url_fb = re.search(r"(https?://[\w\.-:%/]+)", nl) if url_fb: + extracted["db_url"] = url_fb.group(1) fallback_bullets.append(f"Endpoint: {url_fb.group(1)}") - if not fallback_bullets: - fallback_bullets.append("No actionable request") + + if "uprn" in extracted: + router_text = ( + f"Download assets for UPRN(s) {', '.join(extracted['uprn'])}" + ) + elif "output_area" in extracted: + router_text = f"List UPRNs in output areas {', '.join(extracted['output_area'])}" + elif "ods" in extracted: + router_text = f"Map ODS {', '.join(extracted['ods'])} to UPRNs" + else: + router_text = "no-op" + if not fallback_bullets: + fallback_bullets.append("No actionable request") + summary_obj = { "bullets": fallback_bullets, - "router_text": nl, - "extracted": {}, + "router_text": router_text, + "extracted": extracted, } bullets = summary_obj.get("bullets") or [] summary_router_text = summary_obj.get("router_text") or None - if show_summary and bullets: + if show_summary and bullets and bullets != ["No actionable request"]: log.info("Summary: %s", " | ".join(bullets)) except Exception as e: @@ -517,8 +752,7 @@ def run_once( proc = subprocess.run(argv) return proc.returncode except Exception: - # Fall through to heuristic + routing if build failed - pass + pass # fall through # --- Heuristic fast path --- heuristic_spec = heuristic_parse(candidate_text) @@ -613,8 +847,8 @@ def run_once( "ods_to_uprn", "uprns_by_output_area", }: - log.warning("No actionable command inferred.") - log.info( + log.warning( + "No actionable command inferred.\n" "Try examples like:\n" " • 'Download the merged lidar point cloud for UPRN 5045394'\n" " • 'Map ODS G85013 to UPRNs'\n" @@ -658,7 +892,7 @@ def main(): "--once", "-q", help="Run a single NL query and exit (non-interactive)" ) - # Decoding / runtime knobs (kept simple and model-agnostic) + # Decoding / runtime knobs ap.add_argument( "--temperature", type=float, default=0.0, help="Sampling temperature" ) @@ -720,7 +954,7 @@ def main(): args = ap.parse_args() - # Configure logging (quiet by default: WARNING). -v sets INFO, -vv or --debug-model sets DEBUG. + # Configure logging if args.debug_model or args.verbose >= 2: level = logging.DEBUG elif args.verbose == 1: @@ -730,17 +964,9 @@ def main(): logging.basicConfig(level=level, format="%(levelname)s: %(message)s") - # Initial context logs (concise) - log.info("Model: %s (%s)", args.model_id, args.base_url) - log.debug( - "Decoding temp=%.2f top_p=%.2f predict=%d ctx=%s keep_alive=%s force_json=%s", - args.temperature, - args.top_p, - args.num_predict, - str(args.num_ctx), - str(args.keep_alive), - str(not args.no_force_json), - ) + # Verbose-mode introductory banner + if level <= logging.INFO: + _print_intro_banner(args.base_url, args.model_id) py_exe = sys.executable qa_path = args.query_assist_path @@ -800,6 +1026,11 @@ def main(): ) if rc != 0: log.warning("Subprocess exited with code %d", rc) + except KeyboardInterrupt: + # Cleanly handle Ctrl-C in REPL + print() + log.info("Interrupted.") + sys.exit(130) except Exception as e: log.error("Fatal error: %s", e) sys.exit(1) From 7d372f27f03b25cb06e76c56c19c542e0eb20d8e Mon Sep 17 00:00:00 2001 From: gnathoi Date: Tue, 26 Aug 2025 15:18:45 +0100 Subject: [PATCH 09/19] change: more model independent --- examples/nl_query_cli.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/nl_query_cli.py b/examples/nl_query_cli.py index 8471753..7510af6 100644 --- a/examples/nl_query_cli.py +++ b/examples/nl_query_cli.py @@ -542,8 +542,15 @@ def _render_box(title: str, body: str) -> str: def _print_intro_banner(base_url: str, model_id: str) -> None: - intro = _fetch_model_intro(base_url, model_id) - banner = _render_box(f"query_assist.py Router — {model_id}", intro) + # Static standard description (avoid per-run model call for speed & determinism) + static_intro = ( + "- Parses user queries to detect intent (download assets, convert ODS to UPRN, or fetch UPRNs by output area).\n" + "- Extracts key identifiers (UPRN, ODS code, output area code) from text or file paths.\n" + '- Maps asset type phrases (e.g., "RGB image", "merged lidar point cloud") to predefined IRIs.\n' + "- Fills optional parameters (sensor, download directory, API key env, SPARQL endpoint) from context or heuristics.\n" + "- Uses heuristics + few-shot LLM + optional summarization to build argv for query_assist.py." + ) + banner = _render_box(f"query_assist.py Router — {model_id}", static_intro) print(banner) print() From 73389b41de43be7b9ecc72d9ca4995d2a579f303 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Tue, 26 Aug 2025 16:15:55 +0100 Subject: [PATCH 10/19] add: langgraph --- examples/nl_query_graph.py | 1074 ++++++++++++++++++++++++++++++++++++ 1 file changed, 1074 insertions(+) create mode 100644 examples/nl_query_graph.py diff --git a/examples/nl_query_graph.py b/examples/nl_query_graph.py new file mode 100644 index 0000000..1b84355 --- /dev/null +++ b/examples/nl_query_graph.py @@ -0,0 +1,1074 @@ +#!/usr/bin/env python3 +""" +nl_query_workflow_cli.py + +A LangGraph-powered superset of nl_query_cli.py that can execute multi-stage +workflows against query_assist.py. Examples it can handle in one NL turn: + +- "Get all point clouds in output area E00004550" --> + Step 1: uprns_by_output_area -> CSV(s) + Step 2: download_assets --uprn --types did:lidar-pointcloud-merged,did:lidar-pointcloud-frame + +- "For ODS G85013, download RGB and merged lidar to /data" --> + Step 1: ods_to_uprn -> /.../downloads/ods_to_uprn.csv + Step 2: download_assets --uprn ods_to_uprn.csv --types did:rgb-image,did:lidar-pointcloud-merged --download-dir /data + +It preserves the CLI surface and routing behavior of nl_query_cli.py, but adds: +- Planner (heuristics + optional LLM) that compiles an ordered step list +- LangGraph execution loop with checkpoint-able state, retries, and artifact passing +- Robust parsing of query_assist.py logs to discover produced CSV artifacts +- Deterministic "dry-run" plan prints for auditability +""" +from __future__ import annotations + +import argparse +import dataclasses +import json +import logging +import os +import re +import shlex +import shutil +import subprocess +import sys +import textwrap +import time +from typing import Any, Literal, TypedDict + +# --- Third-party --- +# pip install langgraph[all] requests +import requests +from langgraph.checkpoint.memory import MemorySaver +from langgraph.graph import END, START, StateGraph + +# ====================================================================================== +# Carry forward the routing prompts / type mappings / helpers from nl_query_cli.py +# (kept in-sync conceptually; this file does not import the other to stay standalone). +# ====================================================================================== + +SYSTEM_ROUTER_PROMPT = """You are a rigorous function-call router for a Python CLI named query_assist.py. + +Supported commands and how to populate them: + +1) download_assets + Required: uprn (string CSV path OR array of strings like ["5045394","200003455212"]) + Optional: sensor (string, e.g., "bess:OusterLidarSensor") + types (array of strings; each a type IRI, e.g., ["did:rgb-image","did:lidar-pointcloud-merged"]) + download_dir (string path) + api_key_env (string, name of env var with API key) + db_url (string URL to SPARQL endpoint) + +2) ods_to_uprn + Required: ods (string CSV path OR array of strings like ["G85013","Q12345"]) + +3) uprns_by_output_area + Required: output_area (string CSV path OR array of strings, e.g., ["E00004550","E00032882"]) + +Schema (MUST output exactly one JSON object with these keys as needed): +{ + "command": "download_assets" | "ods_to_uprn" | "uprns_by_output_area", + "uprn": string | string[] | null, + "ods": string | string[] | null, + "output_area": string | string[] | null, + "sensor": string | null, + "types": string[] | null, + "download_dir": string | null, + "api_key_env": string | null, + "db_url": string | null +} + +Constraints: +- Return ONLY the JSON object. No prose, no markdown. +- If the user request implies asset types, map them to the supported IRIs if possible: + - RGB image -> "did:rgb-image" + - merged lidar point cloud -> "did:lidar-pointcloud-merged" + - lidar range panorama -> "did:lidar-range-pano" + - lidar reflectance panorama -> "did:lidar-reflectance-pano" + - lidar signal panorama -> "did:lidar-signal-pano" + - lidar near-infrared panorama -> "did:lidar-nearir-pano" + - IR false color -> "did:ir-false-color-image" + - IR temperature array -> "did:ir-temperature-array" + - IR counts -> "did:ir-count-image" + - temperature (no contentUrl) -> "did:celsius-temperature" + - relative humidity (no contentUrl) -> "did:relative-humidity" +- Prefer being decisive. When in doubt, infer sensible defaults. +""" + +TYPE_ALIASES = { + # canonical mappings used throughout your stack + "rgb": "did:rgb-image", + "rgb image": "did:rgb-image", + "merged lidar": "did:lidar-pointcloud-merged", + "merged lidar point cloud": "did:lidar-pointcloud-merged", + "lidar point cloud": "did:lidar-pointcloud-frame", + "point cloud": "did:lidar-pointcloud-frame", + "point clouds": None, # expands to both merged + frame + "lidar range panorama": "did:lidar-range-pano", + "lidar reflectance panorama": "did:lidar-reflectance-pano", + "lidar signal panorama": "did:lidar-signal-pano", + "lidar nearir panorama": "did:lidar-nearir-pano", + "ir false color": "did:ir-false-color-image", + "ir temperature array": "did:ir-temperature-array", + "ir counts": "did:ir-count-image", + "temperature": "did:celsius-temperature", + "relative humidity": "did:relative-humidity", +} + +POINTCLOUD_BOTH = ["did:lidar-pointcloud-merged", "did:lidar-pointcloud-frame"] + +# ---------------------------------------------------------------------------------- +# High-level planner system prompt (multi-step). Explains assets & synonyms so the +# LLM can infer intent even with loose language ("building" -> UPRN, etc.). +# ---------------------------------------------------------------------------------- +PLAN_SYSTEM_PROMPT = """ +You are a planning assistant that converts a natural language request about +retrieving built environment asset data into an ordered execution plan for +query_assist.py. + +Available low-level commands (same as router): + 1. uprns_by_output_area – given output area code(s) yield UPRN CSV(s) + 2. ods_to_uprn – given ODS clinical practice code(s) yield UPRN CSV + 3. download_assets – given UPRN(s) (list or CSV path) download assets + +Important asset type IRIs (use when mentioned or implied): + did:rgb-image, did:lidar-pointcloud-merged, did:lidar-pointcloud-frame, + did:lidar-range-pano, did:lidar-reflectance-pano, did:lidar-signal-pano, + did:lidar-nearir-pano, did:ir-false-color-image, did:ir-temperature-array, + did:ir-count-image, did:celsius-temperature, did:relative-humidity + +Synonyms / interpretation guidance: + "building", "buildings", "property", "properties" -> treat as UPRN(s) + "practice", "gp practice" -> ODS code + "thermal array", "temperature array", "thermal sensor" -> did:ir-temperature-array + "thermal image" -> did:ir-false-color-image (unless array explicitly stated) + "point clouds" (plural, no qualifier) -> both merged + frame + "merged point cloud" -> did:lidar-pointcloud-merged + "point cloud frame" / "single frame" -> did:lidar-pointcloud-frame + If user asks for "temperature and humidity" -> did:celsius-temperature + did:relative-humidity + +Planning rules: + - If an intermediate mapping (ODS or output area) is needed to reach UPRNs, plan that first, then a download_assets step referencing previous CSV output (use uprn_from_previous_csvs=true instead of explicit uprn list). + - If the user directly supplies UPRNs (numbers) OR a CSV path that obviously contains UPRNs, a single download_assets step may suffice. + - Always be decisive; include only the steps required. + +Return JSON ONLY, schema: +{ + "steps": [ + { + "command": "uprns_by_output_area"|"ods_to_uprn"|"download_assets", + "output_area": string|[string]|null, + "ods": string|[string]|null, + "uprn": string|[string]|null, + "types": [string]|null, + "sensor": string|null, + "download_dir": string|null, + "api_key_env": string|null, + "db_url": string|null, + "uprn_from_previous_csvs": true|false|null + }, ... + ] +} + +Notes: + - Omit keys or set null when not applicable. + - Use uprn_from_previous_csvs=true only on a download_assets step that should read the CSV(s) produced by previous mapping step(s). + - Do NOT include explanatory prose. +""" + + +def _extract_first_json(text: str) -> dict | None: + start = text.find("{") + if start == -1: + return None + depth = 0 + for i, c in enumerate(text[start:], start=start): + if c == "{": + depth += 1 + elif c == "}": + depth -= 1 + if depth == 0: + try: + return json.loads(text[start : i + 1]) + except Exception: + return None + return None + + +def _find_csvs_emitted(stream_text: str) -> list[str]: + """ + Parse query_assist.py logs to discover created CSVs. + It prints e.g.: + "✔ Saved CSV for {oa} → {path}" + "✔ Saved ODS→UPRN CSV → {path}" + Accept both unicode arrows and ASCII '->'. + """ + csvs = [] + # Unicode arrow / ASCII arrow variants, greedy path match until whitespace end + patterns = [ + r"Saved CSV for .*? → ([^\s]+\.csv)", + r"Saved CSV for .*? -> ([^\s]+\.csv)", + r"Saved ODS.?UPRN CSV .*? → ([^\s]+\.csv)", + r"Saved ODS.?UPRN CSV .*? -> ([^\s]+\.csv)", + ] + for pat in patterns: + for m in re.finditer(pat, stream_text): + csvs.append(m.group(1)) + # Deduplicate preserving order + seen = set() + out = [] + for p in csvs: + if p not in seen: + out.append(p) + seen.add(p) + return out + + +def _ensure_list_or_path(v: None | str | list[str]) -> list[str]: + if v is None: + return [] + if isinstance(v, list): + return [str(x) for x in v if str(x).strip()] + # Post-augmentation: if user asked for assets but only got a mapping step, add download step + lowered = state.nl.lower() + if len(plan) == 1 and ("asset" in lowered or "all the" in lowered): + first_cmd = plan[0].get("command") + if first_cmd in {"ods_to_uprn", "uprns_by_output_area"}: + # Only add if no existing download step follows + plan.append( + { + "command": "download_assets", + "uprn_from_previous_csvs": True, # consume prior CSV + "types": None, # all asset types + "download_dir": plan[0].get("download_dir") + or defaults.get("download_dir"), + "api_key_env": defaults.get("api_key_env"), + "db_url": plan[0].get("db_url") or defaults.get("db_url"), + } + ) + state.plan = plan + return [s] if s else [] + + +def _build_argv(spec: dict[str, Any], py: str, qa_path: str) -> list[str]: + cmd = [py, qa_path] + command = spec.get("command") + if command == "download_assets": + uprn = _ensure_list_or_path(spec.get("uprn")) + if not uprn: + raise ValueError("download_assets requires 'uprn'.") + cmd += ["--uprn"] + uprn + if spec.get("sensor"): + cmd += ["--sensor", str(spec["sensor"])] + if spec.get("types"): + cmd += ["--types", ",".join(spec["types"])] + elif command == "ods_to_uprn": + ods = _ensure_list_or_path(spec.get("ods")) + if not ods: + raise ValueError("ods_to_uprn requires 'ods'.") + cmd += ["--ods"] + ods + elif command == "uprns_by_output_area": + oa = _ensure_list_or_path(spec.get("output_area")) + if not oa: + raise ValueError("uprns_by_output_area requires 'output_area'.") + cmd += ["--output-area"] + oa + else: + raise ValueError(f"Unsupported command: {command!r}") + + if spec.get("db_url"): + cmd += ["--db-url", str(spec["db_url"])] + if spec.get("download_dir"): + cmd += ["--download-dir", str(spec["download_dir"])] + if spec.get("api_key_env"): + cmd += ["--api-key-env", str(spec["api_key_env"])] + return cmd + + +def _map_types_from_text(lowered: str) -> list[str] | None: + # Broad heuristics for types based on NL + wants_pointclouds = re.search(r"\bpoint\s*clouds?\b", lowered) is not None + wants_merged = "merged lidar" in lowered or re.search( + r"merged\s+lidar\s+point\s*cloud", lowered + ) + wants_frame = "pointcloud frame" in lowered or "single frame" in lowered + + types = [] + if wants_pointclouds: + types.extend(POINTCLOUD_BOTH) + if wants_merged: + types.append("did:lidar-pointcloud-merged") + if wants_frame: + types.append("did:lidar-pointcloud-frame") + + if "rgb" in lowered and "image" in lowered: + types.append("did:rgb-image") + if "range panorama" in lowered: + types.append("did:lidar-range-pano") + if "reflectance panorama" in lowered: + types.append("did:lidar-reflectance-pano") + if "signal panorama" in lowered: + types.append("did:lidar-signal-pano") + if "nearir" in lowered or "near-infrared" in lowered: + types.append("did:lidar-nearir-pano") + if "ir false" in lowered: + types.append("did:ir-false-color-image") + if "ir temperature array" in lowered: + types.append("did:ir-temperature-array") + # Additional thermal/temperature array synonyms + if ( + re.search(r"thermal\s+arrays?", lowered) + or re.search(r"temperature\s+arrays?", lowered) + or "thermal array" in lowered + ): + types.append("did:ir-temperature-array") + if re.search(r"thermal\s+images?", lowered): + # Map generic thermal image request to false-color IR image if not already specified + types.append("did:ir-false-color-image") + + if not types: + return None + # dedupe preserve order + out, seen = [], set() + for t in types: + if t not in seen: + out.append(t) + seen.add(t) + return out + + +# ====================================================================================== +# Planning + LangGraph state +# ====================================================================================== + + +class StepSpec(TypedDict, total=False): + command: Literal["download_assets", "ods_to_uprn", "uprns_by_output_area"] + uprn: list[str] | str | None + ods: list[str] | str | None + output_area: list[str] | str | None + sensor: str | None + types: list[str] | None + download_dir: str | None + api_key_env: str | None + db_url: str | None + # internal: indicates this step will take CSVs from previous steps + uprn_from_previous_csvs: bool + + +@dataclasses.dataclass +class WFState: + nl: str + plan: list[StepSpec] + current: int = 0 + artifacts: dict[str, Any] = dataclasses.field( + default_factory=dict + ) # e.g., {"csvs": [...]} + log: list[str] = dataclasses.field(default_factory=list) + dry_run: bool = False + py_exe: str = sys.executable + qa_path: str = os.path.join(os.path.dirname(__file__), "query_assist.py") + base_url: str = os.environ.get("OLLAMA_HOST", "http://localhost:11434") + model_id: str = "gpt-oss:20b" + temperature: float = 0.0 + top_p: float = 0.95 + num_predict: int = 256 + num_ctx: int | None = None + keep_alive: str | None = None + force_json: bool = True + verbose_level: int = logging.INFO + max_steps: int = 8 + + +def _coerce_wfstate(obj: Any) -> WFState: + """Ensure we have a WFState instance (LangGraph may return a plain dict).""" + if isinstance(obj, WFState): + return obj + if isinstance(obj, dict): + # Build kwargs respecting dataclass fields + kwargs = {} + for f in dataclasses.fields(WFState): + if f.name in obj: + kwargs[f.name] = obj[f.name] + else: + if f.default is not dataclasses.MISSING: # type: ignore[attr-defined] + kwargs[f.name] = f.default # type: ignore[assignment] + elif getattr(f, "default_factory", dataclasses.MISSING) is not dataclasses.MISSING: # type: ignore[attr-defined] + kwargs[f.name] = f.default_factory() # type: ignore[call-arg] + else: + kwargs[f.name] = None + return WFState(**kwargs) # type: ignore[arg-type] + raise TypeError(f"Cannot coerce state of type {type(obj)} to WFState") + + +# ====================================================================================== +# Heuristic Planner (covers the common multi-step cases deterministically) +# ====================================================================================== + + +def heuristic_plan(nl: str, defaults: dict[str, Any]) -> list[StepSpec] | None: + """ + Returns a list of StepSpec if it can deduce a plan without the LLM. + Covers: + - output area + asset types -> [uprns_by_output_area, download_assets] + - ods + asset types -> [ods_to_uprn, download_assets] + - simple one-shot commands (download_assets / ods_to_uprn / uprns_by_output_area) + """ + text = nl.strip() + if not text: + return None + lowered = text.lower() + + csv_paths = re.findall(r'(?:(?:[A-Za-z]:)?[^\s"\'<>|]+\.csv)\b', text) + oa_codes = re.findall(r"\bE\d{8}\b", text) + ods_codes = re.findall(r"\b[A-Z]\d{5}\b", text) + uprns = re.findall(r"\b\d{6,}\b", text) + endpoint_match = re.search(r"(https?://[\w\.-:%/]+)", text) + endpoint_url = endpoint_match.group(1) if endpoint_match else None + + # Extract common options if present + download_dir = None + m = re.search(r"(?: to | into )\s+(/[^ ]+)", lowered) + if m: + download_dir = m.group(1) + + # Types + types = _map_types_from_text(lowered) + + # If user provided a CSV and mentioned UPRNs -> treat as download step + if csv_paths and ("uprn" in lowered or "uprns" in lowered): + return [ + { + "command": "download_assets", + "uprn": csv_paths if len(csv_paths) > 1 else csv_paths[0], + "types": types, + "download_dir": download_dir or defaults.get("download_dir"), + "api_key_env": defaults.get("api_key_env"), + "db_url": endpoint_url or defaults.get("db_url"), + } + ] + + # output area + types (or "point clouds") + if (oa_codes or "output area" in lowered or "output areas" in lowered) and ( + types or "point cloud" in lowered + ): + oa_list = oa_codes if oa_codes else [] + # allow CSV of output areas too + if csv_paths and not ("uprn" in lowered): + oa_list = csv_paths if oa_list == [] else oa_list + csv_paths + + return [ + { + "command": "uprns_by_output_area", + "output_area": oa_list + if oa_list + else (csv_paths[0] if csv_paths else None), + "download_dir": defaults.get("download_dir"), + "db_url": defaults.get("db_url"), + }, + { + "command": "download_assets", + "uprn_from_previous_csvs": True, # take outputs of step 1 + "types": types + or POINTCLOUD_BOTH, # default to both if only said "point clouds" + "download_dir": download_dir or defaults.get("download_dir"), + "api_key_env": defaults.get("api_key_env"), + "db_url": defaults.get("db_url"), + }, + ] + + # ods + types + if (ods_codes or "ods" in lowered) and (types is not None): + ods_list = ods_codes if ods_codes else (csv_paths if csv_paths else None) + return [ + { + "command": "ods_to_uprn", + "ods": ods_list, + "download_dir": defaults.get("download_dir"), + "db_url": defaults.get("db_url"), + }, + { + "command": "download_assets", + "uprn_from_previous_csvs": True, + "types": types, + "download_dir": download_dir or defaults.get("download_dir"), + "api_key_env": defaults.get("api_key_env"), + "db_url": defaults.get("db_url"), + }, + ] + + # plain output area listing (no types mentioned) + if oa_codes or ("output area" in lowered or "output areas" in lowered): + return [ + { + "command": "uprns_by_output_area", + "output_area": oa_codes + if oa_codes + else (csv_paths[0] if csv_paths else None), + "download_dir": defaults.get("download_dir"), + "db_url": defaults.get("db_url"), + } + ] + + # plain ods mapping (no types) + if ods_codes or "ods" in lowered: + return [ + { + "command": "ods_to_uprn", + "ods": ods_codes + if ods_codes + else (csv_paths[0] if csv_paths else None), + "download_dir": defaults.get("download_dir"), + "db_url": defaults.get("db_url"), + } + ] + + # direct UPRN download + if uprns and ( + "download" in lowered + or "assets" in lowered + or "point" in lowered + or "rgb" in lowered + or "image" in lowered + or "lidar" in lowered + ): + return [ + { + "command": "download_assets", + "uprn": uprns, + "types": types, + "download_dir": download_dir or defaults.get("download_dir"), + "api_key_env": defaults.get("api_key_env"), + "db_url": endpoint_url or defaults.get("db_url"), + } + ] + + return None + + +# ====================================================================================== +# LLM multi-step planner (primary) – falls back to heuristics if invalid/empty. +# ====================================================================================== + + +def llm_plan( + nl: str, + defaults: dict[str, Any], + base_url: str, + model_id: str, + temperature: float, + top_p: float, + num_predict: int, + num_ctx: int | None, + keep_alive: str | None, + force_json: bool, +) -> list[StepSpec] | None: + messages = [ + {"role": "system", "content": PLAN_SYSTEM_PROMPT}, + {"role": "user", "content": nl}, + ] + try: + resp = ollama_chat( + base_url=base_url, + model=model_id, + messages=messages, + temperature=temperature, + top_p=top_p, + num_predict=num_predict, + num_ctx=num_ctx, + keep_alive=keep_alive, + force_json=force_json, + ) + content = None + if isinstance(resp.get("message"), dict): + content = resp["message"].get("content") + if not content and isinstance(resp.get("response"), str): + content = resp.get("response") + if not content: + return None + plan_obj = None + try: + plan_obj = json.loads(content) + except Exception: + plan_obj = _extract_first_json(content) + if not plan_obj or not isinstance(plan_obj, dict): + return None + steps_raw = plan_obj.get("steps") + if not isinstance(steps_raw, list) or not steps_raw: + return None + steps: list[StepSpec] = [] + for s in steps_raw: + if not isinstance(s, dict): + continue + cmd = s.get("command") + if cmd not in {"download_assets", "ods_to_uprn", "uprns_by_output_area"}: + continue + step: StepSpec = { + "command": cmd, # type: ignore[assignment] + } + for key in [ + "uprn", + "ods", + "output_area", + "types", + "sensor", + "download_dir", + "api_key_env", + "db_url", + ]: + if key in s: + step[key] = s[key] # type: ignore[index, assignment] + if s.get("uprn_from_previous_csvs"): + step["uprn_from_previous_csvs"] = True # type: ignore[index] + # Inject defaults where appropriate + if "api_key_env" not in step and defaults.get("api_key_env"): + step["api_key_env"] = defaults["api_key_env"] # type: ignore[index] + if "download_dir" not in step and defaults.get("download_dir"): + step["download_dir"] = defaults["download_dir"] # type: ignore[index] + steps.append(step) + return steps or None + except Exception: + return None + + +# ====================================================================================== +# Optional: LLM Planner fallback (uses your Ollama server same as nl_query_cli.py) +# Produces a single-step spec; we then upgrade it into a plan if the NL implies multi. +# ====================================================================================== + + +def ollama_chat( + base_url: str, + model: str, + messages: list[dict[str, str]], + temperature: float = 0.0, + top_p: float = 0.95, + num_predict: int = 256, + num_ctx: int | None = None, + keep_alive: str | None = None, + force_json: bool = True, + timeout_s: float = 120.0, +) -> dict[str, Any]: + url = base_url.rstrip("/") + "/api/chat" + payload: dict[str, Any] = { + "model": model, + "messages": messages, + "stream": False, + "options": { + "temperature": float(temperature), + "top_p": float(top_p), + "num_predict": int(num_predict), + }, + } + if num_ctx is not None: + payload["options"]["num_ctx"] = int(num_ctx) + if keep_alive: + payload["keep_alive"] = str(keep_alive) + if force_json: + payload["format"] = "json" + r = requests.post(url, json=payload, timeout=(5.0, timeout_s)) + r.raise_for_status() + return r.json() + + +def llm_route_to_spec(nl: str, base_url: str, model_id: str, **opts) -> StepSpec | None: + messages = [ + {"role": "system", "content": SYSTEM_ROUTER_PROMPT}, + {"role": "user", "content": nl}, + ] + resp = ollama_chat(base_url, model_id, messages, **opts) + content = None + if isinstance(resp.get("message"), dict): + content = resp["message"].get("content") + if not content and isinstance(resp.get("response"), str): + content = resp["response"] + if not content: + return None + obj = _extract_first_json(content) or None + if not obj or "command" not in obj: + return None + # Normalize to StepSpec + step: StepSpec = { + "command": obj["command"], + "uprn": obj.get("uprn"), + "ods": obj.get("ods"), + "output_area": obj.get("output_area"), + "sensor": obj.get("sensor"), + "types": obj.get("types"), + "download_dir": obj.get("download_dir"), + "api_key_env": obj.get("api_key_env"), + "db_url": obj.get("db_url"), + } + return step + + +def upgrade_single_spec_to_plan( + nl: str, spec: StepSpec, defaults: dict[str, Any] +) -> list[StepSpec]: + """ + If NL implies multi-stage (e.g., output area + assets, or ODS + assets), + expand the single routed spec into a 2-step plan. Otherwise return [spec]. + """ + lowered = nl.lower() + types = spec.get("types") or _map_types_from_text(lowered) + # output area -> assets + if spec.get("command") == "uprns_by_output_area" and ( + types or "point cloud" in lowered or "download" in lowered + ): + return [ + spec, + { + "command": "download_assets", + "uprn_from_previous_csvs": True, + "types": types or POINTCLOUD_BOTH, + "download_dir": spec.get("download_dir") + or defaults.get("download_dir"), + "api_key_env": spec.get("api_key_env") or defaults.get("api_key_env"), + "db_url": spec.get("db_url") or defaults.get("db_url"), + }, + ] + # ODS -> assets + if spec.get("command") == "ods_to_uprn" and (types is not None): + return [ + spec, + { + "command": "download_assets", + "uprn_from_previous_csvs": True, + "types": types, + "download_dir": spec.get("download_dir") + or defaults.get("download_dir"), + "api_key_env": spec.get("api_key_env") or defaults.get("api_key_env"), + "db_url": spec.get("db_url") or defaults.get("db_url"), + }, + ] + return [spec] + + +# ====================================================================================== +# Execution helpers +# ====================================================================================== + + +def run_query_assist_step( + step: StepSpec, + py_exe: str, + qa_path: str, + dry_run: bool, + env: dict[str, str] | None = None, +) -> tuple[int, str]: + """Execute a single step via subprocess, stream logs, and return (rc, captured_text).""" + argv = _build_argv(step, py_exe, qa_path) + printable = " ".join(shlex.quote(x) for x in argv) + logging.info("Command: %s", printable) + if dry_run: + return 0, f"[dry-run] {printable}\n" + + p = subprocess.Popen( + argv, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=env + ) + captured_lines = [] + try: + assert p.stdout is not None + for line in p.stdout: + sys.stdout.write(line) + captured_lines.append(line) + finally: + rc = p.wait() + return rc, "".join(captured_lines) + + +def materialize_previous_uprn_csvs(state: WFState, step_idx: int) -> list[str]: + """ + Return list of CSV paths produced by earlier steps. + Uses parsed logs; if nothing parsed but we are in ODS->UPRN flow, default + to /downloads/ods_to_uprn.csv (query_assist behavior). + """ + from_logs = state.artifacts.get("csvs", []) + if from_logs: + return from_logs + + # fallback heuristic for ODS mapping + dl_base = state.plan[0].get("download_dir") or os.path.join( + os.getcwd(), "downloads" + ) + candidate = os.path.join(dl_base, "ods_to_uprn.csv") + if os.path.isfile(candidate): + return [candidate] + return [] + + +# ====================================================================================== +# LangGraph nodes +# ====================================================================================== + + +def node_plan(state: WFState) -> WFState: + defaults = { + "download_dir": None, # leave None -> query_assist default ./downloads + "api_key_env": "API_KEY", # aligns with query_assist default + "db_url": None, # use query_assist default unless user overrides + } + # Primary: LLM multi-step planner + plan = llm_plan( + state.nl, + defaults, + base_url=state.base_url, + model_id=state.model_id, + temperature=state.temperature, + top_p=state.top_p, + num_predict=state.num_predict, + num_ctx=state.num_ctx, + keep_alive=state.keep_alive, + force_json=state.force_json, + ) + # Fallback: legacy heuristics + if not plan: + plan = heuristic_plan(state.nl, defaults) + if not plan: + state.plan = [] + state.log.append("No actionable plan could be inferred.") + return state + state.plan = plan + return state + + +def node_execute(state: WFState) -> WFState: + if state.current >= len(state.plan): + return state # nothing to do + step = state.plan[state.current] + + # If this step takes UPRNs from previous CSVs, resolve them now + if step.get("uprn_from_previous_csvs"): + csvs = materialize_previous_uprn_csvs(state, state.current) + if not csvs: + state.log.append("No CSVs found from previous step(s).") + # Fail this step + state.current = len(state.plan) + return state + step = {**step} + step.pop("uprn_from_previous_csvs", None) + step["uprn"] = csvs + + # Execute + rc, captured = run_query_assist_step( + step, state.py_exe, state.qa_path, state.dry_run + ) + state.log.append(captured) + + # Parse any emitted CSV artifacts for downstream use + newly_found = _find_csvs_emitted(captured) + if newly_found: + extant = state.artifacts.get("csvs", []) + state.artifacts["csvs"] = list(dict.fromkeys(extant + newly_found)) + + # Advance or stop on error + if rc != 0: + state.log.append(f"Step {state.current} returned non-zero exit {rc}.") + state.current = len(state.plan) # abort + else: + state.current += 1 + return state + + +def node_check_done(state: WFState) -> str: + if state.current >= len(state.plan): + return END + if state.current >= state.max_steps: + state.log.append(f"Aborting: exceeded max_steps={state.max_steps}") + return END + return "execute" + + +# ====================================================================================== +# CLI / main +# ====================================================================================== + + +def _render_box(title: str, body: str) -> str: + term_width = shutil.get_terminal_size(fallback=(100, 24)).columns + max_width = max(60, min(term_width - 2, 100)) + wrap_width = max_width - 4 + body_lines = [] + for para in body.splitlines(): + if not para.strip(): + body_lines.append("") + else: + body_lines.extend(textwrap.wrap(para, width=wrap_width)) + title = title.strip() + title_line = f" {title} " + top = "┌" + "─" * (max_width - 2) + "┐" + sep = "├" + "─" * (max_width - 2) + "┤" + bot = "└" + "─" * (max_width - 2) + "┘" + if len(title_line) <= (max_width - 2): + left = (max_width - 2 - len(title_line)) // 2 + right = max_width - 2 - len(title_line) - left + top = "┌" + "─" * left + title_line + "─" * right + "┐" + content = "\n".join("│ " + line.ljust(max_width - 4) + " │" for line in body_lines) + return "\n".join([top, sep, content, bot]) + + +def parse_args() -> argparse.Namespace: + ap = argparse.ArgumentParser( + description="LangGraph NL workflow CLI for query_assist.py (multi-stage capable)" + ) + ap.add_argument("--model-id", default="gpt-oss:20b", help="Ollama model name/tag") + ap.add_argument( + "--query-assist-path", + default=os.path.join(os.path.dirname(__file__), "query_assist.py"), + help="Path to query_assist.py", + ) + ap.add_argument( + "--base-url", + default=os.environ.get("OLLAMA_HOST", "http://localhost:11434"), + help="Base URL of the Ollama server (or set OLLAMA_HOST)", + ) + ap.add_argument( + "--dry-run", action="store_true", help="Plan/print but do not execute" + ) + ap.add_argument("--once", "-q", help="Run a single NL query and exit") + ap.add_argument("--temperature", type=float, default=0.0) + ap.add_argument("--top-p", type=float, default=0.95) + ap.add_argument("--num-predict", type=int, default=256) + ap.add_argument("--num-ctx", type=int, default=None) + ap.add_argument("--keep-alive", default=None) + ap.add_argument("--no-force-json", action="store_true") + ap.add_argument("--max-steps", type=int, default=8) + ap.add_argument( + "-v", "--verbose", action="count", default=0, help="-v=info, -vv=debug" + ) + ap.add_argument( + "--plan-only", action="store_true", help="Only show the compiled plan and exit" + ) + return ap.parse_args() + + +def main(): + args = parse_args() + + # Logging level + if args.verbose >= 2: + level = logging.DEBUG + elif args.verbose == 1: + level = logging.INFO + else: + level = logging.WARNING + logging.basicConfig(level=level, format="%(levelname)s: %(message)s") + + # Intro banner + if level <= logging.INFO: + body = ( + "- Parses NL to a multi-step plan (heuristics + optional LLM).\n" + "- Executes steps via LangGraph with artifact passing.\n" + "- Detects CSVs emitted by query_assist.py and feeds them forward.\n" + "- Supports dry-run and plan-only modes for auditability." + ) + print(_render_box(f"LangGraph NL Workflow — {args.model_id}", body)) + + # Build the LangGraph + builder = StateGraph(WFState) + builder.add_node("plan", node_plan) + builder.add_node("execute", node_execute) + builder.add_edge(START, "plan") + builder.add_conditional_edges( + "plan", lambda s: "execute" if s.plan else END, {"execute": "execute", END: END} + ) + builder.add_conditional_edges( + "execute", node_check_done, {"execute": "execute", END: END} + ) + memory = MemorySaver() + graph = builder.compile(checkpointer=memory) + + # One-shot or REPL + def run_once(nl: str) -> int: + st = WFState( + nl=nl, + plan=[], + dry_run=bool(args.dry_run), + qa_path=args.query_assist_path, + base_url=args.base_url, + model_id=args.model_id, + temperature=args.temperature, + top_p=args.top_p, + num_predict=args.num_predict, + num_ctx=args.num_ctx, + keep_alive=args.keep_alive, + force_json=(not args.no_force_json), + verbose_level=level, + max_steps=args.max_steps, + ) + # PLAN + st = _coerce_wfstate( + graph.invoke( + st, config={"configurable": {"thread_id": f"tid-{time.time_ns()}"}} + ) + ) + st = node_plan(st) + if not st.plan: + logging.warning("No actionable plan produced for: %r", nl) + if st.log: + print("\n".join(st.log)) + return 0 + + # Print plan only at INFO or DEBUG verbosity + if level <= logging.INFO: + print("Plan:") + for i, step in enumerate(st.plan): + step_disp = { + k: v for k, v in step.items() if k != "uprn_from_previous_csvs" + } + print(f" {i+1}. {json.dumps(step_disp, ensure_ascii=False)}") + print() + if args.plan_only: + return 0 + + # EXECUTE + while st.current < len(st.plan) and st.current < st.max_steps: + status = node_check_done(st) + if status == END: + break + st = node_execute(st) + if st.current >= len(st.plan): + # Re-check termination + if node_check_done(st) == END: + break + # Final logs (already streamed, but attach any notes) + if st.log: + trailing = [ + line + for line in st.log + if "[dry-run]" in line + or "No CSVs found" in line + or "non-zero exit" in line + ] + if trailing: + print("\n".join(trailing)) + return 0 + + try: + if args.once: + sys.exit(run_once(args.once)) + if level <= logging.INFO: + print( + "LangGraph NL workflow for query_assist.py. Type 'exit' or Ctrl-D to quit." + ) + while True: + try: + nl = input("> ").strip() + except EOFError: + break + if not nl: + continue + if nl.lower() in {"exit", "quit"}: + break + rc = run_once(nl) + if rc != 0: + logging.warning("Workflow exited with code %d", rc) + except KeyboardInterrupt: + print() + logging.info("Interrupted.") + sys.exit(130) + except Exception as e: + logging.error("Fatal error: %s", e) + sys.exit(1) + + +if __name__ == "__main__": + main() From cb18dbd1e26fc4f9119fa013638afbdbe2956d47 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Tue, 26 Aug 2025 16:47:12 +0100 Subject: [PATCH 11/19] getting close --- examples/nl_query_graph.py | 1186 +++++++++++++----------------------- 1 file changed, 415 insertions(+), 771 deletions(-) diff --git a/examples/nl_query_graph.py b/examples/nl_query_graph.py index 1b84355..38e1ccb 100644 --- a/examples/nl_query_graph.py +++ b/examples/nl_query_graph.py @@ -1,28 +1,28 @@ #!/usr/bin/env python3 """ -nl_query_workflow_cli.py +nl_query_graph.py — LLM-only planner, plan-first, two-stage ODS -A LangGraph-powered superset of nl_query_cli.py that can execute multi-stage -workflows against query_assist.py. Examples it can handle in one NL turn: +A lean rewrite of the previous LangGraph CLI that: +- Prints the plan FIRST (and ONLY when running at INFO, i.e., with -v) +- Uses an LLM-only planner (no heuristics) with a stronger prompt +- Guarantees a two-stage plan for ODS/Output Area when the request implies assets +- Keeps dry-run and plan-only modes +- Preserves artifact detection (CSV) and feeds them into download steps +- Adds robust retries and graceful fallback when Ollama returns empty/invalid content +- Understands both Ollama-native and OpenAI-style response shapes (choices[0].message.content) -- "Get all point clouds in output area E00004550" --> - Step 1: uprns_by_output_area -> CSV(s) - Step 2: download_assets --uprn --types did:lidar-pointcloud-merged,did:lidar-pointcloud-frame +Requirements: + pip install langgraph requests -- "For ODS G85013, download RGB and merged lidar to /data" --> - Step 1: ods_to_uprn -> /.../downloads/ods_to_uprn.csv - Step 2: download_assets --uprn ods_to_uprn.csv --types did:rgb-image,did:lidar-pointcloud-merged --download-dir /data - -It preserves the CLI surface and routing behavior of nl_query_cli.py, but adds: -- Planner (heuristics + optional LLM) that compiles an ordered step list -- LangGraph execution loop with checkpoint-able state, retries, and artifact passing -- Robust parsing of query_assist.py logs to discover produced CSV artifacts -- Deterministic "dry-run" plan prints for auditability +Notes: +- We keep LangGraph for structure/checkpointing, but we do not auto-run it before + printing the plan. We explicitly call plan -> print -> execute. +- The LLM prompt is opinionated to produce two steps when the user asks for + assets, even if types are not specified (types omitted === all types). """ from __future__ import annotations import argparse -import dataclasses import json import logging import os @@ -33,313 +33,73 @@ import sys import textwrap import time +from dataclasses import dataclass, field from typing import Any, Literal, TypedDict -# --- Third-party --- -# pip install langgraph[all] requests +# Third-party import requests from langgraph.checkpoint.memory import MemorySaver from langgraph.graph import END, START, StateGraph -# ====================================================================================== -# Carry forward the routing prompts / type mappings / helpers from nl_query_cli.py -# (kept in-sync conceptually; this file does not import the other to stay standalone). -# ====================================================================================== - -SYSTEM_ROUTER_PROMPT = """You are a rigorous function-call router for a Python CLI named query_assist.py. - -Supported commands and how to populate them: - -1) download_assets - Required: uprn (string CSV path OR array of strings like ["5045394","200003455212"]) - Optional: sensor (string, e.g., "bess:OusterLidarSensor") - types (array of strings; each a type IRI, e.g., ["did:rgb-image","did:lidar-pointcloud-merged"]) - download_dir (string path) - api_key_env (string, name of env var with API key) - db_url (string URL to SPARQL endpoint) - -2) ods_to_uprn - Required: ods (string CSV path OR array of strings like ["G85013","Q12345"]) - -3) uprns_by_output_area - Required: output_area (string CSV path OR array of strings, e.g., ["E00004550","E00032882"]) - -Schema (MUST output exactly one JSON object with these keys as needed): -{ - "command": "download_assets" | "ods_to_uprn" | "uprns_by_output_area", - "uprn": string | string[] | null, - "ods": string | string[] | null, - "output_area": string | string[] | null, - "sensor": string | null, - "types": string[] | null, - "download_dir": string | null, - "api_key_env": string | null, - "db_url": string | null -} - -Constraints: -- Return ONLY the JSON object. No prose, no markdown. -- If the user request implies asset types, map them to the supported IRIs if possible: - - RGB image -> "did:rgb-image" - - merged lidar point cloud -> "did:lidar-pointcloud-merged" - - lidar range panorama -> "did:lidar-range-pano" - - lidar reflectance panorama -> "did:lidar-reflectance-pano" - - lidar signal panorama -> "did:lidar-signal-pano" - - lidar near-infrared panorama -> "did:lidar-nearir-pano" - - IR false color -> "did:ir-false-color-image" - - IR temperature array -> "did:ir-temperature-array" - - IR counts -> "did:ir-count-image" - - temperature (no contentUrl) -> "did:celsius-temperature" - - relative humidity (no contentUrl) -> "did:relative-humidity" -- Prefer being decisive. When in doubt, infer sensible defaults. -""" - -TYPE_ALIASES = { - # canonical mappings used throughout your stack - "rgb": "did:rgb-image", - "rgb image": "did:rgb-image", - "merged lidar": "did:lidar-pointcloud-merged", - "merged lidar point cloud": "did:lidar-pointcloud-merged", - "lidar point cloud": "did:lidar-pointcloud-frame", - "point cloud": "did:lidar-pointcloud-frame", - "point clouds": None, # expands to both merged + frame - "lidar range panorama": "did:lidar-range-pano", - "lidar reflectance panorama": "did:lidar-reflectance-pano", - "lidar signal panorama": "did:lidar-signal-pano", - "lidar nearir panorama": "did:lidar-nearir-pano", - "ir false color": "did:ir-false-color-image", - "ir temperature array": "did:ir-temperature-array", - "ir counts": "did:ir-count-image", - "temperature": "did:celsius-temperature", - "relative humidity": "did:relative-humidity", -} - -POINTCLOUD_BOTH = ["did:lidar-pointcloud-merged", "did:lidar-pointcloud-frame"] - -# ---------------------------------------------------------------------------------- -# High-level planner system prompt (multi-step). Explains assets & synonyms so the -# LLM can infer intent even with loose language ("building" -> UPRN, etc.). -# ---------------------------------------------------------------------------------- -PLAN_SYSTEM_PROMPT = """ +# ============================================================================ +# Strong LLM planning prompt (LLM-only) +# ============================================================================ +PLAN_SYSTEM_PROMPT = r""" You are a planning assistant that converts a natural language request about retrieving built environment asset data into an ordered execution plan for query_assist.py. -Available low-level commands (same as router): - 1. uprns_by_output_area – given output area code(s) yield UPRN CSV(s) - 2. ods_to_uprn – given ODS clinical practice code(s) yield UPRN CSV - 3. download_assets – given UPRN(s) (list or CSV path) download assets - -Important asset type IRIs (use when mentioned or implied): - did:rgb-image, did:lidar-pointcloud-merged, did:lidar-pointcloud-frame, - did:lidar-range-pano, did:lidar-reflectance-pano, did:lidar-signal-pano, - did:lidar-nearir-pano, did:ir-false-color-image, did:ir-temperature-array, - did:ir-count-image, did:celsius-temperature, did:relative-humidity - -Synonyms / interpretation guidance: - "building", "buildings", "property", "properties" -> treat as UPRN(s) - "practice", "gp practice" -> ODS code - "thermal array", "temperature array", "thermal sensor" -> did:ir-temperature-array - "thermal image" -> did:ir-false-color-image (unless array explicitly stated) - "point clouds" (plural, no qualifier) -> both merged + frame - "merged point cloud" -> did:lidar-pointcloud-merged - "point cloud frame" / "single frame" -> did:lidar-pointcloud-frame - If user asks for "temperature and humidity" -> did:celsius-temperature + did:relative-humidity - -Planning rules: - - If an intermediate mapping (ODS or output area) is needed to reach UPRNs, plan that first, then a download_assets step referencing previous CSV output (use uprn_from_previous_csvs=true instead of explicit uprn list). - - If the user directly supplies UPRNs (numbers) OR a CSV path that obviously contains UPRNs, a single download_assets step may suffice. - - Always be decisive; include only the steps required. - -Return JSON ONLY, schema: +Available CLI commands: + 1. uprns_by_output_area – given output area code(s) yield UPRN CSV(s) + 2. ods_to_uprn – given ODS clinical practice code(s) yield UPRN CSV + 3. download_assets – given UPRN(s) (list or CSV path) download assets + +Asset type IRIs (use when mentioned or implied; omit "types" to mean ALL types): + did:rgb-image, did:lidar-pointcloud-merged, did:lidar-pointcloud-frame, + did:lidar-range-pano, did:lidar-reflectance-pano, did:lidar-signal-pano, + did:lidar-nearir-pano, did:ir-false-color-image, did:ir-temperature-array, + did:ir-count-image, did:celsius-temperature, did:relative-humidity + +Synonyms / interpretation guidance (LLM, be decisive): + "building"/"property" -> UPRN(s) + "practice"/"gp practice" -> ODS code + "point clouds" (plural) -> include merged and frame unless otherwise specified + "thermal image" -> did:ir-false-color-image unless arrays are explicitly requested + +CRITICAL RULES: + • If the user asks to get/download assets and provides ODS or Output Area, + ALWAYS output a TWO-STEP plan: the mapping step first, then a download_assets + step that consumes the CSVs from the previous step. Use + {"uprn_from_previous_csvs": true} on that download step. + • If the user says "all assets" or does not specify types, omit the "types" + field entirely on download_assets to indicate ALL types. + • Prefer being decisive and avoid asking questions. + +JSON output schema (emit ONLY one JSON object): { - "steps": [ - { - "command": "uprns_by_output_area"|"ods_to_uprn"|"download_assets", - "output_area": string|[string]|null, - "ods": string|[string]|null, - "uprn": string|[string]|null, - "types": [string]|null, - "sensor": string|null, - "download_dir": string|null, - "api_key_env": string|null, - "db_url": string|null, - "uprn_from_previous_csvs": true|false|null - }, ... - ] + "steps": [ + { + "command": "uprns_by_output_area"|"ods_to_uprn"|"download_assets", + "output_area": string|[string]|null, + "ods": string|[string]|null, + "uprn": string|[string]|null, + "types": [string]|null, + "sensor": string|null, + "download_dir": string|null, + "api_key_env": string|null, + "db_url": string|null, + "uprn_from_previous_csvs": true|false|null + }, ... + ] } -Notes: - - Omit keys or set null when not applicable. - - Use uprn_from_previous_csvs=true only on a download_assets step that should read the CSV(s) produced by previous mapping step(s). - - Do NOT include explanatory prose. +Do not include any prose. Output the JSON object only. """ -def _extract_first_json(text: str) -> dict | None: - start = text.find("{") - if start == -1: - return None - depth = 0 - for i, c in enumerate(text[start:], start=start): - if c == "{": - depth += 1 - elif c == "}": - depth -= 1 - if depth == 0: - try: - return json.loads(text[start : i + 1]) - except Exception: - return None - return None - - -def _find_csvs_emitted(stream_text: str) -> list[str]: - """ - Parse query_assist.py logs to discover created CSVs. - It prints e.g.: - "✔ Saved CSV for {oa} → {path}" - "✔ Saved ODS→UPRN CSV → {path}" - Accept both unicode arrows and ASCII '->'. - """ - csvs = [] - # Unicode arrow / ASCII arrow variants, greedy path match until whitespace end - patterns = [ - r"Saved CSV for .*? → ([^\s]+\.csv)", - r"Saved CSV for .*? -> ([^\s]+\.csv)", - r"Saved ODS.?UPRN CSV .*? → ([^\s]+\.csv)", - r"Saved ODS.?UPRN CSV .*? -> ([^\s]+\.csv)", - ] - for pat in patterns: - for m in re.finditer(pat, stream_text): - csvs.append(m.group(1)) - # Deduplicate preserving order - seen = set() - out = [] - for p in csvs: - if p not in seen: - out.append(p) - seen.add(p) - return out - - -def _ensure_list_or_path(v: None | str | list[str]) -> list[str]: - if v is None: - return [] - if isinstance(v, list): - return [str(x) for x in v if str(x).strip()] - # Post-augmentation: if user asked for assets but only got a mapping step, add download step - lowered = state.nl.lower() - if len(plan) == 1 and ("asset" in lowered or "all the" in lowered): - first_cmd = plan[0].get("command") - if first_cmd in {"ods_to_uprn", "uprns_by_output_area"}: - # Only add if no existing download step follows - plan.append( - { - "command": "download_assets", - "uprn_from_previous_csvs": True, # consume prior CSV - "types": None, # all asset types - "download_dir": plan[0].get("download_dir") - or defaults.get("download_dir"), - "api_key_env": defaults.get("api_key_env"), - "db_url": plan[0].get("db_url") or defaults.get("db_url"), - } - ) - state.plan = plan - return [s] if s else [] - - -def _build_argv(spec: dict[str, Any], py: str, qa_path: str) -> list[str]: - cmd = [py, qa_path] - command = spec.get("command") - if command == "download_assets": - uprn = _ensure_list_or_path(spec.get("uprn")) - if not uprn: - raise ValueError("download_assets requires 'uprn'.") - cmd += ["--uprn"] + uprn - if spec.get("sensor"): - cmd += ["--sensor", str(spec["sensor"])] - if spec.get("types"): - cmd += ["--types", ",".join(spec["types"])] - elif command == "ods_to_uprn": - ods = _ensure_list_or_path(spec.get("ods")) - if not ods: - raise ValueError("ods_to_uprn requires 'ods'.") - cmd += ["--ods"] + ods - elif command == "uprns_by_output_area": - oa = _ensure_list_or_path(spec.get("output_area")) - if not oa: - raise ValueError("uprns_by_output_area requires 'output_area'.") - cmd += ["--output-area"] + oa - else: - raise ValueError(f"Unsupported command: {command!r}") - - if spec.get("db_url"): - cmd += ["--db-url", str(spec["db_url"])] - if spec.get("download_dir"): - cmd += ["--download-dir", str(spec["download_dir"])] - if spec.get("api_key_env"): - cmd += ["--api-key-env", str(spec["api_key_env"])] - return cmd - - -def _map_types_from_text(lowered: str) -> list[str] | None: - # Broad heuristics for types based on NL - wants_pointclouds = re.search(r"\bpoint\s*clouds?\b", lowered) is not None - wants_merged = "merged lidar" in lowered or re.search( - r"merged\s+lidar\s+point\s*cloud", lowered - ) - wants_frame = "pointcloud frame" in lowered or "single frame" in lowered - - types = [] - if wants_pointclouds: - types.extend(POINTCLOUD_BOTH) - if wants_merged: - types.append("did:lidar-pointcloud-merged") - if wants_frame: - types.append("did:lidar-pointcloud-frame") - - if "rgb" in lowered and "image" in lowered: - types.append("did:rgb-image") - if "range panorama" in lowered: - types.append("did:lidar-range-pano") - if "reflectance panorama" in lowered: - types.append("did:lidar-reflectance-pano") - if "signal panorama" in lowered: - types.append("did:lidar-signal-pano") - if "nearir" in lowered or "near-infrared" in lowered: - types.append("did:lidar-nearir-pano") - if "ir false" in lowered: - types.append("did:ir-false-color-image") - if "ir temperature array" in lowered: - types.append("did:ir-temperature-array") - # Additional thermal/temperature array synonyms - if ( - re.search(r"thermal\s+arrays?", lowered) - or re.search(r"temperature\s+arrays?", lowered) - or "thermal array" in lowered - ): - types.append("did:ir-temperature-array") - if re.search(r"thermal\s+images?", lowered): - # Map generic thermal image request to false-color IR image if not already specified - types.append("did:ir-false-color-image") - - if not types: - return None - # dedupe preserve order - out, seen = [], set() - for t in types: - if t not in seen: - out.append(t) - seen.add(t) - return out - - -# ====================================================================================== -# Planning + LangGraph state -# ====================================================================================== - - +# ============================================================================ +# Types & State +# ============================================================================ class StepSpec(TypedDict, total=False): command: Literal["download_assets", "ods_to_uprn", "uprns_by_output_area"] uprn: list[str] | str | None @@ -350,19 +110,16 @@ class StepSpec(TypedDict, total=False): download_dir: str | None api_key_env: str | None db_url: str | None - # internal: indicates this step will take CSVs from previous steps uprn_from_previous_csvs: bool -@dataclasses.dataclass +@dataclass class WFState: nl: str - plan: list[StepSpec] + plan: list[StepSpec] = field(default_factory=list) current: int = 0 - artifacts: dict[str, Any] = dataclasses.field( - default_factory=dict - ) # e.g., {"csvs": [...]} - log: list[str] = dataclasses.field(default_factory=list) + artifacts: dict[str, Any] = field(default_factory=dict) # e.g., {"csvs": [...]} + log: list[str] = field(default_factory=list) dry_run: bool = False py_exe: str = sys.executable qa_path: str = os.path.join(os.path.dirname(__file__), "query_assist.py") @@ -378,267 +135,74 @@ class WFState: max_steps: int = 8 -def _coerce_wfstate(obj: Any) -> WFState: - """Ensure we have a WFState instance (LangGraph may return a plain dict).""" - if isinstance(obj, WFState): - return obj - if isinstance(obj, dict): - # Build kwargs respecting dataclass fields - kwargs = {} - for f in dataclasses.fields(WFState): - if f.name in obj: - kwargs[f.name] = obj[f.name] - else: - if f.default is not dataclasses.MISSING: # type: ignore[attr-defined] - kwargs[f.name] = f.default # type: ignore[assignment] - elif getattr(f, "default_factory", dataclasses.MISSING) is not dataclasses.MISSING: # type: ignore[attr-defined] - kwargs[f.name] = f.default_factory() # type: ignore[call-arg] - else: - kwargs[f.name] = None - return WFState(**kwargs) # type: ignore[arg-type] - raise TypeError(f"Cannot coerce state of type {type(obj)} to WFState") - - -# ====================================================================================== -# Heuristic Planner (covers the common multi-step cases deterministically) -# ====================================================================================== - - -def heuristic_plan(nl: str, defaults: dict[str, Any]) -> list[StepSpec] | None: - """ - Returns a list of StepSpec if it can deduce a plan without the LLM. - Covers: - - output area + asset types -> [uprns_by_output_area, download_assets] - - ods + asset types -> [ods_to_uprn, download_assets] - - simple one-shot commands (download_assets / ods_to_uprn / uprns_by_output_area) - """ - text = nl.strip() - if not text: - return None - lowered = text.lower() - - csv_paths = re.findall(r'(?:(?:[A-Za-z]:)?[^\s"\'<>|]+\.csv)\b', text) - oa_codes = re.findall(r"\bE\d{8}\b", text) - ods_codes = re.findall(r"\b[A-Z]\d{5}\b", text) - uprns = re.findall(r"\b\d{6,}\b", text) - endpoint_match = re.search(r"(https?://[\w\.-:%/]+)", text) - endpoint_url = endpoint_match.group(1) if endpoint_match else None - - # Extract common options if present - download_dir = None - m = re.search(r"(?: to | into )\s+(/[^ ]+)", lowered) - if m: - download_dir = m.group(1) - - # Types - types = _map_types_from_text(lowered) - - # If user provided a CSV and mentioned UPRNs -> treat as download step - if csv_paths and ("uprn" in lowered or "uprns" in lowered): - return [ - { - "command": "download_assets", - "uprn": csv_paths if len(csv_paths) > 1 else csv_paths[0], - "types": types, - "download_dir": download_dir or defaults.get("download_dir"), - "api_key_env": defaults.get("api_key_env"), - "db_url": endpoint_url or defaults.get("db_url"), - } - ] - - # output area + types (or "point clouds") - if (oa_codes or "output area" in lowered or "output areas" in lowered) and ( - types or "point cloud" in lowered - ): - oa_list = oa_codes if oa_codes else [] - # allow CSV of output areas too - if csv_paths and not ("uprn" in lowered): - oa_list = csv_paths if oa_list == [] else oa_list + csv_paths +# ============================================================================ +# LLM plumbing (robust to sparse/odd Ollama responses) +# ============================================================================ - return [ - { - "command": "uprns_by_output_area", - "output_area": oa_list - if oa_list - else (csv_paths[0] if csv_paths else None), - "download_dir": defaults.get("download_dir"), - "db_url": defaults.get("db_url"), - }, - { - "command": "download_assets", - "uprn_from_previous_csvs": True, # take outputs of step 1 - "types": types - or POINTCLOUD_BOTH, # default to both if only said "point clouds" - "download_dir": download_dir or defaults.get("download_dir"), - "api_key_env": defaults.get("api_key_env"), - "db_url": defaults.get("db_url"), - }, - ] - # ods + types - if (ods_codes or "ods" in lowered) and (types is not None): - ods_list = ods_codes if ods_codes else (csv_paths if csv_paths else None) - return [ - { - "command": "ods_to_uprn", - "ods": ods_list, - "download_dir": defaults.get("download_dir"), - "db_url": defaults.get("db_url"), - }, - { - "command": "download_assets", - "uprn_from_previous_csvs": True, - "types": types, - "download_dir": download_dir or defaults.get("download_dir"), - "api_key_env": defaults.get("api_key_env"), - "db_url": defaults.get("db_url"), - }, - ] - - # plain output area listing (no types mentioned) - if oa_codes or ("output area" in lowered or "output areas" in lowered): - return [ - { - "command": "uprns_by_output_area", - "output_area": oa_codes - if oa_codes - else (csv_paths[0] if csv_paths else None), - "download_dir": defaults.get("download_dir"), - "db_url": defaults.get("db_url"), - } - ] - - # plain ods mapping (no types) - if ods_codes or "ods" in lowered: - return [ - { - "command": "ods_to_uprn", - "ods": ods_codes - if ods_codes - else (csv_paths[0] if csv_paths else None), - "download_dir": defaults.get("download_dir"), - "db_url": defaults.get("db_url"), - } - ] - - # direct UPRN download - if uprns and ( - "download" in lowered - or "assets" in lowered - or "point" in lowered - or "rgb" in lowered - or "image" in lowered - or "lidar" in lowered - ): - return [ - { - "command": "download_assets", - "uprn": uprns, - "types": types, - "download_dir": download_dir or defaults.get("download_dir"), - "api_key_env": defaults.get("api_key_env"), - "db_url": endpoint_url or defaults.get("db_url"), - } - ] - - return None - - -# ====================================================================================== -# LLM multi-step planner (primary) – falls back to heuristics if invalid/empty. -# ====================================================================================== +def _post_ollama( + base_url: str, payload: dict[str, Any], timeout_s: float = 120.0 +) -> dict[str, Any]: + url = base_url.rstrip("/") + "/api/chat" + r = requests.post(url, json=payload, timeout=(5.0, timeout_s)) + r.raise_for_status() + return r.json() -def llm_plan( - nl: str, - defaults: dict[str, Any], - base_url: str, - model_id: str, - temperature: float, - top_p: float, - num_predict: int, - num_ctx: int | None, - keep_alive: str | None, - force_json: bool, -) -> list[StepSpec] | None: - messages = [ - {"role": "system", "content": PLAN_SYSTEM_PROMPT}, - {"role": "user", "content": nl}, - ] - try: - resp = ollama_chat( - base_url=base_url, - model=model_id, - messages=messages, - temperature=temperature, - top_p=top_p, - num_predict=num_predict, - num_ctx=num_ctx, - keep_alive=keep_alive, - force_json=force_json, - ) - content = None - if isinstance(resp.get("message"), dict): - content = resp["message"].get("content") - if not content and isinstance(resp.get("response"), str): - content = resp.get("response") - if not content: - return None - plan_obj = None - try: - plan_obj = json.loads(content) - except Exception: - plan_obj = _extract_first_json(content) - if not plan_obj or not isinstance(plan_obj, dict): - return None - steps_raw = plan_obj.get("steps") - if not isinstance(steps_raw, list) or not steps_raw: - return None - steps: list[StepSpec] = [] - for s in steps_raw: - if not isinstance(s, dict): - continue - cmd = s.get("command") - if cmd not in {"download_assets", "ods_to_uprn", "uprns_by_output_area"}: - continue - step: StepSpec = { - "command": cmd, # type: ignore[assignment] - } - for key in [ - "uprn", - "ods", - "output_area", - "types", - "sensor", - "download_dir", - "api_key_env", - "db_url", - ]: - if key in s: - step[key] = s[key] # type: ignore[index, assignment] - if s.get("uprn_from_previous_csvs"): - step["uprn_from_previous_csvs"] = True # type: ignore[index] - # Inject defaults where appropriate - if "api_key_env" not in step and defaults.get("api_key_env"): - step["api_key_env"] = defaults["api_key_env"] # type: ignore[index] - if "download_dir" not in step and defaults.get("download_dir"): - step["download_dir"] = defaults["download_dir"] # type: ignore[index] - steps.append(step) - return steps or None - except Exception: +def _extract_first_json(text: str) -> dict | None: + start = text.find("{") + if start == -1: return None + depth = 0 + for i, c in enumerate(text[start:], start=start): + if c == "{": + depth += 1 + elif c == "}": + depth -= 1 + if depth == 0: + try: + return json.loads(text[start : i + 1]) + except Exception: + return None + return None -# ====================================================================================== -# Optional: LLM Planner fallback (uses your Ollama server same as nl_query_cli.py) -# Produces a single-step spec; we then upgrade it into a plan if the NL implies multi. -# ====================================================================================== +def _extract_content_variants(resp: dict[str, Any]) -> str: + # Try common shapes returned by Ollama and proxies + if isinstance(resp.get("message"), dict): + c = resp["message"].get("content") + if c: + return c + # OpenAI/vLLM/LM Studio style + choices = resp.get("choices") + if isinstance(choices, list) and choices: + first = choices[0] or {} + msg = first.get("message") or {} + c = msg.get("content") or first.get("text") + if isinstance(c, str) and c: + return c + # Some servers use top-level 'response' + c = resp.get("response") + if isinstance(c, str) and c: + return c + # Rare: top-level 'content' + c = resp.get("content") + if isinstance(c, str) and c: + return c + # Some wrappers return {'messages': [{'content': ...}]} + msgs = resp.get("messages") + if isinstance(msgs, list) and msgs and isinstance(msgs[-1], dict): + c = msgs[-1].get("content") + if isinstance(c, str) and c: + return c + return "" def ollama_chat( base_url: str, model: str, messages: list[dict[str, str]], + *, temperature: float = 0.0, top_p: float = 0.95, num_predict: int = 256, @@ -646,105 +210,209 @@ def ollama_chat( keep_alive: str | None = None, force_json: bool = True, timeout_s: float = 120.0, -) -> dict[str, Any]: - url = base_url.rstrip("/") + "/api/chat" - payload: dict[str, Any] = { - "model": model, - "messages": messages, - "stream": False, - "options": { - "temperature": float(temperature), - "top_p": float(top_p), - "num_predict": int(num_predict), - }, - } - if num_ctx is not None: - payload["options"]["num_ctx"] = int(num_ctx) - if keep_alive: - payload["keep_alive"] = str(keep_alive) - if force_json: - payload["format"] = "json" - r = requests.post(url, json=payload, timeout=(5.0, timeout_s)) - r.raise_for_status() - return r.json() - - -def llm_route_to_spec(nl: str, base_url: str, model_id: str, **opts) -> StepSpec | None: + retries: int = 2, + logger: logging.Logger | None = None, +) -> str: + """Return assistant content as a string. Retries with/without JSON forcing.""" + last_err: Exception | None = None + for attempt in range(retries + 1): + payload: dict[str, Any] = { + "model": model, + "messages": messages, + "stream": False, + "options": { + "temperature": float(temperature), + "top_p": float(top_p), + "num_predict": int(num_predict), + }, + } + use_json = force_json if attempt == 0 else False + if num_ctx is not None: + payload["options"]["num_ctx"] = int(num_ctx) + if keep_alive: + payload["keep_alive"] = str(keep_alive) + if use_json: + payload["format"] = "json" + try: + resp = _post_ollama(base_url, payload, timeout_s=timeout_s) + content = _extract_content_variants(resp) + if logger and logger.isEnabledFor(logging.DEBUG): + logger.debug("Ollama raw keys: %s", list(resp.keys())) + logger.debug("Ollama content preview: %r", content[:200]) + if content: + return content + last_err = RuntimeError("empty content from Ollama") + except Exception as e: + last_err = e + # brief backoff + time.sleep(0.2 * (attempt + 1)) + if last_err: + raise last_err + raise RuntimeError("Ollama returned no content.") + + +def llm_plan(state: WFState) -> list[StepSpec]: messages = [ - {"role": "system", "content": SYSTEM_ROUTER_PROMPT}, - {"role": "user", "content": nl}, + {"role": "system", "content": PLAN_SYSTEM_PROMPT}, + {"role": "user", "content": state.nl}, ] - resp = ollama_chat(base_url, model_id, messages, **opts) - content = None - if isinstance(resp.get("message"), dict): - content = resp["message"].get("content") - if not content and isinstance(resp.get("response"), str): - content = resp["response"] - if not content: - return None - obj = _extract_first_json(content) or None - if not obj or "command" not in obj: - return None - # Normalize to StepSpec - step: StepSpec = { - "command": obj["command"], - "uprn": obj.get("uprn"), - "ods": obj.get("ods"), - "output_area": obj.get("output_area"), - "sensor": obj.get("sensor"), - "types": obj.get("types"), - "download_dir": obj.get("download_dir"), - "api_key_env": obj.get("api_key_env"), - "db_url": obj.get("db_url"), - } - return step - - -def upgrade_single_spec_to_plan( - nl: str, spec: StepSpec, defaults: dict[str, Any] -) -> list[StepSpec]: - """ - If NL implies multi-stage (e.g., output area + assets, or ODS + assets), - expand the single routed spec into a 2-step plan. Otherwise return [spec]. - """ - lowered = nl.lower() - types = spec.get("types") or _map_types_from_text(lowered) - # output area -> assets - if spec.get("command") == "uprns_by_output_area" and ( - types or "point cloud" in lowered or "download" in lowered - ): - return [ - spec, - { - "command": "download_assets", - "uprn_from_previous_csvs": True, - "types": types or POINTCLOUD_BOTH, - "download_dir": spec.get("download_dir") - or defaults.get("download_dir"), - "api_key_env": spec.get("api_key_env") or defaults.get("api_key_env"), - "db_url": spec.get("db_url") or defaults.get("db_url"), - }, - ] - # ODS -> assets - if spec.get("command") == "ods_to_uprn" and (types is not None): - return [ - spec, + # Try calling the planner; on failure, DO NOT abort. Fall through to synthesis. + try: + content = ollama_chat( + base_url=state.base_url, + model=state.model_id, + messages=messages, + temperature=state.temperature, + top_p=state.top_p, + num_predict=state.num_predict, + num_ctx=state.num_ctx, + keep_alive=state.keep_alive, + force_json=state.force_json, + retries=2, + logger=logging.getLogger(__name__), + ) + except Exception as e: + logging.getLogger(__name__).warning("Planner LLM call failed: %s", e) + content = "" + + obj: dict | None + try: + obj = json.loads(content) + except Exception: + obj = _extract_first_json(content) + if not obj or not isinstance(obj, dict): + # LAST-DITCH guardrail: if the LLM truly failed to produce JSON but the NL + # clearly requests assets with ODS/OA, synthesize a minimal two-step plan + # rather than crashing. This is NOT a heuristic router; it's a fail-safe + # to keep execution usable when the model returns empty text. + lowered = state.nl.lower() + ods = re.findall(r"\b[A-Z]\d{5}\b", state.nl) + oa = re.findall(r"\bE\d{8}\b", state.nl) + implies_assets = any( + w in lowered + for w in ( + "asset", + "assets", + "download", + "point cloud", + "rgb", + "image", + "lidar", + ) + ) + if implies_assets and (ods or oa): + if ods: + return [ + {"command": "ods_to_uprn", "ods": ods}, + {"command": "download_assets", "uprn_from_previous_csvs": True}, + ] + if oa: + return [ + {"command": "uprns_by_output_area", "output_area": oa}, + {"command": "download_assets", "uprn_from_previous_csvs": True}, + ] + # If we get here, there is nothing actionable — report the original issue. + raise RuntimeError("LLM did not return a valid JSON object.") + + steps_raw = obj.get("steps") + if not isinstance(steps_raw, list) or not steps_raw: + raise RuntimeError("LLM returned an empty steps list.") + + steps: list[StepSpec] = [] + for s in steps_raw: + if not isinstance(s, dict): + continue + cmd = s.get("command") + if cmd not in {"download_assets", "ods_to_uprn", "uprns_by_output_area"}: + continue + step: StepSpec = {"command": cmd} # type: ignore[assignment] + for key in [ + "uprn", + "ods", + "output_area", + "types", + "sensor", + "download_dir", + "api_key_env", + "db_url", + "uprn_from_previous_csvs", + ]: + if key in s: + step[key] = s[key] # type: ignore[index] + steps.append(step) + + # Ensure two-stage plan if assets implied + lowered = state.nl.lower() + implies_assets = any( + w in lowered + for w in ["asset", "assets", "download", "point cloud", "rgb", "image", "lidar"] + ) + has_mapping = any( + st.get("command") in {"ods_to_uprn", "uprns_by_output_area"} for st in steps + ) + has_download = any(st.get("command") == "download_assets" for st in steps) + if implies_assets and has_mapping and not has_download: + steps.append( { "command": "download_assets", "uprn_from_previous_csvs": True, - "types": types, - "download_dir": spec.get("download_dir") - or defaults.get("download_dir"), - "api_key_env": spec.get("api_key_env") or defaults.get("api_key_env"), - "db_url": spec.get("db_url") or defaults.get("db_url"), - }, - ] - return [spec] + } + ) + + return steps -# ====================================================================================== +# ============================================================================ # Execution helpers -# ====================================================================================== +# ============================================================================ + + +def _build_argv(spec: StepSpec, py: str, qa_path: str) -> list[str]: + cmd = [sys.executable if not py else py, qa_path] + command = spec.get("command") + if command == "download_assets": + uprn = spec.get("uprn") + if isinstance(uprn, list): + uprn_list = [str(x) for x in uprn] + elif isinstance(uprn, str): + uprn_list = [uprn] + else: + raise ValueError( + "download_assets requires 'uprn' unless 'uprn_from_previous_csvs' is used earlier." + ) + cmd += ["--uprn"] + uprn_list + if spec.get("sensor"): + cmd += ["--sensor", str(spec["sensor"])] + if spec.get("types"): + cmd += ["--types", ",".join(spec["types"])] + elif command == "ods_to_uprn": + ods = spec.get("ods") + if isinstance(ods, list): + ods_list = [str(x) for x in ods] + elif isinstance(ods, str): + ods_list = [ods] + else: + raise ValueError("ods_to_uprn requires 'ods'.") + cmd += ["--ods"] + ods_list + elif command == "uprns_by_output_area": + oa = spec.get("output_area") + if isinstance(oa, list): + oa_list = [str(x) for x in oa] + elif isinstance(oa, str): + oa_list = [oa] + else: + raise ValueError("uprns_by_output_area requires 'output_area'.") + cmd += ["--output-area"] + oa_list + else: + raise ValueError(f"Unsupported command: {command!r}") + + if spec.get("db_url"): + cmd += ["--db-url", str(spec["db_url"])] + if spec.get("download_dir"): + cmd += ["--download-dir", str(spec["download_dir"])] + if spec.get("api_key_env"): + cmd += ["--api-key-env", str(spec["api_key_env"])] + return cmd def run_query_assist_step( @@ -764,7 +432,7 @@ def run_query_assist_step( p = subprocess.Popen( argv, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=env ) - captured_lines = [] + captured_lines: list[str] = [] try: assert p.stdout is not None for line in p.stdout: @@ -775,94 +443,76 @@ def run_query_assist_step( return rc, "".join(captured_lines) -def materialize_previous_uprn_csvs(state: WFState, step_idx: int) -> list[str]: - """ - Return list of CSV paths produced by earlier steps. - Uses parsed logs; if nothing parsed but we are in ODS->UPRN flow, default - to /downloads/ods_to_uprn.csv (query_assist behavior). - """ +def _find_csvs_emitted(stream_text: str) -> list[str]: + """Parse query_assist.py logs to discover created CSVs.""" + csvs: list[str] = [] + patterns = [ + r"Saved CSV for .*? → ([^\s]+\.csv)", + r"Saved CSV for .*? -> ([^\s]+\.csv)", + r"Saved ODS.?UPRN CSV .*? → ([^\s]+\.csv)", + r"Saved ODS.?UPRN CSV .*? -> ([^\s]+\.csv)", + ] + for pat in patterns: + for m in re.finditer(pat, stream_text): + csvs.append(m.group(1)) + # Deduplicate preserving order + seen: set[str] = set() + out: list[str] = [] + for pth in csvs: + if pth not in seen: + out.append(pth) + seen.add(pth) + return out + + +def materialize_previous_uprn_csvs(state: WFState) -> list[str]: + """Return list of CSV paths produced by earlier steps.""" from_logs = state.artifacts.get("csvs", []) if from_logs: return from_logs - - # fallback heuristic for ODS mapping - dl_base = state.plan[0].get("download_dir") or os.path.join( - os.getcwd(), "downloads" - ) + # Fallback: common default path from ODS mapping + dl_base = os.path.join(os.getcwd(), "downloads") candidate = os.path.join(dl_base, "ods_to_uprn.csv") if os.path.isfile(candidate): return [candidate] return [] -# ====================================================================================== -# LangGraph nodes -# ====================================================================================== - - -def node_plan(state: WFState) -> WFState: - defaults = { - "download_dir": None, # leave None -> query_assist default ./downloads - "api_key_env": "API_KEY", # aligns with query_assist default - "db_url": None, # use query_assist default unless user overrides - } - # Primary: LLM multi-step planner - plan = llm_plan( - state.nl, - defaults, - base_url=state.base_url, - model_id=state.model_id, - temperature=state.temperature, - top_p=state.top_p, - num_predict=state.num_predict, - num_ctx=state.num_ctx, - keep_alive=state.keep_alive, - force_json=state.force_json, - ) - # Fallback: legacy heuristics - if not plan: - plan = heuristic_plan(state.nl, defaults) - if not plan: - state.plan = [] - state.log.append("No actionable plan could be inferred.") - return state - state.plan = plan - return state +# ============================================================================ +# LangGraph nodes (used only for the execution loop, not for pre-plan) +# ============================================================================ def node_execute(state: WFState) -> WFState: if state.current >= len(state.plan): - return state # nothing to do + return state step = state.plan[state.current] - # If this step takes UPRNs from previous CSVs, resolve them now + # Inject UPRNs from previous CSVs if requested if step.get("uprn_from_previous_csvs"): - csvs = materialize_previous_uprn_csvs(state, state.current) + csvs = materialize_previous_uprn_csvs(state) if not csvs: state.log.append("No CSVs found from previous step(s).") - # Fail this step state.current = len(state.plan) return state - step = {**step} + step = dict(step) step.pop("uprn_from_previous_csvs", None) step["uprn"] = csvs + state.plan[state.current] = step # persist the resolved step - # Execute rc, captured = run_query_assist_step( step, state.py_exe, state.qa_path, state.dry_run ) state.log.append(captured) - # Parse any emitted CSV artifacts for downstream use newly_found = _find_csvs_emitted(captured) if newly_found: - extant = state.artifacts.get("csvs", []) - state.artifacts["csvs"] = list(dict.fromkeys(extant + newly_found)) + existing = state.artifacts.get("csvs", []) + state.artifacts["csvs"] = list(dict.fromkeys(existing + newly_found)) - # Advance or stop on error if rc != 0: state.log.append(f"Step {state.current} returned non-zero exit {rc}.") - state.current = len(state.plan) # abort + state.current = len(state.plan) else: state.current += 1 return state @@ -877,16 +527,16 @@ def node_check_done(state: WFState) -> str: return "execute" -# ====================================================================================== -# CLI / main -# ====================================================================================== +# ============================================================================ +# UI helpers +# ============================================================================ def _render_box(title: str, body: str) -> str: term_width = shutil.get_terminal_size(fallback=(100, 24)).columns max_width = max(60, min(term_width - 2, 100)) wrap_width = max_width - 4 - body_lines = [] + body_lines: list[str] = [] for para in body.splitlines(): if not para.strip(): body_lines.append("") @@ -905,9 +555,26 @@ def _render_box(title: str, body: str) -> str: return "\n".join([top, sep, content, bot]) +def _print_plan(plan: list[StepSpec], level: int) -> None: + # Print ONLY when we are at INFO (-v). Not at WARNING/DEBUG. + if level == logging.INFO: + print("Plan:") + for i, step in enumerate(plan, 1): + step_disp = { + k: v for k, v in step.items() if k != "uprn_from_previous_csvs" + } + print(f" {i}. {json.dumps(step_disp, ensure_ascii=False)}") + print() + + +# ============================================================================ +# CLI +# ============================================================================ + + def parse_args() -> argparse.Namespace: ap = argparse.ArgumentParser( - description="LangGraph NL workflow CLI for query_assist.py (multi-stage capable)" + description="LangGraph NL workflow CLI for query_assist.py (LLM-only planner)" ) ap.add_argument("--model-id", default="gpt-oss:20b", help="Ollama model name/tag") ap.add_argument( @@ -934,13 +601,12 @@ def parse_args() -> argparse.Namespace: ap.add_argument( "-v", "--verbose", action="count", default=0, help="-v=info, -vv=debug" ) - ap.add_argument( - "--plan-only", action="store_true", help="Only show the compiled plan and exit" - ) + ap.add_argument("--plan-only", action="store_true", dest="plan_only") + ap.set_defaults(plan_only=False) return ap.parse_args() -def main(): +def main() -> None: args = parse_args() # Logging level @@ -952,35 +618,29 @@ def main(): level = logging.WARNING logging.basicConfig(level=level, format="%(levelname)s: %(message)s") - # Intro banner + # Intro banner (print at INFO or DEBUG) if level <= logging.INFO: body = ( - "- Parses NL to a multi-step plan (heuristics + optional LLM).\n" - "- Executes steps via LangGraph with artifact passing.\n" - "- Detects CSVs emitted by query_assist.py and feeds them forward.\n" - "- Supports dry-run and plan-only modes for auditability." + "- LLM-only planner (no heuristics).\n" + "- Plan is printed FIRST and only at INFO (-v).\n" + "- Two-stage ODS/OA -> assets enforced when assets are implied.\n" + "- Dry-run and plan-only for auditability." ) print(_render_box(f"LangGraph NL Workflow — {args.model_id}", body)) - # Build the LangGraph + # Build the LangGraph for execution loop only builder = StateGraph(WFState) - builder.add_node("plan", node_plan) builder.add_node("execute", node_execute) - builder.add_edge(START, "plan") - builder.add_conditional_edges( - "plan", lambda s: "execute" if s.plan else END, {"execute": "execute", END: END} - ) + builder.add_edge(START, "execute") builder.add_conditional_edges( "execute", node_check_done, {"execute": "execute", END: END} ) memory = MemorySaver() graph = builder.compile(checkpointer=memory) - # One-shot or REPL def run_once(nl: str) -> int: st = WFState( nl=nl, - plan=[], dry_run=bool(args.dry_run), qa_path=args.query_assist_path, base_url=args.base_url, @@ -994,52 +654,36 @@ def run_once(nl: str) -> int: verbose_level=level, max_steps=args.max_steps, ) - # PLAN - st = _coerce_wfstate( - graph.invoke( - st, config={"configurable": {"thread_id": f"tid-{time.time_ns()}"}} - ) - ) - st = node_plan(st) - if not st.plan: - logging.warning("No actionable plan produced for: %r", nl) - if st.log: - print("\n".join(st.log)) - return 0 - # Print plan only at INFO or DEBUG verbosity - if level <= logging.INFO: - print("Plan:") - for i, step in enumerate(st.plan): - step_disp = { - k: v for k, v in step.items() if k != "uprn_from_previous_csvs" - } - print(f" {i+1}. {json.dumps(step_disp, ensure_ascii=False)}") - print() + # === PLAN (LLM-only) === + try: + st.plan = llm_plan(st) + except Exception as e: + logging.warning("Planning failed: %s", e) + return 1 + + # Print plan FIRST, ONLY at INFO + _print_plan(st.plan, level) if args.plan_only: return 0 - # EXECUTE + # === EXECUTE === while st.current < len(st.plan) and st.current < st.max_steps: - status = node_check_done(st) - if status == END: + # Run the node explicitly so we fully control when execution starts + st = graph.invoke( + st, config={"configurable": {"thread_id": f"tid-{time.time_ns()}"}} + ) + if node_check_done(st) == END: break - st = node_execute(st) - if st.current >= len(st.plan): - # Re-check termination - if node_check_done(st) == END: - break - # Final logs (already streamed, but attach any notes) - if st.log: - trailing = [ - line - for line in st.log - if "[dry-run]" in line - or "No CSVs found" in line - or "non-zero exit" in line - ] - if trailing: - print("\n".join(trailing)) + + # Emit trailing notes if any + trailing = [ + ln + for ln in st.log + if any(k in ln for k in ("[dry-run]", "No CSVs found", "non-zero exit")) + ] + if trailing: + print("\n".join(trailing)) return 0 try: From 7f5cbb0a1ed483d46af5f9634e025bc2450606cb Mon Sep 17 00:00:00 2001 From: gnathoi Date: Tue, 26 Aug 2025 17:09:51 +0100 Subject: [PATCH 12/19] eod update --- examples/nl_query_graph.py | 600 +++++++++++++++++++------------------ 1 file changed, 306 insertions(+), 294 deletions(-) diff --git a/examples/nl_query_graph.py b/examples/nl_query_graph.py index 38e1ccb..9186a07 100644 --- a/examples/nl_query_graph.py +++ b/examples/nl_query_graph.py @@ -1,25 +1,17 @@ #!/usr/bin/env python3 """ -nl_query_graph.py — LLM-only planner, plan-first, two-stage ODS - -A lean rewrite of the previous LangGraph CLI that: -- Prints the plan FIRST (and ONLY when running at INFO, i.e., with -v) -- Uses an LLM-only planner (no heuristics) with a stronger prompt -- Guarantees a two-stage plan for ODS/Output Area when the request implies assets -- Keeps dry-run and plan-only modes -- Preserves artifact detection (CSV) and feeds them into download steps -- Adds robust retries and graceful fallback when Ollama returns empty/invalid content -- Understands both Ollama-native and OpenAI-style response shapes (choices[0].message.content) - -Requirements: - pip install langgraph requests - -Notes: -- We keep LangGraph for structure/checkpointing, but we do not auto-run it before - printing the plan. We explicitly call plan -> print -> execute. -- The LLM prompt is opinionated to produce two steps when the user asks for - assets, even if types are not specified (types omitted === all types). +nl_query_graph.py — LLM-only planner, plan-first, OA/ODS→assets, supports direct UPRN + +Fixes: +- Correct LangGraph conditional mapping: return label strings ("execute"/"end") + and map {"execute": "execute", "end": END}. This eliminates the infinite loop. +- Remove the external invoke loop; let LangGraph run to END in a single invoke. +- Allow single-step plans when UPRNs are provided directly in the NL query. +- Keep CSV auto-handoff and robust Ollama fallback (/api/chat then /api/generate). + +Requirements: pip install langgraph requests """ + from __future__ import annotations import argparse @@ -33,17 +25,15 @@ import sys import textwrap import time -from dataclasses import dataclass, field from typing import Any, Literal, TypedDict -# Third-party import requests from langgraph.checkpoint.memory import MemorySaver from langgraph.graph import END, START, StateGraph -# ============================================================================ -# Strong LLM planning prompt (LLM-only) -# ============================================================================ +# ───────────────────────────────────────────────────────────────────────────── +# Planner prompt +# ───────────────────────────────────────────────────────────────────────────── PLAN_SYSTEM_PROMPT = r""" You are a planning assistant that converts a natural language request about retrieving built environment asset data into an ordered execution plan for @@ -97,9 +87,9 @@ """ -# ============================================================================ -# Types & State -# ============================================================================ +# ───────────────────────────────────────────────────────────────────────────── +# Types +# ───────────────────────────────────────────────────────────────────────────── class StepSpec(TypedDict, total=False): command: Literal["download_assets", "ods_to_uprn", "uprns_by_output_area"] uprn: list[str] | str | None @@ -113,34 +103,31 @@ class StepSpec(TypedDict, total=False): uprn_from_previous_csvs: bool -@dataclass -class WFState: +class WFState(TypedDict, total=False): nl: str - plan: list[StepSpec] = field(default_factory=list) - current: int = 0 - artifacts: dict[str, Any] = field(default_factory=dict) # e.g., {"csvs": [...]} - log: list[str] = field(default_factory=list) - dry_run: bool = False - py_exe: str = sys.executable - qa_path: str = os.path.join(os.path.dirname(__file__), "query_assist.py") - base_url: str = os.environ.get("OLLAMA_HOST", "http://localhost:11434") - model_id: str = "gpt-oss:20b" - temperature: float = 0.0 - top_p: float = 0.95 - num_predict: int = 256 - num_ctx: int | None = None - keep_alive: str | None = None - force_json: bool = True - verbose_level: int = logging.INFO - max_steps: int = 8 - - -# ============================================================================ -# LLM plumbing (robust to sparse/odd Ollama responses) -# ============================================================================ - - -def _post_ollama( + plan: list[StepSpec] + current: int + artifacts: dict[str, Any] + log: list[str] + dry_run: bool + py_exe: str + qa_path: str + base_url: str + model_id: str + temperature: float + top_p: float + num_predict: int + num_ctx: int | None + keep_alive: str | None + force_json: bool + verbose_level: int + max_steps: int + + +# ───────────────────────────────────────────────────────────────────────────── +# Ollama client (robust) +# ───────────────────────────────────────────────────────────────────────────── +def _ollama_chat( base_url: str, payload: dict[str, Any], timeout_s: float = 120.0 ) -> dict[str, Any]: url = base_url.rstrip("/") + "/api/chat" @@ -149,31 +136,20 @@ def _post_ollama( return r.json() -def _extract_first_json(text: str) -> dict | None: - start = text.find("{") - if start == -1: - return None - depth = 0 - for i, c in enumerate(text[start:], start=start): - if c == "{": - depth += 1 - elif c == "}": - depth -= 1 - if depth == 0: - try: - return json.loads(text[start : i + 1]) - except Exception: - return None - return None +def _ollama_generate( + base_url: str, payload: dict[str, Any], timeout_s: float = 120.0 +) -> dict[str, Any]: + url = base_url.rstrip("/") + "/api/generate" + r = requests.post(url, json=payload, timeout=(5.0, timeout_s)) + r.raise_for_status() + return r.json() def _extract_content_variants(resp: dict[str, Any]) -> str: - # Try common shapes returned by Ollama and proxies if isinstance(resp.get("message"), dict): c = resp["message"].get("content") if c: return c - # OpenAI/vLLM/LM Studio style choices = resp.get("choices") if isinstance(choices, list) and choices: first = choices[0] or {} @@ -181,15 +157,12 @@ def _extract_content_variants(resp: dict[str, Any]) -> str: c = msg.get("content") or first.get("text") if isinstance(c, str) and c: return c - # Some servers use top-level 'response' c = resp.get("response") if isinstance(c, str) and c: return c - # Rare: top-level 'content' c = resp.get("content") if isinstance(c, str) and c: return c - # Some wrappers return {'messages': [{'content': ...}]} msgs = resp.get("messages") if isinstance(msgs, list) and msgs and isinstance(msgs[-1], dict): c = msgs[-1].get("content") @@ -198,25 +171,48 @@ def _extract_content_variants(resp: dict[str, Any]) -> str: return "" -def ollama_chat( +def _extract_first_json(text: str) -> dict | None: + if not isinstance(text, str): + return None + s = text.find("{") + if s == -1: + return None + depth = 0 + for i, ch in enumerate(text[s:], start=s): + if ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + try: + return json.loads(text[s : i + 1]) + except Exception: + return None + return None + + +def ollama_plan( + *, base_url: str, model: str, - messages: list[dict[str, str]], - *, - temperature: float = 0.0, - top_p: float = 0.95, - num_predict: int = 256, - num_ctx: int | None = None, - keep_alive: str | None = None, - force_json: bool = True, + system_prompt: str, + user_prompt: str, + temperature: float, + top_p: float, + num_predict: int, + num_ctx: int | None, + keep_alive: str | None, + force_json: bool, timeout_s: float = 120.0, retries: int = 2, - logger: logging.Logger | None = None, ) -> str: - """Return assistant content as a string. Retries with/without JSON forcing.""" - last_err: Exception | None = None + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] + # Try /api/chat with JSON then without for attempt in range(retries + 1): - payload: dict[str, Any] = { + payload = { "model": model, "messages": messages, "stream": False, @@ -226,68 +222,74 @@ def ollama_chat( "num_predict": int(num_predict), }, } - use_json = force_json if attempt == 0 else False if num_ctx is not None: payload["options"]["num_ctx"] = int(num_ctx) if keep_alive: payload["keep_alive"] = str(keep_alive) - if use_json: + if attempt == 0 and force_json: payload["format"] = "json" try: - resp = _post_ollama(base_url, payload, timeout_s=timeout_s) - content = _extract_content_variants(resp) - if logger and logger.isEnabledFor(logging.DEBUG): - logger.debug("Ollama raw keys: %s", list(resp.keys())) - logger.debug("Ollama content preview: %r", content[:200]) - if content: - return content - last_err = RuntimeError("empty content from Ollama") - except Exception as e: - last_err = e - # brief backoff - time.sleep(0.2 * (attempt + 1)) - if last_err: - raise last_err - raise RuntimeError("Ollama returned no content.") + resp = _ollama_chat(base_url, payload, timeout_s=timeout_s) + c = _extract_content_variants(resp) + if c: + return c + except Exception: + pass + time.sleep(0.15 * (attempt + 1)) + # Fallback /api/generate + try: + prompt = ( + f"<>\n{system_prompt}\n<>\n\nUser:\n{user_prompt}\n\nAssistant:" + ) + payload = { + "model": model, + "prompt": prompt, + "stream": False, + "options": { + "temperature": float(temperature), + "top_p": float(top_p), + "num_predict": int(num_predict), + }, + } + if force_json: + payload["format"] = "json" + resp = _ollama_generate(base_url, payload, timeout_s=timeout_s) + c = _extract_content_variants(resp) + if c: + return c + except Exception: + pass + return "" +# ───────────────────────────────────────────────────────────────────────────── +# Planning +# ───────────────────────────────────────────────────────────────────────────── def llm_plan(state: WFState) -> list[StepSpec]: - messages = [ - {"role": "system", "content": PLAN_SYSTEM_PROMPT}, - {"role": "user", "content": state.nl}, - ] - # Try calling the planner; on failure, DO NOT abort. Fall through to synthesis. - try: - content = ollama_chat( - base_url=state.base_url, - model=state.model_id, - messages=messages, - temperature=state.temperature, - top_p=state.top_p, - num_predict=state.num_predict, - num_ctx=state.num_ctx, - keep_alive=state.keep_alive, - force_json=state.force_json, - retries=2, - logger=logging.getLogger(__name__), - ) - except Exception as e: - logging.getLogger(__name__).warning("Planner LLM call failed: %s", e) - content = "" + content = ollama_plan( + base_url=state["base_url"], + model=state["model_id"], + system_prompt=PLAN_SYSTEM_PROMPT, + user_prompt=state["nl"], + temperature=state.get("temperature", 0.0), + top_p=state.get("top_p", 0.95), + num_predict=state.get("num_predict", 256), + num_ctx=state.get("num_ctx"), + keep_alive=state.get("keep_alive"), + force_json=state.get("force_json", True), + ) - obj: dict | None - try: - obj = json.loads(content) - except Exception: - obj = _extract_first_json(content) - if not obj or not isinstance(obj, dict): - # LAST-DITCH guardrail: if the LLM truly failed to produce JSON but the NL - # clearly requests assets with ODS/OA, synthesize a minimal two-step plan - # rather than crashing. This is NOT a heuristic router; it's a fail-safe - # to keep execution usable when the model returns empty text. - lowered = state.nl.lower() - ods = re.findall(r"\b[A-Z]\d{5}\b", state.nl) - oa = re.findall(r"\bE\d{8}\b", state.nl) + obj: dict | None = None + if content: + try: + obj = json.loads(content) + except Exception: + obj = _extract_first_json(content) + + # If LLM output is unusable, synthesize from the NL string + if not obj or not isinstance(obj, dict) or not isinstance(obj.get("steps"), list): + nl = state["nl"] + lowered = nl.lower() implies_assets = any( w in lowered for w in ( @@ -300,26 +302,29 @@ def llm_plan(state: WFState) -> list[StepSpec]: "lidar", ) ) - if implies_assets and (ods or oa): - if ods: - return [ - {"command": "ods_to_uprn", "ods": ods}, - {"command": "download_assets", "uprn_from_previous_csvs": True}, - ] - if oa: - return [ - {"command": "uprns_by_output_area", "output_area": oa}, - {"command": "download_assets", "uprn_from_previous_csvs": True}, - ] - # If we get here, there is nothing actionable — report the original issue. - raise RuntimeError("LLM did not return a valid JSON object.") - - steps_raw = obj.get("steps") - if not isinstance(steps_raw, list) or not steps_raw: - raise RuntimeError("LLM returned an empty steps list.") + # Detect UPRNs (10–14 digit sequences are typical); be conservative + uprns = re.findall(r"\b\d{10,14}\b", nl) + ods = re.findall(r"\b[A-Z]\d{5}\b", nl) + oa = re.findall(r"\bE\d{8}\b", nl) + if implies_assets and uprns: + return [{"command": "download_assets", "uprn": uprns}] + if implies_assets and ods: + return [ + {"command": "ods_to_uprn", "ods": ods}, + {"command": "download_assets", "uprn_from_previous_csvs": True}, + ] + if implies_assets and oa: + return [ + {"command": "uprns_by_output_area", "output_area": oa}, + {"command": "download_assets", "uprn_from_previous_csvs": True}, + ] + raise RuntimeError( + "Planner produced no actionable steps; provide UPRN(s), ODS or Output Area." + ) + # Normalize steps from LLM JSON steps: list[StepSpec] = [] - for s in steps_raw: + for s in obj["steps"]: if not isinstance(s, dict): continue cmd = s.get("command") @@ -341,35 +346,34 @@ def llm_plan(state: WFState) -> list[StepSpec]: step[key] = s[key] # type: ignore[index] steps.append(step) - # Ensure two-stage plan if assets implied - lowered = state.nl.lower() + # Enforce download step only when a mapping step exists (don’t force for direct UPRN) + lowered = state["nl"].lower() implies_assets = any( w in lowered - for w in ["asset", "assets", "download", "point cloud", "rgb", "image", "lidar"] + for w in ("asset", "assets", "download", "point cloud", "rgb", "image", "lidar") ) has_mapping = any( st.get("command") in {"ods_to_uprn", "uprns_by_output_area"} for st in steps ) has_download = any(st.get("command") == "download_assets" for st in steps) if implies_assets and has_mapping and not has_download: - steps.append( - { - "command": "download_assets", - "uprn_from_previous_csvs": True, - } - ) + steps.append({"command": "download_assets", "uprn_from_previous_csvs": True}) + + # If a download step exists but lacks UPRNs, consume prior CSVs + for st in steps: + if st.get("command") == "download_assets" and not st.get("uprn"): + st["uprn_from_previous_csvs"] = True return steps -# ============================================================================ +# ───────────────────────────────────────────────────────────────────────────── # Execution helpers -# ============================================================================ - - +# ───────────────────────────────────────────────────────────────────────────── def _build_argv(spec: StepSpec, py: str, qa_path: str) -> list[str]: - cmd = [sys.executable if not py else py, qa_path] + cmd = [py or sys.executable, qa_path] command = spec.get("command") + if command == "download_assets": uprn = spec.get("uprn") if isinstance(uprn, list): @@ -378,7 +382,7 @@ def _build_argv(spec: StepSpec, py: str, qa_path: str) -> list[str]: uprn_list = [uprn] else: raise ValueError( - "download_assets requires 'uprn' unless 'uprn_from_previous_csvs' is used earlier." + "download_assets requires 'uprn' unless 'uprn_from_previous_csvs' is set." ) cmd += ["--uprn"] + uprn_list if spec.get("sensor"): @@ -422,116 +426,122 @@ def run_query_assist_step( dry_run: bool, env: dict[str, str] | None = None, ) -> tuple[int, str]: - """Execute a single step via subprocess, stream logs, and return (rc, captured_text).""" argv = _build_argv(step, py_exe, qa_path) printable = " ".join(shlex.quote(x) for x in argv) + logging.info("Executing step: %s", json.dumps(step, ensure_ascii=False)) logging.info("Command: %s", printable) if dry_run: return 0, f"[dry-run] {printable}\n" - p = subprocess.Popen( - argv, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=env - ) - captured_lines: list[str] = [] try: - assert p.stdout is not None - for line in p.stdout: - sys.stdout.write(line) - captured_lines.append(line) - finally: - rc = p.wait() - return rc, "".join(captured_lines) - - -def _find_csvs_emitted(stream_text: str) -> list[str]: - """Parse query_assist.py logs to discover created CSVs.""" - csvs: list[str] = [] - patterns = [ - r"Saved CSV for .*? → ([^\s]+\.csv)", - r"Saved CSV for .*? -> ([^\s]+\.csv)", - r"Saved ODS.?UPRN CSV .*? → ([^\s]+\.csv)", - r"Saved ODS.?UPRN CSV .*? -> ([^\s]+\.csv)", + p = subprocess.Popen( + argv, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=env + ) + except FileNotFoundError as e: + return 127, f"[spawn-failed] {e}\n" + except Exception as e: + return 1, f"[spawn-failed] {e}\n" + + captured: list[str] = [] + assert p.stdout is not None + for line in p.stdout: + sys.stdout.write(line) # stream-through to console + captured.append(line) + rc = p.wait() + return rc, "".join(captured) + + +def _find_csvs_emitted(text: str) -> list[str]: + pats = [ + r"✔?\s*Saved\s+(?:ODS.?→?UPRN|ODS.?to.?UPRN)\s*CSV\s*[–\-→>]\s*([^\s]+\.csv)", + r"✔?\s*Saved\s+(?:OA.?→?UPRN|OA.?to.?UPRN|Output\s*Area.?→?UPRN)\s*CSV\s*[–\-→>]\s*([^\s]+\.csv)", + r"Saved\s*CSV\s*for\s*.*?[–\-→>]\s*([^\s]+\.csv)", + r"✔\s*Saved\s*ODS.?→?UPRN\s*CSV\s*→\s*([^\s]+\.csv)", ] - for pat in patterns: - for m in re.finditer(pat, stream_text): - csvs.append(m.group(1)) - # Deduplicate preserving order - seen: set[str] = set() - out: list[str] = [] - for pth in csvs: - if pth not in seen: - out.append(pth) - seen.add(pth) + out, seen = [], set() + for pat in pats: + for m in re.finditer(pat, text, flags=re.IGNORECASE): + path = m.group(1) + if path not in seen: + out.append(path) + seen.add(path) return out def materialize_previous_uprn_csvs(state: WFState) -> list[str]: - """Return list of CSV paths produced by earlier steps.""" - from_logs = state.artifacts.get("csvs", []) + from_logs = state.get("artifacts", {}).get("csvs", []) if from_logs: - return from_logs - # Fallback: common default path from ODS mapping - dl_base = os.path.join(os.getcwd(), "downloads") - candidate = os.path.join(dl_base, "ods_to_uprn.csv") - if os.path.isfile(candidate): - return [candidate] - return [] - - -# ============================================================================ -# LangGraph nodes (used only for the execution loop, not for pre-plan) -# ============================================================================ + return list(from_logs) + candidates = [ + os.path.join(os.getcwd(), "downloads", "ods_to_uprn.csv"), + os.path.join(os.getcwd(), "downloads", "oa_to_uprn.csv"), + os.path.join(os.getcwd(), "downloads", "uprns_by_output_area.csv"), + ] + return [p for p in candidates if os.path.isfile(p)] +# ───────────────────────────────────────────────────────────────────────────── +# LangGraph nodes +# ───────────────────────────────────────────────────────────────────────────── def node_execute(state: WFState) -> WFState: - if state.current >= len(state.plan): + state.setdefault("artifacts", {}) + state.setdefault("log", []) + state.setdefault("current", 0) + + if state["current"] >= len(state.get("plan", [])): return state - step = state.plan[state.current] - # Inject UPRNs from previous CSVs if requested - if step.get("uprn_from_previous_csvs"): + step: StepSpec = state["plan"][state["current"]] + + # Inject CSVs for download_assets when needed + if step.get("command") == "download_assets" and not step.get("uprn"): csvs = materialize_previous_uprn_csvs(state) - if not csvs: - state.log.append("No CSVs found from previous step(s).") - state.current = len(state.plan) + if not csvs and step.get("uprn_from_previous_csvs"): + state["log"].append("No CSVs found from previous step(s).") + state["current"] = len(state["plan"]) return state - step = dict(step) - step.pop("uprn_from_previous_csvs", None) - step["uprn"] = csvs - state.plan[state.current] = step # persist the resolved step + if csvs: + step = dict(step) + step.pop("uprn_from_previous_csvs", None) + step["uprn"] = csvs + state["plan"][state["current"]] = step rc, captured = run_query_assist_step( - step, state.py_exe, state.qa_path, state.dry_run + step, + state.get("py_exe", sys.executable), + state["qa_path"], + state.get("dry_run", False), ) - state.log.append(captured) + state["log"].append(captured) - newly_found = _find_csvs_emitted(captured) - if newly_found: - existing = state.artifacts.get("csvs", []) - state.artifacts["csvs"] = list(dict.fromkeys(existing + newly_found)) + newly = _find_csvs_emitted(captured) + if newly: + existing = state["artifacts"].get("csvs", []) + state["artifacts"]["csvs"] = list(dict.fromkeys(list(existing) + newly)) if rc != 0: - state.log.append(f"Step {state.current} returned non-zero exit {rc}.") - state.current = len(state.plan) + state["log"].append(f"Step {state['current']} returned non-zero exit {rc}.") + state["current"] = len(state["plan"]) else: - state.current += 1 + state["current"] += 1 return state def node_check_done(state: WFState) -> str: - if state.current >= len(state.plan): - return END - if state.current >= state.max_steps: - state.log.append(f"Aborting: exceeded max_steps={state.max_steps}") - return END + # IMPORTANT: return string labels, not END sentinel + if state.get("current", 0) >= len(state.get("plan", [])): + return "end" + if state.get("current", 0) >= state.get("max_steps", 8): + state.setdefault("log", []).append( + f"Aborting: exceeded max_steps={state.get('max_steps', 8)}" + ) + return "end" return "execute" -# ============================================================================ +# ───────────────────────────────────────────────────────────────────────────── # UI helpers -# ============================================================================ - - +# ───────────────────────────────────────────────────────────────────────────── def _render_box(title: str, body: str) -> str: term_width = shutil.get_terminal_size(fallback=(100, 24)).columns max_width = max(60, min(term_width - 2, 100)) @@ -556,22 +566,17 @@ def _render_box(title: str, body: str) -> str: def _print_plan(plan: list[StepSpec], level: int) -> None: - # Print ONLY when we are at INFO (-v). Not at WARNING/DEBUG. if level == logging.INFO: print("Plan:") for i, step in enumerate(plan, 1): - step_disp = { - k: v for k, v in step.items() if k != "uprn_from_previous_csvs" - } - print(f" {i}. {json.dumps(step_disp, ensure_ascii=False)}") + display = {k: v for k, v in step.items() if k != "uprn_from_previous_csvs"} + print(f" {i}. {json.dumps(display, ensure_ascii=False)}") print() -# ============================================================================ +# ───────────────────────────────────────────────────────────────────────────── # CLI -# ============================================================================ - - +# ───────────────────────────────────────────────────────────────────────────── def parse_args() -> argparse.Namespace: ap = argparse.ArgumentParser( description="LangGraph NL workflow CLI for query_assist.py (LLM-only planner)" @@ -609,7 +614,7 @@ def parse_args() -> argparse.Namespace: def main() -> None: args = parse_args() - # Logging level + # Logging if args.verbose >= 2: level = logging.DEBUG elif args.verbose == 1: @@ -618,7 +623,7 @@ def main() -> None: level = logging.WARNING logging.basicConfig(level=level, format="%(levelname)s: %(message)s") - # Intro banner (print at INFO or DEBUG) + # Banner if level <= logging.INFO: body = ( "- LLM-only planner (no heuristics).\n" @@ -628,59 +633,66 @@ def main() -> None: ) print(_render_box(f"LangGraph NL Workflow — {args.model_id}", body)) - # Build the LangGraph for execution loop only - builder = StateGraph(WFState) + # Build LangGraph (dict state) — IMPORTANT: label mapping uses string keys + builder = StateGraph(dict) builder.add_node("execute", node_execute) builder.add_edge(START, "execute") builder.add_conditional_edges( - "execute", node_check_done, {"execute": "execute", END: END} + "execute", node_check_done, {"execute": "execute", "end": END} ) memory = MemorySaver() graph = builder.compile(checkpointer=memory) def run_once(nl: str) -> int: - st = WFState( - nl=nl, - dry_run=bool(args.dry_run), - qa_path=args.query_assist_path, - base_url=args.base_url, - model_id=args.model_id, - temperature=args.temperature, - top_p=args.top_p, - num_predict=args.num_predict, - num_ctx=args.num_ctx, - keep_alive=args.keep_alive, - force_json=(not args.no_force_json), - verbose_level=level, - max_steps=args.max_steps, - ) + st: WFState = { + "nl": nl, + "plan": [], + "current": 0, + "artifacts": {}, + "log": [], + "dry_run": bool(args.dry_run), + "py_exe": sys.executable, + "qa_path": args.query_assist_path, + "base_url": args.base_url, + "model_id": args.model_id, + "temperature": args.temperature, + "top_p": args.top_p, + "num_predict": args.num_predict, + "num_ctx": args.num_ctx, + "keep_alive": args.keep_alive, + "force_json": (not args.no_force_json), + "verbose_level": level, + "max_steps": args.max_steps, + } - # === PLAN (LLM-only) === + # PLAN try: - st.plan = llm_plan(st) + st["plan"] = llm_plan(st) except Exception as e: - logging.warning("Planning failed: %s", e) + logging.info("Planning failed: %s", e) return 1 - # Print plan FIRST, ONLY at INFO - _print_plan(st.plan, level) + _print_plan(st["plan"], level) if args.plan_only: return 0 - # === EXECUTE === - while st.current < len(st.plan) and st.current < st.max_steps: - # Run the node explicitly so we fully control when execution starts - st = graph.invoke( - st, config={"configurable": {"thread_id": f"tid-{time.time_ns()}"}} - ) - if node_check_done(st) == END: - break + # EXECUTE: single invoke to END (no external loop) + final_state = graph.invoke( + st, config={"configurable": {"thread_id": f"tid-{time.time_ns()}"}} + ) - # Emit trailing notes if any trailing = [ ln - for ln in st.log - if any(k in ln for k in ("[dry-run]", "No CSVs found", "non-zero exit")) + for ln in final_state.get("log", []) + if any( + k in ln + for k in ( + "[dry-run]", + "No CSVs found", + "non-zero exit", + "[spawn-failed]", + ) + ) ] if trailing: print("\n".join(trailing)) @@ -704,7 +716,7 @@ def run_once(nl: str) -> int: break rc = run_once(nl) if rc != 0: - logging.warning("Workflow exited with code %d", rc) + logging.info("Workflow exited with code %d", rc) except KeyboardInterrupt: print() logging.info("Interrupted.") From 4e51e57ac3fef23ec9830d35060ba305eb9ce275 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Wed, 27 Aug 2025 14:30:37 +0100 Subject: [PATCH 13/19] change: working --- examples/nl_query_graph.py | 1360 ++++++++++++++++++++++-------------- 1 file changed, 830 insertions(+), 530 deletions(-) diff --git a/examples/nl_query_graph.py b/examples/nl_query_graph.py index 9186a07..51f9ea6 100644 --- a/examples/nl_query_graph.py +++ b/examples/nl_query_graph.py @@ -1,20 +1,36 @@ #!/usr/bin/env python3 """ -nl_query_graph.py — LLM-only planner, plan-first, OA/ODS→assets, supports direct UPRN - -Fixes: -- Correct LangGraph conditional mapping: return label strings ("execute"/"end") - and map {"execute": "execute", "end": END}. This eliminates the infinite loop. -- Remove the external invoke loop; let LangGraph run to END in a single invoke. -- Allow single-step plans when UPRNs are provided directly in the NL query. -- Keep CSV auto-handoff and robust Ollama fallback (/api/chat then /api/generate). - -Requirements: pip install langgraph requests +LangGraph NL workflow for query_assist.py — single- and two-stage queries + +Capabilities +------------ +• One-stage: download assets directly for given UPRN(s) / UPRN CSV +• Two-stage: (a) ODS → UPRN → download assets (b) Output Areas → UPRN → download assets +• Optional filters: sensor, types; optional overrides: download_dir, api_key_env, db_url +• LLM (Ollama) planner first; heuristic fallback; single-step router last +• Robust parsing of query_assist.py logs to discover emitted CSVs and feed them forward + +Examples +-------- +# Direct assets (one-stage) +python3 nl_query_graph.py -q "Download merged lidar for UPRN 5045394 to /data" + +# ODS → UPRN → assets (two-stage) +python3 nl_query_graph.py -q "For ODS G85013 download RGB and merged lidar to /data" + +# Output areas → UPRN → assets (two-stage) +python3 nl_query_graph.py -q "Get point clouds in output area E00004550" + +Notes +----- +• Requires: langgraph (pip install langgraph[all]), requests +• Uses Ollama for optional planning: set OLLAMA_HOST or --base-url if needed +• Defaults match query_assist.py: API key env var defaults to API_KEY, downloads to ./downloads """ - from __future__ import annotations import argparse +import dataclasses import json import logging import os @@ -27,386 +43,184 @@ import time from typing import Any, Literal, TypedDict +# Third-party import requests from langgraph.checkpoint.memory import MemorySaver from langgraph.graph import END, START, StateGraph -# ───────────────────────────────────────────────────────────────────────────── -# Planner prompt -# ───────────────────────────────────────────────────────────────────────────── -PLAN_SYSTEM_PROMPT = r""" -You are a planning assistant that converts a natural language request about -retrieving built environment asset data into an ordered execution plan for -query_assist.py. - -Available CLI commands: - 1. uprns_by_output_area – given output area code(s) yield UPRN CSV(s) - 2. ods_to_uprn – given ODS clinical practice code(s) yield UPRN CSV - 3. download_assets – given UPRN(s) (list or CSV path) download assets - -Asset type IRIs (use when mentioned or implied; omit "types" to mean ALL types): - did:rgb-image, did:lidar-pointcloud-merged, did:lidar-pointcloud-frame, - did:lidar-range-pano, did:lidar-reflectance-pano, did:lidar-signal-pano, - did:lidar-nearir-pano, did:ir-false-color-image, did:ir-temperature-array, - did:ir-count-image, did:celsius-temperature, did:relative-humidity - -Synonyms / interpretation guidance (LLM, be decisive): - "building"/"property" -> UPRN(s) - "practice"/"gp practice" -> ODS code - "point clouds" (plural) -> include merged and frame unless otherwise specified - "thermal image" -> did:ir-false-color-image unless arrays are explicitly requested - -CRITICAL RULES: - • If the user asks to get/download assets and provides ODS or Output Area, - ALWAYS output a TWO-STEP plan: the mapping step first, then a download_assets - step that consumes the CSVs from the previous step. Use - {"uprn_from_previous_csvs": true} on that download step. - • If the user says "all assets" or does not specify types, omit the "types" - field entirely on download_assets to indicate ALL types. - • Prefer being decisive and avoid asking questions. - -JSON output schema (emit ONLY one JSON object): -{ - "steps": [ - { - "command": "uprns_by_output_area"|"ods_to_uprn"|"download_assets", - "output_area": string|[string]|null, - "ods": string|[string]|null, - "uprn": string|[string]|null, - "types": [string]|null, - "sensor": string|null, - "download_dir": string|null, - "api_key_env": string|null, - "db_url": string|null, - "uprn_from_previous_csvs": true|false|null - }, ... - ] -} +# ====================================================================================== +# Prompts & canonical type mappings (kept consistent with nl_query_cli.py / README) +# ====================================================================================== -Do not include any prose. Output the JSON object only. -""" +SYSTEM_ROUTER_PROMPT = """You are a rigorous function-call router for a Python CLI named query_assist.py. +Supported commands and how to populate them: -# ───────────────────────────────────────────────────────────────────────────── -# Types -# ───────────────────────────────────────────────────────────────────────────── -class StepSpec(TypedDict, total=False): - command: Literal["download_assets", "ods_to_uprn", "uprns_by_output_area"] - uprn: list[str] | str | None - ods: list[str] | str | None - output_area: list[str] | str | None - sensor: str | None - types: list[str] | None - download_dir: str | None - api_key_env: str | None - db_url: str | None - uprn_from_previous_csvs: bool +1) download_assets + Required: uprn (string CSV path OR array of strings like ["5045394","200003455212"]) + Optional: sensor (string, e.g., "bess:OusterLidarSensor") + types (array of strings; each a type IRI, e.g., ["did:rgb-image","did:lidar-pointcloud-merged"]) + download_dir (string path) + api_key_env (string, name of env var with API key) + db_url (string URL to SPARQL endpoint) +2) ods_to_uprn + Required: ods (string CSV path OR array of strings like ["G85013","Q12345"]) -class WFState(TypedDict, total=False): - nl: str - plan: list[StepSpec] - current: int - artifacts: dict[str, Any] - log: list[str] - dry_run: bool - py_exe: str - qa_path: str - base_url: str - model_id: str - temperature: float - top_p: float - num_predict: int - num_ctx: int | None - keep_alive: str | None - force_json: bool - verbose_level: int - max_steps: int - - -# ───────────────────────────────────────────────────────────────────────────── -# Ollama client (robust) -# ───────────────────────────────────────────────────────────────────────────── -def _ollama_chat( - base_url: str, payload: dict[str, Any], timeout_s: float = 120.0 -) -> dict[str, Any]: - url = base_url.rstrip("/") + "/api/chat" - r = requests.post(url, json=payload, timeout=(5.0, timeout_s)) - r.raise_for_status() - return r.json() +3) uprns_by_output_area + Required: output_area (string CSV path OR array of strings, e.g., ["E00004550","E00032882"]) +Schema (MUST output exactly one JSON object with these keys as needed): +{ + "command": "download_assets" | "ods_to_uprn" | "uprns_by_output_area", + "uprn": string | string[] | null, + "ods": string | string[] | null, + "output_area": string | string[] | null, + "sensor": string | null, + "types": string[] | null, + "download_dir": string | null, + "api_key_env": string | null, + "db_url": string | null +} -def _ollama_generate( - base_url: str, payload: dict[str, Any], timeout_s: float = 120.0 -) -> dict[str, Any]: - url = base_url.rstrip("/") + "/api/generate" - r = requests.post(url, json=payload, timeout=(5.0, timeout_s)) - r.raise_for_status() - return r.json() +Constraints: +- Return ONLY the JSON object. No prose, no markdown. +- If the user request implies asset types, map them to the supported IRIs if possible: + - RGB image -> "did:rgb-image" + - merged lidar point cloud -> "did:lidar-pointcloud-merged" + - lidar range panorama -> "did:lidar-range-pano" + - lidar reflectance panorama -> "did:lidar-reflectance-pano" + - lidar signal panorama -> "did:lidar-signal-pano" + - lidar near-infrared panorama -> "did:lidar-nearir-pano" + - IR false color -> "did:ir-false-color-image" + - IR temperature array -> "did:ir-temperature-array" + - IR counts -> "did:ir-count-image" + - temperature (no contentUrl) -> "did:celsius-temperature" + - relative humidity (no contentUrl) -> "did:relative-humidity" +- Prefer being decisive. When in doubt, infer sensible defaults. +""" + +TYPE_ALIASES = { + "rgb": "did:rgb-image", + "rgb image": "did:rgb-image", + "merged lidar": "did:lidar-pointcloud-merged", + "merged lidar point cloud": "did:lidar-pointcloud-merged", + "lidar point cloud": "did:lidar-pointcloud-frame", + "point cloud": "did:lidar-pointcloud-frame", + "point clouds": None, # expands to both merged + frame + "lidar range panorama": "did:lidar-range-pano", + "lidar reflectance panorama": "did:lidar-reflectance-pano", + "lidar signal panorama": "did:lidar-signal-pano", + "lidar nearir panorama": "did:lidar-nearir-pano", + "ir false color": "did:ir-false-color-image", + "ir temperature array": "did:ir-temperature-array", + "ir counts": "did:ir-count-image", + "temperature": "did:celsius-temperature", + "relative humidity": "did:relative-humidity", +} +POINTCLOUD_BOTH = ["did:lidar-pointcloud-merged", "did:lidar-pointcloud-frame"] -def _extract_content_variants(resp: dict[str, Any]) -> str: - if isinstance(resp.get("message"), dict): - c = resp["message"].get("content") - if c: - return c - choices = resp.get("choices") - if isinstance(choices, list) and choices: - first = choices[0] or {} - msg = first.get("message") or {} - c = msg.get("content") or first.get("text") - if isinstance(c, str) and c: - return c - c = resp.get("response") - if isinstance(c, str) and c: - return c - c = resp.get("content") - if isinstance(c, str) and c: - return c - msgs = resp.get("messages") - if isinstance(msgs, list) and msgs and isinstance(msgs[-1], dict): - c = msgs[-1].get("content") - if isinstance(c, str) and c: - return c - return "" +# ====================================================================================== +# Helpers +# ====================================================================================== + + +def _render_box(title: str, body: str) -> str: + term_width = shutil.get_terminal_size(fallback=(100, 24)).columns + max_width = max(60, min(term_width - 2, 100)) + wrap_width = max_width - 4 + body_lines = [] + for para in body.splitlines(): + if not para.strip(): + body_lines.append("") + else: + body_lines.extend(textwrap.wrap(para, width=wrap_width)) + title = title.strip() + title_line = f" {title} " + top = "┌" + "─" * (max_width - 2) + "┐" + sep = "├" + "─" * (max_width - 2) + "┤" + bot = "└" + "─" * (max_width - 2) + "┘" + if len(title_line) <= (max_width - 2): + left = (max_width - 2 - len(title_line)) // 2 + right = max_width - 2 - len(title_line) - left + top = "┌" + "─" * left + title_line + "─" * right + "┐" + content = "\n".join("│ " + line.ljust(max_width - 4) + " │" for line in body_lines) + return "\n".join([top, sep, content, bot]) def _extract_first_json(text: str) -> dict | None: - if not isinstance(text, str): - return None - s = text.find("{") - if s == -1: + start = text.find("{") + if start == -1: return None depth = 0 - for i, ch in enumerate(text[s:], start=s): - if ch == "{": + for i, c in enumerate(text[start:], start=start): + if c == "{": depth += 1 - elif ch == "}": + elif c == "}": depth -= 1 if depth == 0: try: - return json.loads(text[s : i + 1]) + return json.loads(text[start : i + 1]) except Exception: return None return None -def ollama_plan( - *, - base_url: str, - model: str, - system_prompt: str, - user_prompt: str, - temperature: float, - top_p: float, - num_predict: int, - num_ctx: int | None, - keep_alive: str | None, - force_json: bool, - timeout_s: float = 120.0, - retries: int = 2, -) -> str: - messages = [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, +def _find_csvs_emitted(stream_text: str) -> list[str]: + """Parse query_assist.py logs to discover created CSVs.""" + csvs: list[str] = [] + patterns = [ + r"Saved CSV for .*? → ([^\s]+\.csv)", + r"Saved CSV for .*? -> ([^\s]+\.csv)", + r"Saved ODS.?UPRN CSV .*? → ([^\s]+\.csv)", + r"Saved ODS.?UPRN CSV .*? -> ([^\s]+\.csv)", ] - # Try /api/chat with JSON then without - for attempt in range(retries + 1): - payload = { - "model": model, - "messages": messages, - "stream": False, - "options": { - "temperature": float(temperature), - "top_p": float(top_p), - "num_predict": int(num_predict), - }, - } - if num_ctx is not None: - payload["options"]["num_ctx"] = int(num_ctx) - if keep_alive: - payload["keep_alive"] = str(keep_alive) - if attempt == 0 and force_json: - payload["format"] = "json" - try: - resp = _ollama_chat(base_url, payload, timeout_s=timeout_s) - c = _extract_content_variants(resp) - if c: - return c - except Exception: - pass - time.sleep(0.15 * (attempt + 1)) - # Fallback /api/generate - try: - prompt = ( - f"<>\n{system_prompt}\n<>\n\nUser:\n{user_prompt}\n\nAssistant:" - ) - payload = { - "model": model, - "prompt": prompt, - "stream": False, - "options": { - "temperature": float(temperature), - "top_p": float(top_p), - "num_predict": int(num_predict), - }, - } - if force_json: - payload["format"] = "json" - resp = _ollama_generate(base_url, payload, timeout_s=timeout_s) - c = _extract_content_variants(resp) - if c: - return c - except Exception: - pass - return "" - - -# ───────────────────────────────────────────────────────────────────────────── -# Planning -# ───────────────────────────────────────────────────────────────────────────── -def llm_plan(state: WFState) -> list[StepSpec]: - content = ollama_plan( - base_url=state["base_url"], - model=state["model_id"], - system_prompt=PLAN_SYSTEM_PROMPT, - user_prompt=state["nl"], - temperature=state.get("temperature", 0.0), - top_p=state.get("top_p", 0.95), - num_predict=state.get("num_predict", 256), - num_ctx=state.get("num_ctx"), - keep_alive=state.get("keep_alive"), - force_json=state.get("force_json", True), - ) - - obj: dict | None = None - if content: - try: - obj = json.loads(content) - except Exception: - obj = _extract_first_json(content) - - # If LLM output is unusable, synthesize from the NL string - if not obj or not isinstance(obj, dict) or not isinstance(obj.get("steps"), list): - nl = state["nl"] - lowered = nl.lower() - implies_assets = any( - w in lowered - for w in ( - "asset", - "assets", - "download", - "point cloud", - "rgb", - "image", - "lidar", - ) - ) - # Detect UPRNs (10–14 digit sequences are typical); be conservative - uprns = re.findall(r"\b\d{10,14}\b", nl) - ods = re.findall(r"\b[A-Z]\d{5}\b", nl) - oa = re.findall(r"\bE\d{8}\b", nl) - if implies_assets and uprns: - return [{"command": "download_assets", "uprn": uprns}] - if implies_assets and ods: - return [ - {"command": "ods_to_uprn", "ods": ods}, - {"command": "download_assets", "uprn_from_previous_csvs": True}, - ] - if implies_assets and oa: - return [ - {"command": "uprns_by_output_area", "output_area": oa}, - {"command": "download_assets", "uprn_from_previous_csvs": True}, - ] - raise RuntimeError( - "Planner produced no actionable steps; provide UPRN(s), ODS or Output Area." - ) - - # Normalize steps from LLM JSON - steps: list[StepSpec] = [] - for s in obj["steps"]: - if not isinstance(s, dict): - continue - cmd = s.get("command") - if cmd not in {"download_assets", "ods_to_uprn", "uprns_by_output_area"}: - continue - step: StepSpec = {"command": cmd} # type: ignore[assignment] - for key in [ - "uprn", - "ods", - "output_area", - "types", - "sensor", - "download_dir", - "api_key_env", - "db_url", - "uprn_from_previous_csvs", - ]: - if key in s: - step[key] = s[key] # type: ignore[index] - steps.append(step) - - # Enforce download step only when a mapping step exists (don’t force for direct UPRN) - lowered = state["nl"].lower() - implies_assets = any( - w in lowered - for w in ("asset", "assets", "download", "point cloud", "rgb", "image", "lidar") - ) - has_mapping = any( - st.get("command") in {"ods_to_uprn", "uprns_by_output_area"} for st in steps - ) - has_download = any(st.get("command") == "download_assets" for st in steps) - if implies_assets and has_mapping and not has_download: - steps.append({"command": "download_assets", "uprn_from_previous_csvs": True}) + for pat in patterns: + for m in re.finditer(pat, stream_text): + csvs.append(m.group(1)) + # Deduplicate preserving order + seen: set[str] = set() + out: list[str] = [] + for p in csvs: + if p not in seen: + out.append(p) + seen.add(p) + return out - # If a download step exists but lacks UPRNs, consume prior CSVs - for st in steps: - if st.get("command") == "download_assets" and not st.get("uprn"): - st["uprn_from_previous_csvs"] = True - return steps +def _ensure_list_or_path(v: None | str | list[str]) -> list[str]: + """Convert (None | str | list[str]) to a flat argv-ready list.""" + if v is None: + return [] + if isinstance(v, list): + return [str(x) for x in v if str(x).strip()] + s = str(v).strip() + return [s] if s else [] -# ───────────────────────────────────────────────────────────────────────────── -# Execution helpers -# ───────────────────────────────────────────────────────────────────────────── -def _build_argv(spec: StepSpec, py: str, qa_path: str) -> list[str]: - cmd = [py or sys.executable, qa_path] +def _build_argv(spec: dict[str, Any], py: str, qa_path: str) -> list[str]: + cmd = [py, qa_path] command = spec.get("command") - if command == "download_assets": - uprn = spec.get("uprn") - if isinstance(uprn, list): - uprn_list = [str(x) for x in uprn] - elif isinstance(uprn, str): - uprn_list = [uprn] - else: - raise ValueError( - "download_assets requires 'uprn' unless 'uprn_from_previous_csvs' is set." - ) - cmd += ["--uprn"] + uprn_list + uprn = _ensure_list_or_path(spec.get("uprn")) + if not uprn: + raise ValueError("download_assets requires 'uprn'.") + cmd += ["--uprn"] + uprn if spec.get("sensor"): cmd += ["--sensor", str(spec["sensor"])] if spec.get("types"): cmd += ["--types", ",".join(spec["types"])] elif command == "ods_to_uprn": - ods = spec.get("ods") - if isinstance(ods, list): - ods_list = [str(x) for x in ods] - elif isinstance(ods, str): - ods_list = [ods] - else: + ods = _ensure_list_or_path(spec.get("ods")) + if not ods: raise ValueError("ods_to_uprn requires 'ods'.") - cmd += ["--ods"] + ods_list + cmd += ["--ods"] + ods elif command == "uprns_by_output_area": - oa = spec.get("output_area") - if isinstance(oa, list): - oa_list = [str(x) for x in oa] - elif isinstance(oa, str): - oa_list = [oa] - else: + oa = _ensure_list_or_path(spec.get("output_area")) + if not oa: raise ValueError("uprns_by_output_area requires 'output_area'.") - cmd += ["--output-area"] + oa_list + cmd += ["--output-area"] + oa else: raise ValueError(f"Unsupported command: {command!r}") @@ -419,167 +233,635 @@ def _build_argv(spec: StepSpec, py: str, qa_path: str) -> list[str]: return cmd +def _map_types_from_text(lowered: str) -> list[str] | None: + wants_pointclouds = re.search(r"\bpoint\s*clouds?\b", lowered) is not None + wants_merged = "merged lidar" in lowered or re.search( + r"merged\s+lidar\s+point\s*cloud", lowered + ) + wants_frame = "pointcloud frame" in lowered or "single frame" in lowered + + types: list[str] = [] + if wants_pointclouds: + types.extend(POINTCLOUD_BOTH) + if wants_merged: + types.append("did:lidar-pointcloud-merged") + if wants_frame: + types.append("did:lidar-pointcloud-frame") + + if "rgb" in lowered and "image" in lowered: + types.append("did:rgb-image") + if "range panorama" in lowered: + types.append("did:lidar-range-pano") + if "reflectance panorama" in lowered: + types.append("did:lidar-reflectance-pano") + if "signal panorama" in lowered: + types.append("did:lidar-signal-pano") + if "nearir" in lowered or "near-infrared" in lowered: + types.append("did:lidar-nearir-pano") + if "ir false" in lowered: + types.append("did:ir-false-color-image") + if ( + "ir temperature array" in lowered + or re.search(r"thermal\s+arrays?", lowered) + or re.search(r"temperature\s+arrays?", lowered) + ): + types.append("did:ir-temperature-array") + if re.search(r"thermal\s+images?", lowered): + types.append("did:ir-false-color-image") + + if not types: + return None + # Dedupe, preserve order + out: list[str] = [] + seen: set[str] = set() + for t in types: + if t not in seen: + out.append(t) + seen.add(t) + return out + + +# ====================================================================================== +# Planning (heuristics + optional LLM via Ollama) +# ====================================================================================== + + +def ollama_chat( + base_url: str, + model: str, + messages: list[dict[str, str]], + temperature: float = 0.0, + top_p: float = 0.95, + num_predict: int = 256, + num_ctx: int | None = None, + keep_alive: str | None = None, + force_json: bool = True, + timeout_s: float = 120.0, +) -> dict[str, Any]: + url = base_url.rstrip("/") + "/api/chat" + payload: dict[str, Any] = { + "model": model, + "messages": messages, + "stream": False, + "options": { + "temperature": float(temperature), + "top_p": float(top_p), + "num_predict": int(num_predict), + }, + } + if num_ctx is not None: + payload["options"]["num_ctx"] = int(num_ctx) + if keep_alive: + payload["keep_alive"] = str(keep_alive) + if force_json: + payload["format"] = "json" + r = requests.post(url, json=payload, timeout=(5.0, timeout_s)) + r.raise_for_status() + return r.json() + + +class StepSpec(TypedDict, total=False): + command: Literal["download_assets", "ods_to_uprn", "uprns_by_output_area"] + uprn: list[str] | str | None + ods: list[str] | str | None + output_area: list[str] | str | None + sensor: str | None + types: list[str] | None + download_dir: str | None + api_key_env: str | None + db_url: str | None + uprn_from_previous_csvs: bool + + +@dataclasses.dataclass +class WFState: + nl: str + plan: list[StepSpec] + current: int = 0 + artifacts: dict[str, Any] = dataclasses.field(default_factory=dict) + log: list[str] = dataclasses.field(default_factory=list) + actions: list[dict[str, Any]] = dataclasses.field(default_factory=list) + dry_run: bool = False + plan_only: bool = False + py_exe: str = sys.executable + qa_path: str = os.path.join(os.path.dirname(__file__), "query_assist.py") + base_url: str = os.environ.get("OLLAMA_HOST", "http://localhost:11434") + model_id: str = "gpt-oss:20b" + temperature: float = 0.0 + top_p: float = 0.95 + num_predict: int = 256 + num_ctx: int | None = None + keep_alive: str | None = None + force_json: bool = True + verbose_level: int = logging.INFO + max_steps: int = 8 + + +def heuristic_plan(nl: str, defaults: dict[str, Any]) -> list[StepSpec] | None: + text = nl.strip() + if not text: + return None + lowered = text.lower() + + csv_paths = re.findall(r'(?:(?:[A-Za-z]:)?[^\s"\'<>|]+\.csv)\b', text) + oa_codes = re.findall(r"\bE\d{8}\b", text) + ods_codes = re.findall(r"\b[A-Z]\d{5}\b", text) + uprns = re.findall(r"\b\d{6,}\b", text) + endpoint_match = re.search(r"(https?://[\w\.-:%/]+)", text) + endpoint_url = endpoint_match.group(1) if endpoint_match else None + + # Extract common options if present + download_dir = None + m = re.search(r"(?: to | into )\s+(/[^ ]+)", lowered) + if m: + download_dir = m.group(1) + + # Types + types = _map_types_from_text(lowered) + + # CSV containing UPRNs + if csv_paths and ("uprn" in lowered or "uprns" in lowered): + step: dict[str, Any] = { + "command": "download_assets", + "uprn": csv_paths if len(csv_paths) > 1 else csv_paths[0], + "types": types, + "download_dir": download_dir or defaults.get("download_dir"), + "db_url": endpoint_url or defaults.get("db_url"), + } + return [step] + + # Output area → assets + if (oa_codes or "output area" in lowered or "output areas" in lowered) and ( + types + or "point cloud" in lowered + or "assets" in lowered + or "download" in lowered + ): + oa_list: list[str] = [] + if oa_codes: + oa_list.extend(oa_codes) + if csv_paths and ("uprn" not in lowered): + oa_list.extend(csv_paths) + if not oa_list: + # No concrete OA identifiers provided; defer to LLM/router instead of emitting an invalid step + return None + second: dict[str, Any] = { + "command": "download_assets", + "uprn_from_previous_csvs": True, + "download_dir": download_dir or defaults.get("download_dir"), + "db_url": defaults.get("db_url"), + } + if types: # only include types if user implied them + second["types"] = types + return [ + { + "command": "uprns_by_output_area", + "output_area": oa_list + if oa_list + else (csv_paths[0] if csv_paths else None), + "download_dir": defaults.get("download_dir"), + "db_url": defaults.get("db_url"), + }, + second, + ] + + # ODS → assets + if (ods_codes or "ods" in lowered) and (types is not None): + ods_list: list[str] = [] + if ods_codes: + ods_list.extend(ods_codes) + if csv_paths: + ods_list.extend(csv_paths) + if not ods_list: + return None + return [ + { + "command": "ods_to_uprn", + "ods": ods_list, + "download_dir": defaults.get("download_dir"), + "db_url": defaults.get("db_url"), + }, + { + "command": "download_assets", + "uprn_from_previous_csvs": True, + "types": types, + "download_dir": download_dir or defaults.get("download_dir"), + "db_url": defaults.get("db_url"), + }, + ] + + # Plain OA listing + if oa_codes or ("output area" in lowered or "output areas" in lowered): + oa_list: list[str] = [] + if oa_codes: + oa_list.extend(oa_codes) + if csv_paths and ("uprn" not in lowered): + oa_list.extend(csv_paths) + if not oa_list: + return None + return [ + { + "command": "uprns_by_output_area", + "output_area": oa_list if len(oa_list) > 1 else oa_list[0], + "download_dir": defaults.get("download_dir"), + "db_url": defaults.get("db_url"), + } + ] + + # Plain ODS mapping + if ods_codes or "ods" in lowered: + ods_list: list[str] = [] + if ods_codes: + ods_list.extend(ods_codes) + if csv_paths: + ods_list.extend(csv_paths) + if not ods_list: + return None + return [ + { + "command": "ods_to_uprn", + "ods": ods_list if len(ods_list) > 1 else ods_list[0], + "download_dir": defaults.get("download_dir"), + "db_url": defaults.get("db_url"), + } + ] + + # Direct UPRN assets + if uprns and ( + "download" in lowered + or "assets" in lowered + or "point" in lowered + or "rgb" in lowered + or "image" in lowered + or "lidar" in lowered + ): + step: dict[str, Any] = { + "command": "download_assets", + "uprn": uprns, + "types": types, + "download_dir": download_dir or defaults.get("download_dir"), + "db_url": endpoint_url or defaults.get("db_url"), + } + return [step] + + return None + + +def llm_plan( + nl: str, + defaults: dict[str, Any], + base_url: str, + model_id: str, + temperature: float, + top_p: float, + num_predict: int, + num_ctx: int | None, + keep_alive: str | None, + force_json: bool, +) -> list[StepSpec] | None: + messages = [ + { + "role": "system", + "content": ( + "You are a planning assistant that compiles an ordered plan for query_assist.py.\n" + "Return JSON with a 'steps' array; use 'uprn_from_previous_csvs': true when a download_assets step should consume prior CSVs.\n" + "Types to use when implied: did:rgb-image, did:lidar-pointcloud-merged, did:lidar-pointcloud-frame, did:lidar-range-pano, did:lidar-reflectance-pano, did:lidar-signal-pano, did:lidar-nearir-pano, did:ir-false-color-image, did:ir-temperature-array, did:ir-count-image, did:celsius-temperature, did:relative-humidity." + ), + }, + {"role": "user", "content": nl}, + ] + try: + resp = ollama_chat( + base_url=base_url, + model=model_id, + messages=messages, + temperature=temperature, + top_p=top_p, + num_predict=num_predict, + num_ctx=num_ctx, + keep_alive=keep_alive, + force_json=force_json, + ) + content = None + if isinstance(resp.get("message"), dict): + content = resp["message"].get("content") + if not content and isinstance(resp.get("response"), str): + content = resp["response"] + if not content: + return None + plan_obj = None + try: + plan_obj = json.loads(content) + except Exception: + plan_obj = _extract_first_json(content) + if not isinstance(plan_obj, dict): + return None + steps_raw = plan_obj.get("steps") + if not isinstance(steps_raw, list) or not steps_raw: + return None + steps: list[StepSpec] = [] + for s in steps_raw: + if not isinstance(s, dict): + continue + cmd = s.get("command") + if cmd not in {"download_assets", "ods_to_uprn", "uprns_by_output_area"}: + continue + step: StepSpec = {"command": cmd} + for key in [ + "uprn", + "ods", + "output_area", + "types", + "sensor", + "download_dir", + "api_key_env", + "db_url", + ]: + if key in s: + step[key] = s[key] # type: ignore[index] + if s.get("uprn_from_previous_csvs"): + step["uprn_from_previous_csvs"] = True + # Do not auto-insert api_key_env; query_assist.py defaults it. + if "download_dir" not in step and defaults.get("download_dir"): + step["download_dir"] = defaults["download_dir"] + steps.append(step) + return steps or None + except Exception: + return None + + +def llm_route_to_spec(nl: str, base_url: str, model_id: str, **opts) -> StepSpec | None: + messages = [ + {"role": "system", "content": SYSTEM_ROUTER_PROMPT}, + {"role": "user", "content": nl}, + ] + try: + resp = ollama_chat(base_url, model_id, messages, **opts) + except Exception: + return None + content = None + if isinstance(resp.get("message"), dict): + content = resp["message"].get("content") + if not content and isinstance(resp.get("response"), str): + content = resp["response"] + if not content: + return None + obj = _extract_first_json(content) or None + if not obj or "command" not in obj: + return None + step: StepSpec = { + "command": obj["command"], + "uprn": obj.get("uprn"), + "ods": obj.get("ods"), + "output_area": obj.get("output_area"), + "sensor": obj.get("sensor"), + "types": obj.get("types"), + "download_dir": obj.get("download_dir"), + "api_key_env": obj.get("api_key_env"), + "db_url": obj.get("db_url"), + } + return step + + +def upgrade_single_spec_to_plan( + nl: str, spec: StepSpec, defaults: dict[str, Any] +) -> list[StepSpec]: + lowered = nl.lower() + types = spec.get("types") or _map_types_from_text(lowered) + # OA -> assets + if spec.get("command") == "uprns_by_output_area" and ( + types + or "point cloud" in lowered + or "download" in lowered + or "assets" in lowered + ): + second: dict[str, Any] = { + "command": "download_assets", + "uprn_from_previous_csvs": True, + "download_dir": spec.get("download_dir"), + "db_url": spec.get("db_url"), + } + if types: + second["types"] = types + return [spec, second] + # ODS -> assets + if spec.get("command") == "ods_to_uprn" and (types is not None): + return [ + spec, + { + "command": "download_assets", + "uprn_from_previous_csvs": True, + "types": types, + "download_dir": spec.get("download_dir"), + "api_key_env": spec.get("api_key_env"), + "db_url": spec.get("db_url"), + }, + ] + return [spec] + + +# ====================================================================================== +# Execution helpers & LangGraph nodes +# ====================================================================================== + + def run_query_assist_step( - step: StepSpec, - py_exe: str, - qa_path: str, - dry_run: bool, - env: dict[str, str] | None = None, + step: StepSpec, py_exe: str, qa_path: str, dry_run: bool ) -> tuple[int, str]: argv = _build_argv(step, py_exe, qa_path) printable = " ".join(shlex.quote(x) for x in argv) - logging.info("Executing step: %s", json.dumps(step, ensure_ascii=False)) logging.info("Command: %s", printable) if dry_run: return 0, f"[dry-run] {printable}\n" + p = subprocess.Popen( + argv, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True + ) + captured_lines: list[str] = [] try: - p = subprocess.Popen( - argv, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=env - ) - except FileNotFoundError as e: - return 127, f"[spawn-failed] {e}\n" - except Exception as e: - return 1, f"[spawn-failed] {e}\n" - - captured: list[str] = [] - assert p.stdout is not None - for line in p.stdout: - sys.stdout.write(line) # stream-through to console - captured.append(line) - rc = p.wait() - return rc, "".join(captured) - - -def _find_csvs_emitted(text: str) -> list[str]: - pats = [ - r"✔?\s*Saved\s+(?:ODS.?→?UPRN|ODS.?to.?UPRN)\s*CSV\s*[–\-→>]\s*([^\s]+\.csv)", - r"✔?\s*Saved\s+(?:OA.?→?UPRN|OA.?to.?UPRN|Output\s*Area.?→?UPRN)\s*CSV\s*[–\-→>]\s*([^\s]+\.csv)", - r"Saved\s*CSV\s*for\s*.*?[–\-→>]\s*([^\s]+\.csv)", - r"✔\s*Saved\s*ODS.?→?UPRN\s*CSV\s*→\s*([^\s]+\.csv)", - ] - out, seen = [], set() - for pat in pats: - for m in re.finditer(pat, text, flags=re.IGNORECASE): - path = m.group(1) - if path not in seen: - out.append(path) - seen.add(path) - return out + assert p.stdout is not None + for line in p.stdout: + sys.stdout.write(line) + captured_lines.append(line) + finally: + rc = p.wait() + return rc, "".join(captured_lines) def materialize_previous_uprn_csvs(state: WFState) -> list[str]: - from_logs = state.get("artifacts", {}).get("csvs", []) + from_logs = state.artifacts.get("csvs", []) if from_logs: - return list(from_logs) - candidates = [ - os.path.join(os.getcwd(), "downloads", "ods_to_uprn.csv"), - os.path.join(os.getcwd(), "downloads", "oa_to_uprn.csv"), - os.path.join(os.getcwd(), "downloads", "uprns_by_output_area.csv"), - ] - return [p for p in candidates if os.path.isfile(p)] + return from_logs + # Fallback to default ODS CSV location if present + dl_base = state.plan[0].get("download_dir") or os.path.join( + os.getcwd(), "downloads" + ) + candidate = os.path.join(dl_base, "ods_to_uprn.csv") + if os.path.isfile(candidate): + return [candidate] + return [] + + +def node_plan(state: WFState) -> WFState: + defaults = { + "download_dir": None, + "api_key_env": "API_KEY", + "db_url": None, + } + # 1) LLM multi-step plan + plan = llm_plan( + state.nl, + defaults, + base_url=state.base_url, + model_id=state.model_id, + temperature=state.temperature, + top_p=state.top_p, + num_predict=state.num_predict, + num_ctx=state.num_ctx, + keep_alive=state.keep_alive, + force_json=state.force_json, + ) + # 2) Heuristic plan + if not plan: + plan = heuristic_plan(state.nl, defaults) + # 3) LLM single-step → upgrade + if not plan: + spec = llm_route_to_spec( + state.nl, + base_url=state.base_url, + model_id=state.model_id, + temperature=state.temperature, + top_p=state.top_p, + num_predict=state.num_predict, + num_ctx=state.num_ctx, + keep_alive=state.keep_alive, + force_json=state.force_json, + ) + if spec: + plan = upgrade_single_spec_to_plan(state.nl, spec, defaults) + + # Validate required args; if invalid (e.g., output_area missing), try fallback routes + def _valid(p: list[StepSpec]) -> bool: + for st in p: + cmd = st.get("command") + if cmd == "uprns_by_output_area" and not _ensure_list_or_path( + st.get("output_area") + ): + return False + if cmd == "ods_to_uprn" and not _ensure_list_or_path(st.get("ods")): + return False + if cmd == "download_assets": + if st.get("uprn_from_previous_csvs"): + continue + if not _ensure_list_or_path(st.get("uprn")): + return False + return True + + if plan and not _valid(plan): + plan = heuristic_plan(state.nl, defaults) + if plan and not _valid(plan): + spec = llm_route_to_spec( + state.nl, + state.base_url, + state.model_id, + temperature=state.temperature, + top_p=state.top_p, + num_predict=state.num_predict, + num_ctx=state.num_ctx, + keep_alive=state.keep_alive, + force_json=state.force_json, + ) + if spec: + plan = upgrade_single_spec_to_plan(state.nl, spec, defaults) + state.plan = plan or [] -# ───────────────────────────────────────────────────────────────────────────── -# LangGraph nodes -# ───────────────────────────────────────────────────────────────────────────── -def node_execute(state: WFState) -> WFState: - state.setdefault("artifacts", {}) - state.setdefault("log", []) - state.setdefault("current", 0) + # Pretty-print plan at INFO/DEBUG + if state.plan and state.verbose_level <= logging.INFO: + print("Plan:") + for i, step in enumerate(state.plan): + show = {k: v for k, v in step.items() if k != "uprn_from_previous_csvs"} + print(f" {i+1}. {json.dumps(show, ensure_ascii=False)}") + print() + if not state.plan: + state.log.append("No actionable plan could be inferred.") + return state - if state["current"] >= len(state.get("plan", [])): - return state - step: StepSpec = state["plan"][state["current"]] +def node_execute(state: WFState) -> WFState: + if state.current >= len(state.plan): + return state + step = state.plan[state.current] - # Inject CSVs for download_assets when needed - if step.get("command") == "download_assets" and not step.get("uprn"): + # If this step should consume prior CSVs, materialize them + if step.get("uprn_from_previous_csvs"): csvs = materialize_previous_uprn_csvs(state) - if not csvs and step.get("uprn_from_previous_csvs"): - state["log"].append("No CSVs found from previous step(s).") - state["current"] = len(state["plan"]) + if not csvs: + state.log.append("No CSVs found from previous step(s).") + state.current = len(state.plan) return state - if csvs: - step = dict(step) - step.pop("uprn_from_previous_csvs", None) - step["uprn"] = csvs - state["plan"][state["current"]] = step + step = dict(step) + step.pop("uprn_from_previous_csvs", None) + step["uprn"] = csvs rc, captured = run_query_assist_step( - step, - state.get("py_exe", sys.executable), - state["qa_path"], - state.get("dry_run", False), + step, state.py_exe, state.qa_path, state.dry_run ) - state["log"].append(captured) + state.log.append(captured) - newly = _find_csvs_emitted(captured) - if newly: - existing = state["artifacts"].get("csvs", []) - state["artifacts"]["csvs"] = list(dict.fromkeys(list(existing) + newly)) + newly_found = _find_csvs_emitted(captured) + if newly_found: + extant = state.artifacts.get("csvs", []) + state.artifacts["csvs"] = list(dict.fromkeys(extant + newly_found)) + + # Track executed action for summary + try: + argv_for_record = _build_argv(step, state.py_exe, state.qa_path) + except Exception: + argv_for_record = [] + state.actions.append( + { + "index": state.current + 1, + "command": step.get("command"), + "argv": argv_for_record, + "rc": rc, + "emitted_csvs": newly_found, + } + ) if rc != 0: - state["log"].append(f"Step {state['current']} returned non-zero exit {rc}.") - state["current"] = len(state["plan"]) + state.log.append(f"Step {state.current} returned non-zero exit {rc}.") + state.current = len(state.plan) else: - state["current"] += 1 + state.current += 1 return state -def node_check_done(state: WFState) -> str: - # IMPORTANT: return string labels, not END sentinel - if state.get("current", 0) >= len(state.get("plan", [])): - return "end" - if state.get("current", 0) >= state.get("max_steps", 8): - state.setdefault("log", []).append( - f"Aborting: exceeded max_steps={state.get('max_steps', 8)}" - ) - return "end" +def after_plan(state: WFState) -> str: + if not state.plan: + return END + if state.plan_only: + return END return "execute" -# ───────────────────────────────────────────────────────────────────────────── -# UI helpers -# ───────────────────────────────────────────────────────────────────────────── -def _render_box(title: str, body: str) -> str: - term_width = shutil.get_terminal_size(fallback=(100, 24)).columns - max_width = max(60, min(term_width - 2, 100)) - wrap_width = max_width - 4 - body_lines: list[str] = [] - for para in body.splitlines(): - if not para.strip(): - body_lines.append("") - else: - body_lines.extend(textwrap.wrap(para, width=wrap_width)) - title = title.strip() - title_line = f" {title} " - top = "┌" + "─" * (max_width - 2) + "┐" - sep = "├" + "─" * (max_width - 2) + "┤" - bot = "└" + "─" * (max_width - 2) + "┘" - if len(title_line) <= (max_width - 2): - left = (max_width - 2 - len(title_line)) // 2 - right = max_width - 2 - len(title_line) - left - top = "┌" + "─" * left + title_line + "─" * right + "┐" - content = "\n".join("│ " + line.ljust(max_width - 4) + " │" for line in body_lines) - return "\n".join([top, sep, content, bot]) +def check_done(state: WFState) -> str: + if state.current >= len(state.plan): + return END + if state.current >= state.max_steps: + state.log.append(f"Aborting: exceeded max_steps={state.max_steps}") + return END + return "execute" -def _print_plan(plan: list[StepSpec], level: int) -> None: - if level == logging.INFO: - print("Plan:") - for i, step in enumerate(plan, 1): - display = {k: v for k, v in step.items() if k != "uprn_from_previous_csvs"} - print(f" {i}. {json.dumps(display, ensure_ascii=False)}") - print() +# ====================================================================================== +# CLI +# ====================================================================================== -# ───────────────────────────────────────────────────────────────────────────── -# CLI -# ───────────────────────────────────────────────────────────────────────────── def parse_args() -> argparse.Namespace: ap = argparse.ArgumentParser( - description="LangGraph NL workflow CLI for query_assist.py (LLM-only planner)" + description="LangGraph NL workflow for query_assist.py (one- and two-stage)" ) ap.add_argument("--model-id", default="gpt-oss:20b", help="Ollama model name/tag") ap.add_argument( @@ -593,9 +875,16 @@ def parse_args() -> argparse.Namespace: help="Base URL of the Ollama server (or set OLLAMA_HOST)", ) ap.add_argument( - "--dry-run", action="store_true", help="Plan/print but do not execute" + "--dry-run", + action="store_true", + help="Plan & print commands but do not execute", + ) + ap.add_argument( + "--plan-only", action="store_true", help="Only compile/print the plan and exit" ) ap.add_argument("--once", "-q", help="Run a single NL query and exit") + + # Decoding/runtime knobs ap.add_argument("--temperature", type=float, default=0.0) ap.add_argument("--top-p", type=float, default=0.95) ap.add_argument("--num-predict", type=int, default=256) @@ -603,18 +892,18 @@ def parse_args() -> argparse.Namespace: ap.add_argument("--keep-alive", default=None) ap.add_argument("--no-force-json", action="store_true") ap.add_argument("--max-steps", type=int, default=8) + + # Logging controls ap.add_argument( "-v", "--verbose", action="count", default=0, help="-v=info, -vv=debug" ) - ap.add_argument("--plan-only", action="store_true", dest="plan_only") - ap.set_defaults(plan_only=False) return ap.parse_args() def main() -> None: args = parse_args() - # Logging + # Logging level if args.verbose >= 2: level = logging.DEBUG elif args.verbose == 1: @@ -623,100 +912,111 @@ def main() -> None: level = logging.WARNING logging.basicConfig(level=level, format="%(levelname)s: %(message)s") - # Banner + # Intro banner if level <= logging.INFO: body = ( - "- LLM-only planner (no heuristics).\n" - "- Plan is printed FIRST and only at INFO (-v).\n" - "- Two-stage ODS/OA -> assets enforced when assets are implied.\n" - "- Dry-run and plan-only for auditability." + "• Parses NL into a one- or two-stage plan (heuristics + optional LLM).\n" + "• Executes via LangGraph with artifact passing.\n" + "• Parses query_assist.py logs to capture CSVs for the next stage.\n" + "• Supports dry-run and plan-only modes." ) print(_render_box(f"LangGraph NL Workflow — {args.model_id}", body)) - # Build LangGraph (dict state) — IMPORTANT: label mapping uses string keys - builder = StateGraph(dict) + # Build LangGraph + builder = StateGraph(WFState) + builder.add_node("plan", node_plan) builder.add_node("execute", node_execute) - builder.add_edge(START, "execute") + builder.add_edge(START, "plan") + builder.add_conditional_edges("plan", after_plan, {"execute": "execute", END: END}) builder.add_conditional_edges( - "execute", node_check_done, {"execute": "execute", "end": END} + "execute", check_done, {"execute": "execute", END: END} ) memory = MemorySaver() graph = builder.compile(checkpointer=memory) def run_once(nl: str) -> int: - st: WFState = { - "nl": nl, - "plan": [], - "current": 0, - "artifacts": {}, - "log": [], - "dry_run": bool(args.dry_run), - "py_exe": sys.executable, - "qa_path": args.query_assist_path, - "base_url": args.base_url, - "model_id": args.model_id, - "temperature": args.temperature, - "top_p": args.top_p, - "num_predict": args.num_predict, - "num_ctx": args.num_ctx, - "keep_alive": args.keep_alive, - "force_json": (not args.no_force_json), - "verbose_level": level, - "max_steps": args.max_steps, - } - - # PLAN - try: - st["plan"] = llm_plan(st) - except Exception as e: - logging.info("Planning failed: %s", e) - return 1 - - _print_plan(st["plan"], level) - if args.plan_only: - return 0 - - # EXECUTE: single invoke to END (no external loop) + st = WFState( + nl=nl, + plan=[], + dry_run=bool(args.dry_run), + plan_only=bool(args.plan_only), + qa_path=args.query_assist_path, + base_url=args.base_url, + model_id=args.model_id, + temperature=args.temperature, + top_p=args.top_p, + num_predict=args.num_predict, + num_ctx=args.num_ctx, + keep_alive=args.keep_alive, + force_json=(not args.no_force_json), + verbose_level=level, + max_steps=args.max_steps, + ) final_state = graph.invoke( st, config={"configurable": {"thread_id": f"tid-{time.time_ns()}"}} ) - + logs = ( + final_state.get("log") + if isinstance(final_state, dict) + else getattr(final_state, "log", []) + ) or [] trailing = [ - ln - for ln in final_state.get("log", []) + l + for l in logs if any( - k in ln - for k in ( + t in l + for t in ( "[dry-run]", "No CSVs found", "non-zero exit", - "[spawn-failed]", + "Planner", + "No actionable plan", ) ) ] if trailing: - print("\n".join(trailing)) - return 0 + print("\n" + "\n".join(l.strip() for l in trailing)) + actions = ( + final_state.get("actions") + if isinstance(final_state, dict) + else getattr(final_state, "actions", []) + ) or [] + if actions and level <= logging.INFO: + print("\nACTIONS (LangGraph Execution):") + for a in actions: + argv = " ".join(shlex.quote(x) for x in a.get("argv", [])) + rc = a.get("rc") + em = ", ".join(a.get("emitted_csvs", []) or []) + print( + f" {a.get('index')}. {a.get('command')} [rc={rc}]\n argv: {argv}" + ) + if em: + print(f" emitted CSVs: {em}") + return 0 if not any("non-zero exit" in l for l in logs) else 1 + # === Entry modes === try: if args.once: - sys.exit(run_once(args.once)) + rc = run_once(args.once) + if rc != 0: + logging.warning("Workflow exited with code %d", rc) + return if level <= logging.INFO: print( "LangGraph NL workflow for query_assist.py. Type 'exit' or Ctrl-D to quit." ) while True: try: - nl = input("> ").strip() + nl = input("> ") except EOFError: break - if not nl: + if not nl.strip(): continue - if nl.lower() in {"exit", "quit"}: + if nl.strip().lower() in {"exit", "quit"}: break rc = run_once(nl) if rc != 0: - logging.info("Workflow exited with code %d", rc) + logging.warning("Workflow exited with code %d", rc) except KeyboardInterrupt: print() logging.info("Interrupted.") From 2235ff53fa7ff1ebbdbf4bddf9a7338cc4b1212f Mon Sep 17 00:00:00 2001 From: gnathoi Date: Wed, 27 Aug 2025 14:52:47 +0100 Subject: [PATCH 14/19] change working --- examples/nl_query_graph.py | 47 ++++++-------------------------------- 1 file changed, 7 insertions(+), 40 deletions(-) diff --git a/examples/nl_query_graph.py b/examples/nl_query_graph.py index 51f9ea6..7f2fe63 100644 --- a/examples/nl_query_graph.py +++ b/examples/nl_query_graph.py @@ -1,32 +1,5 @@ #!/usr/bin/env python3 -""" -LangGraph NL workflow for query_assist.py — single- and two-stage queries - -Capabilities ------------- -• One-stage: download assets directly for given UPRN(s) / UPRN CSV -• Two-stage: (a) ODS → UPRN → download assets (b) Output Areas → UPRN → download assets -• Optional filters: sensor, types; optional overrides: download_dir, api_key_env, db_url -• LLM (Ollama) planner first; heuristic fallback; single-step router last -• Robust parsing of query_assist.py logs to discover emitted CSVs and feed them forward - -Examples --------- -# Direct assets (one-stage) -python3 nl_query_graph.py -q "Download merged lidar for UPRN 5045394 to /data" - -# ODS → UPRN → assets (two-stage) -python3 nl_query_graph.py -q "For ODS G85013 download RGB and merged lidar to /data" - -# Output areas → UPRN → assets (two-stage) -python3 nl_query_graph.py -q "Get point clouds in output area E00004550" - -Notes ------ -• Requires: langgraph (pip install langgraph[all]), requests -• Uses Ollama for optional planning: set OLLAMA_HOST or --base-url if needed -• Defaults match query_assist.py: API key env var defaults to API_KEY, downloads to ./downloads -""" + from __future__ import annotations import argparse @@ -43,15 +16,10 @@ import time from typing import Any, Literal, TypedDict -# Third-party import requests from langgraph.checkpoint.memory import MemorySaver from langgraph.graph import END, START, StateGraph -# ====================================================================================== -# Prompts & canonical type mappings (kept consistent with nl_query_cli.py / README) -# ====================================================================================== - SYSTEM_ROUTER_PROMPT = """You are a rigorous function-call router for a Python CLI named query_assist.py. Supported commands and how to populate them: @@ -97,6 +65,9 @@ - IR counts -> "did:ir-count-image" - temperature (no contentUrl) -> "did:celsius-temperature" - relative humidity (no contentUrl) -> "did:relative-humidity" + - UPRNs are the UK OS Unique Property Reference Numbers. Queries may call them buildings or other built environment associated words. + - Output areas may be called OAs. + - ODS codes are unique identifiers for UK NHS buildings, hence words like medical, practice, hospital, etc... may be used. - Prefer being decisive. When in doubt, infer sensible defaults. """ @@ -121,10 +92,6 @@ POINTCLOUD_BOTH = ["did:lidar-pointcloud-merged", "did:lidar-pointcloud-frame"] -# ====================================================================================== -# Helpers -# ====================================================================================== - def _render_box(title: str, body: str) -> str: term_width = shutil.get_terminal_size(fallback=(100, 24)).columns @@ -915,12 +882,12 @@ def main() -> None: # Intro banner if level <= logging.INFO: body = ( - "• Parses NL into a one- or two-stage plan (heuristics + optional LLM).\n" + "• Parses natural language into a multi-stage plan to execute SPARQL and retrieve assets.\n" "• Executes via LangGraph with artifact passing.\n" - "• Parses query_assist.py logs to capture CSVs for the next stage.\n" + "• Optional filters: sensor, types; optional overrides: download_dir, api_key_env, db_url \n" "• Supports dry-run and plan-only modes." ) - print(_render_box(f"LangGraph NL Workflow — {args.model_id}", body)) + print(_render_box(f"Query Assist AI — {args.model_id}", body)) # Build LangGraph builder = StateGraph(WFState) From f339800bf236e982c8a69207482ad704349a3c5e Mon Sep 17 00:00:00 2001 From: gnathoi Date: Mon, 1 Sep 2025 15:45:02 +0100 Subject: [PATCH 15/19] change: tidy query_assist.py & nl_query_assist.py --- .../{nl_query_graph.py => nl_query_assist.py} | 40 +- examples/nl_query_cli.py | 1047 ----------------- examples/query_assist.py | 7 +- 3 files changed, 5 insertions(+), 1089 deletions(-) rename examples/{nl_query_graph.py => nl_query_assist.py} (95%) delete mode 100644 examples/nl_query_cli.py diff --git a/examples/nl_query_graph.py b/examples/nl_query_assist.py similarity index 95% rename from examples/nl_query_graph.py rename to examples/nl_query_assist.py index 7f2fe63..2b954ef 100644 --- a/examples/nl_query_graph.py +++ b/examples/nl_query_assist.py @@ -146,7 +146,6 @@ def _find_csvs_emitted(stream_text: str) -> list[str]: for pat in patterns: for m in re.finditer(pat, stream_text): csvs.append(m.group(1)) - # Deduplicate preserving order seen: set[str] = set() out: list[str] = [] for p in csvs: @@ -238,7 +237,7 @@ def _map_types_from_text(lowered: str) -> list[str] | None: if not types: return None - # Dedupe, preserve order + out: list[str] = [] seen: set[str] = set() for t in types: @@ -248,11 +247,6 @@ def _map_types_from_text(lowered: str) -> list[str] | None: return out -# ====================================================================================== -# Planning (heuristics + optional LLM via Ollama) -# ====================================================================================== - - def ollama_chat( base_url: str, model: str, @@ -337,16 +331,13 @@ def heuristic_plan(nl: str, defaults: dict[str, Any]) -> list[StepSpec] | None: endpoint_match = re.search(r"(https?://[\w\.-:%/]+)", text) endpoint_url = endpoint_match.group(1) if endpoint_match else None - # Extract common options if present download_dir = None m = re.search(r"(?: to | into )\s+(/[^ ]+)", lowered) if m: download_dir = m.group(1) - # Types types = _map_types_from_text(lowered) - # CSV containing UPRNs if csv_paths and ("uprn" in lowered or "uprns" in lowered): step: dict[str, Any] = { "command": "download_assets", @@ -357,7 +348,6 @@ def heuristic_plan(nl: str, defaults: dict[str, Any]) -> list[StepSpec] | None: } return [step] - # Output area → assets if (oa_codes or "output area" in lowered or "output areas" in lowered) and ( types or "point cloud" in lowered @@ -370,7 +360,6 @@ def heuristic_plan(nl: str, defaults: dict[str, Any]) -> list[StepSpec] | None: if csv_paths and ("uprn" not in lowered): oa_list.extend(csv_paths) if not oa_list: - # No concrete OA identifiers provided; defer to LLM/router instead of emitting an invalid step return None second: dict[str, Any] = { "command": "download_assets", @@ -378,7 +367,7 @@ def heuristic_plan(nl: str, defaults: dict[str, Any]) -> list[StepSpec] | None: "download_dir": download_dir or defaults.get("download_dir"), "db_url": defaults.get("db_url"), } - if types: # only include types if user implied them + if types: second["types"] = types return [ { @@ -392,7 +381,6 @@ def heuristic_plan(nl: str, defaults: dict[str, Any]) -> list[StepSpec] | None: second, ] - # ODS → assets if (ods_codes or "ods" in lowered) and (types is not None): ods_list: list[str] = [] if ods_codes: @@ -417,7 +405,6 @@ def heuristic_plan(nl: str, defaults: dict[str, Any]) -> list[StepSpec] | None: }, ] - # Plain OA listing if oa_codes or ("output area" in lowered or "output areas" in lowered): oa_list: list[str] = [] if oa_codes: @@ -435,7 +422,6 @@ def heuristic_plan(nl: str, defaults: dict[str, Any]) -> list[StepSpec] | None: } ] - # Plain ODS mapping if ods_codes or "ods" in lowered: ods_list: list[str] = [] if ods_codes: @@ -453,7 +439,6 @@ def heuristic_plan(nl: str, defaults: dict[str, Any]) -> list[StepSpec] | None: } ] - # Direct UPRN assets if uprns and ( "download" in lowered or "assets" in lowered @@ -545,10 +530,9 @@ def llm_plan( "db_url", ]: if key in s: - step[key] = s[key] # type: ignore[index] + step[key] = s[key] if s.get("uprn_from_previous_csvs"): step["uprn_from_previous_csvs"] = True - # Do not auto-insert api_key_env; query_assist.py defaults it. if "download_dir" not in step and defaults.get("download_dir"): step["download_dir"] = defaults["download_dir"] steps.append(step) @@ -595,7 +579,6 @@ def upgrade_single_spec_to_plan( ) -> list[StepSpec]: lowered = nl.lower() types = spec.get("types") or _map_types_from_text(lowered) - # OA -> assets if spec.get("command") == "uprns_by_output_area" and ( types or "point cloud" in lowered @@ -611,7 +594,6 @@ def upgrade_single_spec_to_plan( if types: second["types"] = types return [spec, second] - # ODS -> assets if spec.get("command") == "ods_to_uprn" and (types is not None): return [ spec, @@ -627,11 +609,6 @@ def upgrade_single_spec_to_plan( return [spec] -# ====================================================================================== -# Execution helpers & LangGraph nodes -# ====================================================================================== - - def run_query_assist_step( step: StepSpec, py_exe: str, qa_path: str, dry_run: bool ) -> tuple[int, str]: @@ -659,7 +636,6 @@ def materialize_previous_uprn_csvs(state: WFState) -> list[str]: from_logs = state.artifacts.get("csvs", []) if from_logs: return from_logs - # Fallback to default ODS CSV location if present dl_base = state.plan[0].get("download_dir") or os.path.join( os.getcwd(), "downloads" ) @@ -743,7 +719,6 @@ def _valid(p: list[StepSpec]) -> bool: state.plan = plan or [] - # Pretty-print plan at INFO/DEBUG if state.plan and state.verbose_level <= logging.INFO: print("Plan:") for i, step in enumerate(state.plan): @@ -760,7 +735,6 @@ def node_execute(state: WFState) -> WFState: return state step = state.plan[state.current] - # If this step should consume prior CSVs, materialize them if step.get("uprn_from_previous_csvs"): csvs = materialize_previous_uprn_csvs(state) if not csvs: @@ -781,7 +755,6 @@ def node_execute(state: WFState) -> WFState: extant = state.artifacts.get("csvs", []) state.artifacts["csvs"] = list(dict.fromkeys(extant + newly_found)) - # Track executed action for summary try: argv_for_record = _build_argv(step, state.py_exe, state.qa_path) except Exception: @@ -821,11 +794,6 @@ def check_done(state: WFState) -> str: return "execute" -# ====================================================================================== -# CLI -# ====================================================================================== - - def parse_args() -> argparse.Namespace: ap = argparse.ArgumentParser( description="LangGraph NL workflow for query_assist.py (one- and two-stage)" @@ -879,7 +847,6 @@ def main() -> None: level = logging.WARNING logging.basicConfig(level=level, format="%(levelname)s: %(message)s") - # Intro banner if level <= logging.INFO: body = ( "• Parses natural language into a multi-stage plan to execute SPARQL and retrieve assets.\n" @@ -961,7 +928,6 @@ def run_once(nl: str) -> int: print(f" emitted CSVs: {em}") return 0 if not any("non-zero exit" in l for l in logs) else 1 - # === Entry modes === try: if args.once: rc = run_once(args.once) diff --git a/examples/nl_query_cli.py b/examples/nl_query_cli.py deleted file mode 100644 index 7510af6..0000000 --- a/examples/nl_query_cli.py +++ /dev/null @@ -1,1047 +0,0 @@ -#!/usr/bin/env python3 - -from __future__ import annotations - -import argparse -import json -import logging -import os -import re -import shlex -import shutil -import subprocess -import sys -import textwrap -from typing import Any - -import requests - -# ------------------------------- -# System & Summary Prompts -# ------------------------------- - -SYSTEM_ROUTER_PROMPT = """You are a rigorous function-call router for a Python CLI named query_assist.py. - -Supported commands and how to populate them: - -1) download_assets - Required: uprn (string CSV path OR array of strings like ["5045394","200003455212"]) - Optional: sensor (string, e.g., "bess:OusterLidarSensor") - types (array of strings; each a type IRI, e.g., ["did:rgb-image","did:lidar-pointcloud-merged"]) - download_dir (string path) - api_key_env (string, name of env var with API key) - db_url (string URL to SPARQL endpoint) - -2) ods_to_uprn - Required: ods (string CSV path OR array of strings like ["G85013","Q12345"]) - -3) uprns_by_output_area - Required: output_area (string CSV path OR array of strings, e.g., ["E00004550","E00032882"]) - -Schema (MUST output exactly one JSON object with these keys as needed): -{ - "command": "download_assets" | "ods_to_uprn" | "uprns_by_output_area", - "uprn": string | string[] | null, - "ods": string | string[] | null, - "output_area": string | string[] | null, - "sensor": string | null, - "types": string[] | null, - "download_dir": string | null, - "api_key_env": string | null, - "db_url": string | null -} - -Constraints: -- Return ONLY the JSON object. No prose, no markdown. -- If the user request implies asset types, map them to the supported IRIs if possible: - - RGB image -> "did:rgb-image" - - merged lidar point cloud -> "did:lidar-pointcloud-merged" - - lidar range panorama -> "did:lidar-range-pano" - - lidar reflectance panorama -> "did:lidar-reflectance-pano" - - lidar signal panorama -> "did:lidar-signal-pano" - - lidar near-infrared panorama -> "did:lidar-nearir-pano" - - IR false color -> "did:ir-false-color-image" - - IR temperature array -> "did:ir-temperature-array" - - IR counts -> "did:ir-count-image" - - temperature (no contentUrl) -> "did:celsius-temperature" - - relative humidity (no contentUrl) -> "did:relative-humidity" -- Prefer being decisive. When in doubt, infer sensible defaults. -""" - -SUMMARY_SYSTEM_PROMPT = """You transform a raw natural-language user request about assets / UPRNs / ODS / output areas into a concise structured summary. - -Return ONLY a single JSON object with this exact schema (no prose): -{ - "bullets": string[] // 2-8 short bullet points capturing intent & extracted fields - ,"router_text": string // ONE concise imperative sentence for a routing model to decide the command - ,"extracted": { // best-effort extraction; omit keys you cannot infer - "uprn": string[] | null, - "ods": string[] | null, - "output_area": string[] | null, - "sensor": string | null, - "types": string[] | null, - "download_dir": string | null, - "api_key_env": string | null, - "db_url": string | null - } -} - -Guidance: -- Normalize UPRNs to digit strings. -- Keep ordering as given when sensible. -- For types, map descriptive phrases to IRIs per provided mapping when obvious. -- router_text should be minimal but sufficient (e.g., "Download merged lidar point cloud for UPRN 5045394"). -- If purely informational greeting with no actionable command, set bullets to ["No actionable request"], router_text="no-op" and extracted={}. -""" - -FEW_SHOTS: list[tuple[str, str]] = [ - ( - "Download the merged lidar point cloud for UPRN 5045394 into /data/assets. " - "Use MY_KEY as the env var for the API key.", - '{"command":"download_assets","uprn":["5045394"],"sensor":null,' - '"types":["did:lidar-pointcloud-merged"],"download_dir":"/data/assets",' - '"api_key_env":"MY_KEY","db_url":null,"ods":null,"output_area":null}', - ), - ( - "Map ODS G85013 to UPRNs.", - '{"command":"ods_to_uprn","ods":["G85013"],"uprn":null,"output_area":null,' - '"sensor":null,"types":null,"download_dir":null,"api_key_env":null,"db_url":null}', - ), - ( - "List all UPRNs in output areas E00004550 and E00032882.", - '{"command":"uprns_by_output_area","output_area":["E00004550","E00032882"],' - '"uprn":null,"ods":null,"sensor":null,"types":null,"download_dir":null,' - '"api_key_env":null,"db_url":null}', - ), - ( - "Get RGB images and merged lidar for UPRNs 5045394 and 200003455212 to /mnt/dl (API key var KEY2).", - '{"command":"download_assets","uprn":["5045394","200003455212"],"sensor":null,' - '"types":["did:rgb-image","did:lidar-pointcloud-merged"],"download_dir":"/mnt/dl",' - '"api_key_env":"KEY2","db_url":null,"ods":null,"output_area":null}', - ), - ( - "output areas E00004550 E00032882 E00063193 list uprns", - '{"command":"uprns_by_output_area","output_area":["E00004550","E00032882","E00063193"],' - '"uprn":null,"ods":null,"sensor":null,"types":null,"download_dir":null,' - '"api_key_env":null,"db_url":null}', - ), - ( - "ODS codes G85013 Q12345 map to uprns", - '{"command":"ods_to_uprn","ods":["G85013","Q12345"],"uprn":null,"output_area":null,' - '"sensor":null,"types":null,"download_dir":null,"api_key_env":null,"db_url":null}', - ), - ( - "5045394 merged lidar pointcloud now", - '{"command":"download_assets","uprn":["5045394"],"sensor":null,' - '"types":["did:lidar-pointcloud-merged"],"download_dir":null,' - '"api_key_env":null,"db_url":null,"ods":null,"output_area":null}', - ), - ( - "Download lidar range and reflectance panoramas for UPRN 5045394 sensor bess:OusterLidarSensor", - '{"command":"download_assets","uprn":["5045394"],"sensor":"bess:OusterLidarSensor",' - '"types":["did:lidar-range-pano","did:lidar-reflectance-pano"],"download_dir":null,' - '"api_key_env":null,"db_url":null,"ods":null,"output_area":null}', - ), - ( - "Give me temperature and humidity for UPRN 200003455212", - '{"command":"download_assets","uprn":["200003455212"],"sensor":null,' - '"types":["did:celsius-temperature","did:relative-humidity"],"download_dir":null,' - '"api_key_env":null,"db_url":null,"ods":null,"output_area":null}', - ), - ( - "Fetch IR false color and temperature array for 5045394", - '{"command":"download_assets","uprn":["5045394"],"sensor":null,' - '"types":["did:ir-false-color-image","did:ir-temperature-array"],"download_dir":null,' - '"api_key_env":null,"db_url":null,"ods":null,"output_area":null}', - ), - ( - "List UPRNs in output area E00004550 (single)", - '{"command":"uprns_by_output_area","output_area":["E00004550"],"uprn":null,' - '"ods":null,"sensor":null,"types":null,"download_dir":null,' - '"api_key_env":null,"db_url":null}', - ), - ( - "Map ODS G85013 and G85014 with endpoint override http://myhost:3030/ds/query", - '{"command":"ods_to_uprn","ods":["G85013","G85014"],"uprn":null,"output_area":null,' - '"sensor":null,"types":null,"download_dir":null,"api_key_env":null,' - '"db_url":"http://myhost:3030/ds/query"}', - ), - ( - "Download rgb image for 5045394 to /tmp/dl using key var APIKEY", - '{"command":"download_assets","uprn":["5045394"],"sensor":null,' - '"types":["did:rgb-image"],"download_dir":"/tmp/dl","api_key_env":"APIKEY",' - '"db_url":null,"ods":null,"output_area":null}', - ), - ( - "Get point cloud frame for UPRN 5045394", - '{"command":"download_assets","uprn":["5045394"],"sensor":null,' - '"types":["did:lidar-pointcloud-frame"],"download_dir":null,"api_key_env":null,' - '"db_url":null,"ods":null,"output_area":null}', - ), -] - -# ------------------------------- -# Helpers -# ------------------------------- - -log = logging.getLogger("nl_query_cli") - - -def extract_assistant_text_from_ollama(resp: dict[str, Any]) -> str: - """Extract assistant text from Ollama /api/chat or /api/generate.""" - msg = resp.get("message") - if isinstance(msg, dict): - content = msg.get("content") - if isinstance(content, str) and content.strip(): - return content - if isinstance(resp.get("response"), str) and resp["response"].strip(): - return resp["response"] - return json.dumps(resp, ensure_ascii=False) - - -def slice_first_json_object(text: str) -> str: - """Extract the first top-level JSON object {...} from text.""" - start = text.find("{") - if start == -1: - raise ValueError("No JSON object start found.") - depth = 0 - for i, c in enumerate(text[start:], start=start): - if c == "{": - depth += 1 - elif c == "}": - depth -= 1 - if depth == 0: - return text[start : i + 1] - raise ValueError("Unbalanced braces; JSON object not closed.") - - -def ensure_list_or_path(v: None | str | list[str]) -> list[str]: - """Convert (None | str | list) into a list of CLI tokens.""" - if v is None: - return [] - if isinstance(v, list): - return [str(x) for x in v if str(x).strip()] - s = str(v).strip() - if not s: - return [] - return [s] - - -def _find_csv_paths(text: str) -> list[str]: - """Return a list of CSV-like path tokens found in text.""" - # Accept absolute, relative, and bare filenames ending with .csv - return re.findall(r'(?:(?:[A-Za-z]:)?[^\s"\'<>|]+\.csv)\b', text) - - -def build_argv(spec: dict[str, Any], py: str, qa_path: str) -> list[str]: - """Map the JSON spec to query_assist.py argv.""" - cmd = [py, qa_path] - command = spec.get("command") - - db_url = spec.get("db_url") - download_dir = spec.get("download_dir") - api_key_env = spec.get("api_key_env") - sensor = spec.get("sensor") - types = spec.get("types") - - if command == "download_assets": - uprn = ensure_list_or_path(spec.get("uprn")) - if not uprn: - raise ValueError("download_assets requires 'uprn'.") - cmd += ["--uprn"] + uprn - if sensor: - cmd += ["--sensor", str(sensor)] - if types: - cmd += ["--types", ",".join([str(t) for t in types])] - elif command == "ods_to_uprn": - ods = ensure_list_or_path(spec.get("ods")) - if not ods: - raise ValueError("ods_to_uprn requires 'ods'.") - cmd += ["--ods"] + ods - elif command == "uprns_by_output_area": - oa = ensure_list_or_path(spec.get("output_area")) - if not oa: - raise ValueError("uprns_by_output_area requires 'output_area'.") - cmd += ["--output-area"] + oa - else: - raise ValueError(f"Unsupported command: {command!r}") - - if db_url: - cmd += ["--db-url", str(db_url)] - if download_dir: - cmd += ["--download-dir", str(download_dir)] - if api_key_env: - cmd += ["--api-key-env", str(api_key_env)] - - return cmd - - -def heuristic_parse(nl: str) -> dict[str, Any] | None: - """ - Heuristic parse for common patterns with CSV precedence: - - CSV + 'uprn' → download_assets (uprn=) - - CSV + 'output area' → uprns_by_output_area (output_area=) - - Output areas: codes E######## - - ODS: tokens like A12345 - - UPRN asset downloads: ≥6-digit tokens + keywords - """ - text = nl.strip() - if not text: - return None - lowered = text.lower() - - csv_paths = _find_csv_paths(text) - output_area_codes = re.findall(r"\bE\d{8}\b", text) - ods_codes = re.findall(r"\b[A-Z]\d{5}\b", text) - uprn_candidates = [t for t in re.findall(r"\b\d{6,}\b", text)] - - # Asset type hints - wants_merged = bool( - re.search(r"merged\s+(lidar\s+)?point\s*clouds?", lowered) - or "merged lidar" in lowered - ) - wants_rgb = ("rgb" in lowered) and ("image" in lowered or "images" in lowered) - - # Endpoint override - endpoint_match = re.search(r"(https?://[\w\.-:%/]+)", text) - endpoint_url = endpoint_match.group(1) if endpoint_match else None - - # --- CSV precedence --- - if csv_paths: - # If the user mentions UPRN(s), prefer treating the CSV as a UPRN list - if "uprn" in lowered: - types_list = [] - if wants_merged: - types_list.append("did:lidar-pointcloud-merged") - if wants_rgb: - types_list.append("did:rgb-image") - return { - "command": "download_assets", - "uprn": csv_paths, - "sensor": None, - "types": types_list or None, - "download_dir": None, - "api_key_env": None, - "db_url": endpoint_url, - "ods": None, - "output_area": None, - } - # Else, if they mention output areas explicitly, treat CSV as an OA list - if ( - "output area" in lowered - or "output areas" in lowered - or "oa" in lowered.split() - ): - return { - "command": "uprns_by_output_area", - "output_area": csv_paths, - "uprn": None, - "ods": None, - "sensor": None, - "types": None, - "download_dir": None, - "api_key_env": None, - "db_url": None, - } - # If ambiguous: assume UPRN list (safer/more common in this CLI) - return { - "command": "download_assets", - "uprn": csv_paths, - "sensor": None, - "types": ( - ["did:lidar-pointcloud-merged"] - if wants_merged - else (["did:rgb-image"] if wants_rgb else None) - ), - "download_dir": None, - "api_key_env": None, - "db_url": endpoint_url, - "ods": None, - "output_area": None, - } - - # --- Pure OA codes --- - if output_area_codes and ( - "output area" in lowered - or "output areas" in lowered - or len(output_area_codes) > 1 - ): - return { - "command": "uprns_by_output_area", - "output_area": output_area_codes, - "uprn": None, - "ods": None, - "sensor": None, - "types": None, - "download_dir": None, - "api_key_env": None, - "db_url": None, - } - - # --- ODS codes (only if no explicit UPRN numbers present) --- - if ods_codes and ("ods" in lowered or not uprn_candidates): - return { - "command": "ods_to_uprn", - "ods": ods_codes, - "uprn": None, - "output_area": None, - "sensor": None, - "types": None, - "download_dir": None, - "api_key_env": None, - "db_url": None, - } - - # --- UPRN download with optional types --- - asset_keywords = { - "download", - "get", - "asset", - "assets", - "lidar", - "image", - "images", - "point", - "pointcloud", - "point cloud", - } - if uprn_candidates and any(k in lowered for k in asset_keywords): - types_list = [] - if wants_merged: - types_list.append("did:lidar-pointcloud-merged") - if wants_rgb: - types_list.append("did:rgb-image") - return { - "command": "download_assets", - "uprn": uprn_candidates, - "sensor": None, - "types": types_list or None, - "download_dir": None, - "api_key_env": None, - "db_url": endpoint_url, - "ods": None, - "output_area": None, - } - - return None - - -def ollama_chat( - base_url: str, - model: str, - messages: list[dict[str, str]], - temperature: float = 0.0, - top_p: float = 0.95, - num_predict: int = 256, - num_ctx: int | None = None, - keep_alive: str | None = None, - request_timeout_s: float = 120.0, - force_json: bool = True, -) -> dict[str, Any]: - """Call Ollama's /api/chat and return parsed JSON.""" - url = base_url.rstrip("/") + "/api/chat" - payload: dict[str, Any] = { - "model": model, - "messages": messages, - "stream": False, - "options": { - "temperature": float(temperature), - "top_p": float(top_p), - "num_predict": int(num_predict), - }, - } - if num_ctx is not None: - payload["options"]["num_ctx"] = int(num_ctx) - if keep_alive: - payload["keep_alive"] = str(keep_alive) - if force_json: - payload["format"] = "json" - - resp = requests.post(url, json=payload, timeout=(5.0, request_timeout_s)) - resp.raise_for_status() - return resp.json() - - -# ------------------------------- -# Verbose-mode intro helpers -# ------------------------------- - - -def _fetch_model_intro(base_url: str, model_id: str) -> str: - """ - Ask the routing model (plain text) to describe its function briefly. - Safe: if the request fails, returns a static fallback. - """ - try: - resp = ollama_chat( - base_url=base_url, - model=model_id, - messages=[ - {"role": "system", "content": SYSTEM_ROUTER_PROMPT}, - { - "role": "user", - "content": ( - "Briefly describe your function in 4–7 short bullet points without JSON. " - "Focus on how natural-language input is turned into a structured command " - "for query_assist.py and what arguments you can infer." - ), - }, - ], - force_json=False, - temperature=0.0, - top_p=0.95, - num_predict=200, - ) - text = extract_assistant_text_from_ollama(resp).strip() - if text.startswith("{") and text.endswith("}"): - try: - obj = json.loads(text) - text = obj.get("description") or obj.get("text") or text - except Exception: - pass - return text - except Exception as e: - logging.getLogger("nl_query_cli").debug("Intro fetch failed: %s", e) - return ( - "- Routes natural-language queries to one of three commands: " - "download_assets, ods_to_uprn, uprns_by_output_area.\n" - "- Extracts UPRNs/ODS/Output Areas plus optional sensor, types, " - "download_dir, api_key_env, db_url.\n" - "- Maps asset phrases to canonical IRIs (e.g., merged lidar → did:lidar-pointcloud-merged).\n" - "- Builds argv for query_assist.py and executes it (unless --dry-run).\n" - "- Uses heuristics, few-shots, and optional summarization for robustness." - ) - - -def _render_box(title: str, body: str) -> str: - """Render a Unicode box with a title and wrapped body.""" - term_width = shutil.get_terminal_size(fallback=(100, 24)).columns - max_width = max(60, min(term_width - 2, 100)) - wrap_width = max_width - 4 - - body_lines = [] - for para in body.splitlines(): - if not para.strip(): - body_lines.append("") - else: - body_lines.extend(textwrap.wrap(para, width=wrap_width)) - - title = title.strip() - title_line = f" {title} " - top = "┌" + "─" * (max_width - 2) + "┐" - sep = "├" + "─" * (max_width - 2) + "┤" - bot = "└" + "─" * (max_width - 2) + "┘" - - if len(title_line) <= (max_width - 2): - left = (max_width - 2 - len(title_line)) // 2 - right = max_width - 2 - len(title_line) - left - top = "┌" + "─" * left + title_line + "─" * right + "┐" - - content = "\n".join("│ " + line.ljust(max_width - 4) + " │" for line in body_lines) - return "\n".join([top, sep, content, bot]) - - -def _print_intro_banner(base_url: str, model_id: str) -> None: - # Static standard description (avoid per-run model call for speed & determinism) - static_intro = ( - "- Parses user queries to detect intent (download assets, convert ODS to UPRN, or fetch UPRNs by output area).\n" - "- Extracts key identifiers (UPRN, ODS code, output area code) from text or file paths.\n" - '- Maps asset type phrases (e.g., "RGB image", "merged lidar point cloud") to predefined IRIs.\n' - "- Fills optional parameters (sensor, download directory, API key env, SPARQL endpoint) from context or heuristics.\n" - "- Uses heuristics + few-shot LLM + optional summarization to build argv for query_assist.py." - ) - banner = _render_box(f"query_assist.py Router — {model_id}", static_intro) - print(banner) - print() - - -# ------------------------------- -# Core turn runner -# ------------------------------- - - -def run_once( - base_url: str, - model_id: str, - nl: str, - qa_path: str, - py_exe: str, - dry_run: bool, - temperature: float, - top_p: float, - num_predict: int, - num_ctx: int | None, - keep_alive: str | None, - force_json: bool, - debug_model: bool, - summarize: bool, - summary_model: str, - summary_temperature: float, - show_summary: bool, -) -> int: - """ - One-shot turn: summarize (optional) → route spec → build argv → (dry) run query_assist.py. - Only DEBUG shows JSON specs and raw model content. - """ - original_nl = nl - # log.info("Request: %s", original_nl) - - # --- Summarization (independent of routing) --- - summary_router_text = None - summary_obj: dict[str, Any] | None = None - if summarize: - try: - sum_resp = ollama_chat( - base_url=base_url, - model=summary_model, - messages=[ - {"role": "system", "content": SUMMARY_SYSTEM_PROMPT}, - {"role": "user", "content": nl}, - ], - temperature=summary_temperature, - top_p=top_p, - num_predict=256, - num_ctx=num_ctx, - keep_alive=keep_alive, - force_json=True, - ) - if debug_model: - log.debug( - "Summarizer raw JSON response:\n%s", - json.dumps(sum_resp, indent=2, ensure_ascii=False), - ) - sum_content = extract_assistant_text_from_ollama(sum_resp) - log.debug("Summarizer extracted content: %r", sum_content) - - if sum_content.strip(): - try: - summary_obj = json.loads(sum_content) - except Exception: - try: - blob = slice_first_json_object(sum_content) - summary_obj = json.loads(blob) - except Exception: - summary_obj = None - - if not summary_obj: - # Heuristic fallback summary aligned with heuristic_parse - lowered = nl.lower() - fallback_bullets: list[str] = [] - extracted: dict[str, Any] = {} - - csv_paths = _find_csv_paths(nl) - oa_fb = re.findall(r"\bE\d{8}\b", nl) - ods_fb = re.findall(r"\b[A-Z]\d{5}\b", nl) - uprn_fb = re.findall(r"\b\d{6,}\b", nl) - - wants_merged = bool( - re.search(r"merged\s+(lidar\s+)?point\s*clouds?", lowered) - or "merged lidar" in lowered - ) - wants_rgb = ("rgb" in lowered) and ( - "image" in lowered or "images" in lowered - ) - - if csv_paths: - if "uprn" in lowered: - extracted["uprn"] = csv_paths - fallback_bullets.append(f"UPRNs CSV: {', '.join(csv_paths)}") - elif ( - ("output area" in lowered) - or ("output areas" in lowered) - or ("oa" in lowered.split()) - ): - extracted["output_area"] = csv_paths - fallback_bullets.append( - f"Output-area CSV: {', '.join(csv_paths)}" - ) - else: - extracted["uprn"] = csv_paths - fallback_bullets.append( - f"UPRNs CSV (assumed): {', '.join(csv_paths)}" - ) - - if not csv_paths: - if oa_fb: - extracted["output_area"] = oa_fb - fallback_bullets.append(f"Output areas: {', '.join(oa_fb)}") - if ods_fb and not uprn_fb: - extracted["ods"] = ods_fb - fallback_bullets.append(f"ODS: {', '.join(ods_fb)}") - if uprn_fb: - extracted["uprn"] = uprn_fb - fallback_bullets.append(f"UPRNs: {', '.join(uprn_fb)}") - - if wants_merged: - extracted["types"] = list( - set( - (extracted.get("types") or []) - + ["did:lidar-pointcloud-merged"] - ) - ) - fallback_bullets.append("Type: merged lidar pointcloud") - if wants_rgb: - extracted["types"] = list( - set((extracted.get("types") or []) + ["did:rgb-image"]) - ) - fallback_bullets.append("Type: RGB image") - - url_fb = re.search(r"(https?://[\w\.-:%/]+)", nl) - if url_fb: - extracted["db_url"] = url_fb.group(1) - fallback_bullets.append(f"Endpoint: {url_fb.group(1)}") - - if "uprn" in extracted: - router_text = ( - f"Download assets for UPRN(s) {', '.join(extracted['uprn'])}" - ) - elif "output_area" in extracted: - router_text = f"List UPRNs in output areas {', '.join(extracted['output_area'])}" - elif "ods" in extracted: - router_text = f"Map ODS {', '.join(extracted['ods'])} to UPRNs" - else: - router_text = "no-op" - if not fallback_bullets: - fallback_bullets.append("No actionable request") - - summary_obj = { - "bullets": fallback_bullets, - "router_text": router_text, - "extracted": extracted, - } - - bullets = summary_obj.get("bullets") or [] - summary_router_text = summary_obj.get("router_text") or None - if show_summary and bullets and bullets != ["No actionable request"]: - log.info("Summary: %s", " | ".join(bullets)) - - except Exception as e: - if show_summary: - log.info("Summary step failed: %s (continuing with original input)", e) - - candidate_text = summary_router_text or nl - - # --- Try summary-extracted direct spec first --- - if summary_obj and isinstance(summary_obj.get("extracted"), dict): - ex = summary_obj["extracted"] - inferred_command = None - if ex.get("ods"): - inferred_command = "ods_to_uprn" - if ex.get("output_area"): - inferred_command = "uprns_by_output_area" - if ex.get("uprn"): - inferred_command = "download_assets" - - if inferred_command: - spec_direct = { - "command": inferred_command, - "uprn": ex.get("uprn"), - "ods": ex.get("ods"), - "output_area": ex.get("output_area"), - "sensor": ex.get("sensor"), - "types": ex.get("types"), - "download_dir": ex.get("download_dir"), - "api_key_env": ex.get("api_key_env"), - "db_url": ex.get("db_url"), - } - if spec_direct.get("types") is None and "merged" in candidate_text.lower(): - spec_direct["types"] = ["did:lidar-pointcloud-merged"] - - try: - argv = build_argv(spec_direct, py_exe, qa_path) - log.debug( - "Router JSON (summary extracted):\n%s", - json.dumps(spec_direct, indent=2), - ) - log.info("Command: %s", " ".join([shlex.quote(x) for x in argv])) - if dry_run: - return 0 - proc = subprocess.run(argv) - return proc.returncode - except Exception: - pass # fall through - - # --- Heuristic fast path --- - heuristic_spec = heuristic_parse(candidate_text) - if heuristic_spec: - log.debug("Heuristic spec: %s", json.dumps(heuristic_spec)) - spec = heuristic_spec - argv = build_argv(spec, py_exe, qa_path) - log.info("Command: %s", " ".join([shlex.quote(x) for x in argv])) - if dry_run: - return 0 - proc = subprocess.run(argv) - return proc.returncode - - # --- Model routing --- - messages = [{"role": "system", "content": SYSTEM_ROUTER_PROMPT}] - for u, a in FEW_SHOTS: - messages.append({"role": "user", "content": u}) - messages.append({"role": "assistant", "content": a}) - messages.append({"role": "user", "content": candidate_text}) - - resp = ollama_chat( - base_url=base_url, - model=model_id, - messages=messages, - temperature=temperature, - top_p=top_p, - num_predict=num_predict, - num_ctx=num_ctx, - keep_alive=keep_alive, - force_json=force_json, - ) - if debug_model: - log.debug( - "Primary model raw JSON:\n%s", - json.dumps(resp, indent=2, ensure_ascii=False), - ) - - content = extract_assistant_text_from_ollama(resp) - log.debug("Primary model extracted content: %r", content) - - spec = None - if content.strip(): - try: - spec = json.loads(content) - except Exception: - try: - blob = slice_first_json_object(content) - spec = json.loads(blob) - except Exception: - spec = None - - if spec is None and force_json: - log.debug("Empty/invalid JSON content; retrying without format=json...") - resp2 = ollama_chat( - base_url=base_url, - model=model_id, - messages=messages, - temperature=temperature, - top_p=top_p, - num_predict=num_predict, - num_ctx=num_ctx, - keep_alive=keep_alive, - force_json=False, - ) - if debug_model: - log.debug( - "Secondary model raw JSON:\n%s", - json.dumps(resp2, indent=2, ensure_ascii=False), - ) - content2 = extract_assistant_text_from_ollama(resp2) - log.debug("Secondary model extracted content: %r", content2) - try: - spec = json.loads(content2) - except Exception: - try: - blob = slice_first_json_object(content2) - spec = json.loads(blob) - except Exception: - spec = None - - if spec is None: - heuristic_spec = heuristic_parse(nl) - if heuristic_spec: - log.debug( - "Fallback to heuristic after model failure: %s", - json.dumps(heuristic_spec), - ) - spec = heuristic_spec - - if not spec or spec.get("command") not in { - "download_assets", - "ods_to_uprn", - "uprns_by_output_area", - }: - log.warning( - "No actionable command inferred.\n" - "Try examples like:\n" - " • 'Download the merged lidar point cloud for UPRN 5045394'\n" - " • 'Map ODS G85013 to UPRNs'\n" - " • 'List all UPRNs in output areas E00004550 and E00032882'" - ) - log.debug("Raw model content:\n%s", content.strip()) - return 0 - - argv = build_argv(spec, py_exe, qa_path) - log.debug("Router JSON:\n%s", json.dumps(spec, indent=2)) - log.info("Command: %s", " ".join([shlex.quote(x) for x in argv])) - - if dry_run: - return 0 - - proc = subprocess.run(argv) - return proc.returncode - - -def main(): - ap = argparse.ArgumentParser( - description="Natural-language interface for query_assist.py using Ollama" - ) - ap.add_argument("--model-id", default="gpt-oss:20b", help="Ollama model name/tag") - ap.add_argument( - "--query-assist-path", - default=os.path.join(os.path.dirname(__file__), "query_assist.py"), - help="Path to query_assist.py", - ) - ap.add_argument( - "--base-url", - default=os.environ.get("OLLAMA_HOST", "http://localhost:11434"), - help="Base URL of the Ollama server (or set OLLAMA_HOST)", - ) - ap.add_argument( - "--dry-run", - action="store_true", - help="Only log the derived command; do not execute", - ) - ap.add_argument( - "--once", "-q", help="Run a single NL query and exit (non-interactive)" - ) - - # Decoding / runtime knobs - ap.add_argument( - "--temperature", type=float, default=0.0, help="Sampling temperature" - ) - ap.add_argument( - "--top-p", type=float, default=0.95, help="Nucleus sampling probability" - ) - ap.add_argument( - "--num-predict", type=int, default=256, help="Max new tokens to generate" - ) - ap.add_argument( - "--num-ctx", - type=int, - default=None, - help="Context window size hint (model-dependent)", - ) - ap.add_argument( - "--keep-alive", default=None, help="Ollama keep-alive (e.g., '5m', '30m', '0')" - ) - ap.add_argument( - "--no-force-json", - action="store_true", - help="Do not set format='json' in the chat call", - ) - - # Logging controls - ap.add_argument( - "-v", - "--verbose", - action="count", - default=0, - help="Increase verbosity ( -v = info, -vv = debug )", - ) - ap.add_argument( - "--debug-model", - action="store_true", - help="Force DEBUG and include raw model responses", - ) - - # Summarization controls - ap.add_argument( - "--no-summarize", - action="store_true", - help="Disable preliminary summarization step", - ) - ap.add_argument( - "--summary-model", - default=None, - help="Model ID for summarization (defaults to routing model)", - ) - ap.add_argument( - "--summary-temperature", - type=float, - default=0.0, - help="Temperature for summarization model", - ) - ap.add_argument( - "--hide-summary", action="store_true", help="Do not log summarization bullets" - ) - - args = ap.parse_args() - - # Configure logging - if args.debug_model or args.verbose >= 2: - level = logging.DEBUG - elif args.verbose == 1: - level = logging.INFO - else: - level = logging.WARNING - - logging.basicConfig(level=level, format="%(levelname)s: %(message)s") - - # Verbose-mode introductory banner - if level <= logging.INFO: - _print_intro_banner(args.base_url, args.model_id) - - py_exe = sys.executable - qa_path = args.query_assist_path - - try: - if args.once: - rc = run_once( - base_url=args.base_url, - model_id=args.model_id, - nl=args.once, - qa_path=qa_path, - py_exe=py_exe, - dry_run=args.dry_run, - temperature=args.temperature, - top_p=args.top_p, - num_predict=args.num_predict, - num_ctx=args.num_ctx, - keep_alive=args.keep_alive, - force_json=(not args.no_force_json), - debug_model=args.debug_model, - summarize=(not args.no_summarize), - summary_model=(args.summary_model or args.model_id), - summary_temperature=args.summary_temperature, - show_summary=not args.hide_summary, - ) - sys.exit(rc) - - if level <= logging.INFO: - print("NL router for query_assist.py. Type 'exit' or Ctrl-D to quit.") - while True: - try: - nl = input("> ").strip() - except EOFError: - break - if not nl: - continue - if nl.lower() in {"exit", "quit"}: - break - rc = run_once( - base_url=args.base_url, - model_id=args.model_id, - nl=nl, - qa_path=qa_path, - py_exe=py_exe, - dry_run=args.dry_run, - temperature=args.temperature, - top_p=args.top_p, - num_predict=args.num_predict, - num_ctx=args.num_ctx, - keep_alive=args.keep_alive, - force_json=(not args.no_force_json), - debug_model=args.debug_model, - summarize=(not args.no_summarize), - summary_model=(args.summary_model or args.model_id), - summary_temperature=args.summary_temperature, - show_summary=not args.hide_summary, - ) - if rc != 0: - log.warning("Subprocess exited with code %d", rc) - except KeyboardInterrupt: - # Cleanly handle Ctrl-C in REPL - print() - log.info("Interrupted.") - sys.exit(130) - except Exception as e: - log.error("Fatal error: %s", e) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/examples/query_assist.py b/examples/query_assist.py index 6cb54fe..fd63b1f 100644 --- a/examples/query_assist.py +++ b/examples/query_assist.py @@ -111,7 +111,6 @@ def build_asset_query(uprn_list, args): quoted_uprns = ", ".join(f'"{u}"' for u in uprn_list) where.append(f" FILTER(str(?uprnValue) IN ({quoted_uprns}))") if args.types: - # Assuming types are full IRIs, wrap them in <> quoted_types = ", ".join(f"<{t.strip()}>" for t in args.types.split(",")) where.append(f" FILTER(?enum IN ({quoted_types}))") @@ -160,7 +159,6 @@ def build_ods_to_uprn_query(ods_list): def download_asset(url: str, save_dir: str, api_key: str): """Downloads a single asset from a URL to a specified directory.""" try: - # Use a client with a longer timeout for potentially large files with httpx.Client(timeout=120.0) as client: resp = client.get(url, headers={"x-api-key": api_key}) resp.raise_for_status() @@ -199,7 +197,7 @@ def main(): ods_list.extend(load_column_from_csv(entry, "ods")) else: ods_list.extend(o.strip() for o in entry.split(",") if o.strip()) - ods_list = sorted(list(dict.fromkeys(ods_list))) # Sort for consistent query + ods_list = sorted(list(dict.fromkeys(ods_list))) store = SPARQLStore(query_endpoint=args.db_url, returnFormat="json") q = build_ods_to_uprn_query(ods_list) @@ -277,7 +275,7 @@ def main(): logging.error( f"API key environment variable {args.api_key_env!r} is not set." ) - return # Exit gracefully + return store = SPARQLStore(query_endpoint=args.db_url, returnFormat="json") q = build_asset_query(uprn_list, args) @@ -294,7 +292,6 @@ def main(): if isinstance(phenomenon_time_obj, datetime): date_str = phenomenon_time_obj.strftime("%Y-%m-%d") else: - # Fallback for unexpected date formats date_str = str(phenomenon_time_obj).split("T")[0] asset_type_subdir = asset_subdir(enum_iri) From 4e2bcf4c138830708c0038958c27b40781879710 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Wed, 29 Oct 2025 12:39:46 +0000 Subject: [PATCH 16/19] change: new triplestore endpoint --- examples/query_assist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/query_assist.py b/examples/query_assist.py index fd63b1f..ece4c0a 100644 --- a/examples/query_assist.py +++ b/examples/query_assist.py @@ -43,7 +43,7 @@ def parse_args(): ) parser.add_argument( "--db-url", - default="http://ec2-18-175-116-201.eu-west-2.compute.amazonaws.com:3030/didtriplestore/query", + default="http://ec2-3-10-233-191.eu-west-2.compute.amazonaws.com:3030/mytriplestore/query", help="SPARQL endpoint URL", ) parser.add_argument( From e38445cbd19bd84e846e8e42e04d995d5a76f6e7 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Wed, 29 Oct 2025 12:45:53 +0000 Subject: [PATCH 17/19] change: readme --- CHANGELOG.md | 2 + examples/README.md | 225 ++++++++++-------- examples/br_decompress.py | 85 ------- .../get_all_assets_for_a_list_of_uprns.py | 119 --------- examples/get_all_assets_for_a_uprn.py | 105 -------- ..._all_assets_for_a_uprn_made_by_a_sensor.py | 130 ---------- ...et_all_assets_of_type_for_list_of_uprns.py | 123 ---------- 7 files changed, 128 insertions(+), 661 deletions(-) delete mode 100644 examples/br_decompress.py delete mode 100644 examples/get_all_assets_for_a_list_of_uprns.py delete mode 100644 examples/get_all_assets_for_a_uprn.py delete mode 100644 examples/get_all_assets_for_a_uprn_made_by_a_sensor.py delete mode 100644 examples/get_all_assets_of_type_for_list_of_uprns.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d56b042..5573c5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,9 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add a `CODEOWNERS` file for ownership management. - Add a `CHANGELOG.md` file for ckear documentation of the development of the asset-api. +- Add `nl_query_assist.py` file for simple natural language querying. ### Changed - Refactor the query_assist.py output folder structure for dates of observations. +- New triplestore endpoint. --- diff --git a/examples/README.md b/examples/README.md index 741662d..853db94 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,6 +1,17 @@ # Python examples -This directory contains a few Python scripts that will load and provision the graph database with provided turtle (.ttl) files and execute queries against it. Some will also then use results from queries to download assets from the API. +This directory contains Python scripts for interacting with the DID triplestore (SPARQL endpoint) and the Asset API: + +1. `query_assist.py` – Structured CLI for downloading assets, listing UPRNs by output area, or mapping ODS→UPRN. +2. `nl_query_assist.py` – Natural language (NL) wrapper that plans and executes one- or two-stage workflows using `query_assist.py` underneath. + +The scripts can: + +- Discover UPRNs by output area codes. +- Map NHS ODS codes to UPRNs (with recommendation codes). +- Download assets for specified UPRNs with optional sensor and asset-type filters. +- Accept CSV file inputs for batch operations. +- Use NL instructions (via `nl_query_assist.py`) to infer plans automatically. ## Setup @@ -19,13 +30,11 @@ source venv/bin/activate ``` **Windows (CMD)** - ```bash venv\Scripts\activate.bat ``` **Windows (PowerShell)** - ```powershell venv\Scripts\Activate.ps1 ``` @@ -39,103 +48,47 @@ pip install -r requirements.txt ### 4. Set your API key **Temporary (session only)** - ```bash export API_KEY="your_api_key" ``` -**Permanent (automatic when Virtualenv is activated)** - -If you want this variable to be automatically set every time you activate your virtual environment, add the export line to the activate script inside your virtual environment. For example (macOS/Linux): - -Open `venv/bin/activate` and add the environment variable near the bottom (but before any final `unset` lines if present). +## Running the structured CLI (`query_assist.py`) -## Running scripts +`query_assist.py` supports three modes (mutually exclusive per invocation): -Once your environment is set up then you can run any of the `.py` files in the `/examples` directory. **Before you run a script** verify that the constants in the file are set correctly to match your local environment. +1. Asset download: `--uprn` (one/many or CSV path with column `uprn`) +2. Output area → UPRN listing: `--output-area` / `--oa` (codes or CSV path column `output_area`) +3. ODS → UPRN mapping: `--ods` (codes or CSV path column `ods`) -### Scripts +Optional filters / overrides: +- `--sensor bess:OusterLidarSensor` (or other supported sensor IRI) +- `--types did:rgb-image,did:lidar-pointcloud-merged` (comma separated IRIs) +- `--db-url http://host:3030/didtriplestore/query` (override SPARQL endpoint) +- `--download-dir /path/to/downloads` (default `./downloads`) +- `--api-key-env MY_KEY` (environment var containing API key; default `API_KEY`) -Granular scripts to improve legibility when viewing the SPARQL queries: -- `get_all_assets_for_a_list_of_uprns.py` -- `get_all_assets_for_a_uprn.py` -- `get_all_assets_for_a_uprn_made_by_a_sensor.py` -- `get_all_assets_of_type_for_list_of_uprns.py` - -Unified script: -- `query_assist.py` - -This unified script replaces and extends the above utilities by allowing you to: - -- Specify one or more UPRNs via **`--uprn`** (space- or comma-separated), or provide a CSV file path (column `uprn`) to `--uprn`. -- Specify one or more ODS codes via **`--ods`** (space- or comma-separated), or provide a CSV file path (column `ods`) to `--ods` for ODS→UPRN mapping. -- Specify one or more output-area IRIs or codes via **`--output-area`**/`--oa` (space- or comma-separated), or provide a CSV file path (column `output_area`) to list UPRNs by output area. -- Filter by **sensor** type (`--sensor`, e.g. `bess:OusterLidarSensor`). -- Filter by **asset type** (`--types`, e.g. `did:rgb-image,did:lidar-pointcloud-merged`). -- Override the **SPARQL endpoint** (`--db-url`). -- Change the **download directory** (`--download-dir`). -- Use a custom **API key** environment variable (`--api-key-env`). - -#### Supported sensors - -- `bess:PhidgetHumiditySensor` -- `bess:PhidgetTemperatureSensor` -- `bess:OusterLidarSensor` -- `bess:FlirOryxCamera` -- `bess:FlirA70Camera` - -#### Supported asset types - -- **Merged lidar point clouds**: `did:lidar-pointcloud-merged` -- **Pointcloud frame**: `did:lidar-pointcloud-frame` -- **Lidar range panorama images**: `did:lidar-range-pano` -- **Lidar reflectance for panorama**: `did:lidar-reflectance-pano` -- **Lidar signal intensity for panoramas**: `did:lidar-signal-pano` -- **Lidar Near Infrared for panoramas**: `did:lidar-nearir-pano` -- **Temperature in celsius** (no contentUrl): `did:celsius-temperature` -- **Relative humidity** (no contentUrl): `did:relative-humidity` -- **IR false colour**: `did:ir-false-color-image` -- **IR temperature array**: `did:ir-temperature-array` -- **IR counts**: `did:ir-count-image` -- **RGB image**: `did:rgb-image` - -Pointclouds are brotli compressed .pcd files. These can be decompressed using the Brotli CLI tool +Example usages: ```bash -brew install brotli -``` - -Or using the `br_decompress.py` script. - -```bash -python3 br_decompress.py --directory ./downloads -``` - -#### `query_assist.py` Usage - -```bash -# Single UPRN +# Single UPRN asset download python3 query_assist.py --uprn 100023334911 -# Multiple UPRNs (space-separated) +# Multiple UPRNs (space separated) python3 query_assist.py --uprn 100023334911 100023268138 -# Multiple UPRNs (comma-separated) -python3 query_assist.py --uprn 100023334911, 100023268138, 46251044 +# Multiple UPRNs (comma separated in one argument) +python3 query_assist.py --uprn 100023334911,100023268138,46251044 -# CSV-only for UPRNs +# CSV of UPRNs python3 query_assist.py --uprn path/to/uprns.csv -# ODS→UPRN mapping with recommendation code A (accepted) I (intervention recommended) +# ODS→UPRN mapping python3 query_assist.py --ods G85013 -# Output-area mode (single code) -python3 query_assist.py --output-area E00004550 - -# Output-area mode (multiple codes) +# Output areas → UPRN listing (mixed raw codes) python3 query_assist.py --output-area E00004550 E00032882 E00063193 E00047411 -# CSV-only for output-area +# CSV of output areas python3 query_assist.py --output-area path/to/areas.csv # Sensor filter @@ -154,7 +107,7 @@ python3 query_assist.py --uprn 5045394 --download-dir /data/assets export MY_KEY="..." python3 query_assist.py --uprn 5045394 --api-key-env MY_KEY -# A Few options at once +# Multiple options combined export MY_KEY="..." python3 query_assist.py \ --uprn 200003455212,5045394 \ @@ -165,39 +118,113 @@ python3 query_assist.py \ --api-key-env MY_KEY ``` -Run `python3 query_assist.py -h` to see the full list of command-line options and examples. +Run `python3 query_assist.py -h` for full help text. + +## Natural language workflow (`nl_query_assist.py`) + +`nl_query_assist.py` lets you describe tasks conversationally; it plans steps (e.g. output-area lookup → asset download) and calls `query_assist.py` accordingly. + +Prerequisite: An [Ollama](https://ollama.com/) server must be running locally (or remotely) with the desired model already pulled. Set the server URL via `export OLLAMA_HOST=http://host:port` (defaults to `http://localhost:11434`). Pull a model first, e.g.: + +```bash +ollama pull gpt-oss:20b +``` + +If you use a different model tag, pass it with `--model-id`. + +Key flags: +- `--once "your NL request"` run a single NL instruction and exit. +- `--dry-run` plan and show commands without executing downloads. +- `--plan-only` output the inferred plan (JSON-like) and exit. +- `--model-id gpt-oss:20b` choose Ollama model (set `OLLAMA_HOST` to change server URL). +- Decoding knobs: `--temperature`, `--top-p`, `--num-predict`, `--num-ctx`, `--keep-alive`, `--no-force-json`. + +Interactive session: +```bash +python3 nl_query_assist.py +> download merged lidar point clouds and rgb images for UPRNs 5045394 and 200003455212 into /tmp/assets +``` + +Single command: +```bash +python3 nl_query_assist.py --once "list UPRNs in output areas E00004550 and E00032882 then download rgb images" +``` + +Dry run: +```bash +python3 nl_query_assist.py --dry-run --once "download point clouds for ODS G85013" +``` + +Verbose (show planning internals): +```bash +python3 nl_query_assist.py -vv --once "rgb images for UPRNs in areas E00004550,E00032882" +``` + +## Supported sensors + +- `bess:PhidgetHumiditySensor` +- `bess:PhidgetTemperatureSensor` +- `bess:OusterLidarSensor` +- `bess:FlirOryxCamera` +- `bess:FlirA70Camera` + +## Supported asset types + +- Merged lidar point clouds: `did:lidar-pointcloud-merged` +- Pointcloud frame: `did:lidar-pointcloud-frame` +- Lidar range panorama images: `did:lidar-range-pano` +- Lidar reflectance panorama images: `did:lidar-reflectance-pano` +- Lidar signal intensity panorama images: `did:lidar-signal-pano` +- Lidar Near Infrared panorama images: `did:lidar-nearir-pano` +- Temperature in celsius (no contentUrl): `did:celsius-temperature` +- Relative humidity (no contentUrl): `did:relative-humidity` +- IR false colour images: `did:ir-false-color-image` +- IR temperature arrays: `did:ir-temperature-array` +- IR counts images: `did:ir-count-image` +- RGB images: `did:rgb-image` + +Point clouds are now provided as LAZ (.laz) compressed files. Most point cloud processing tools (e.g. PDAL, CloudCompare, Potree converters) handle `.laz` directly—no manual decompression step is required. + +## Additional Data Information +### RGB -# Additional Data Information +sRGB images are optimised for computer vision tasks. Vehicles and humans are masked out automatically. Please report any unmasked person or vehicle (especially number plates) to [xRI](mailto:info@xri.online). -## RGB +### IR -sRGB images are provided in the API at a resolution optimised for computer vision tasks. Vehicles and humans are masked out using an automated process, if a user finds an unmasked person or vehicle (most critically the number plate), please report it to [xRI](mailto:info@xri.online). +Edge regions are masked due to sensor heating. In temperature arrays masked regions are NaN. Radiometric assumptions: +1. Pixel distance is currently a sensible hard-coded value (dynamic derivation from LiDAR is in progress). +2. Emissivity is assumed constant (typical building materials fall in $\epsilon \in [0.85, 0.93]$; dynamic estimation is in development). +3. Daytime data is reflectance dominated; radiometric temperatures are only provided for night hours (1h after sunset to 1h before sunrise). +4. Sky regions fall outside reliable radiometric interpretation and are excluded. -## IR +### LiDAR -The outermost regions of the IR images and temperature arrays have been masked out, this is due to hot edges due to the IR detector heating itself up during operation. In the temperature arrays the masked areas are NaN elements in the compressed numpy array. +Four 360° grayscale panoramas are provided: -Additionally when working with the IR data there are some assumptions to note about the way in which radiometic temperature pixels themselves are calculated. +- Near-infrared (NIR): captures near-infrared spectrum for vegetation and surface texture analysis. +- Range: distance (mm) from sensor to objects (depth map). +- Reflectance: intensity of returned signal (material/angle dependent). +- Signal strength: quality of LiDAR return (helps assess reliability & environmental conditions). -- The formula requires the distance of each pixel from the detector, currently this is a sensible hard coded value, we are in the process of calculating these distances from the lidar. -- Building materials tend to be in a narrow range of emissivities $\epsilon\in [0.85,0.93]$, we currently hard code a single sensible value for emissivity but are developing methods for estimating building materials dynamically. -- During the day, we are in a reflectance dominated regime due to the influence of the sun, radiometric temperatures calculated in this regime are not reliable. Thermal data is provided for the night hours only (1 hour after sunset to 1 hour before sunrise). -- The sky is an object outside the scope of the radiometric temperature calculation, this is a low reflectance, low emissivity regime that our radiometric temperature calculations cannot say anything meaningful about. +Point cloud modalities: -## LiDAR +- Merged point cloud: dense, registered aggregate from multiple frames using Iterative Closest Point (ICP). +- Single frame point cloud: most orthogonal frame (fallback if ICP merge is unusable). -We have four 360 degree grey scale panormas these are: +ICP can fail, producing dense but misaligned merged clouds; use the single frame as a fallback. -- Near-infrared (NIR) capturing light in the near-infrared spectrum (just beyond visible light). NIR is often used to assess vegetation health, surface properties, and for capturing detailed textures in low-light conditions. +## Troubleshooting -- The range modality provides the distance from the LiDAR sensor to objects in the environment. Each pixel in this image represents a distance measurement in millimeters, creating a depth map of the scene. +- Missing downloads? Ensure the API key environment variable (`API_KEY` or your override) is exported in the same shell session. +- Empty CSV outputs: Verify the codes (UPRN / ODS / Output area) exist in the triplestore and that `--db-url` is correct. +- Slow queries: Consider filtering with `--types` and/or `--sensor` to reduce result size. +- NL planning returns "No actionable plan": add explicit codes (e.g. UPRNs) or clarify intent ("download rgb images" vs. "rgb"). -- The reflectivity image captures the intensity of the LiDAR signal that bounces back to the sensor. Reflectivity depends on the surface material and angle of incidence, making it useful for distinguishing between materials or identifying road markings, signs, and other objects. +## License -- The signal strength or return signal intensity measures the quality of the LiDAR return. Stronger signals usually indicate clearer, more reliable measurements. It can also reflect surface properties and environmental conditions. +See the root `LICENSE` file for details. -We also have two pointcloud types one is a single frame that is closest to orthogonal to the UPRN, the other is a dense, orchstrated pointcloud created by merging many pointcloud frames on either side of the most orthogonal frame using the [Iterative Closes Point (ICP) registration algorithm](http://ki-www.cvl.iis.u-tokyo.ac.jp/class2013/2013w/paper/correspondingAndRegistration/03_Levoy.pdf). -ICP registration can also fail completely resulting in dense but unaligned pointclouds. The single centre frame is provided as a failback pointcloud in the event of an unusable merged pointcloud. diff --git a/examples/br_decompress.py b/examples/br_decompress.py deleted file mode 100644 index 09fbfa6..0000000 --- a/examples/br_decompress.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import logging -import os - -import brotli - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - - -def decompress_and_replace(input_file): - """ - Decompresses a Brotli-compressed .pcd.br file and replaces it with the decompressed .pcd file. - - :param input_file: Path to the Brotli-compressed .pcd.br file - """ - try: - output_file = input_file[:-3] - with open(input_file, "rb") as compressed_file: - compressed_data = compressed_file.read() - - decompressed_data = brotli.decompress(compressed_data) - - with open(output_file, "wb") as decompressed_file: - decompressed_file.write(decompressed_data) - - os.remove(input_file) - - logging.info(f"Decompressed and replaced: {input_file} -> {output_file}") - except Exception as e: - logging.error(f"Error processing {input_file}: {e}") - - -def find_and_replace_pcd_br(directory): - """ - Finds all Brotli-compressed .pcd.br files in the specified directory recursively, - decompresses them, and replaces the original files with the decompressed .pcd files. - - :param directory: Path to the directory to search for .pcd.br files - """ - if not os.path.exists(directory): - logging.error(f"Directory not found: {directory}") - return - - # Find all .pcd.br files - pcd_br_files = [] - for root, _, files in os.walk(directory): - for file in files: - if file.endswith(".pcd.br"): - pcd_br_files.append(os.path.join(root, file)) - - if not pcd_br_files: - logging.warning(f"No .pcd.br files found in directory: {directory}") - return - - logging.info(f"Found {len(pcd_br_files)} .pcd.br files to decompress.") - - for file_path in pcd_br_files: - decompress_and_replace(file_path) - - logging.info("All decompression and replacement tasks completed.") - - -def main(): - parser = argparse.ArgumentParser( - description="Decompress Brotli-compressed .pcd.br files." - ) - parser.add_argument( - "-d", - "--directory", - default=os.path.join(os.path.dirname(os.path.abspath(__file__)), "./downloads"), - help="Directory to search for .pcd.br files (default: ./downloads)", - ) - args = parser.parse_args() - - logging.info(f"Using directory: {args.directory}") - find_and_replace_pcd_br(args.directory) - - -if __name__ == "__main__": - main() diff --git a/examples/get_all_assets_for_a_list_of_uprns.py b/examples/get_all_assets_for_a_list_of_uprns.py deleted file mode 100644 index 0d30e6d..0000000 --- a/examples/get_all_assets_for_a_list_of_uprns.py +++ /dev/null @@ -1,119 +0,0 @@ -import logging -import os -import re - -import httpx -from rdflib.plugins.stores.sparqlstore import SPARQLStore -from rdflib.query import ResultRow - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - -# --- Configuration --------------------------------------------------------- - -# SPARQL endpoint -DB_URL = "http://ec2-18-175-116-201.eu-west-2.compute.amazonaws.com:3030/didtriplestore/query" -endpoint = SPARQLStore(query_endpoint=DB_URL, returnFormat="json") - -# Base download directory -here = os.path.dirname(os.path.abspath(__file__)) -DOWNLOAD_DIR = os.path.join(here, "downloads") -os.makedirs(DOWNLOAD_DIR, exist_ok=True) - -# Comma-separated list of UPRNs (note the space after each comma) -UPRNs = "200003455212, 5045394" - -# SPARQL query: return both the UPRN value and the asset URL -QUERY = f""" -PREFIX dob: -PREFIX so: -PREFIX sosa: -PREFIX prov: - -SELECT DISTINCT ?uprnValue ?contentUrl -WHERE {{ - # 1) Grab any resource (?res) carrying a contentUrl - ?res - so:contentUrl ?contentUrl . - - # 2) Crawl back through: - # – sosa:hasResult from an Observation - # – prov:generated / prov:used chains from Processing → DerivedResult → Result - # (including any number of chained DerivedResults) - ?res - ( - ^sosa:hasResult - | ^prov:generated / prov:used - )* - / sosa:hasFeatureOfInterest - / so:identifier - / so:value - ?uprnValue . - - # 3) Restrict to only the UPRNs you care about - FILTER (?uprnValue IN ({UPRNs})) -}} -""" - - -# Your API key from environment -API_KEY = os.getenv("API_KEY") -if not API_KEY: - raise ValueError( - "API_KEY environment variable is not set. Please set it to your API key." - ) - - -# --- Helper to download a single asset into a given folder ---------------- - - -def download_asset(url: str, save_dir: str) -> None: - try: - resp = httpx.get(url, headers={"x-api-key": API_KEY}) - resp.raise_for_status() - - # Derive filename from Content-Disposition or fallback to URL basename - cd_header = resp.headers.get("Content-Disposition", "") - m = re.search(r'filename="([^"]+)"', cd_header) - filename = m.group(1) if m else os.path.basename(url) - - # Ensure the target folder exists - os.makedirs(save_dir, exist_ok=True) - save_path = os.path.join(save_dir, filename) - - # Write out the file - with open(save_path, "wb") as f: - f.write(resp.content) - - logging.info(f"✔ Saved {url} → {save_path}") - except Exception as e: - logging.error(f"✖ Failed to download {url}: {e}") - - -# --- Main execution -------------------------------------------------------- - - -def main(): - # Run the SPARQL query - results = endpoint.query(QUERY) - - # Iterate and dispatch each download into its UPRN folder - for row in results: - if not isinstance(row, ResultRow): - continue - - uprn_val = str(row["uprnValue"]) - content_url = str(row["contentUrl"]) - uprn_folder = os.path.join(DOWNLOAD_DIR, uprn_val) - - logging.info(f"⤷ Downloading {content_url} into {uprn_folder}/ …") - download_asset(content_url, uprn_folder) - - -if __name__ == "__main__": - try: - main() - except Exception as e: - logging.error(f"Error: {e}") diff --git a/examples/get_all_assets_for_a_uprn.py b/examples/get_all_assets_for_a_uprn.py deleted file mode 100644 index d8db0cd..0000000 --- a/examples/get_all_assets_for_a_uprn.py +++ /dev/null @@ -1,105 +0,0 @@ -import logging -import os -import re - -import httpx -from rdflib.plugins.stores.sparqlstore import SPARQLStore -from rdflib.query import ResultRow - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - -# --- Configuration --------------------------------------------------------- - -# SPARQL endpoint -DB_URL = "http://ec2-18-175-116-201.eu-west-2.compute.amazonaws.com:3030/didtriplestore/query" -endpoint = SPARQLStore(query_endpoint=DB_URL, returnFormat="json") - -# Base download directory -here = os.path.dirname(os.path.abspath(__file__)) -DOWNLOAD_DIR = os.path.join(here, "downloads") -os.makedirs(DOWNLOAD_DIR, exist_ok=True) - - -# Query parameters -UPRN = "5045394" - -# SPARQL query: return all asset URLs for the given UPRN -QUERY = f""" -PREFIX dob: -PREFIX rdfs: -PREFIX sosa: -PREFIX so: -PREFIX owl: - - -SELECT DISTINCT ?contentUrl -WHERE {{ - ?result a sosa:Result ; - so:contentUrl ?contentUrl . - ?observation a sosa:Observation ; - sosa:hasResult ?result ; - sosa:hasFeatureOfInterest ?foi . - ?foi a sosa:FeatureOfInterest ; - so:identifier ?uprn . - ?uprn a dob:UPRNValue ; - so:value ?uprnValue . - FILTER(str(?uprnValue) = "{UPRN}") -}} -""" - -# Your API key from environment -API_KEY = os.getenv("API_KEY") -if not API_KEY: - raise ValueError( - "API_KEY environment variable is not set. Please set it to your API key." - ) - -# --- Helper to download a single asset into a given folder ---------------- - - -def download_asset(url: str, save_dir: str) -> None: - try: - resp = httpx.get(url, headers={"x-api-key": API_KEY}) - resp.raise_for_status() - - # Derive filename from Content-Disposition or fallback to URL basename - cd_header = resp.headers.get("Content-Disposition", "") - m = re.search(r'filename="([^"]+)"', cd_header) - filename = m.group(1) if m else os.path.basename(url) - - os.makedirs(save_dir, exist_ok=True) - path = os.path.join(save_dir, filename) - - with open(path, "wb") as f: - f.write(resp.content) - - logging.info(f"✔ Saved {url} → {path}") - except Exception as e: - logging.error(f"✖ Failed to download {url}: {e}") - - -# --- Main execution -------------------------------------------------------- - - -def main(): - # Run the SPARQL query - results = endpoint.query(QUERY) - - # Download each asset into the base download directory - for row in results: - if not isinstance(row, ResultRow): - continue - - content_url = str(row["contentUrl"]) - logging.info(f"⤷ Downloading {content_url} …") - download_asset(content_url, DOWNLOAD_DIR) - - -if __name__ == "__main__": - try: - main() - except Exception as e: - logging.error(f"Error: {e}") diff --git a/examples/get_all_assets_for_a_uprn_made_by_a_sensor.py b/examples/get_all_assets_for_a_uprn_made_by_a_sensor.py deleted file mode 100644 index 52061b7..0000000 --- a/examples/get_all_assets_for_a_uprn_made_by_a_sensor.py +++ /dev/null @@ -1,130 +0,0 @@ -import logging -import os -import re - -import httpx -from rdflib.plugins.stores.sparqlstore import SPARQLStore -from rdflib.query import ResultRow - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - -# --- Configuration --------------------------------------------------------- - -# SPARQL endpoint -DB_URL = "http://ec2-18-175-116-201.eu-west-2.compute.amazonaws.com:3030/didtriplestore/query" -endpoint = SPARQLStore(query_endpoint=DB_URL, returnFormat="json") - -# Base download directory -here = os.path.dirname(os.path.abspath(__file__)) -DOWNLOAD_DIR = os.path.join(here, "downloads") -os.makedirs(DOWNLOAD_DIR, exist_ok=True) - -# Query parameters -UPRN = "5045394" -SENSOR = "bess:OusterLidarSensor" - -# The available sensors are: -# - bess:PhidgetHumiditySensor -# - bess:PhidgetTemperatureSensor -# - bess:OusterLidarSensor -# - bess:FlirOryxCamera -# - bess:FlirA70Camera - -# SPARQL query: return the asset URL for the given UPRN and sensor -QUERY = f""" -PREFIX dob: -PREFIX rdfs: -PREFIX sosa: -PREFIX bess: -PREFIX so: -PREFIX prov: -PREFIX owl: - -SELECT DISTINCT ?uprnValue ?contentUrl -WHERE {{ - # 1) Grab any resource carrying a contentUrl - ?res - so:contentUrl ?contentUrl . - - # 2) Crawl back through either: - # - Observation → sosa:hasResult - # - Processing → DerivedResult → (prov:generated / prov:used) - # (any number of times) - ?res - ( - ^sosa:hasResult - | ^prov:generated / prov:used - )* - ?obs . - - # 3) Now we’re at the Observation; pull out sensor & UPRN - ?obs a sosa:Observation ; - sosa:madeBySensor ?sensor ; - sosa:hasFeatureOfInterest/so:identifier/so:value ?uprnValue . - - # 4) Filter on specific sensor type and UPRN - ?sensor a {SENSOR} . - FILTER(str(?uprnValue) = "{UPRN}") -}} -""" - -# Your API key from environment -API_KEY = os.getenv("API_KEY") -if not API_KEY: - raise ValueError( - "API_KEY environment variable is not set. Please set it to your API key." - ) - - -# --- Helper to download a single asset into a given folder ---------------- - - -def download_asset(url: str, save_dir: str) -> None: - try: - resp = httpx.get(url, headers={"x-api-key": API_KEY}) - resp.raise_for_status() - - # Derive filename from Content-Disposition or fallback to URL basename - cd = resp.headers.get("Content-Disposition", "") - m = re.search(r'filename="([^"]+)"', cd) - filename = m.group(1) if m else os.path.basename(url) - - os.makedirs(save_dir, exist_ok=True) - path = os.path.join(save_dir, filename) - - with open(path, "wb") as f: - f.write(resp.content) - - logging.info(f"✔ Saved {url} → {path}") - except Exception as e: - logging.error(f"✖ Failed to download {url}: {e}") - - -# --- Main execution -------------------------------------------------------- - - -def main(): - # Run the SPARQL query - results = endpoint.query(QUERY) - - # Dispatch each download into the UPRN folder - for row in results: - if not isinstance(row, ResultRow): - continue - - uprn_val = str(row["uprnValue"]) - content_url = str(row["contentUrl"]) - target_dir = os.path.join(DOWNLOAD_DIR, uprn_val) - - logging.info(f"⤷ Downloading {content_url} into {target_dir}/ …") - download_asset(content_url, target_dir) - - -if __name__ == "__main__": - try: - main() - except Exception as e: - logging.error(f"Error: {e}") diff --git a/examples/get_all_assets_of_type_for_list_of_uprns.py b/examples/get_all_assets_of_type_for_list_of_uprns.py deleted file mode 100644 index 84b9b0d..0000000 --- a/examples/get_all_assets_of_type_for_list_of_uprns.py +++ /dev/null @@ -1,123 +0,0 @@ -import logging -import os -import re - -import httpx -from rdflib.plugins.stores.sparqlstore import SPARQLStore -from rdflib.query import ResultRow - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - -# --- Configuration --------------------------------------------------------- - -# SPARQL endpoint -DB_URL = "http://ec2-18-175-116-201.eu-west-2.compute.amazonaws.com:3030/didtriplestore/query" -endpoint = SPARQLStore(query_endpoint=DB_URL, returnFormat="json") - -# Base download directory -here = os.path.dirname(os.path.abspath(__file__)) -DOWNLOAD_DIR = os.path.join(here, "downloads") -os.makedirs(DOWNLOAD_DIR, exist_ok=True) - -# Assets can be of the following types: -# - Merged lidar point clouds: https://w3id.org/dob/id/lidar-pointcloud-merged -# - Lidar range panorama images: https://w3id.org/dob/id/lidar-range-pano -# - Lidar reflectance for panorama: https://w3id.org/dob/id/lidar-reflectance-pano -# - Temperature in celsius: https://w3id.org/dob/id/celsius-temperature (no contentUrl) -# - Lidar signal intensity for panoramas: https://w3id.org/dob/id/lidar-signal-pano -# - Lidar Near Infrared for panoramas: https://w3id.org/dob/id/lidar-nearir-pano -# - Relative humidity: https://w3id.org/dob/id/relative-humidity (no contentUrl) -# - Pointcloud frame: https://w3id.org/dob/id/lidar-pointcloud-frame -# - IR false colour: https://w3id.org/dob/id/ir-false-color-image -# - IR temperature array: https://w3id.org/dob/id/ir-temperature-array -# - IR counts: https://w3id.org/dob/id/ir-count-image -# - RBG image: https://w3id.org/dob/id/rgb-image - -# Which UPRNs and which enums (types) to pull -UPRNs = "200003455212, 5045394" -TYPES = "did:rgb-image, did:lidar-pointcloud-merged, did:ir-temperature-array" - -# --- SPARQL: find any resource with your enum & contentUrl, then crawl back to UPRN --- -QUERY = f""" -PREFIX dob: -PREFIX sosa: -PREFIX so: -PREFIX prov: -PREFIX did: - -SELECT DISTINCT ?uprnValue ?contentUrl -WHERE {{ - # 1) Pick up any resource carrying the enum & contentUrl - ?res - dob:typeQualifier ?enum ; - so:contentUrl ?contentUrl . - FILTER(?enum IN ({TYPES})) - - # 2) Crawl back arbitrarily through DerivedResult→Processing→Result→Observation - # (and even chained DerivedResults) to get the UPRN literal - ?res - ( - ^prov:generated / prov:used - | ^sosa:hasResult - )* - / sosa:hasFeatureOfInterest - / so:identifier - / so:value - ?uprnValue . - - # 3) Only the UPRNs you care about - FILTER(?uprnValue IN ({UPRNs})) -}} -""" - -# Your API key from environment -API_KEY = os.getenv("API_KEY") -if not API_KEY: - raise RuntimeError("API_KEY environment variable is not set.") - -# --- Download helper ------------------------------------------------------- - - -def download_asset(url: str, save_dir: str) -> None: - try: - resp = httpx.get(url, headers={"x-api-key": API_KEY}, timeout=60) - resp.raise_for_status() - - # Derive filename - cd = resp.headers.get("Content-Disposition", "") - m = re.search(r'filename="([^"]+)"', cd) - fn = m.group(1) if m else os.path.basename(url) - - os.makedirs(save_dir, exist_ok=True) - path = os.path.join(save_dir, fn) - with open(path, "wb") as f: - f.write(resp.content) - - logging.info(f"✔ {url} → {path}") - except Exception as e: - logging.error(f"✖ Failed {url}: {e}") - - -# --- Main ------------------------------------------------------------------ - - -def main(): - results = endpoint.query(QUERY) - - for row in results: - if not isinstance(row, ResultRow): - continue - - uprn = str(row["uprnValue"]) - url = str(row["contentUrl"]) - folder = os.path.join(DOWNLOAD_DIR, uprn) - - logging.info(f"Downloading into {folder}/ ← {url}") - download_asset(url, folder) - - -if __name__ == "__main__": - main() From ce457a547e6bed41ed74851fcb4c12e0527f5164 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Wed, 29 Oct 2025 12:49:27 +0000 Subject: [PATCH 18/19] remove: unused tests --- examples/README.md | 2 - examples/tests/test_br_decompression.py | 73 ------------------- examples/tests/test_download_scripts.py | 96 ------------------------- 3 files changed, 171 deletions(-) delete mode 100644 examples/tests/test_br_decompression.py delete mode 100644 examples/tests/test_download_scripts.py diff --git a/examples/README.md b/examples/README.md index 853db94..93e39bb 100644 --- a/examples/README.md +++ b/examples/README.md @@ -226,5 +226,3 @@ ICP can fail, producing dense but misaligned merged clouds; use the single frame ## License See the root `LICENSE` file for details. - - diff --git a/examples/tests/test_br_decompression.py b/examples/tests/test_br_decompression.py deleted file mode 100644 index 6eaf47c..0000000 --- a/examples/tests/test_br_decompression.py +++ /dev/null @@ -1,73 +0,0 @@ -import br_decompress as br -import brotli -import query_assist as qa - - -def _make_compressed_pair(): - """Return (raw_bytes, brotli_compressed_bytes).""" - raw = b"FOR UNIT TEST ONLY - pretend this is a PCD header\n" - return raw, brotli.compress(raw) - - -class _DummyResponse: - def __init__(self, data): - self.status_code = 200 - self.headers = {"Content-Disposition": 'attachment; filename="cloud.pcd.br"'} - self.content = data - - def raise_for_status(self): - pass - - -class _DummyStore: - """Fake rdflib SPARQLStore that yields exactly one result row.""" - - def __init__(self, *_, **__): - pass - - def query(self, *_): - return [ - { - "uprnValue": "999", - "contentUrl": "https://example.com/cloud.pcd.br", - "enum": "did:lidar-pointcloud-merged", - } - ] - - -def test_download_and_decompress_brotli(tmp_path, monkeypatch): - raw, compressed = _make_compressed_pair() - - monkeypatch.setattr(qa, "SPARQLStore", _DummyStore) - - monkeypatch.setattr(qa.httpx, "get", lambda *a, **k: _DummyResponse(compressed)) - - monkeypatch.setenv("API_KEY", "DUMMY") - - monkeypatch.setattr( - qa, - "parse_args", - lambda: qa.argparse.Namespace( - uprn=["999"], - ods=None, - sensor=None, - types=None, - output_area=None, - db_url="http://dummy", - download_dir=str(tmp_path), - api_key_env="API_KEY", - ), - ) - - qa.main() - - p_br = tmp_path / "999" / "lidar-pointcloud-merged" / "cloud.pcd.br" - assert p_br.is_file(), "compressed asset should have been saved by query_assist" - - br.find_and_replace_pcd_br(str(tmp_path)) - - p_raw = p_br.with_suffix("") - assert p_raw.is_file(), "decompressed .pcd should exist" - assert not p_br.exists(), ".pcd.br should have been removed" - - assert p_raw.read_bytes() == raw, "decompressed bytes should match original" diff --git a/examples/tests/test_download_scripts.py b/examples/tests/test_download_scripts.py deleted file mode 100644 index 324a8e9..0000000 --- a/examples/tests/test_download_scripts.py +++ /dev/null @@ -1,96 +0,0 @@ -import importlib -from pathlib import Path - -import pytest - - -class _DummyRow(dict): - """row['uprnValue'] / row['contentUrl'] lookup just like ResultRow""" - - def __getitem__(self, key): - return super().get(key) - - -class _DummyEndpoint: - """Replaces SPARQLStore instance inside each script.""" - - def __init__(self, rows): - self._rows = rows - - def query(self, *_): - return self._rows - - -def _fake_response(): - class _R: - status_code = 200 - headers = {"Content-Disposition": 'attachment; filename="file.bin"'} - content = b"DUMMY" - - def raise_for_status(self): - pass - - return _R() - - -@pytest.mark.parametrize( - "mod_name, expects_uprn_subfolder", - [ - ("examples.get_all_assets_for_a_list_of_uprns", True), - ("examples.get_all_assets_for_a_uprn", False), - ("examples.get_all_assets_for_a_uprn_made_by_a_sensor", True), - ("examples.get_all_assets_of_type_for_list_of_uprns", True), - ], -) -def test_script_downloads(tmp_path, monkeypatch, mod_name, expects_uprn_subfolder): - """Import the script as a module, monkey-patch, run main(), check the file.""" - - import httpx - - monkeypatch.setattr(httpx, "get", lambda *a, **k: _fake_response()) - - monkeypatch.setenv("API_KEY", "UNIT-TEST-KEY") - - mod = importlib.import_module(mod_name) - - monkeypatch.setattr(mod, "DOWNLOAD_DIR", str(tmp_path)) - - monkeypatch.setattr(mod, "ResultRow", _DummyRow) - - dummy_rows = [_DummyRow({"uprnValue": "999", "contentUrl": "https://x/y.bin"})] - if hasattr(mod, "endpoint"): - monkeypatch.setattr(mod, "endpoint", _DummyEndpoint(dummy_rows)) - else: - monkeypatch.setattr(mod, "endpoint", _DummyEndpoint(dummy_rows)) - - mod.main() - - if expects_uprn_subfolder: - expected = Path(tmp_path) / "999" / "file.bin" - else: - expected = Path(tmp_path) / "file.bin" - - assert expected.is_file(), f"{mod_name}: expected {expected} to exist" - - -@pytest.mark.parametrize( - "mod_name, substrings", - [ - ("examples.get_all_assets_for_a_list_of_uprns", ["200003455212", "5045394"]), - ("examples.get_all_assets_for_a_uprn", ["5045394"]), - ( - "examples.get_all_assets_for_a_uprn_made_by_a_sensor", - ["5045394", "bess:OusterLidarSensor"], - ), - ( - "examples.get_all_assets_of_type_for_list_of_uprns", - ["did:rgb-image", "lidar-pointcloud-merged"], - ), - ], -) -def test_query_contains_expected_literals(mod_name, substrings): - """Make sure the hard-coded constants really appear in the QUERY string.""" - mod = importlib.import_module(mod_name) - q = mod.QUERY - for s in substrings: - assert s in q, f"{mod_name}: missing {s} in QUERY" From e09b582daeb9d4a79f508da10d8a9ccc80ffb382 Mon Sep 17 00:00:00 2001 From: gnathoi Date: Wed, 29 Oct 2025 12:51:42 +0000 Subject: [PATCH 19/19] fix: pytest --- examples/tests/test_query_assist.py | 40 ++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/examples/tests/test_query_assist.py b/examples/tests/test_query_assist.py index a431aee..9f39c81 100644 --- a/examples/tests/test_query_assist.py +++ b/examples/tests/test_query_assist.py @@ -69,11 +69,19 @@ def __init__(self, *a, **k): pass def query(self, *_): + class _PhenomenonTime: + def __init__(self): + # fixed date for deterministic path + from datetime import datetime + + self.value = datetime(2024, 1, 2) + return [ { "uprnValue": "42", "contentUrl": "https://example.com/file.bin", "enum": "did:rgb-image", + "phenomenonTime": _PhenomenonTime(), } ] @@ -81,7 +89,22 @@ def query(self, *_): def test_cli_download_creates_nested_dir(tmp_path, monkeypatch): """Full happy-path run – ensures ///file.bin is created.""" monkeypatch.setattr(qa, "SPARQLStore", _DummyStore) - monkeypatch.setattr(qa.httpx, "get", lambda *a, **k: _dummy_http_response()) + + # Mock httpx.Client context manager used in download_asset + class _DummyClient: + def __init__(self, *a, **k): + pass + + def __enter__(self): + return self + + def __exit__(self, *a): + return False + + def get(self, *a, **k): + return _dummy_http_response() + + monkeypatch.setattr(qa.httpx, "Client", _DummyClient) monkeypatch.setenv("API_KEY", "FAKE-KEY") argv = ["query_assist", "--uprn", "42", "--download-dir", str(tmp_path)] @@ -102,12 +125,13 @@ def test_cli_download_creates_nested_dir(tmp_path, monkeypatch): qa.main() - expected = tmp_path / "42" / "rgb-image" / "file.bin" + # Includes date directory now (YYYY-MM-DD) + expected = tmp_path / "42" / "2024-01-02" / "rgb-image" / "file.bin" assert expected.is_file(), f"expected {expected} to exist" -def test_cli_fails_without_api_key(monkeypatch): - """Main should raise RuntimeError if API_KEY env var is missing.""" +def test_cli_missing_api_key_logs_error(monkeypatch, caplog): + """Modern behaviour: when API key missing, log error and return without raising.""" monkeypatch.setattr(qa, "SPARQLStore", _DummyStore) monkeypatch.delenv("API_KEY", raising=False) @@ -125,5 +149,9 @@ def test_cli_fails_without_api_key(monkeypatch): api_key_env="API_KEY", ), ) - with pytest.raises(RuntimeError, match="Env var 'API_KEY' is not set"): - qa.main() + caplog.clear() + qa.main() + assert any( + "API key environment variable 'API_KEY' is not set." in r.message + for r in caplog.records + ), "Expected error log for missing API key"