diff --git a/app/cli/core/inspector.py b/app/cli/core/inspector.py index 963071d..ff0d2f6 100644 --- a/app/cli/core/inspector.py +++ b/app/cli/core/inspector.py @@ -9,7 +9,7 @@ # Double-brace every dict/format literal inside the script so .format() ignores them. # Only {path} is a real substitution placeholder. _INSPECT_SCRIPT = """\ -import os, json, sys +import os, json, sys, traceback as _tb path = {path!r} ext = os.path.splitext(path)[1].lower() @@ -45,211 +45,226 @@ print(json.dumps(result)) sys.exit(0) -# ── pickle-based artifacts ────────────────────────────────────────────────── -import pickle - -with open(path, "rb") as f: - obj = pickle.load(f) - -module = type(obj).__module__ or "" -class_name = type(obj).__name__ - -# ── framework detection (order matters) ──────────────────────────────────── -framework = "generic" -extra = {{}} - -# sentence-transformers (check before transformers — it wraps HF models) +# ── pickle-based artifacts ────────────────────────────────────────── try: - from sentence_transformers import SentenceTransformer - if isinstance(obj, SentenceTransformer): - framework = "sentence_transformers" -except Exception: - pass - -# transformers PreTrainedModel -if framework == "generic": - try: - from transformers import PreTrainedModel - if isinstance(obj, PreTrainedModel): - framework = "transformers" - except Exception: - pass - -# PyTorch nn.Module -if framework == "generic": - try: - import torch - if isinstance(obj, torch.nn.Module): - framework = "pytorch" - except Exception: - pass - -# XGBoost -if framework == "generic": - try: - import xgboost as xgb - if isinstance(obj, (xgb.XGBModel, xgb.Booster)): - framework = "xgboost" - except Exception: - pass - -# LightGBM -if framework == "generic": - try: - import lightgbm as lgb - if isinstance(obj, (lgb.Booster, lgb.LGBMModel)): - framework = "lightgbm" - except Exception: - pass - -# CatBoost -if framework == "generic": - try: - from catboost import CatBoost - if isinstance(obj, CatBoost): - framework = "catboost" - except Exception: - pass + import pickle -# sklearn (last — many frameworks inherit from sklearn base) -if framework == "generic" and "sklearn" in module: - framework = "sklearn" - -# ── metadata extraction ───────────────────────────────────────────────────── -class_hierarchy = [] -input_hint = "unknown" -output_hint = "unknown" -feature_count = None -class_labels = None - -if framework == "sklearn": - if hasattr(obj, "steps"): - class_hierarchy = [type(s).__name__ for _, s in obj.steps] - else: - class_hierarchy = [class_name] - - if hasattr(obj, "n_features_in_"): - feature_count = int(obj.n_features_in_) - - if hasattr(obj, "classes_"): - class_labels = obj.classes_.tolist() - elif hasattr(obj, "steps"): - for _, step in obj.steps: - if hasattr(step, "classes_"): - class_labels = step.classes_.tolist() - break - - text_vectorizers = ("TfidfVectorizer", "CountVectorizer", "HashingVectorizer") - if class_hierarchy and class_hierarchy[0] in text_vectorizers: - input_hint = "raw text string" - elif feature_count: - input_hint = "array-like of shape (n, " + str(feature_count) + ")" - else: - input_hint = "array-like" - - if class_labels is not None: - output_hint = "integer class label (classes: " + str(class_labels) + ")" - else: - output_hint = "float or array" - -elif framework == "pytorch": - try: - import torch - layer_count = sum(1 for _ in obj.modules()) - extra["layer_count"] = layer_count - # Try to infer input/output shapes from named children - children = list(obj.named_children()) - if children: - extra["first_layer"] = type(children[0][1]).__name__ - extra["last_layer"] = type(children[-1][1]).__name__ - except Exception: - pass - input_hint = "torch.Tensor" - output_hint = "torch.Tensor" + with open(path, "rb") as f: + obj = pickle.load(f) -elif framework == "transformers": - try: - cfg = obj.config - extra["model_type"] = getattr(cfg, "model_type", "unknown") - extra["hidden_size"] = getattr(cfg, "hidden_size", None) - extra["num_labels"] = getattr(cfg, "num_labels", None) - extra["tokenizer_class"] = getattr(cfg, "tokenizer_class", None) - except Exception: - pass - input_hint = "dict with input_ids, attention_mask (tokenized)" - output_hint = "ModelOutput (logits or last_hidden_state)" + module = type(obj).__module__ or "" + class_name = type(obj).__name__ -elif framework == "xgboost": - try: - import xgboost as xgb - if isinstance(obj, xgb.XGBModel): - extra["n_estimators"] = getattr(obj, "n_estimators", None) - extra["objective"] = getattr(obj, "objective", None) - if hasattr(obj, "n_features_in_"): - feature_count = int(obj.n_features_in_) - elif isinstance(obj, xgb.Booster): - extra["num_trees"] = obj.num_trees() - except Exception: - pass - input_hint = "numpy array or pandas DataFrame" - output_hint = "numpy array of predictions" + # ── framework detection (order matters) ────────────────────────────────── + framework = "generic" + extra = {{}} -elif framework == "lightgbm": + # sentence-transformers (check before transformers — it wraps HF models) try: - import lightgbm as lgb - if isinstance(obj, lgb.LGBMModel): - extra["n_estimators"] = getattr(obj, "n_estimators", None) - extra["objective"] = getattr(obj, "objective", None) - if hasattr(obj, "n_features_in_"): - feature_count = int(obj.n_features_in_) - elif isinstance(obj, lgb.Booster): - extra["num_trees"] = obj.num_trees() + from sentence_transformers import SentenceTransformer + if isinstance(obj, SentenceTransformer): + framework = "sentence_transformers" except Exception: pass - input_hint = "numpy array or pandas DataFrame" - output_hint = "numpy array of predictions" -elif framework == "catboost": - try: - from catboost import CatBoost - extra["loss_function"] = obj.get_param("loss_function") - fc = obj.get_param("feature_count") or obj.get_param("num_features") - if fc is not None: - feature_count = int(fc) - except Exception: - pass - input_hint = "numpy array or pandas DataFrame" - output_hint = "numpy array of predictions" - -elif framework == "sentence_transformers": + # transformers PreTrainedModel + if framework == "generic": + try: + from transformers import PreTrainedModel + if isinstance(obj, PreTrainedModel): + framework = "transformers" + except Exception: + pass + + # PyTorch nn.Module + if framework == "generic": + try: + import torch + if isinstance(obj, torch.nn.Module): + framework = "pytorch" + except Exception: + pass + + # XGBoost + if framework == "generic": + try: + import xgboost as xgb + if isinstance(obj, (xgb.XGBModel, xgb.Booster)): + framework = "xgboost" + except Exception: + pass + + # LightGBM + if framework == "generic": + try: + import lightgbm as lgb + if isinstance(obj, (lgb.Booster, lgb.LGBMModel)): + framework = "lightgbm" + except Exception: + pass + + # CatBoost + if framework == "generic": + try: + from catboost import CatBoost + if isinstance(obj, CatBoost): + framework = "catboost" + except Exception: + pass + + # sklearn (last — many frameworks inherit from sklearn base) + if framework == "generic" and "sklearn" in module: + framework = "sklearn" + + # ── metadata extraction ───────────────────────────────────────────────── + class_hierarchy = [] + input_hint = "unknown" + output_hint = "unknown" + feature_count = None + class_labels = None + + if framework == "sklearn": + if hasattr(obj, "steps"): + class_hierarchy = [type(s).__name__ for _, s in obj.steps] + else: + class_hierarchy = [class_name] + + if hasattr(obj, "n_features_in_"): + feature_count = int(obj.n_features_in_) + + if hasattr(obj, "classes_"): + class_labels = obj.classes_.tolist() + elif hasattr(obj, "steps"): + for _, step in obj.steps: + if hasattr(step, "classes_"): + class_labels = step.classes_.tolist() + break + + text_vectorizers = ("TfidfVectorizer", "CountVectorizer", "HashingVectorizer") + if class_hierarchy and class_hierarchy[0] in text_vectorizers: + input_hint = "raw text string" + elif feature_count: + input_hint = "array-like of shape (n, " + str(feature_count) + ")" + else: + input_hint = "array-like" + + if class_labels is not None: + output_hint = "integer class label (classes: " + str(class_labels) + ")" + else: + output_hint = "float or array" + + elif framework == "pytorch": + try: + import torch + layer_count = sum(1 for _ in obj.modules()) + extra["layer_count"] = layer_count + children = list(obj.named_children()) + if children: + extra["first_layer"] = type(children[0][1]).__name__ + extra["last_layer"] = type(children[-1][1]).__name__ + except Exception: + pass + input_hint = "torch.Tensor" + output_hint = "torch.Tensor" + + elif framework == "transformers": + try: + cfg = obj.config + extra["model_type"] = getattr(cfg, "model_type", "unknown") + extra["hidden_size"] = getattr(cfg, "hidden_size", None) + extra["num_labels"] = getattr(cfg, "num_labels", None) + extra["tokenizer_class"] = getattr(cfg, "tokenizer_class", None) + except Exception: + pass + input_hint = "dict with input_ids, attention_mask (tokenized)" + output_hint = "ModelOutput (logits or last_hidden_state)" + + elif framework == "xgboost": + try: + import xgboost as xgb + if isinstance(obj, xgb.XGBModel): + extra["n_estimators"] = getattr(obj, "n_estimators", None) + extra["objective"] = getattr(obj, "objective", None) + if hasattr(obj, "n_features_in_"): + feature_count = int(obj.n_features_in_) + elif isinstance(obj, xgb.Booster): + extra["num_trees"] = obj.num_trees() + except Exception: + pass + input_hint = "numpy array or pandas DataFrame" + output_hint = "numpy array of predictions" + + elif framework == "lightgbm": + try: + import lightgbm as lgb + if isinstance(obj, lgb.LGBMModel): + extra["n_estimators"] = getattr(obj, "n_estimators", None) + extra["objective"] = getattr(obj, "objective", None) + if hasattr(obj, "n_features_in_"): + feature_count = int(obj.n_features_in_) + elif isinstance(obj, lgb.Booster): + extra["num_trees"] = obj.num_trees() + except Exception: + pass + input_hint = "numpy array or pandas DataFrame" + output_hint = "numpy array of predictions" + + elif framework == "catboost": + try: + from catboost import CatBoost + extra["loss_function"] = obj.get_param("loss_function") + fc = obj.get_param("feature_count") or obj.get_param("num_features") + if fc is not None: + feature_count = int(fc) + except Exception: + pass + input_hint = "numpy array or pandas DataFrame" + output_hint = "numpy array of predictions" + + elif framework == "sentence_transformers": + try: + extra["model_name"] = getattr(obj, "_model_card_text", None) or class_name + modules = list(obj.modules()) + for m in modules: + if hasattr(m, "word_embedding_dimension"): + extra["embedding_dim"] = m.word_embedding_dimension + break + except Exception: + pass + input_hint = "string or list of strings" + output_hint = "numpy array of shape (n, embedding_dim)" + + artifact_size_mb = round(os.path.getsize(path) / (1024 * 1024), 2) + print(json.dumps(dict( + framework=framework, + class_name=class_name, + class_hierarchy=class_hierarchy, + input_hint=input_hint, + output_hint=output_hint, + feature_count=feature_count, + class_labels=class_labels, + artifact_path=path, + artifact_size_mb=artifact_size_mb, + extra=extra, + ))) +except Exception as _exc: + _size = 0.0 try: - extra["model_name"] = getattr(obj, "_model_card_text", None) or class_name - modules = list(obj.modules()) - # embedding dim from the pooling layer - for m in modules: - if hasattr(m, "word_embedding_dimension"): - extra["embedding_dim"] = m.word_embedding_dimension - break + _size = round(os.path.getsize(path) / (1024 * 1024), 2) except Exception: pass - input_hint = "string or list of strings" - output_hint = "numpy array of shape (n, embedding_dim)" - -artifact_size_mb = round(os.path.getsize(path) / (1024 * 1024), 2) - -result = dict( - framework=framework, - class_name=class_name, - class_hierarchy=class_hierarchy, - input_hint=input_hint, - output_hint=output_hint, - feature_count=feature_count, - class_labels=class_labels, - artifact_path=path, - artifact_size_mb=artifact_size_mb, - extra=extra, -) -print(json.dumps(result)) + print(json.dumps(dict( + framework="unknown", + class_name="unknown", + class_hierarchy=[], + input_hint="unknown", + output_hint="unknown", + feature_count=None, + class_labels=None, + artifact_path=path, + artifact_size_mb=_size, + extra={{"inspection_warning": str(_exc), "traceback": _tb.format_exc()}}, + ))) """ @@ -284,4 +299,12 @@ def inspect_artifact(path: str) -> ArtifactMetadata: raise ValueError(f"Inspection failed:\n{result.stderr.strip()}") data = json.loads(result.stdout) - return ArtifactMetadata(**data) + meta = ArtifactMetadata(**data) + if "inspection_warning" in meta.extra: + import warnings + warnings.warn( + f"Artifact inspection encountered an error and returned partial metadata: " + f"{meta.extra['inspection_warning']}", + stacklevel=2, + ) + return meta diff --git a/docs/internal/issue-planner.md b/docs/internal/issue-planner.md new file mode 100644 index 0000000..be793a8 --- /dev/null +++ b/docs/internal/issue-planner.md @@ -0,0 +1,43 @@ +# Issue Planner + +One branch per issue. Branch naming: `fix/-` for bugs, `feat/-` for enhancements. + +--- + +## Tier 1 — Bugs (fix in order) + +| # | Title | Branch | Status | +|---|-------|--------|--------| +| #14 | Inspector exits non-zero and discards all metadata on any exception | `fix/14-inspector-exception-handling` | ✅ Done | +| #15 | Inspector uses pickle.load for all formats, crashing on ONNX and PyTorch `.pt` | `fix/15-inspector-format-routing` | | +| #21 | `_parse_methods` conflates load() and predict() into one block | `fix/21-parse-methods` | | +| #20 | `_splice_methods` regex corrupts definition.py when class has helper methods | `fix/20-splice-methods` | | +| #23 | Reused temp dir across validation retries causes stale module state | `fix/23-validator-stale-module` | | +| #18 | sample_input passed as raw string to validate_pipeline, breaking numeric models | `fix/18-sample-input-type` | | +| #22 | write_scaffold crashes with KeyError when ArtifactMetadata fields are None | `fix/22-write-scaffold-none-fields` | | +| #19 | fix command exits immediately in CI / non-interactive mode | `fix/19-fix-command-ci-mode` | | + +## Tier 2 — Enhancements (after all bugs are closed) + +| # | Title | Branch | Status | +|---|-------|--------|--------| +| #16 | Add LLM interpretation stage between inspection and codegen | `feat/16-llm-interpretation-stage` | | +| #17 | Add --framework flag to deploy to override framework detection | `feat/17-framework-flag` | | +| #24 | Add --yes flag for CI mode | `feat/24-yes-flag` | | +| #25 | Include sample_input in generate(), fix(), and interpretation prompts | `feat/25-sample-input-in-prompts` | | +| #36 | Extend LLM interpretation to suggest sample_input | `feat/36-suggest-sample-input` | | +| #35 | Cache LLM generation results by artifact hash | `feat/35-llm-cache` | | +| #34 | Add GET /models/{name}/{version}/metadata endpoint | `feat/34-metadata-endpoint` | | +| #33 | Add inference-engine benchmark command | `feat/33-benchmark-command` | | +| #32 | Add --dry-run flag to fix command | `feat/32-fix-dry-run` | | +| #31 | Add POST /admin/reload endpoint | `feat/31-admin-reload` | | +| #30 | Add --explain flag to deploy | `feat/30-explain-flag` | | +| #28 | Add inference-engine snippets command | `feat/28-snippets-command` | | +| #27 | Add inference-engine export command | `feat/27-export-command` | | +| #26 | Add inference-engine package command | `feat/26-package-command` | | + +## Tier 3 — Security + +| # | Title | Branch | Status | +|---|-------|--------|--------| +| #42 | Path traversal in deploy via --name / --version flags | `fix/42-path-traversal` | | diff --git a/tests/test_cli_phase1_2.py b/tests/test_cli_phase1_2.py index 17a049b..3a14179 100644 --- a/tests/test_cli_phase1_2.py +++ b/tests/test_cli_phase1_2.py @@ -40,11 +40,16 @@ def test_inspect_missing_file(): def test_inspect_invalid_file(tmp_path): + import warnings from app.cli.core.inspector import inspect_artifact bad = tmp_path / "bad.pkl" bad.write_bytes(b"not a pickle") - with pytest.raises(ValueError, match="Inspection failed"): - inspect_artifact(str(bad)) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + meta = inspect_artifact(str(bad)) + assert meta.framework == "unknown" + assert "inspection_warning" in meta.extra + assert len(w) == 1 and "partial metadata" in str(w[0].message) # ---------------------------------------------------------------------------