diff --git a/.gitignore b/.gitignore index 79222a7..e45f63f 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ parseTable1.out/ # Large or local-only PDF fixtures /testpapers/OPEandRA.pdf +/papers/ diff --git a/R/inspect_paper_outputs.R b/R/inspect_paper_outputs.R index 0e0ca1a..03e8d26 100644 --- a/R/inspect_paper_outputs.R +++ b/R/inspect_paper_outputs.R @@ -235,8 +235,9 @@ show_paper_variable_candidates <- function(paper_dir, min_priority = NULL) { paper_table_inventory_df <- function(outputs) { records <- outputs$paper_table_inventory$tables %||% list() - rows <- lapply(records, function(record) { + rows <- Map(function(record, record_position) { data.frame( + table_index = as.integer(record_position - 1L), table_number = as.integer(record$table_number %||% NA_integer_), table_id = as.character(record$table_id %||% ""), table_category = as.character(record$table_category %||% ""), @@ -249,9 +250,10 @@ paper_table_inventory_df <- function(outputs) { evidence = paste(as.character(unlist(record$category_evidence %||% list(), use.names = FALSE)), collapse = " | "), stringsAsFactors = FALSE ) - }) + }, records, seq_along(records)) if (length(rows) == 0) { data.frame( + table_index = integer(), table_number = integer(), table_id = character(), table_category = character(), @@ -770,6 +772,7 @@ show_table_processing <- function(paper_dir, table_number = 1L, table_index = NU } else { cat(sprintf("Table processing for table_number=%s\n", as.integer(resolved_table_number))) } + cat(sprintf("table_index: %s\n", as.integer(table_index))) cat(sprintf("table_id: %s\n", definition$table_id %||% normalized$table_id %||% parsed$table_id %||% "")) if (!is.null(definition$title) && nzchar(definition$title)) { cat(sprintf("title: %s\n", definition$title)) @@ -848,6 +851,7 @@ show_parse_quality <- function(paper_dir, table_number = 1L, table_index = NULL) } else { cat(sprintf("Parse quality for table_number=%s\n", as.integer(resolved_table_number))) } + cat(sprintf("table_index: %s\n", as.integer(table_index))) cat(sprintf("table_id: %s\n", as.character(report$table_id %||% normalized$table_id %||% ""))) cat(sprintf("total_body_rows: %s\n", as.integer(summary$total_body_rows %||% 0L))) cat(sprintf("unknown_row_count: %s\n", as.integer(summary$unknown_row_count %||% 0L))) @@ -898,6 +902,7 @@ show_table_structure <- function(paper_dir, table_number = 1L, table_index = NUL } else { cat(sprintf("table_number: %s\n", as.integer(resolved_table_number))) } + cat(sprintf("table_index: %s\n", as.integer(table_index))) cat(sprintf("table_id: %s\n", definition$table_id %||% normalized$table_id %||% "")) if (!is.null(definition$title) && nzchar(definition$title)) { cat(sprintf("title: %s\n", definition$title)) @@ -990,20 +995,25 @@ llm_variable_plausibility_review_by_index <- function(outputs, table_index = 0L) if (length(reviews) == 0) { stop("No table_variable_plausibility_llm.json found for this paper.", call. = FALSE) } + definition <- table_definition_by_index(outputs, table_index) + target_table_id <- as.character(definition$table_id %||% "") + if (nzchar(target_table_id)) { + review_matches <- Filter( + function(x) identical(as.character(x$table_id %||% ""), target_table_id), + reviews + ) + if (length(review_matches) > 0) { + return(review_matches[[1]]) + } + } + idx <- as.integer(table_index) + 1L review <- reviews[[idx]] %||% NULL - if (!is.null(review)) { + if (!is.null(review) && !nzchar(as.character(review$table_id %||% ""))) { return(review) } - definition <- table_definition_by_index(outputs, table_index) - review_matches <- Filter( - function(x) identical(as.character(x$table_id %||% ""), as.character(definition$table_id %||% "")), - reviews - ) - if (length(review_matches) == 0) { - stop(sprintf("No variable-plausibility review found for table_index=%s.", table_index), call. = FALSE) - } - review_matches[[1]] + + stop(sprintf("No variable-plausibility review found for table_index=%s.", table_index), call. = FALSE) } llm_variable_plausibility_df <- function(outputs, table_number = NULL, table_index = NULL) { @@ -1064,10 +1074,13 @@ llm_variable_plausibility_df <- function(outputs, table_number = NULL, table_ind do.call(rbind, rows) } -show_variable_plausibility_review <- function(review, normalized_table, table_definition) { +show_variable_plausibility_review <- function(review, normalized_table, table_definition, table_index = NULL) { cleaned_rows <- normalized_table$metadata$cleaned_rows %||% list() cat(sprintf("Variable plausibility review for table_id=%s\n", review$table_id %||% table_definition$table_id %||% "")) + if (!is.null(table_index)) { + cat(sprintf("table_index: %s\n", as.integer(table_index))) + } if (!is.null(table_definition$title) && nzchar(table_definition$title)) { cat(sprintf("title: %s\n", table_definition$title)) } @@ -1162,7 +1175,7 @@ show_llm_variable_plausibility <- function(paper_dir, table_number = 1L, table_i review <- llm_variable_plausibility_review_by_index(outputs, table_index = table_index) normalized <- normalized_table_by_index(outputs, table_index = table_index) definition <- table_definition_by_index(outputs, table_index = table_index) - show_variable_plausibility_review(review, normalized, definition) + show_variable_plausibility_review(review, normalized, definition, table_index = table_index) } summarize_llm_variable_plausibility_monitoring <- function(paper_dir, run_id = NULL) { @@ -1309,6 +1322,7 @@ show_table_context <- function(paper_dir, table_number = 1L, table_index = NULL, } else { cat(sprintf("Table context for table_number=%s\n", as.integer(resolved_table_number))) } + cat(sprintf("table_index: %s\n", as.integer(table_index))) if (!is.null(context$table_label)) { cat(sprintf("Label: %s\n", context$table_label)) } diff --git a/README.md b/README.md index 77445ff..1851e2c 100644 --- a/README.md +++ b/README.md @@ -208,22 +208,22 @@ For paper-level inspection there is also: source("R/inspect_paper_outputs.R") summarize_table_processing("outputs/papers/cobaltpaper") show_paper_table_inventory("outputs/papers/cobaltpaper") -show_table_structure("outputs/papers/cobaltpaper", table_number = 1L) -show_table_processing("outputs/papers/cobaltpaper", table_number = 1L) +show_table_structure("outputs/papers/cobaltpaper", table_index = 0L) +show_table_processing("outputs/papers/cobaltpaper", table_index = 0L) show_paper_variable_candidates("outputs/papers/cobaltpaper") show_paper_variable_mentions("outputs/papers/cobaltpaper", source_type = "text_based", mention_role = "variable") -show_table_context("outputs/papers/cobaltpaper", table_number = 1L) +show_table_context("outputs/papers/cobaltpaper", table_index = 0L) ``` -Use `table_number` for public inspection. Extraction-order indices are retained only as low-level provenance/debug handles. +Use `table_index` for parser-output debugging and batch evaluation. It is zero-based and lines up with JSON array positions and debug artifacts such as `table_0/`. Use `table_number` only when you intentionally want the table labeled that way in the paper, such as `Table 1`. If `review-variable-plausibility` has been run: ```r source("R/inspect_paper_outputs.R") outputs <- load_paper_outputs("outputs/papers/cobaltpaper") -llm_variable_plausibility_df(outputs, table_number = 1L) -show_llm_variable_plausibility("outputs/papers/cobaltpaper", table_number = 1L) +llm_variable_plausibility_df(outputs, table_index = 0L) +show_llm_variable_plausibility("outputs/papers/cobaltpaper", table_index = 0L) list_llm_variable_plausibility_debug_runs("outputs/papers/cobaltpaper") summarize_llm_variable_plausibility_monitoring("outputs/papers/cobaltpaper") ``` @@ -382,14 +382,15 @@ Minimum OpenAI setup: ```bash export LLM_PROVIDER=openai export OPENAI_API_KEY=your_api_key_here -export OPENAI_MODEL=gpt-4.1-mini -export LLM_TEMPERATURE=0 +export OPENAI_MODEL=gpt-5.5 export LLM_TIMEOUT_SECONDS=60 export LLM_MAX_RETRIES=2 export LLM_DEBUG=false export LLM_SDK_DEBUG=false ``` +The OpenAI client intentionally omits `temperature` for `gpt-5.5` and other reasoning-model IDs that reject custom sampling parameters. + Minimum Qwen setup: ```bash diff --git a/docs/design/llm_integration.md b/docs/design/llm_integration.md index fa50c7f..6161e65 100644 --- a/docs/design/llm_integration.md +++ b/docs/design/llm_integration.md @@ -91,14 +91,15 @@ Qwen uses: ```bash export LLM_PROVIDER=openai export OPENAI_API_KEY=your_api_key_here -export OPENAI_MODEL=gpt-4.1-mini -export LLM_TEMPERATURE=0 +export OPENAI_MODEL=gpt-5.5 export LLM_TIMEOUT_SECONDS=60 export LLM_MAX_RETRIES=2 export LLM_DEBUG=false export LLM_SDK_DEBUG=false ``` +For `gpt-5.5` and other OpenAI reasoning-model IDs that reject custom sampling parameters, the client omits `temperature` and uses provider defaults. + Required for OpenAI: - `OPENAI_API_KEY` diff --git a/docs/implementation/llm_setup.md b/docs/implementation/llm_setup.md index 96cb600..20d01c0 100644 --- a/docs/implementation/llm_setup.md +++ b/docs/implementation/llm_setup.md @@ -36,8 +36,7 @@ OpenAI: ```bash export LLM_PROVIDER=openai export OPENAI_API_KEY=your_api_key_here -export OPENAI_MODEL=gpt-4.1-mini -export LLM_TEMPERATURE=0 +export OPENAI_MODEL=gpt-5.5 export LLM_TIMEOUT_SECONDS=60 export LLM_MAX_RETRIES=2 export LLM_DEBUG=false @@ -64,8 +63,7 @@ OpenAI: ```powershell $env:LLM_PROVIDER = "openai" $env:OPENAI_API_KEY = "your_api_key_here" -$env:OPENAI_MODEL = "gpt-4.1-mini" -$env:LLM_TEMPERATURE = "0" +$env:OPENAI_MODEL = "gpt-5.5" $env:LLM_TIMEOUT_SECONDS = "60" $env:LLM_MAX_RETRIES = "2" $env:LLM_DEBUG = "false" @@ -91,6 +89,8 @@ Meaning of the two debug flags: write timestamped variable-plausibility debug JSON artifacts to disk during `review-variable-plausibility` - `LLM_SDK_DEBUG=true` enable verbose provider/SDK logging in the terminal +- `LLM_TEMPERATURE` + used by Qwen and OpenAI model IDs that accept custom sampling; omitted automatically for `gpt-5.5` and other OpenAI reasoning-model IDs that reject custom sampling parameters ## Install Requirement diff --git a/docs/r_visualization.md b/docs/r_visualization.md index 7f2287c..45dcee4 100644 --- a/docs/r_visualization.md +++ b/docs/r_visualization.md @@ -29,20 +29,20 @@ Public functions: - `load_paper_outputs(paper_dir)` - `summarize_table_processing(paper_dir)` - `show_paper_table_inventory(paper_dir)` -- `show_table_processing(paper_dir, table_number = 1L)` -- `show_parse_quality(paper_dir, table_number = 1L)` +- `show_table_processing(paper_dir, table_index = 0L)` or `show_table_processing(paper_dir, table_number = 1L)` +- `show_parse_quality(paper_dir, table_index = 0L)` or `show_parse_quality(paper_dir, table_number = 1L)` - `summarize_table1_continuations(paper_dir)` - `show_merged_table1(paper_dir, group_index = 0L, max_rows = 30L)` - `show_paper_variable_mentions(paper_dir, role_hint = NULL, source_type = NULL, mention_role = NULL)` - `show_paper_variable_candidates(paper_dir, min_priority = NULL)` - `show_paper_visuals(paper_dir, visual_kind = NULL)` - `show_paper_references(paper_dir, reference_kind = NULL, reference_label = NULL, resolution_status = NULL)` -- `show_table_structure(paper_dir, table_number = 1L, max_rows = NULL)` -- `llm_variable_plausibility_df(outputs, table_number = NULL)` -- `show_llm_variable_plausibility(paper_dir, table_number = 1L)` +- `show_table_structure(paper_dir, table_index = 0L, max_rows = NULL)` or `show_table_structure(paper_dir, table_number = 1L, max_rows = NULL)` +- `llm_variable_plausibility_df(outputs, table_index = 0L)` or `llm_variable_plausibility_df(outputs, table_number = 1L)` +- `show_llm_variable_plausibility(paper_dir, table_index = 0L)` or `show_llm_variable_plausibility(paper_dir, table_number = 1L)` - `list_llm_variable_plausibility_debug_runs(paper_dir)` - `summarize_llm_variable_plausibility_monitoring(paper_dir, run_id = NULL)` -- `show_table_context(paper_dir, table_number = 1L, match_type = NULL)` +- `show_table_context(paper_dir, table_index = 0L, match_type = NULL)` or `show_table_context(paper_dir, table_number = 1L, match_type = NULL)` These helpers use the same per-paper output directory written by `table1-parser parse` and, when run, `table1-parser review-variable-plausibility`. @@ -54,22 +54,26 @@ source("R/inspect_paper_outputs.R") x <- load_paper_outputs("outputs/papers/cobaltpaper") summarize_table_processing("outputs/papers/cobaltpaper") show_paper_table_inventory("outputs/papers/cobaltpaper") -show_table_processing("outputs/papers/cobaltpaper", table_number = 1L) -show_parse_quality("outputs/papers/cobaltpaper", table_number = 1L) +show_table_processing("outputs/papers/cobaltpaper", table_index = 0L) +show_parse_quality("outputs/papers/cobaltpaper", table_index = 0L) summarize_table1_continuations("outputs/papers/cobaltpaper") show_merged_table1("outputs/papers/cobaltpaper", group_index = 0L, max_rows = 20L) show_paper_variable_candidates("outputs/papers/cobaltpaper") show_paper_variable_mentions("outputs/papers/cobaltpaper", source_type = "text_based", mention_role = "variable") show_paper_visuals("outputs/papers/cobaltpaper", visual_kind = "figure") show_paper_references("outputs/papers/cobaltpaper", resolution_status = "resolved") -show_table_structure("outputs/papers/cobaltpaper", table_number = 1L) -llm_variable_plausibility_df(x) -show_llm_variable_plausibility("outputs/papers/cobaltpaper", table_number = 1L) +show_table_structure("outputs/papers/cobaltpaper", table_index = 0L) +llm_variable_plausibility_df(x, table_index = 0L) +show_llm_variable_plausibility("outputs/papers/cobaltpaper", table_index = 0L) list_llm_variable_plausibility_debug_runs("outputs/papers/cobaltpaper") summarize_llm_variable_plausibility_monitoring("outputs/papers/cobaltpaper") -show_table_context("outputs/papers/cobaltpaper", table_number = 1L, match_type = "table_reference") +show_table_context("outputs/papers/cobaltpaper", table_index = 0L, match_type = "table_reference") ``` +`table_index` is the parser artifact position and is zero-based. It lines up with files and directories such as `table_0_context.json` and `llm_variable_plausibility_debug//table_0/`. + +`table_number` is the number printed in the paper caption, such as `Table 1`. For many simple papers `table_index = 0L` and `table_number = 1L` select the same table, but they are not interchangeable. Do not use `table_number = 0L` to mean the first parser table. + What these are for: - `show_table_structure(...)` @@ -115,12 +119,13 @@ What these are for: table1-parser parse testpapers/OPEandRA.pdf ``` -Use `table_number` in public inspection helpers. The parser may keep extraction-order indices internally for provenance, but the paper's table number is the conceptual selector. +Use `table_index` for parser-output debugging and batch evaluation. It is the safest selector when comparing R output with JSON arrays, table context files, and LLM debug directories. Use `table_number` only when you intentionally want the table labeled that way in the paper. ```r source("R/inspect_paper_outputs.R") -show_table_structure("outputs/papers/OPEandRA", table_number = 1L) -show_parse_quality("outputs/papers/OPEandRA", table_number = 1L) +show_paper_table_inventory("outputs/papers/OPEandRA") +show_table_structure("outputs/papers/OPEandRA", table_index = 0L) +show_parse_quality("outputs/papers/OPEandRA", table_index = 0L) ``` 2. Run the optional variable-plausibility review with debug tracing enabled: @@ -147,7 +152,7 @@ Rscript R/visualize_table_from_json.R outputs/papers/OPEandRA/llm_variable_plaus source("R/inspect_paper_outputs.R") options(width = 200) -show_llm_variable_plausibility("outputs/papers/OPEandRA", table_number = 1L) +show_llm_variable_plausibility("outputs/papers/OPEandRA", table_index = 0L) ``` ## Notes diff --git a/table1_parser/llm/client.py b/table1_parser/llm/client.py index 5f60179..dfa9f91 100644 --- a/table1_parser/llm/client.py +++ b/table1_parser/llm/client.py @@ -114,12 +114,15 @@ def structured_completion( if response_model is None: raise LLMConfigurationError("OpenAIClient requires a Pydantic response_model for structured parsing.") try: - response = self._client.responses.parse( - model=self.model, - input=prompt, - temperature=self.temperature, - text_format=response_model, - ) + request_kwargs: dict[str, Any] = { + "model": self.model, + "input": prompt, + "text_format": response_model, + } + model_name = self.model.lower() + if not (model_name.startswith("gpt-5") or re.match(r"^o\d", model_name)): + request_kwargs["temperature"] = self.temperature + response = self._client.responses.parse(**request_kwargs) except Exception as exc: raise LLMProviderError(f"OpenAI structured completion failed: {exc}") from exc diff --git a/tests/test_llm_client.py b/tests/test_llm_client.py index 2009438..a864393 100644 --- a/tests/test_llm_client.py +++ b/tests/test_llm_client.py @@ -74,7 +74,7 @@ def test_settings_reads_llm_environment_variables(monkeypatch) -> None: """Settings should read the documented LLM environment variables directly.""" monkeypatch.setenv("LLM_PROVIDER", "openai") monkeypatch.setenv("OPENAI_API_KEY", "test-key") - monkeypatch.setenv("OPENAI_MODEL", "gpt-4.1-mini") + monkeypatch.setenv("OPENAI_MODEL", "gpt-5.5") monkeypatch.setenv("LLM_TEMPERATURE", "0.1") monkeypatch.setenv("LLM_TIMEOUT_SECONDS", "30") monkeypatch.setenv("LLM_MAX_RETRIES", "4") @@ -85,8 +85,8 @@ def test_settings_reads_llm_environment_variables(monkeypatch) -> None: assert settings.llm_provider == "openai" assert settings.openai_api_key == "test-key" - assert settings.openai_model == "gpt-4.1-mini" - assert settings.llm_model == "gpt-4.1-mini" + assert settings.openai_model == "gpt-5.5" + assert settings.llm_model == "gpt-5.5" assert settings.llm_temperature == 0.1 assert settings.llm_timeout_seconds == 30 assert settings.llm_max_retries == 4 @@ -116,7 +116,7 @@ def test_build_llm_client_requires_openai_configuration(monkeypatch) -> None: """Missing required OpenAI settings should fail with a clear configuration error.""" monkeypatch.delenv("OPENAI_API_KEY", raising=False) monkeypatch.delenv("OPENAI_MODEL", raising=False) - settings = Settings(llm_provider="openai", openai_model="gpt-4.1-mini") + settings = Settings(llm_provider="openai", openai_model="gpt-5.5") with pytest.raises(LLMConfigurationError) as exc_info: build_llm_client(settings=settings) @@ -141,7 +141,7 @@ def test_build_llm_client_returns_openai_client_with_fake_sdk(monkeypatch) -> No settings = Settings( llm_provider="openai", openai_api_key="test-key", - openai_model="gpt-4.1-mini", + openai_model="openai-chat-model", llm_temperature=0.2, llm_timeout_seconds=15, llm_max_retries=3, @@ -159,11 +159,34 @@ def test_build_llm_client_returns_openai_client_with_fake_sdk(monkeypatch) -> No assert client._client.api_key == "test-key" # type: ignore[attr-defined] assert client._client.timeout == 15 # type: ignore[attr-defined] assert client._client.max_retries == 3 # type: ignore[attr-defined] - assert client._client.responses.calls[0]["model"] == "gpt-4.1-mini" # type: ignore[attr-defined] + assert client._client.responses.calls[0]["model"] == "openai-chat-model" # type: ignore[attr-defined] + assert client._client.responses.calls[0]["temperature"] == 0.2 # type: ignore[attr-defined] assert client.sdk_debug is False assert client.embeds_output_schema_in_prompt is False +def test_openai_client_omits_temperature_for_gpt_5_5(monkeypatch) -> None: + """GPT-5.5 rejects temperature, so the OpenAI request should use model defaults.""" + monkeypatch.setitem(sys.modules, "openai", SimpleNamespace(OpenAI=_FakeOpenAI)) + settings = Settings( + llm_provider="openai", + openai_api_key="test-key", + openai_model="gpt-5.5", + llm_temperature=0.2, + ) + + client = build_llm_client(settings=settings) + client.structured_completion( + "prompt text", + LLMVariablePlausibilityTableReview.model_json_schema(), + response_model=LLMVariablePlausibilityTableReview, + ) + + call = client._client.responses.calls[0] # type: ignore[attr-defined] + assert call["model"] == "gpt-5.5" + assert "temperature" not in call + + def test_build_llm_client_returns_qwen_client_and_parses_json_response() -> None: """The provider builder should construct a Qwen client and parse JSON text output.""" client = build_llm_client( @@ -230,7 +253,7 @@ def test_build_llm_client_separates_sdk_debug_from_artifact_debug(monkeypatch) - settings=Settings( llm_provider="openai", openai_api_key="test-key", - openai_model="gpt-4.1-mini", + openai_model="gpt-5.5", llm_debug=True, llm_sdk_debug=False, ) @@ -243,7 +266,7 @@ def test_build_llm_client_separates_sdk_debug_from_artifact_debug(monkeypatch) - settings=Settings( llm_provider="openai", openai_api_key="test-key", - openai_model="gpt-4.1-mini", + openai_model="gpt-5.5", llm_debug=False, llm_sdk_debug=True, ) @@ -264,7 +287,7 @@ def test_openai_client_raises_provider_error_when_no_parsed_payload(monkeypatch) settings=Settings( llm_provider="openai", openai_api_key="test-key", - openai_model="gpt-4.1-mini", + openai_model="gpt-5.5", ) ) diff --git a/tests/test_r_inspection.py b/tests/test_r_inspection.py index fde763a..8360d16 100644 --- a/tests/test_r_inspection.py +++ b/tests/test_r_inspection.py @@ -592,7 +592,7 @@ def _write_sample_variable_plausibility_debug_run( { "report_timestamp": "2026-03-24T10:15:00Z", "provider": "openai", - "model": "gpt-4.1-mini", + "model": "gpt-5.5", "items": [ { "table_id": "tbl-1", @@ -931,6 +931,117 @@ def test_r_inspection_shows_variable_plausibility_review(tmp_path) -> None: assert "score=0.970" in result.stdout +def test_r_inspection_matches_variable_plausibility_review_by_table_id(tmp_path) -> None: + """LLM review helpers should not treat a compact review list as table-index aligned.""" + if not _r_dependencies_available(): + return + + paper_dir = tmp_path / "variable_review_sparse" / "papers" / "paper" + _write_sample_paper_outputs(paper_dir, include_variable_review=False, include_processing_status=True) + + normalized_tables = json.loads((paper_dir / "normalized_tables.json").read_text(encoding="utf-8")) + table_definitions = json.loads((paper_dir / "table_definitions.json").read_text(encoding="utf-8")) + parsed_tables = json.loads((paper_dir / "parsed_tables.json").read_text(encoding="utf-8")) + table_profiles = json.loads((paper_dir / "table_profiles.json").read_text(encoding="utf-8")) + + second_variable = _make_variable("BMI", "BMI", "continuous", 1, 1) + normalized_tables.append( + { + **normalized_tables[0], + "table_id": "tbl-2", + "title": "Table 2", + "caption": "Other characteristics", + "metadata": { + "cleaned_rows": [ + ["Characteristic", "Overall"], + ["BMI", "28.1"], + ], + "table_number": 2, + }, + } + ) + table_definitions.append( + { + "table_id": "tbl-2", + "title": "Table 2", + "caption": "Other characteristics", + "variables": [second_variable], + "column_definition": { + "grouping_label": None, + "grouping_name": None, + "columns": [_make_column(1, "Overall", "Overall", "overall")], + "confidence": 0.9, + }, + "notes": [], + "overall_confidence": 0.9, + } + ) + parsed_tables.append(_make_parsed_table("tbl-2", "Table 2", "Other characteristics", variables=[second_variable])) + table_profiles.append({"table_id": "tbl-2", "table_family": "descriptive_characteristics", "confidence": 0.9}) + + _write_json(paper_dir / "normalized_tables.json", normalized_tables) + _write_json(paper_dir / "table_definitions.json", table_definitions) + _write_json(paper_dir / "parsed_tables.json", parsed_tables) + _write_json(paper_dir / "table_profiles.json", table_profiles) + _write_json( + paper_dir / "table_variable_plausibility_llm.json", + [ + { + "table_id": "tbl-2", + "variables": [ + { + "variable_name": "BMI", + "variable_label": "BMI", + "variable_type": "continuous", + "row_start": 1, + "row_end": 1, + "levels": [], + "plausibility_score": 0.99, + "plausibility_note": "", + } + ], + "notes": [], + "overall_plausibility": 0.99, + } + ], + ) + + missing_review_result = subprocess.run( + [ + "Rscript", + "-e", + ( + f'source("{R_SCRIPT}"); ' + f'show_llm_variable_plausibility("{paper_dir}", table_index = 0L)' + ), + ], + capture_output=True, + text=True, + cwd=REPO_ROOT, + check=False, + ) + assert missing_review_result.returncode != 0 + assert "No variable-plausibility review found for table_index=0" in missing_review_result.stderr + + matched_review_result = subprocess.run( + [ + "Rscript", + "-e", + ( + f'source("{R_SCRIPT}"); ' + f'show_llm_variable_plausibility("{paper_dir}", table_index = 1L)' + ), + ], + capture_output=True, + text=True, + cwd=REPO_ROOT, + check=False, + ) + assert matched_review_result.returncode == 0, matched_review_result.stderr + assert "table_index: 1" in matched_review_result.stdout + assert "BMI" in matched_review_result.stdout + + def test_r_compare_normalized_rows_to_definition_supports_table_number(tmp_path) -> None: """The standalone R helper should compare normalized labels to definition labels by paper table number.""" if not _r_dependencies_available(): diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 68b3df2..f94ec27 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -235,7 +235,7 @@ def test_llm_variable_plausibility_monitoring_creation_and_serialization() -> No report = LLMVariablePlausibilityMonitoringReport( report_timestamp="2026-03-24T10:15:00Z", provider="openai", - model="gpt-4.1-mini", + model="gpt-5.5", items=[ LLMVariablePlausibilityCallRecord( table_id="tbl-1",