From c56de934ef800c1bcccd160c038894b944602f7a Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Thu, 30 Apr 2026 06:03:57 -0700
Subject: [PATCH 01/10] feat: vocabulary evaluator

---
 evals/vocabulary_evaluator.ipynb              |  67 +-
 .../learning_commons_evaluators/__init__.py   |  14 +
 .../evaluators/__init__.py                    |   6 +
 .../evaluators/vocabulary.py                  | 251 +++++++
 .../schemas/vocabulary.py                     |  58 ++
 .../_generated_vocabulary_settings.py         | 404 +++++++++++
 .../settings/vocabulary/__init__.py           |   0
 .../settings/vocabulary/contracts.toml        | 648 ++++++++++++++++++
 .../tests/contract_tests/test_vocabulary.py   | 128 ++++
 .../python/tests/contract_tests/vocabulary.py | 137 ++++
 .../tests/evaluators/test_vocabulary.py       | 293 ++++++++
 .../tests/settings/test_load_settings.py      |   4 +-
 sdks/settings/vocabulary/contracts.toml       | 648 ++++++++++++++++++
 sdks/settings/vocabulary/settings.toml        | 410 +++++++++++
 14 files changed, 3059 insertions(+), 9 deletions(-)
 create mode 100644 sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/settings/vocabulary/__init__.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
 create mode 100644 sdks/python/tests/contract_tests/test_vocabulary.py
 create mode 100644 sdks/python/tests/contract_tests/vocabulary.py
 create mode 100644 sdks/python/tests/evaluators/test_vocabulary.py
 create mode 100644 sdks/settings/vocabulary/contracts.toml
 create mode 100644 sdks/settings/vocabulary/settings.toml

diff --git a/evals/vocabulary_evaluator.ipynb b/evals/vocabulary_evaluator.ipynb
index 892ecdf..ce9deda 100644
--- a/evals/vocabulary_evaluator.ipynb
+++ b/evals/vocabulary_evaluator.ipynb
@@ -83,7 +83,8 @@
     "from langchain_google_genai import ChatGoogleGenerativeAI\n",
     "from langchain_openai import ChatOpenAI\n",
     "from pydantic import BaseModel, Field\n",
-    "from textstat import textstat as ts"
+    "from textstat import textstat as ts\n",
+    "from capture import reset_captures, capture_llm, capture_case, build_contract_toml\n"
    ]
   },
   {
@@ -152,7 +153,7 @@
     "# Define the model to be used for student background knowledge generation\n",
     "BK_MODEL = \"gpt-4o-2024-11-20\"\n",
     "BK_TEMPERATURE = 0\n",
-    "student_bk_model = ChatOpenAI(model=BK_MODEL, temperature=BK_TEMPERATURE)"
+    "student_bk_model = ChatOpenAI(model=BK_MODEL, temperature=BK_TEMPERATURE)\n"
    ]
   },
   {
@@ -184,7 +185,7 @@
     "    \"\"\"Use the background knowledge prompt from the prompts file.\"\"\"\n",
     "    prompt = prompts.bk_prompt.format(text=text, grade=grade)\n",
     "\n",
-    "    return student_bk_model.invoke(prompt).content"
+    "    return capture_llm(\"background_knowledge\", student_bk_model).invoke(prompt).content\n"
    ]
   },
   {
@@ -231,7 +232,7 @@
     "        \"fk_level\",\n",
     "    ],\n",
     "    \"outputParser\": JsonOutputParser(pydantic_object=Output),\n",
-    "}"
+    "}\n"
    ]
   },
   {
@@ -374,7 +375,7 @@
     "    \n",
     "    # For GRADES_3_4, complexity_score already exists as a string - no changes needed\n",
     "\n",
-    "    return output"
+    "    return output\n"
    ]
   },
   {
@@ -421,13 +422,13 @@
     "    )\n",
     "\n",
     "    # Invoke the chain\n",
-    "    chain = prompt | get_vocab_model_for_grade(grade) | JsonOutputParser()\n",
+    "    chain = prompt | capture_llm(\"vocab_complexity\", get_vocab_model_for_grade(grade)) | JsonOutputParser()\n",
     "\n",
     "    # Get output and normalize it\n",
     "    output = chain.invoke(dataset)\n",
     "    output = normalize_complexity_output(output)\n",
     "\n",
-    "    return output"
+    "    return output\n"
    ]
   },
   {
@@ -481,6 +482,58 @@
     "prettify_vocab_complexity_output(vocabulary_complexity_output)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reset_captures()\n",
+    "text = \"\"\"\n",
+    "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
+    "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
+    "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
+    "\"\"\"\n",
+    "input = {\"text\": text, \"grade\": 3}\n",
+    "result = predict_text_complexity_level(**input)\n",
+    "\n",
+    "capture = capture_case(\n",
+    "    name=\"marco_polo_grade3\",\n",
+    "    description=\"Marco Polo passage, grade 3 (grades 3-4 Gemini path)\",\n",
+    "    input=input,\n",
+    "    llm_call_captures=[\"background_knowledge\", \"vocab_complexity\"],\n",
+    "    expected_result=result,\n",
+    ")\n",
+    "\n",
+    "print(build_contract_toml(capture))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reset_captures()\n",
+    "text = \"\"\"\n",
+    "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
+    "\n",
+    "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.\n",
+    "\"\"\"\n",
+    "input = {\"text\": text, \"grade\": 7}\n",
+    "result = predict_text_complexity_level(**input)\n",
+    "\n",
+    "capture = capture_case(\n",
+    "    name=\"hurricanes_grade7\",\n",
+    "    description=\"Hurricane formation passage, grade 7 (grades 5-12 GPT path)\",\n",
+    "    input=input,\n",
+    "    llm_call_captures=[\"background_knowledge\", \"vocab_complexity\"],\n",
+    "    expected_result=result,\n",
+    ")\n",
+    "\n",
+    "print(build_contract_toml(capture))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/sdks/python/src/learning_commons_evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/__init__.py
index 5dab77b..180fea6 100644
--- a/sdks/python/src/learning_commons_evaluators/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/__init__.py
@@ -35,6 +35,8 @@
     ConventionalityEvaluator,
     InputT,
     OutputT,
+    VocabularyEvaluationInput,
+    VocabularyEvaluator,
 )
 from learning_commons_evaluators.evaluators.conventionality import (
     ConventionalityEvaluationInput,
@@ -82,6 +84,12 @@
 from learning_commons_evaluators.schemas.text_complexity import (
     TextComplexityEvaluationInput,
 )
+from learning_commons_evaluators.schemas.vocabulary import (
+    VOCABULARY_SUPPORTED_GRADES,
+    VocabularyEvaluationSettings,
+    VocabularyOutputGrades34,
+    VocabularyOutputOtherGrades,
+)
 
 __all__ = [
     "__description__",
@@ -129,6 +137,12 @@
     "TextInputField",
     "TokenUsage",
     "ValidationError",
+    "VOCABULARY_SUPPORTED_GRADES",
+    "VocabularyEvaluationInput",
+    "VocabularyEvaluationSettings",
+    "VocabularyEvaluator",
+    "VocabularyOutputGrades34",
+    "VocabularyOutputOtherGrades",
     "create_config",
     "create_config_no_telemetry",
     "create_config_telemetry_with_full_input",
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
index 50821c6..d42cd15 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
@@ -11,6 +11,10 @@
     ConventionalityEvaluationInput,
     ConventionalityEvaluator,
 )
+from learning_commons_evaluators.evaluators.vocabulary import (
+    VocabularyEvaluationInput,
+    VocabularyEvaluator,
+)
 
 __all__ = [
     "BaseEvaluator",
@@ -18,4 +22,6 @@
     "ConventionalityEvaluator",
     "InputT",
     "OutputT",
+    "VocabularyEvaluationInput",
+    "VocabularyEvaluator",
 ]
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
new file mode 100644
index 0000000..3f00797
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
@@ -0,0 +1,251 @@
+"""Vocabulary evaluator: evaluates text for vocabulary complexity relative to grade level."""
+
+from __future__ import annotations
+
+from typing import ClassVar
+
+import textstat
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import Field
+
+from learning_commons_evaluators.evaluators.base import BaseEvaluator
+from learning_commons_evaluators.schemas.common_inputs import (
+    GradeInputField,
+    TextInputField,
+)
+from learning_commons_evaluators.schemas.config import PromptSettings
+from learning_commons_evaluators.schemas.evaluator import (
+    EvaluationExplanation,
+    EvaluationInput,
+)
+from learning_commons_evaluators.schemas.metadata import (
+    EvaluationMetadata,
+    EvaluatorMetadata,
+)
+from learning_commons_evaluators.schemas.text_complexity import (
+    TextComplexityAnswer,
+    TextComplexityResult,
+)
+from learning_commons_evaluators.schemas.vocabulary import (
+    VocabularyEvaluationSettings,
+    VocabularyOutputGrades34,
+    VocabularyOutputOtherGrades,
+)
+from learning_commons_evaluators.settings._generated_vocabulary_settings import (
+    CONFIG as _VOCABULARY_CONFIG,
+)
+
+_INPUT_SETTINGS = _VOCABULARY_CONFIG.evaluator_metadata.inputs
+
+# Grades 3–4 use the Gemini-based rubric prompt; grades 5–12 use the GPT-based annotation guide prompt.
+_GRADES_3_4 = frozenset({3, 4})
+
+
+class VocabularyEvaluationInput(EvaluationInput):
+    """Input for a vocabulary evaluation.
+
+    Constraints (allowed grade levels) are sourced from
+    ``[[evaluator_metadata.inputs]]`` in evaluator settings and applied
+    automatically — callers supply raw values, not field objects.
+
+    Example::
+
+        inp = VocabularyEvaluationInput(text="The quick brown fox...", grade_level=5)
+    """
+
+    _input_settings: ClassVar[dict] = _INPUT_SETTINGS
+
+    text: TextInputField = Field(description="The text to evaluate.")
+    grade_level: GradeInputField = Field(description="The grade level of the text (3–12).")
+
+    def __init__(self, *, text: str, grade_level: int, **kwargs):
+        super().__init__(text=text, grade_level=grade_level, **kwargs)
+
+
+class VocabularyEvaluator(
+    BaseEvaluator[VocabularyEvaluationInput, TextComplexityResult, VocabularyEvaluationSettings]
+):
+    """Evaluates text for vocabulary complexity relative to the target grade level.
+
+    The evaluation runs in two steps:
+    1. **Background knowledge** – the model generates a short assumption about what
+       students at the target grade already know about the text's topic.  This
+       provides context that keeps the complexity rating from penalising familiar
+       domain words.
+    2. **Vocabulary complexity** – a grade-specific prompt + model produces the
+       final score and reasoning.  Grades 3–4 use a Gemini model and return a
+       rubric label plus a word-level breakdown; grades 5–12 use a GPT model and
+       return an integer score (1–4).
+
+    Supported grades: 3–12.
+    """
+
+    metadata: EvaluatorMetadata = _VOCABULARY_CONFIG.evaluator_metadata
+    default_evaluation_settings: VocabularyEvaluationSettings = (
+        _VOCABULARY_CONFIG.evaluation_settings
+    )
+
+    def evaluate_impl(
+        self,
+        input: VocabularyEvaluationInput,
+        evaluation_settings: VocabularyEvaluationSettings,
+        evaluation_metadata: EvaluationMetadata,
+    ) -> TextComplexityResult:
+        """Run the two-step vocabulary evaluation and return a TextComplexityResult.
+
+        Grade validation is handled by the framework before this method is called:
+        ``VocabularyEvaluationInput`` automatically constrains ``grade_level`` to
+        :data:`~learning_commons_evaluators.schemas.vocabulary.VOCABULARY_SUPPORTED_GRADES`
+        (3–12), so ``BaseEvaluator.evaluate`` raises before reaching here for
+        unsupported grades.
+        """
+        ps_bk = evaluation_settings.prompt_settings_step_background_knowledge
+        ps_34 = evaluation_settings.prompt_settings_step_vocab_grades_3_4
+        ps_og = evaluation_settings.prompt_settings_step_vocab_other_grades
+        assert ps_bk is not None and ps_34 is not None and ps_og is not None
+
+        grade = input.grade_level.value
+        fk_score = round(textstat.flesch_kincaid_grade(input.text.value), 2)
+        prompts = _VOCABULARY_CONFIG.prompts
+
+        # ── Step 1: background knowledge ──────────────────────────────────────
+        # parser_output_type=None → execute_prompt_chain_step returns plain str.
+        bk_template = ChatPromptTemplate.from_messages(
+            [("human", prompts["background_knowledge_prompt"])]
+        )
+        background_knowledge: str = self.execute_prompt_chain_step(
+            step_name="background_knowledge",
+            prompt_settings=ps_bk,
+            evaluation_metadata=evaluation_metadata,
+            template=bk_template,
+            chain_inputs={"text": input.text.value, "grade": grade},
+            parser_output_type=None,
+        )
+
+        # ── Step 2: vocabulary complexity (grade-specific) ────────────────────
+        if grade in _GRADES_3_4:
+            answer, explanation = self._evaluate_grades_3_4(
+                input=input,
+                grade=grade,
+                fk_score=fk_score,
+                background_knowledge=background_knowledge,
+                evaluation_metadata=evaluation_metadata,
+                prompts=prompts,
+                prompt_settings_vocab=ps_34,
+            )
+        else:
+            answer, explanation = self._evaluate_other_grades(
+                input=input,
+                grade=grade,
+                background_knowledge=background_knowledge,
+                evaluation_metadata=evaluation_metadata,
+                prompts=prompts,
+                prompt_settings_vocab=ps_og,
+            )
+
+        return TextComplexityResult(
+            answer=answer,
+            explanation=explanation,
+            metadata=evaluation_metadata,
+        )
+
+    # ── Private helpers ───────────────────────────────────────────────────────
+
+    def _evaluate_grades_3_4(
+        self,
+        *,
+        input: VocabularyEvaluationInput,
+        grade: int,
+        fk_score: float,
+        background_knowledge: str,
+        evaluation_metadata: EvaluationMetadata,
+        prompts: dict,
+        prompt_settings_vocab: PromptSettings,
+    ) -> tuple[TextComplexityAnswer, EvaluationExplanation]:
+        """Run the grades 3–4 vocabulary complexity step.
+
+        Returns a rubric-label score and a word-breakdown explanation.
+        """
+        parser = JsonOutputParser(pydantic_object=VocabularyOutputGrades34)
+        template = ChatPromptTemplate.from_messages(
+            [
+                ("system", prompts["vocab_grades_3_4_system_prompt"]),
+                ("human", prompts["vocab_grades_3_4_user_prompt"]),
+            ]
+        ).partial(format_instructions=parser.get_format_instructions())
+
+        output = self.execute_prompt_chain_step(
+            step_name="vocab_complexity",
+            prompt_settings=prompt_settings_vocab,
+            evaluation_metadata=evaluation_metadata,
+            template=template,
+            chain_inputs={
+                "text": input.text.value,
+                "student_grade_level": grade,
+                "student_background_knowledge": background_knowledge,
+                "fk_level": fk_score,
+            },
+            parser_output_type=VocabularyOutputGrades34,
+        )
+        assert isinstance(output, VocabularyOutputGrades34)
+
+        # Normalise the score string: the prompt may return spaces ("very complex")
+        # but TextComplexityAnswer expects underscores ("very_complex").
+        score = output.complexity_score.lower().replace(" ", "_")
+        answer = TextComplexityAnswer.from_score(score)
+        explanation = EvaluationExplanation(
+            summary=output.reasoning,
+            details={
+                "tier_2_words": output.tier_2_words,
+                "tier_3_words": output.tier_3_words,
+                "archaic_words": output.archaic_words,
+                "other_complex_words": output.other_complex_words,
+            },
+        )
+        return answer, explanation
+
+    def _evaluate_other_grades(
+        self,
+        *,
+        input: VocabularyEvaluationInput,
+        grade: int,
+        background_knowledge: str,
+        evaluation_metadata: EvaluationMetadata,
+        prompts: dict,
+        prompt_settings_vocab: PromptSettings,
+    ) -> tuple[TextComplexityAnswer, EvaluationExplanation]:
+        """Run the grades 5–12 vocabulary complexity step.
+
+        Returns a string-label score and a reasoning explanation.
+        The OTHER_GRADES prompt uses the same ``Output``-style schema as grades 3–4,
+        so the LLM returns a word-level breakdown and a string ``complexity_score``.
+        """
+        parser = JsonOutputParser(pydantic_object=VocabularyOutputOtherGrades)
+        template = ChatPromptTemplate.from_messages(
+            [
+                ("system", prompts["vocab_other_grades_system_prompt"]),
+                ("human", prompts["vocab_other_grades_user_prompt"]),
+            ]
+        ).partial(format_instructions=parser.get_format_instructions())
+
+        output = self.execute_prompt_chain_step(
+            step_name="vocab_complexity",
+            prompt_settings=prompt_settings_vocab,
+            evaluation_metadata=evaluation_metadata,
+            template=template,
+            chain_inputs={
+                "text": input.text.value,
+                "student_grade_level": grade,
+                "student_background_knowledge": background_knowledge,
+            },
+            parser_output_type=VocabularyOutputOtherGrades,
+        )
+        assert isinstance(output, VocabularyOutputOtherGrades)
+
+        # Normalise the score string: the prompt may return spaces ("slightly complex")
+        # but TextComplexityAnswer expects underscores ("slightly_complex").
+        score = output.complexity_score.lower().replace(" ", "_")
+        answer = TextComplexityAnswer.from_score(score)
+        explanation = EvaluationExplanation(summary=output.reasoning, details={})
+        return answer, explanation
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py b/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
new file mode 100644
index 0000000..7e3e2e9
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
@@ -0,0 +1,58 @@
+"""Vocabulary evaluator schemas."""
+
+from pydantic import BaseModel, Field
+
+from learning_commons_evaluators.schemas.config import (
+    EvaluationSettings,
+    PromptSettings,
+)
+
+# Grades supported by the vocabulary evaluator.
+# Kept here for backwards-compatibility; the actual constraint is now enforced
+# via allowed_grades in [[evaluator_metadata.inputs]] in the vocabulary evaluator settings.
+VOCABULARY_SUPPORTED_GRADES: frozenset[int] = frozenset(range(3, 13))
+
+
+class VocabularyEvaluationSettings(EvaluationSettings):
+    """Settings for a vocabulary complexity evaluation.
+
+    Requires three prompt step configurations:
+    - ``prompt_settings_step_background_knowledge``: generates a student background knowledge
+      assumption (used as context for the vocabulary complexity step).
+    - ``prompt_settings_step_vocab_grades_3_4``: vocabulary complexity for grades 3–4.
+    - ``prompt_settings_step_vocab_other_grades``: vocabulary complexity for grades 5–12.
+    """
+
+    prompt_settings_step_background_knowledge: PromptSettings | None = None
+    prompt_settings_step_vocab_grades_3_4: PromptSettings | None = None
+    prompt_settings_step_vocab_other_grades: PromptSettings | None = None
+
+
+# Note: avoid class docstrings here — Pydantic adds them as a root ``description``
+# key in ``model_json_schema()``, which changes ``JsonOutputParser`` format
+# instructions and breaks vocabulary contract tests that snapshot prompts without
+# that key (see ``contracts.toml``). Document behavior on fields / comments.
+
+
+class VocabularyOutputGrades34(BaseModel):
+    # LLM output for grades 3–4: rubric label + word breakdown; mirrors evals notebook Output.
+    tier_2_words: str = Field(description="List of Tier 2 words")
+    tier_3_words: str = Field(description="List of Tier 3 words")
+    archaic_words: str = Field(description="List of Archaic words")
+    other_complex_words: str = Field(description="List of Other Complex words")
+    complexity_score: str = Field(
+        description="the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex"
+    )
+    reasoning: str = Field(description="your reasoning for your answer")
+
+
+class VocabularyOutputOtherGrades(BaseModel):
+    # Same shape as grades 3–4; complexity_score is normalised to underscores before TextComplexityAnswer.
+    tier_2_words: str = Field(description="List of Tier 2 words")
+    tier_3_words: str = Field(description="List of Tier 3 words")
+    archaic_words: str = Field(description="List of Archaic words")
+    other_complex_words: str = Field(description="List of Other Complex words")
+    complexity_score: str = Field(
+        description="the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex"
+    )
+    reasoning: str = Field(description="your reasoning for your answer")
diff --git a/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py b/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py
new file mode 100644
index 0000000..ee4650c
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py
@@ -0,0 +1,404 @@
+# !! AUTO-GENERATED — do not edit directly.
+# Source: sdks/settings/vocabulary/settings.toml
+# Regenerate : python scripts/generate_settings.py
+# Staleness check: python scripts/generate_settings.py --check
+
+from __future__ import annotations
+
+from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings
+from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec
+from learning_commons_evaluators.schemas.metadata import EvaluatorMaturity, EvaluatorMetadata
+from learning_commons_evaluators.schemas.vocabulary import VocabularyEvaluationSettings
+from learning_commons_evaluators.settings.load_settings import EvaluatorSettingsResult
+
+# ── Evaluator metadata ────────────────────────────────────────────────────────
+
+_EVALUATOR_METADATA = EvaluatorMetadata(
+    id='vocabulary',
+    version='0.1',
+    name='Vocabulary',
+    description='Vocabulary Complexity Evaluator',
+    maturity=EvaluatorMaturity.alpha,
+    inputs={
+        'text': TextInputSpec(
+            name='text',
+            description='The text to evaluate for vocabulary complexity.',
+        ),
+        'grade_level': GradeInputSpec(
+            name='grade_level',
+            description='The grade level of the text (3–12).',
+            allowed_grades=[3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+        ),
+    },
+)
+
+# ── Prompt templates ──────────────────────────────────────────────────────────
+
+_PROMPTS: dict[str, str] = {
+    'background_knowledge_prompt': """Review the following text, which is an educational text written for students in the following grade band: {grade}.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+{text}
+[END TEXT]
+""",
+    'vocab_grades_3_4_system_prompt': """You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.
+
+You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)
+
+**Resource 1: Qualitative Text Complexity rubric (SAP)**
+1.  **Level 1: Slightly complex**
+    *   Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+    *   Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!
+2.  **Level 2: Moderately complex**
+    *   Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK
+    *   Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.
+3.  **Level 3: Very complex**
+    *   Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+    *   Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.
+4.  **Level 4: Exceedingly complex**
+    *   Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.
+    *   Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.
+
+**Resource 2: Flesch-Kincaid Grade Level**
+Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.
+*   grade 2-3: 1.98-5.34
+*   grade 4-5: 4.51-7.73
+*   grade 6-8: 6.51-10.34
+*   grade 9-10: 8.32-12.12
+*   grade 11-College: 10.34-14.2
+
+**Guidelines for Interpretation and Reasoning**
+
+Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:
+
+1.  **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.
+2.  **Contextual Scaffolding:** Assess how the text supports new vocabulary.
+    *   Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")?
+    *   Is the surrounding language simple and conversational, making the meaning of new words easier to infer?
+    *   Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.
+3.  **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).
+4.  **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.
+5.  **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.
+    *   **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.
+    *   **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.
+6.  **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).
+
+**Final Analysis Format**
+
+Provide these information as your final analysis:
+1.  **Complex vocabulary:**
+    *   Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.
+    *   Tier 3 words: Overly academic or domain-specific words.
+    *   Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., “The jury retired to deliberate on their verdict." The use of “retire” to mean withdrawing to a private place is an archaic use.
+    *   Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).
+2.  **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex
+3.  **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.
+""",
+    'vocab_grades_3_4_user_prompt': """Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.
+
+- It is intended for grade {student_grade_level}.
+
+- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}
+
+- Text Flesch-Kincaid grade level: {fk_level}
+
+- Text to evaluate: [BEGIN TEXT]
+{text}
+[END TEXT]
+
+{format_instructions}
+""",
+    'vocab_other_grades_system_prompt': """You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.
+
+You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.
+
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)
+IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.
+
+Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).
+""",
+    'vocab_other_grades_user_prompt': """Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly — do not introduce any additional criteria or lenses for judging the complexity of the text.
+
+[BEGIN ANNOTATION GUIDE AND RUBRIC]
+Instructions
+For the following task, please assume that:
+    - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).
+    - The student is moving through a common progression of topics (detailed here).
+    - The student is fluent in speaking English.
+    - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum.
+    - The student will use this material for independent reading/work, without direct instruction.
+    - The text is reasonable for the given grade level.
+
+Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket").
+
+Please do be sure to consider:
+- all of the different types of vocabulary (listed below)
+- the overall proportion of complex words in the text - including repeated complex words.
+- the resulting holistic complexity of the vocabulary (described in the Summary section below).
+
+Level 1:
+Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+
+Level 2:
+Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.
+
+Level 3:
+Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+
+Level 4:
+Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading
+
+And here are some relevant definitions:
+    - Conversational: Everyday language.
+    - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge.
+    - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.
+        - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data").
+        - Note:
+            - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.
+            - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb."
+                - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.
+        - Unfamiliar proper nouns:
+            - A person's name, even if unfamiliar, generally does not add to complexity.
+            - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.
+
+- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.
+- Overly-academic: Words that are excessively formal, complex, or specialized.
+    - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture"
+- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.
+    - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict."
+        - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use.
+
+
+Examples
+The student is on-grade-level:
+- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level.  If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.
+
+Figurative Language
+- Kicked the bucket.
+- The pen is mightier than the sword.
+- The classroom was a zoo.
+- He ran faster than the speed of light.
+[END ANNOTATION GUIDE AND RUBRIC]
+
+Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:
+[BEGIN EXAMPLES]
+
+*** EXAMPLE 1 ***
+The following text was intended for grade level 11 and received a complexity level of 1.
+
+Here is the background knowledge assumption for that text: N/A
+
+Here is the text:
+// START TEXT //
+"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.
+
+He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.
+
+I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.
+
+I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.
+
+Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized."
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.
+
+It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.
+// END REASONING //
+*** EXAMPLE 2 ***
+The following text was intended for grade level 5 and received a complexity level of 2.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).
+
+Here is the text:
+// START TEXT //
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names—hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm  (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.
+// END REASONING //
+
+*** EXAMPLE 3 ***
+The following text was intended for grade level 6 and received a complexity level of 3.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.
+
+Here is the text:
+// START TEXT //
+Geology is the scientific study of Earth. Geologists study the planet—its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.
+
+Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.
+
+Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating.  I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.
+// END REASONING //
+[END EXAMPLES]
+
+Below is the text you need to evaluate. It is intended for grade {student_grade_level}.
+
+As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}
+
+[BEGIN TEXT]
+{text}
+[END TEXT]
+
+In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
+
+{format_instructions}
+""",
+}
+
+# ── Evaluation settings ───────────────────────────────────────────────────────
+
+_EVALUATION_SETTINGS = VocabularyEvaluationSettings(
+    prompt_settings_step_background_knowledge=PromptSettings(
+        provider_type=LlmProvider.OPENAI,
+        model='gpt-4o-2024-11-20',
+        temperature=0.0,
+    ),
+    prompt_settings_step_vocab_grades_3_4=PromptSettings(
+        provider_type=LlmProvider.GOOGLE,
+        model='gemini-2.5-pro',
+        temperature=0.0,
+    ),
+    prompt_settings_step_vocab_other_grades=PromptSettings(provider_type=LlmProvider.OPENAI, model='gpt-4.1', temperature=0.0),
+)
+
+# ── Public config object (imported by evaluator modules) ──────────────────────
+
+CONFIG: EvaluatorSettingsResult[VocabularyEvaluationSettings] = EvaluatorSettingsResult(
+    evaluator_metadata=_EVALUATOR_METADATA,
+    evaluation_settings=_EVALUATION_SETTINGS,
+    prompts=_PROMPTS,
+)
diff --git a/sdks/python/src/learning_commons_evaluators/settings/vocabulary/__init__.py b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
new file mode 100644
index 0000000..d1c55a8
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
@@ -0,0 +1,648 @@
+# Contract tests for the Vocabulary evaluator.
+#
+# This file captures the exact LLM interactions for known inputs so that tests
+# can verify the SDK sends the same requests as the notebook and produces the
+# same results from the same responses.
+#
+# HOW TO REGENERATE
+# -----------------
+# Run the notebook "evals/vocabulary_evaluator.ipynb" with valid API keys.
+# The final cells output a TOML block that you paste in here to replace the
+# placeholder values below.
+#
+# WHAT BELONGS HERE
+# -----------------
+# Only data that originates from external sources:
+#   - input:            what the caller provides to the evaluator
+#   - prompt_steps:     what the SDK sends to the LLM (fully formatted messages,
+#                       model, temperature) and the raw response the LLM returned.
+#                       Steps must be listed in call order:
+#                         1. background_knowledge  (plain text response)
+#                         2. vocab_complexity       (JSON response)
+#   - expected_result:  the structured output parsed from the vocab_complexity
+#                       llm_response (notebook format), before SDK result mapping.
+#
+# Deterministic values computed by library code (FK score, format_instructions)
+# are NOT stored as standalone fields — they appear inside the fully formatted
+# prompts, where they are implicitly verified by the prompt-request assertion.
+#
+# TWO CASES ARE INCLUDED
+# -----------------------
+# marco_polo_grade3  — grade 3, uses the grades 3–4 path (Gemini + rubric labels
+#                      + word-level breakdown in expected_result).
+# hurricanes_grade7  — grade 7, uses the grades 5–12 path (GPT + integer score
+#                      in expected_result).
+
+# ===========================================================================
+# Case 1: marco_polo_grade3  (grades 3–4 path)
+# ===========================================================================
+
+[cases.marco_polo_grade3]
+description = "Marco Polo passage, grade 3 (grades 3-4 Gemini path)"
+
+[cases.marco_polo_grade3.input]
+text = '''
+
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
+'''
+grade_level = 3
+
+[cases.marco_polo_grade3.prompt_steps.background_knowledge]
+model = "gpt-4o-2024-11-20"
+temperature = 0.0
+system_prompt = ""
+user_prompt = '''
+
+Review the following text, which is an educational text written for students in the following grade band: 3.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
+
+[END TEXT]
+'''
+llm_response = '''
+Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of trade. They are unlikely to be familiar with Marco Polo, the Mongol Dynasty, Kublai Khan, or the historical origins of pasta.'''
+
+[cases.marco_polo_grade3.prompt_steps.vocab_complexity]
+model = "gemini-2.5-pro"
+temperature = 0.0
+system_prompt = '''
+
+You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.
+
+You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)
+
+**Resource 1: Qualitative Text Complexity rubric (SAP)**
+1.  **Level 1: Slightly complex**
+    *   Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+    *   Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!
+2.  **Level 2: Moderately complex**
+    *   Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK
+    *   Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.
+3.  **Level 3: Very complex**
+    *   Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+    *   Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.
+4.  **Level 4: Exceedingly complex**
+    *   Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.
+    *   Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.
+
+**Resource 2: Flesch-Kincaid Grade Level**
+Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.
+*   grade 2-3: 1.98-5.34
+*   grade 4-5: 4.51-7.73
+*   grade 6-8: 6.51-10.34
+*   grade 9-10: 8.32-12.12
+*   grade 11-College: 10.34-14.2
+
+**Guidelines for Interpretation and Reasoning**
+
+Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:
+
+1.  **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.
+2.  **Contextual Scaffolding:** Assess how the text supports new vocabulary.
+    *   Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")?
+    *   Is the surrounding language simple and conversational, making the meaning of new words easier to infer?
+    *   Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.
+3.  **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).
+4.  **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.
+5.  **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.
+    *   **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.
+    *   **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.
+6.  **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).
+
+**Final Analysis Format**
+
+Provide these information as your final analysis:
+1.  **Complex vocabulary:**
+    *   Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.
+    *   Tier 3 words: Overly academic or domain-specific words.
+    *   Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., “The jury retired to deliberate on their verdict." The use of “retire” to mean withdrawing to a private place is an archaic use.
+    *   Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).
+2.  **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex
+3.  **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.
+'''
+user_prompt = '''
+
+Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.
+
+- It is intended for grade 3.
+
+- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of trade. They are unlikely to be familiar with Marco Polo, the Mongol Dynasty, Kublai Khan, or the historical origins of pasta.
+
+- Text Flesch-Kincaid grade level: 6.59
+
+- Text to evaluate: [BEGIN TEXT]
+
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
+
+[END TEXT]
+
+STRICT OUTPUT FORMAT:
+- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.
+- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).
+- Do not prepend or append any text (e.g., do not write "Here is the JSON:").
+- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.
+
+The output should be formatted as a JSON instance that conforms to the JSON schema below.
+
+As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
+
+Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):
+```
+{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]}
+```
+'''
+llm_response = '''
+```json
+{
+  "tier_2_words": "served, nevertheless, survived, scholars, described, treasures, wealthy",
+  "tier_3_words": "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain, silk",
+  "archaic_words": "",
+  "other_complex_words": "Venice, Ayas, Tabriz, Kerman, Beijing, Far East, near East",
+  "complexity_score": "very complex",
+  "reasoning": "The vocabulary is rated as 'very complex' primarily due to the high density and conceptual load of unfamiliar proper nouns and domain-specific terms, presented with no contextual scaffolding. For a 3rd grader, the text introduces a large number of new names in rapid succession (Mongol Dynasty, Venice, Ayas, Tabriz, Kerman, Beijing, Kublai Khan) which function as challenging vocabulary. Additionally, the text lists specific, unfamiliar goods (ivory, jade, porcelain, silk) without any description. This barrage of new, undefined terms creates a significant cognitive load that would slow comprehension considerably, even if the student can grasp the basic gist of the story. While the main idea might be accessible, understanding the details of the journey and its significance is made very challenging by the vocabulary, aligning with the 'very complex' rating."
+}
+```'''
+
+[cases.marco_polo_grade3.expected_result]
+tier_2_words = '''
+served, nevertheless, survived, scholars, described, treasures, wealthy'''
+tier_3_words = '''
+Mongol Dynasty, Kublai Khan, ivory, jade, porcelain, silk'''
+archaic_words = ""
+other_complex_words = '''
+Venice, Ayas, Tabriz, Kerman, Beijing, Far East, near East'''
+complexity_score = '''
+very complex'''
+reasoning = '''
+The vocabulary is rated as 'very complex' primarily due to the high density and conceptual load of unfamiliar proper nouns and domain-specific terms, presented with no contextual scaffolding. For a 3rd grader, the text introduces a large number of new names in rapid succession (Mongol Dynasty, Venice, Ayas, Tabriz, Kerman, Beijing, Kublai Khan) which function as challenging vocabulary. Additionally, the text lists specific, unfamiliar goods (ivory, jade, porcelain, silk) without any description. This barrage of new, undefined terms creates a significant cognitive load that would slow comprehension considerably, even if the student can grasp the basic gist of the story. While the main idea might be accessible, understanding the details of the journey and its significance is made very challenging by the vocabulary, aligning with the 'very complex' rating.'''
+
+# ===========================================================================
+# Case 2: hurricanes_grade7  (grades 5–12 path)
+# ===========================================================================
+
+[cases.hurricanes_grade7]
+description = "Hurricane formation passage, grade 7 (grades 5-12 GPT path)"
+
+[cases.hurricanes_grade7.input]
+text = '''
+
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+'''
+grade_level = 7
+
+[cases.hurricanes_grade7.prompt_steps.background_knowledge]
+model = "gpt-4o-2024-11-20"
+temperature = 0.0
+system_prompt = ""
+user_prompt = '''
+
+Review the following text, which is an educational text written for students in the following grade band: 7.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+
+[END TEXT]
+'''
+llm_response = '''
+Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, atmospheric pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes or the specific mechanisms of their formation.'''
+
+[cases.hurricanes_grade7.prompt_steps.vocab_complexity]
+model = "gpt-4.1"
+temperature = 0.0
+system_prompt = '''
+
+You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.
+
+You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.
+
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)
+IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.
+
+Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).
+'''
+user_prompt = '''
+
+Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly — do not introduce any additional criteria or lenses for judging the complexity of the text.
+
+[BEGIN ANNOTATION GUIDE AND RUBRIC]
+Instructions
+For the following task, please assume that:
+    - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).
+    - The student is moving through a common progression of topics (detailed here).
+    - The student is fluent in speaking English.
+    - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum.
+    - The student will use this material for independent reading/work, without direct instruction.
+    - The text is reasonable for the given grade level.
+
+Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket").
+
+Please do be sure to consider:
+- all of the different types of vocabulary (listed below)
+- the overall proportion of complex words in the text - including repeated complex words.
+- the resulting holistic complexity of the vocabulary (described in the Summary section below).
+
+Level 1:
+Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+
+Level 2:
+Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.
+
+Level 3:
+Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+
+Level 4:
+Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading
+
+And here are some relevant definitions:
+    - Conversational: Everyday language.
+    - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge.
+    - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.
+        - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data").
+        - Note:
+            - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.
+            - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb."
+                - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.
+        - Unfamiliar proper nouns:
+            - A person's name, even if unfamiliar, generally does not add to complexity.
+            - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.
+
+- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.
+- Overly-academic: Words that are excessively formal, complex, or specialized.
+    - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture"
+- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.
+    - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict."
+        - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use.
+
+
+Examples
+The student is on-grade-level:
+- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level.  If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.
+
+Figurative Language
+- Kicked the bucket.
+- The pen is mightier than the sword.
+- The classroom was a zoo.
+- He ran faster than the speed of light.
+[END ANNOTATION GUIDE AND RUBRIC]
+
+Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:
+[BEGIN EXAMPLES]
+
+*** EXAMPLE 1 ***
+The following text was intended for grade level 11 and received a complexity level of 1.
+
+Here is the background knowledge assumption for that text: N/A
+
+Here is the text:
+// START TEXT //
+"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.
+
+He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.
+
+I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.
+
+I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.
+
+Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized."
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.
+
+It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.
+// END REASONING //
+*** EXAMPLE 2 ***
+The following text was intended for grade level 5 and received a complexity level of 2.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).
+
+Here is the text:
+// START TEXT //
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names—hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm  (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.
+// END REASONING //
+
+*** EXAMPLE 3 ***
+The following text was intended for grade level 6 and received a complexity level of 3.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.
+
+Here is the text:
+// START TEXT //
+Geology is the scientific study of Earth. Geologists study the planet—its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.
+
+Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.
+
+Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating.  I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.
+// END REASONING //
+[END EXAMPLES]
+
+Below is the text you need to evaluate. It is intended for grade 7.
+
+As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, atmospheric pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes or the specific mechanisms of their formation.
+
+[BEGIN TEXT]
+
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+
+[END TEXT]
+
+In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
+
+STRICT OUTPUT FORMAT:
+- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.
+- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).
+- Do not prepend or append any text (e.g., do not write "Here is the JSON:").
+- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.
+
+The output should be formatted as a JSON instance that conforms to the JSON schema below.
+
+As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
+
+Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):
+```
+{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]}
+```
+'''
+llm_response = '''
+{"tier_2_words": "roar, fury, envelope, damage, match, mightiest, rising, sinking, presses, surrounds, develop, feeding, cover", "tier_3_words": "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist, ashore", "archaic_words": "", "other_complex_words": "", "complexity_score": "slightly complex", "reasoning": "The vocabulary in this text is mostly familiar and conversational for a 7th grader with the assumed background knowledge. Words like 'roar,' 'damage,' 'develop,' and 'feeding' are Tier 2 but are common in both everyday and academic contexts. The subject-specific (Tier 3) words include 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' 'low pressure,' and 'tropical.' However, these are either defined in context or are likely to be familiar due to prior exposure in science classes. There are no archaic or overly academic words. The proportion of subject-specific vocabulary is low to moderate, but not enough to make the text 'often complex.' The overall vocabulary is accessible, with only a few terms that might require brief pauses for clarification. Therefore, the text fits best at Level 2: mostly not complex, with a low proportion of subject-specific words."}'''
+
+[cases.hurricanes_grade7.expected_result]
+tier_2_words = '''
+roar, fury, envelope, damage, match, mightiest, rising, sinking, presses, surrounds, develop, feeding, cover'''
+tier_3_words = '''
+hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist, ashore'''
+archaic_words = ""
+other_complex_words = ""
+complexity_score = '''
+slightly complex'''
+reasoning = '''
+The vocabulary in this text is mostly familiar and conversational for a 7th grader with the assumed background knowledge. Words like 'roar,' 'damage,' 'develop,' and 'feeding' are Tier 2 but are common in both everyday and academic contexts. The subject-specific (Tier 3) words include 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' 'low pressure,' and 'tropical.' However, these are either defined in context or are likely to be familiar due to prior exposure in science classes. There are no archaic or overly academic words. The proportion of subject-specific vocabulary is low to moderate, but not enough to make the text 'often complex.' The overall vocabulary is accessible, with only a few terms that might require brief pauses for clarification. Therefore, the text fits best at Level 2: mostly not complex, with a low proportion of subject-specific words.'''
diff --git a/sdks/python/tests/contract_tests/test_vocabulary.py b/sdks/python/tests/contract_tests/test_vocabulary.py
new file mode 100644
index 0000000..23b03f1
--- /dev/null
+++ b/sdks/python/tests/contract_tests/test_vocabulary.py
@@ -0,0 +1,128 @@
+"""Contract tests: VocabularyEvaluator matches the notebook.
+
+Each test verifies two things for a known input:
+  1. The SDK sends the same LLM requests as the notebook (fully-formatted
+     system prompt, user prompt, model, and temperature — for both steps).
+  2. Given the same LLM responses, the SDK produces the same result as the
+     notebook.
+
+The vocabulary evaluator makes two LLM calls per evaluation:
+  - Step "background_knowledge": plain-text response (no system prompt).
+  - Step "vocab_complexity":      JSON response (system + human prompt).
+
+HOW TO ADD A NEW CASE
+---------------------
+1. Add a ``[cases.<name>]`` entry to
+   ``settings/vocabulary/contracts.toml`` with both prompt steps.
+2. Add a loader function to ``contract_tests/vocabulary.py``.
+3. Add a test function here following the pattern below.
+
+HOW TO REFRESH CONTRACT DATA
+-----------------------------
+Run the notebook ``evals/vocabulary_evaluator.ipynb`` with valid API keys.
+The final cells output a TOML block; paste it into ``contracts.toml``
+replacing the placeholder values.
+
+IMPORTANT: ``system_prompt = ""`` for the background_knowledge step is the
+correct value (the prompt has no system message), not a placeholder. Only
+the ``user_prompt`` and ``llm_response`` fields need to be populated.
+"""
+
+from learning_commons_evaluators import (
+    VocabularyEvaluationInput,
+    VocabularyEvaluator,
+    create_config_no_telemetry,
+)
+from learning_commons_evaluators.schemas.metadata import Status
+
+from .harness import ContractTestHarness
+from .vocabulary import (
+    load_vocabulary_grade34_case,
+    load_vocabulary_other_grades_case,
+    vocabulary_grade34_notebook_to_sdk_result,
+    vocabulary_other_grades_notebook_to_sdk_result,
+)
+
+
+class TestVocabularyContractGrades34:
+    def test_marco_polo_grade3(self) -> None:
+        """Marco Polo passage, grade 3 — grades 3–4 Gemini path.
+
+        Verifies:
+        - Both LLM requests (background_knowledge and vocab_complexity) match
+          the fully-formatted prompts captured from the notebook.
+        - Given the notebook's LLM responses, the SDK returns the same answer,
+          reasoning, and word-breakdown details as the notebook.
+        """
+        case = load_vocabulary_grade34_case()
+
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(
+            text=case.input["text"],
+            grade_level=case.input["grade_level"],
+        )
+
+        with ContractTestHarness(case) as harness:
+            result = evaluator.evaluate(inp)
+
+        # --- Prompt fidelity ---
+        # Both steps are asserted: model, temperature, and formatted messages
+        # must match what the notebook sent to the LLM.
+        harness.assert_prompt_step("background_knowledge")
+        harness.assert_prompt_step("vocab_complexity")
+
+        # --- Result fidelity ---
+        expected = vocabulary_grade34_notebook_to_sdk_result(case)
+        assert result.metadata.status == Status.succeeded
+        assert result.answer.score == expected.answer.score, (
+            f"answer.score: SDK={result.answer.score!r}, notebook={expected.answer.score!r}"
+        )
+        assert result.answer.label == expected.answer.label, (
+            f"answer.label: SDK={result.answer.label!r}, notebook={expected.answer.label!r}"
+        )
+        assert result.explanation.summary == expected.explanation.summary, (
+            "explanation.summary (reasoning) differs between SDK and notebook"
+        )
+        assert result.explanation.details == expected.explanation.details, (
+            "explanation.details (word breakdown) differs between SDK and notebook"
+        )
+
+
+class TestVocabularyContractOtherGrades:
+    def test_hurricanes_grade7(self) -> None:
+        """Hurricane formation passage, grade 7 — grades 5–12 GPT path.
+
+        Verifies:
+        - Both LLM requests match the notebook.
+        - Given the notebook's LLM responses, the SDK maps the integer score
+          to the correct TextComplexityAnswer and returns the same reasoning.
+        """
+        case = load_vocabulary_other_grades_case()
+
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(
+            text=case.input["text"],
+            grade_level=case.input["grade_level"],
+        )
+
+        with ContractTestHarness(case) as harness:
+            result = evaluator.evaluate(inp)
+
+        # --- Prompt fidelity ---
+        harness.assert_prompt_step("background_knowledge")
+        harness.assert_prompt_step("vocab_complexity")
+
+        # --- Result fidelity ---
+        expected = vocabulary_other_grades_notebook_to_sdk_result(case)
+        assert result.metadata.status == Status.succeeded
+        assert result.answer.score == expected.answer.score, (
+            f"answer.score: SDK={result.answer.score!r}, notebook={expected.answer.score!r}"
+        )
+        assert result.answer.label == expected.answer.label, (
+            f"answer.label: SDK={result.answer.label!r}, notebook={expected.answer.label!r}"
+        )
+        assert result.explanation.summary == expected.explanation.summary, (
+            "explanation.summary (reasoning) differs between SDK and notebook"
+        )
diff --git a/sdks/python/tests/contract_tests/vocabulary.py b/sdks/python/tests/contract_tests/vocabulary.py
new file mode 100644
index 0000000..dffd643
--- /dev/null
+++ b/sdks/python/tests/contract_tests/vocabulary.py
@@ -0,0 +1,137 @@
+"""Vocabulary-specific helpers for contract tests.
+
+Provides:
+  - Named case loaders (one function per test case in the TOML).
+  - Result mappers: convert the notebook-format expected result (raw
+    ``JsonOutputParser`` dict) to the ``TextComplexityResult`` that the SDK
+    should produce.
+
+Both grade paths use the same ``Output``-style schema (word-level breakdown +
+string ``complexity_score``), so both mappers apply the same normalisation:
+  ``complexity_score.lower().replace(" ", "_")`` → ``TextComplexityAnswer``.
+"""
+
+from __future__ import annotations
+
+from learning_commons_evaluators.schemas.evaluator import EvaluationExplanation
+from learning_commons_evaluators.schemas.metadata import (
+    EvaluationMetadata,
+    EvaluatorMaturity,
+    EvaluatorMetadata,
+    Status,
+)
+from learning_commons_evaluators.schemas.text_complexity import (
+    TextComplexityAnswer,
+    TextComplexityResult,
+)
+
+from .loader import ContractCase, load_contract_case
+
+# ---------------------------------------------------------------------------
+# Case loaders
+# ---------------------------------------------------------------------------
+
+
+def load_vocabulary_grade34_case() -> ContractCase:
+    """Load the 'marco_polo_grade3' contract test case (grades 3–4 path)."""
+    return load_contract_case("vocabulary", "marco_polo_grade3")
+
+
+def load_vocabulary_other_grades_case() -> ContractCase:
+    """Load the 'hurricanes_grade7' contract test case (grades 5–12 path)."""
+    return load_contract_case("vocabulary", "hurricanes_grade7")
+
+
+# ---------------------------------------------------------------------------
+# Result mappers
+# ---------------------------------------------------------------------------
+
+
+def _placeholder_metadata() -> EvaluationMetadata:
+    """Minimal metadata placeholder for result comparison objects.
+
+    Only ``answer`` and ``explanation`` are compared in contract assertions;
+    metadata contains non-deterministic fields (timing, evaluation ID, etc.)
+    and is intentionally excluded.
+    """
+    return EvaluationMetadata(
+        evaluator_metadata=EvaluatorMetadata(
+            id="vocabulary",
+            version="0.1",
+            name="Vocabulary",
+            description="",
+            maturity=EvaluatorMaturity.alpha,
+        ),
+        evaluation_settings=None,  # type: ignore[arg-type]
+        input_metadata={},
+        status=Status.succeeded,
+    )
+
+
+def vocabulary_grade34_notebook_to_sdk_result(
+    case: ContractCase,
+) -> TextComplexityResult:
+    """Convert a grades 3–4 ``expected_result`` dict to a ``TextComplexityResult``.
+
+    The notebook outputs a plain dict from ``JsonOutputParser``; the SDK wraps
+    that into ``TextComplexityResult``.  This function performs the same mapping
+    so tests can assert equality.
+
+    Only ``answer`` and ``explanation`` are compared — ``metadata`` is a
+    placeholder because it contains non-deterministic fields.
+
+    Args:
+        case: A loaded :class:`~loader.ContractCase` with a populated
+            ``expected_result`` (grades 3–4 format).
+
+    Returns:
+        A ``TextComplexityResult`` built from the contract's expected output.
+    """
+    r = case.expected_result
+    # Normalise the score string: the notebook may return "very complex" (spaces).
+    raw_score = r["complexity_score"].lower().replace(" ", "_")
+    answer = TextComplexityAnswer.from_score(raw_score)
+    explanation = EvaluationExplanation(
+        summary=r["reasoning"],
+        details={
+            "tier_2_words": r["tier_2_words"],
+            "tier_3_words": r["tier_3_words"],
+            "archaic_words": r["archaic_words"],
+            "other_complex_words": r["other_complex_words"],
+        },
+    )
+    return TextComplexityResult(
+        answer=answer,
+        explanation=explanation,
+        metadata=_placeholder_metadata(),
+    )
+
+
+def vocabulary_other_grades_notebook_to_sdk_result(
+    case: ContractCase,
+) -> TextComplexityResult:
+    """Convert a grades 5–12 ``expected_result`` dict to a ``TextComplexityResult``.
+
+    The OTHER_GRADES path uses the same Output-style schema as grades 3–4, so
+    ``expected_result`` contains a string ``complexity_score`` (e.g. "slightly
+    complex") which is normalised to underscore form before mapping.
+
+    Only ``answer`` and ``explanation.summary`` are compared — ``metadata`` is
+    a placeholder because it contains non-deterministic fields.
+
+    Args:
+        case: A loaded :class:`~loader.ContractCase` with a populated
+            ``expected_result`` (grades 5–12 format).
+
+    Returns:
+        A ``TextComplexityResult`` built from the contract's expected output.
+    """
+    r = case.expected_result
+    score = r["complexity_score"].lower().replace(" ", "_")
+    answer = TextComplexityAnswer.from_score(score)
+    explanation = EvaluationExplanation(summary=r["reasoning"], details={})
+    return TextComplexityResult(
+        answer=answer,
+        explanation=explanation,
+        metadata=_placeholder_metadata(),
+    )
diff --git a/sdks/python/tests/evaluators/test_vocabulary.py b/sdks/python/tests/evaluators/test_vocabulary.py
new file mode 100644
index 0000000..a9ac041
--- /dev/null
+++ b/sdks/python/tests/evaluators/test_vocabulary.py
@@ -0,0 +1,293 @@
+"""Tests for VocabularyEvaluator and vocabulary helpers."""
+
+from unittest.mock import patch
+
+import pytest
+
+from learning_commons_evaluators import (
+    VocabularyEvaluationInput,
+    VocabularyEvaluator,
+    create_config_no_telemetry,
+)
+from learning_commons_evaluators.schemas.errors import ConfigurationError, ValidationError
+from learning_commons_evaluators.schemas.metadata import Status
+from learning_commons_evaluators.schemas.vocabulary import (
+    VOCABULARY_SUPPORTED_GRADES,
+    VocabularyOutputGrades34,
+    VocabularyOutputOtherGrades,
+)
+
+_SAMPLE_TEXT = "The cat sat on the mat."
+
+# ── Mock helpers ──────────────────────────────────────────────────────────────
+
+_MOCK_BACKGROUND_KNOWLEDGE = "Students are familiar with household pets and basic domestic scenes."
+
+
+def _make_grades34_output(
+    score: str = "moderately_complex",
+) -> VocabularyOutputGrades34:
+    return VocabularyOutputGrades34(
+        tier_2_words="sat",
+        tier_3_words="none",
+        archaic_words="none",
+        other_complex_words="none",
+        complexity_score=score,
+        reasoning="Most words are simple and familiar.",
+    )
+
+
+_OTHER_GRADES_SCORE_MAP: dict[int, str] = {
+    1: "slightly complex",
+    2: "moderately complex",
+    3: "very complex",
+    4: "exceedingly complex",
+}
+
+
+def _make_other_grades_output(answer: int = 2) -> VocabularyOutputOtherGrades:
+    """Build a mock ``VocabularyOutputOtherGrades`` from a convenience integer (1–4).
+
+    The OTHER_GRADES LLM returns space-separated string labels ("slightly complex",
+    etc.).  Calling code keeps the same integer-based API for readability; this
+    helper maps it to the string the real model would return.
+    """
+    return VocabularyOutputOtherGrades(
+        tier_2_words="sat",
+        tier_3_words="none",
+        archaic_words="none",
+        other_complex_words="none",
+        complexity_score=_OTHER_GRADES_SCORE_MAP[answer],
+        reasoning="Most words are simple and familiar.",
+    )
+
+
+def _patch_steps(evaluator, bk_return, vocab_return):
+    """Patch execute_prompt_chain_step with side_effect=[bk_return, vocab_return].
+
+    Both steps (background_knowledge and vocab_complexity) go through the same
+    method; side_effect returns them in call order.
+    """
+    return patch.object(
+        evaluator,
+        "execute_prompt_chain_step",
+        side_effect=[bk_return, vocab_return],
+    )
+
+
+# ── Grade 3–4 path ────────────────────────────────────────────────────────────
+
+
+class TestVocabularyEvaluatorGrades34:
+    def test_evaluate_grade_3_returns_result(self):
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=3)
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()):
+            result = evaluator.evaluate(inp)
+
+        assert result.answer.score == "moderately_complex"
+        assert result.answer.label == "Moderately complex"
+        assert result.metadata.status == Status.succeeded
+        assert "tier_2_words" in result.explanation.details
+
+    def test_evaluate_grade_4_returns_result(self):
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=4)
+        with _patch_steps(
+            evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output("very_complex")
+        ):
+            result = evaluator.evaluate(inp)
+
+        assert result.answer.score == "very_complex"
+
+    def test_grades34_score_with_spaces_is_normalised(self):
+        """The grades 3–4 prompt may return "slightly complex" (spaces); normalise to underscores."""
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=3)
+        # The evaluator calls .lower().replace(" ", "_") before from_score(),
+        # so we verify space-separated scores survive the normalisation path.
+        output = _make_grades34_output("slightly_complex")
+
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, output):
+            result = evaluator.evaluate(inp)
+
+        assert result.answer.score == "slightly_complex"
+
+    def test_evaluate_grades34_explanation_has_word_breakdown(self):
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=3)
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()):
+            result = evaluator.evaluate(inp)
+
+        details = result.explanation.details
+        assert "tier_2_words" in details
+        assert "tier_3_words" in details
+        assert "archaic_words" in details
+        assert "other_complex_words" in details
+
+
+# ── Grades 5–12 path ──────────────────────────────────────────────────────────
+
+
+class TestVocabularyEvaluatorOtherGrades:
+    @pytest.mark.parametrize(
+        "score_label, expected_score",
+        [
+            (1, "slightly_complex"),
+            (2, "moderately_complex"),
+            (3, "very_complex"),
+            (4, "exceedingly_complex"),
+        ],
+    )
+    def test_all_complexity_scores_map_correctly(self, score_label, expected_score):
+        """Each complexity label (passed as convenience int 1–4) maps to the right score."""
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=7)
+        with _patch_steps(
+            evaluator,
+            _MOCK_BACKGROUND_KNOWLEDGE,
+            _make_other_grades_output(score_label),
+        ):
+            result = evaluator.evaluate(inp)
+
+        assert result.answer.score == expected_score
+
+    def test_evaluate_grade_12_returns_result(self):
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=12)
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(1)):
+            result = evaluator.evaluate(inp)
+
+        assert result.metadata.status == Status.succeeded
+        assert result.answer.score == "slightly_complex"
+
+    def test_other_grades_explanation_details_is_empty(self):
+        """The grades 5–12 evaluator drops word-level breakdowns from explanation.details.
+
+        The LLM does return tier_2_words etc. in its output (same schema as grades
+        3–4), but the evaluator intentionally omits them from the explanation so
+        callers get a clean summary-only result for the higher-grade path.
+        """
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=8)
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)):
+            result = evaluator.evaluate(inp)
+
+        assert result.explanation.details == {}
+
+
+# ── Grade validation via framework ────────────────────────────────────────────
+
+
+class TestVocabularyEvaluationInputValidation:
+    def test_allowed_grades_set_from_toml(self):
+        """VocabularyEvaluationInput picks up allowed_grades from the TOML spec."""
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
+        assert set(inp.grade_level.spec.allowed_grades) == VOCABULARY_SUPPORTED_GRADES
+
+    @pytest.mark.parametrize("unsupported_grade", [0, 1, 2])
+    def test_unsupported_grade_raises_via_framework(self, unsupported_grade):
+        """BaseEvaluator.evaluate() calls input.validate(), which catches the bad grade."""
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=unsupported_grade)
+        # The base evaluator catches the ValidationError, sets status=failed, then re-raises.
+        with pytest.raises(ValidationError):
+            evaluator.evaluate(inp)
+
+    def test_unsupported_grade_sets_status_failed(self):
+        """Metadata status is set to failed when grade validation fails."""
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=2)
+        with pytest.raises(ValidationError):
+            evaluator.evaluate(inp)
+
+
+# ── Metadata and settings ─────────────────────────────────────────────────────
+
+
+class TestVocabularyEvaluatorMetadata:
+    def test_evaluator_metadata(self):
+        evaluator = VocabularyEvaluator(create_config_no_telemetry())
+        assert evaluator.metadata.id == "vocabulary"
+        assert evaluator.metadata.version == "0.1"
+
+    def test_default_settings_has_all_prompt_steps(self):
+        evaluator = VocabularyEvaluator(create_config_no_telemetry())
+        settings = evaluator.default_evaluation_settings
+        assert settings.prompt_settings_step_background_knowledge is not None
+        assert settings.prompt_settings_step_vocab_grades_3_4 is not None
+        assert settings.prompt_settings_step_vocab_other_grades is not None
+
+    def test_evaluate_succeeds_and_records_metadata(self):
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)):
+            result = evaluator.evaluate(inp)
+
+        assert result.metadata.status == Status.succeeded
+        assert result.metadata.evaluator_metadata.id == "vocabulary"
+
+
+class TestVocabularyEvaluationInputConfiguration:
+    """Tests that VocabularyEvaluationInput fails loudly on bad configuration.
+
+    These tests patch ``VocabularyEvaluationInput._input_settings`` directly
+    because the ClassVar is bound at class-definition time.  Patching the
+    module-level ``_INPUT_SETTINGS`` name would rebind the module variable but
+    leave the class variable pointing at the original dict.
+    """
+
+    def test_missing_text_spec_raises_configuration_error(self, monkeypatch):
+        """If 'text' is absent from _input_settings, ConfigurationError is raised immediately."""
+        monkeypatch.setattr(VocabularyEvaluationInput, "_input_settings", {})
+        with pytest.raises(ConfigurationError, match="'text'"):
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
+
+    def test_missing_grade_level_spec_raises_configuration_error(self, monkeypatch):
+        """If 'grade_level' is absent from _input_settings, ConfigurationError is raised."""
+        from learning_commons_evaluators.schemas.input_specs import TextInputSpec
+
+        monkeypatch.setattr(
+            VocabularyEvaluationInput,
+            "_input_settings",
+            {"text": TextInputSpec(name="text")},
+        )
+        with pytest.raises(ConfigurationError, match="'grade_level'"):
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
+
+    def test_wrong_text_spec_type_raises_configuration_error(self, monkeypatch):
+        """If the 'text' spec has the wrong type, ConfigurationError names the type mismatch."""
+        from learning_commons_evaluators.schemas.input_specs import GradeInputSpec
+
+        monkeypatch.setattr(
+            VocabularyEvaluationInput,
+            "_input_settings",
+            {"text": GradeInputSpec(name="text")},
+        )
+        with pytest.raises(ConfigurationError, match="TextInputSpec"):
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
+
+    def test_wrong_grade_level_spec_type_raises_configuration_error(self, monkeypatch):
+        """If the 'grade_level' spec has the wrong type, ConfigurationError names the mismatch."""
+        from learning_commons_evaluators.schemas.input_specs import TextInputSpec
+
+        monkeypatch.setattr(
+            VocabularyEvaluationInput,
+            "_input_settings",
+            {
+                "text": TextInputSpec(name="text"),
+                "grade_level": TextInputSpec(name="grade_level"),  # wrong type
+            },
+        )
+        with pytest.raises(ConfigurationError, match="GradeInputSpec"):
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
diff --git a/sdks/python/tests/settings/test_load_settings.py b/sdks/python/tests/settings/test_load_settings.py
index 9d8e2be..5e72ee3 100644
--- a/sdks/python/tests/settings/test_load_settings.py
+++ b/sdks/python/tests/settings/test_load_settings.py
@@ -343,7 +343,7 @@ def test_bundled_contract_tests_are_present(self) -> None:
         env = {k: v for k, v in os.environ.items() if k != "EVALUATORS_SETTINGS_DIR"}
         with patch.dict(os.environ, env, clear=True):
             root = shared_settings_root()
-        for evaluator in ("conventionality",):
+        for evaluator in ("conventionality", "vocabulary"):
             assert (root / evaluator / "contracts.toml").exists(), (
                 f"Bundled {evaluator}/contracts.toml not found — "
                 f"run 'python scripts/generate_settings.py --sync'"
@@ -383,7 +383,7 @@ def _canonical_settings_root() -> Path | None:
     return candidate if candidate.is_dir() else None
 
 
-@pytest.mark.parametrize("evaluator", ["conventionality"])
+@pytest.mark.parametrize("evaluator", ["conventionality", "vocabulary"])
 def test_bundled_contract_tests_match_canonical(evaluator: str) -> None:
     """Bundled contracts.toml must be byte-for-byte identical to sdks/settings/.
 
diff --git a/sdks/settings/vocabulary/contracts.toml b/sdks/settings/vocabulary/contracts.toml
new file mode 100644
index 0000000..d1c55a8
--- /dev/null
+++ b/sdks/settings/vocabulary/contracts.toml
@@ -0,0 +1,648 @@
+# Contract tests for the Vocabulary evaluator.
+#
+# This file captures the exact LLM interactions for known inputs so that tests
+# can verify the SDK sends the same requests as the notebook and produces the
+# same results from the same responses.
+#
+# HOW TO REGENERATE
+# -----------------
+# Run the notebook "evals/vocabulary_evaluator.ipynb" with valid API keys.
+# The final cells output a TOML block that you paste in here to replace the
+# placeholder values below.
+#
+# WHAT BELONGS HERE
+# -----------------
+# Only data that originates from external sources:
+#   - input:            what the caller provides to the evaluator
+#   - prompt_steps:     what the SDK sends to the LLM (fully formatted messages,
+#                       model, temperature) and the raw response the LLM returned.
+#                       Steps must be listed in call order:
+#                         1. background_knowledge  (plain text response)
+#                         2. vocab_complexity       (JSON response)
+#   - expected_result:  the structured output parsed from the vocab_complexity
+#                       llm_response (notebook format), before SDK result mapping.
+#
+# Deterministic values computed by library code (FK score, format_instructions)
+# are NOT stored as standalone fields — they appear inside the fully formatted
+# prompts, where they are implicitly verified by the prompt-request assertion.
+#
+# TWO CASES ARE INCLUDED
+# -----------------------
+# marco_polo_grade3  — grade 3, uses the grades 3–4 path (Gemini + rubric labels
+#                      + word-level breakdown in expected_result).
+# hurricanes_grade7  — grade 7, uses the grades 5–12 path (GPT + integer score
+#                      in expected_result).
+
+# ===========================================================================
+# Case 1: marco_polo_grade3  (grades 3–4 path)
+# ===========================================================================
+
+[cases.marco_polo_grade3]
+description = "Marco Polo passage, grade 3 (grades 3-4 Gemini path)"
+
+[cases.marco_polo_grade3.input]
+text = '''
+
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
+'''
+grade_level = 3
+
+[cases.marco_polo_grade3.prompt_steps.background_knowledge]
+model = "gpt-4o-2024-11-20"
+temperature = 0.0
+system_prompt = ""
+user_prompt = '''
+
+Review the following text, which is an educational text written for students in the following grade band: 3.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
+
+[END TEXT]
+'''
+llm_response = '''
+Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of trade. They are unlikely to be familiar with Marco Polo, the Mongol Dynasty, Kublai Khan, or the historical origins of pasta.'''
+
+[cases.marco_polo_grade3.prompt_steps.vocab_complexity]
+model = "gemini-2.5-pro"
+temperature = 0.0
+system_prompt = '''
+
+You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.
+
+You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)
+
+**Resource 1: Qualitative Text Complexity rubric (SAP)**
+1.  **Level 1: Slightly complex**
+    *   Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+    *   Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!
+2.  **Level 2: Moderately complex**
+    *   Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK
+    *   Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.
+3.  **Level 3: Very complex**
+    *   Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+    *   Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.
+4.  **Level 4: Exceedingly complex**
+    *   Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.
+    *   Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.
+
+**Resource 2: Flesch-Kincaid Grade Level**
+Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.
+*   grade 2-3: 1.98-5.34
+*   grade 4-5: 4.51-7.73
+*   grade 6-8: 6.51-10.34
+*   grade 9-10: 8.32-12.12
+*   grade 11-College: 10.34-14.2
+
+**Guidelines for Interpretation and Reasoning**
+
+Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:
+
+1.  **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.
+2.  **Contextual Scaffolding:** Assess how the text supports new vocabulary.
+    *   Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")?
+    *   Is the surrounding language simple and conversational, making the meaning of new words easier to infer?
+    *   Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.
+3.  **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).
+4.  **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.
+5.  **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.
+    *   **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.
+    *   **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.
+6.  **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).
+
+**Final Analysis Format**
+
+Provide these information as your final analysis:
+1.  **Complex vocabulary:**
+    *   Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.
+    *   Tier 3 words: Overly academic or domain-specific words.
+    *   Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., “The jury retired to deliberate on their verdict." The use of “retire” to mean withdrawing to a private place is an archaic use.
+    *   Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).
+2.  **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex
+3.  **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.
+'''
+user_prompt = '''
+
+Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.
+
+- It is intended for grade 3.
+
+- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of trade. They are unlikely to be familiar with Marco Polo, the Mongol Dynasty, Kublai Khan, or the historical origins of pasta.
+
+- Text Flesch-Kincaid grade level: 6.59
+
+- Text to evaluate: [BEGIN TEXT]
+
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
+
+[END TEXT]
+
+STRICT OUTPUT FORMAT:
+- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.
+- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).
+- Do not prepend or append any text (e.g., do not write "Here is the JSON:").
+- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.
+
+The output should be formatted as a JSON instance that conforms to the JSON schema below.
+
+As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
+
+Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):
+```
+{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]}
+```
+'''
+llm_response = '''
+```json
+{
+  "tier_2_words": "served, nevertheless, survived, scholars, described, treasures, wealthy",
+  "tier_3_words": "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain, silk",
+  "archaic_words": "",
+  "other_complex_words": "Venice, Ayas, Tabriz, Kerman, Beijing, Far East, near East",
+  "complexity_score": "very complex",
+  "reasoning": "The vocabulary is rated as 'very complex' primarily due to the high density and conceptual load of unfamiliar proper nouns and domain-specific terms, presented with no contextual scaffolding. For a 3rd grader, the text introduces a large number of new names in rapid succession (Mongol Dynasty, Venice, Ayas, Tabriz, Kerman, Beijing, Kublai Khan) which function as challenging vocabulary. Additionally, the text lists specific, unfamiliar goods (ivory, jade, porcelain, silk) without any description. This barrage of new, undefined terms creates a significant cognitive load that would slow comprehension considerably, even if the student can grasp the basic gist of the story. While the main idea might be accessible, understanding the details of the journey and its significance is made very challenging by the vocabulary, aligning with the 'very complex' rating."
+}
+```'''
+
+[cases.marco_polo_grade3.expected_result]
+tier_2_words = '''
+served, nevertheless, survived, scholars, described, treasures, wealthy'''
+tier_3_words = '''
+Mongol Dynasty, Kublai Khan, ivory, jade, porcelain, silk'''
+archaic_words = ""
+other_complex_words = '''
+Venice, Ayas, Tabriz, Kerman, Beijing, Far East, near East'''
+complexity_score = '''
+very complex'''
+reasoning = '''
+The vocabulary is rated as 'very complex' primarily due to the high density and conceptual load of unfamiliar proper nouns and domain-specific terms, presented with no contextual scaffolding. For a 3rd grader, the text introduces a large number of new names in rapid succession (Mongol Dynasty, Venice, Ayas, Tabriz, Kerman, Beijing, Kublai Khan) which function as challenging vocabulary. Additionally, the text lists specific, unfamiliar goods (ivory, jade, porcelain, silk) without any description. This barrage of new, undefined terms creates a significant cognitive load that would slow comprehension considerably, even if the student can grasp the basic gist of the story. While the main idea might be accessible, understanding the details of the journey and its significance is made very challenging by the vocabulary, aligning with the 'very complex' rating.'''
+
+# ===========================================================================
+# Case 2: hurricanes_grade7  (grades 5–12 path)
+# ===========================================================================
+
+[cases.hurricanes_grade7]
+description = "Hurricane formation passage, grade 7 (grades 5-12 GPT path)"
+
+[cases.hurricanes_grade7.input]
+text = '''
+
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+'''
+grade_level = 7
+
+[cases.hurricanes_grade7.prompt_steps.background_knowledge]
+model = "gpt-4o-2024-11-20"
+temperature = 0.0
+system_prompt = ""
+user_prompt = '''
+
+Review the following text, which is an educational text written for students in the following grade band: 7.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+
+[END TEXT]
+'''
+llm_response = '''
+Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, atmospheric pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes or the specific mechanisms of their formation.'''
+
+[cases.hurricanes_grade7.prompt_steps.vocab_complexity]
+model = "gpt-4.1"
+temperature = 0.0
+system_prompt = '''
+
+You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.
+
+You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.
+
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)
+IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.
+
+Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).
+'''
+user_prompt = '''
+
+Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly — do not introduce any additional criteria or lenses for judging the complexity of the text.
+
+[BEGIN ANNOTATION GUIDE AND RUBRIC]
+Instructions
+For the following task, please assume that:
+    - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).
+    - The student is moving through a common progression of topics (detailed here).
+    - The student is fluent in speaking English.
+    - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum.
+    - The student will use this material for independent reading/work, without direct instruction.
+    - The text is reasonable for the given grade level.
+
+Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket").
+
+Please do be sure to consider:
+- all of the different types of vocabulary (listed below)
+- the overall proportion of complex words in the text - including repeated complex words.
+- the resulting holistic complexity of the vocabulary (described in the Summary section below).
+
+Level 1:
+Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+
+Level 2:
+Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.
+
+Level 3:
+Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+
+Level 4:
+Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading
+
+And here are some relevant definitions:
+    - Conversational: Everyday language.
+    - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge.
+    - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.
+        - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data").
+        - Note:
+            - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.
+            - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb."
+                - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.
+        - Unfamiliar proper nouns:
+            - A person's name, even if unfamiliar, generally does not add to complexity.
+            - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.
+
+- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.
+- Overly-academic: Words that are excessively formal, complex, or specialized.
+    - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture"
+- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.
+    - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict."
+        - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use.
+
+
+Examples
+The student is on-grade-level:
+- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level.  If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.
+
+Figurative Language
+- Kicked the bucket.
+- The pen is mightier than the sword.
+- The classroom was a zoo.
+- He ran faster than the speed of light.
+[END ANNOTATION GUIDE AND RUBRIC]
+
+Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:
+[BEGIN EXAMPLES]
+
+*** EXAMPLE 1 ***
+The following text was intended for grade level 11 and received a complexity level of 1.
+
+Here is the background knowledge assumption for that text: N/A
+
+Here is the text:
+// START TEXT //
+"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.
+
+He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.
+
+I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.
+
+I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.
+
+Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized."
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.
+
+It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.
+// END REASONING //
+*** EXAMPLE 2 ***
+The following text was intended for grade level 5 and received a complexity level of 2.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).
+
+Here is the text:
+// START TEXT //
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names—hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm  (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.
+// END REASONING //
+
+*** EXAMPLE 3 ***
+The following text was intended for grade level 6 and received a complexity level of 3.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.
+
+Here is the text:
+// START TEXT //
+Geology is the scientific study of Earth. Geologists study the planet—its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.
+
+Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.
+
+Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating.  I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.
+// END REASONING //
+[END EXAMPLES]
+
+Below is the text you need to evaluate. It is intended for grade 7.
+
+As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, atmospheric pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes or the specific mechanisms of their formation.
+
+[BEGIN TEXT]
+
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+
+[END TEXT]
+
+In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
+
+STRICT OUTPUT FORMAT:
+- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.
+- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).
+- Do not prepend or append any text (e.g., do not write "Here is the JSON:").
+- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.
+
+The output should be formatted as a JSON instance that conforms to the JSON schema below.
+
+As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
+
+Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):
+```
+{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]}
+```
+'''
+llm_response = '''
+{"tier_2_words": "roar, fury, envelope, damage, match, mightiest, rising, sinking, presses, surrounds, develop, feeding, cover", "tier_3_words": "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist, ashore", "archaic_words": "", "other_complex_words": "", "complexity_score": "slightly complex", "reasoning": "The vocabulary in this text is mostly familiar and conversational for a 7th grader with the assumed background knowledge. Words like 'roar,' 'damage,' 'develop,' and 'feeding' are Tier 2 but are common in both everyday and academic contexts. The subject-specific (Tier 3) words include 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' 'low pressure,' and 'tropical.' However, these are either defined in context or are likely to be familiar due to prior exposure in science classes. There are no archaic or overly academic words. The proportion of subject-specific vocabulary is low to moderate, but not enough to make the text 'often complex.' The overall vocabulary is accessible, with only a few terms that might require brief pauses for clarification. Therefore, the text fits best at Level 2: mostly not complex, with a low proportion of subject-specific words."}'''
+
+[cases.hurricanes_grade7.expected_result]
+tier_2_words = '''
+roar, fury, envelope, damage, match, mightiest, rising, sinking, presses, surrounds, develop, feeding, cover'''
+tier_3_words = '''
+hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist, ashore'''
+archaic_words = ""
+other_complex_words = ""
+complexity_score = '''
+slightly complex'''
+reasoning = '''
+The vocabulary in this text is mostly familiar and conversational for a 7th grader with the assumed background knowledge. Words like 'roar,' 'damage,' 'develop,' and 'feeding' are Tier 2 but are common in both everyday and academic contexts. The subject-specific (Tier 3) words include 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' 'low pressure,' and 'tropical.' However, these are either defined in context or are likely to be familiar due to prior exposure in science classes. There are no archaic or overly academic words. The proportion of subject-specific vocabulary is low to moderate, but not enough to make the text 'often complex.' The overall vocabulary is accessible, with only a few terms that might require brief pauses for clarification. Therefore, the text fits best at Level 2: mostly not complex, with a low proportion of subject-specific words.'''
diff --git a/sdks/settings/vocabulary/settings.toml b/sdks/settings/vocabulary/settings.toml
new file mode 100644
index 0000000..6ea3ca9
--- /dev/null
+++ b/sdks/settings/vocabulary/settings.toml
@@ -0,0 +1,410 @@
+[evaluator_metadata]
+id = "vocabulary"
+version = "0.1"
+name = "Vocabulary"
+description = "Vocabulary Complexity Evaluator"
+maturity = "alpha"
+
+[[evaluator_metadata.inputs]]
+name = "text"
+type = "TextInputField"
+description = "The text to evaluate for vocabulary complexity."
+
+[[evaluator_metadata.inputs]]
+name = "grade_level"
+type = "GradeInputField"
+description = "The grade level of the text (3–12)."
+allowed_grades = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+
+# ── Prompt templates ───────────────────────────────────────────────────────────
+#
+# Three prompt keys are used by the evaluator:
+#   background_knowledge_prompt  – step 1 (all grades): produces a plain-text
+#                                  student background knowledge assumption.
+#   vocab_grades_3_4_system_prompt / vocab_grades_3_4_user_prompt  – step 2 for
+#                                  grades 3–4: rubric-based, returns a string label
+#                                  and word-level breakdown.
+#   vocab_other_grades_system_prompt / vocab_other_grades_user_prompt  – step 2
+#                                  for grades 5–12: annotation-guide-based, returns
+#                                  a string complexity_score and reasoning.
+#
+# These prompts are the source of truth for what the evaluator sends to the LLM.
+# They match the corresponding prompts in evals/prompts/vocab_prompts.py.
+# If you update a prompt here, update the notebook file too (or vice versa).
+
+[prompts]
+background_knowledge_prompt = """
+Review the following text, which is an educational text written for students in the following grade band: {grade}.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+{text}
+[END TEXT]
+"""
+
+vocab_grades_3_4_system_prompt = """
+You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.
+
+You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)
+
+**Resource 1: Qualitative Text Complexity rubric (SAP)**
+1.  **Level 1: Slightly complex**
+    *   Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+    *   Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!
+2.  **Level 2: Moderately complex**
+    *   Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK
+    *   Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.
+3.  **Level 3: Very complex**
+    *   Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+    *   Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.
+4.  **Level 4: Exceedingly complex**
+    *   Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.
+    *   Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.
+
+**Resource 2: Flesch-Kincaid Grade Level**
+Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.
+*   grade 2-3: 1.98-5.34
+*   grade 4-5: 4.51-7.73
+*   grade 6-8: 6.51-10.34
+*   grade 9-10: 8.32-12.12
+*   grade 11-College: 10.34-14.2
+
+**Guidelines for Interpretation and Reasoning**
+
+Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:
+
+1.  **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.
+2.  **Contextual Scaffolding:** Assess how the text supports new vocabulary.
+    *   Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")?
+    *   Is the surrounding language simple and conversational, making the meaning of new words easier to infer?
+    *   Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.
+3.  **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).
+4.  **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.
+5.  **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.
+    *   **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.
+    *   **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.
+6.  **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).
+
+**Final Analysis Format**
+
+Provide these information as your final analysis:
+1.  **Complex vocabulary:**
+    *   Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.
+    *   Tier 3 words: Overly academic or domain-specific words.
+    *   Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., “The jury retired to deliberate on their verdict." The use of “retire” to mean withdrawing to a private place is an archaic use.
+    *   Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).
+2.  **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex
+3.  **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.
+"""
+
+vocab_grades_3_4_user_prompt = """
+Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.
+
+- It is intended for grade {student_grade_level}.
+
+- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}
+
+- Text Flesch-Kincaid grade level: {fk_level}
+
+- Text to evaluate: [BEGIN TEXT]
+{text}
+[END TEXT]
+
+{format_instructions}
+"""
+
+vocab_other_grades_system_prompt = """
+You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.
+
+You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.
+
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)
+IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.
+
+Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).
+"""
+
+vocab_other_grades_user_prompt = """
+Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly — do not introduce any additional criteria or lenses for judging the complexity of the text.
+
+[BEGIN ANNOTATION GUIDE AND RUBRIC]
+Instructions
+For the following task, please assume that:
+    - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).
+    - The student is moving through a common progression of topics (detailed here).
+    - The student is fluent in speaking English.
+    - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum.
+    - The student will use this material for independent reading/work, without direct instruction.
+    - The text is reasonable for the given grade level.
+
+Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket").
+
+Please do be sure to consider:
+- all of the different types of vocabulary (listed below)
+- the overall proportion of complex words in the text - including repeated complex words.
+- the resulting holistic complexity of the vocabulary (described in the Summary section below).
+
+Level 1:
+Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+
+Level 2:
+Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.
+
+Level 3:
+Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+
+Level 4:
+Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading
+
+And here are some relevant definitions:
+    - Conversational: Everyday language.
+    - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge.
+    - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.
+        - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data").
+        - Note:
+            - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.
+            - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb."
+                - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.
+        - Unfamiliar proper nouns:
+            - A person's name, even if unfamiliar, generally does not add to complexity.
+            - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.
+
+- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.
+- Overly-academic: Words that are excessively formal, complex, or specialized.
+    - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture"
+- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.
+    - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict."
+        - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use.
+
+
+Examples
+The student is on-grade-level:
+- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level.  If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.
+
+Figurative Language
+- Kicked the bucket.
+- The pen is mightier than the sword.
+- The classroom was a zoo.
+- He ran faster than the speed of light.
+[END ANNOTATION GUIDE AND RUBRIC]
+
+Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:
+[BEGIN EXAMPLES]
+
+*** EXAMPLE 1 ***
+The following text was intended for grade level 11 and received a complexity level of 1.
+
+Here is the background knowledge assumption for that text: N/A
+
+Here is the text:
+// START TEXT //
+"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.
+
+He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.
+
+I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.
+
+I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.
+
+Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized."
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.
+
+It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.
+// END REASONING //
+*** EXAMPLE 2 ***
+The following text was intended for grade level 5 and received a complexity level of 2.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).
+
+Here is the text:
+// START TEXT //
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names—hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm  (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.
+// END REASONING //
+
+*** EXAMPLE 3 ***
+The following text was intended for grade level 6 and received a complexity level of 3.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.
+
+Here is the text:
+// START TEXT //
+Geology is the scientific study of Earth. Geologists study the planet—its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.
+
+Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.
+
+Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating.  I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.
+// END REASONING //
+[END EXAMPLES]
+
+Below is the text you need to evaluate. It is intended for grade {student_grade_level}.
+
+As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}
+
+[BEGIN TEXT]
+{text}
+[END TEXT]
+
+In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
+
+{format_instructions}
+"""
+
+# ── Evaluation settings ────────────────────────────────────────────────────────
+#
+# Three LLM steps, each with its own provider and model:
+#   background_knowledge  – GPT-4o for the background knowledge generation step
+#   vocab_grades_3_4      – Gemini for the grades 3–4 complexity step
+#   vocab_other_grades    – GPT-4.1 for the grades 5–12 complexity step
+
+[evaluation_settings]
+
+[evaluation_settings.prompt_settings_step_background_knowledge]
+type = "OPENAI"
+model = "gpt-4o-2024-11-20"
+temperature = 0
+
+[evaluation_settings.prompt_settings_step_vocab_grades_3_4]
+type = "GOOGLE"
+model = "gemini-2.5-pro"
+temperature = 0
+
+[evaluation_settings.prompt_settings_step_vocab_other_grades]
+type = "OPENAI"
+model = "gpt-4.1"
+temperature = 0

From 4a6ec4da8e7f647d5054fad65545e9b9c6bc210d Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 20:24:54 -0700
Subject: [PATCH 02/10] chore: update vocabulary settings to use  instead of 
 for prompt settings

---
 sdks/settings/vocabulary/settings.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sdks/settings/vocabulary/settings.toml b/sdks/settings/vocabulary/settings.toml
index 6ea3ca9..3760c19 100644
--- a/sdks/settings/vocabulary/settings.toml
+++ b/sdks/settings/vocabulary/settings.toml
@@ -395,16 +395,16 @@ In your response, when specifying the level of complexity, be sure to use only a
 [evaluation_settings]
 
 [evaluation_settings.prompt_settings_step_background_knowledge]
-type = "OPENAI"
+provider_type = "OPENAI"
 model = "gpt-4o-2024-11-20"
 temperature = 0
 
 [evaluation_settings.prompt_settings_step_vocab_grades_3_4]
-type = "GOOGLE"
+provider_type = "GOOGLE"
 model = "gemini-2.5-pro"
 temperature = 0
 
 [evaluation_settings.prompt_settings_step_vocab_other_grades]
-type = "OPENAI"
+provider_type = "OPENAI"
 model = "gpt-4.1"
 temperature = 0

From 1ff22514b7cc303a45c23e53289cd361a3a1678e Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 22:20:31 -0700
Subject: [PATCH 03/10] chore: fix capture and contract tests

---
 evals/vocabulary_evaluator.ipynb              | 1817 +++++++++++------
 sdks/python/scripts/capture.py                |   18 +-
 .../evaluators/vocabulary.py                  |   14 +-
 .../_generated_vocabulary_settings.py         |    4 +-
 .../settings/vocabulary/contracts.toml        |   61 +-
 .../tests/contract_tests/test_vocabulary.py   |   10 +-
 .../python/tests/contract_tests/vocabulary.py |    2 +-
 .../tests/evaluators/test_vocabulary.py       |   44 +-
 sdks/settings/vocabulary/contracts.toml       |   61 +-
 sdks/settings/vocabulary/settings.toml        |    2 +-
 10 files changed, 1346 insertions(+), 687 deletions(-)

diff --git a/evals/vocabulary_evaluator.ipynb b/evals/vocabulary_evaluator.ipynb
index ce9deda..23aa63b 100644
--- a/evals/vocabulary_evaluator.ipynb
+++ b/evals/vocabulary_evaluator.ipynb
@@ -1,581 +1,1246 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Vocabulary Evaluator (Early Release)\n",
-    "\n",
-    "**The Vocabulary Evaluator** gives developers the fine-grained insight they need but can’t get from traditional tools. It helps determine whether texts use words that align with grade-level expectations and support growth in academic language. This ensures students are consistently exposed to the kinds of vocabulary that build knowledge and enable them to fully engage with grade-level texts.\n",
-    "\n",
-    "By understanding what makes a text difficult for a student to read, edtech companies and educators are better equipped to ensure students get the right text for their needs, along with the right instructional supports.\n",
-    "\n",
-    "You can use this evaluator to help ensure AI-generated texts are sufficiently complex for the grade level and their intended purpose.\n",
-    "\n",
-    "1. It estimates a student’s background knowledge given the selected grade level.\n",
-    "2. It uses the background knowledge estimate as a starting point to evaluate the complexity of a passage’s vocabulary."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
-     "inputWidgets": {},
-     "nuid": "5d3ac2f5-40b2-4b2c-afdb-c7400e603af3",
-     "showTitle": false,
-     "tableResultSettingsMap": {},
-     "title": ""
-    }
-   },
-   "source": [
-    "### Install & Load necessary packages"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {
-      "byteLimit": 2048000,
-      "rowLimit": 10000
-     },
-     "inputWidgets": {},
-     "nuid": "f24faefe-74b5-4c6c-9052-b41d1e9b73cf",
-     "showTitle": false,
-     "tableResultSettingsMap": {},
-     "title": ""
-    }
-   },
-   "outputs": [],
-   "source": [
-    "%pip install -qU pydantic textstat langchain langchain_openai langchain-google-genai"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {
-      "byteLimit": 2048000,
-      "rowLimit": 10000
-     },
-     "inputWidgets": {},
-     "nuid": "b243b748-0ed3-4d19-ab1b-d43638d2113e",
-     "showTitle": false,
-     "tableResultSettingsMap": {},
-     "title": ""
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# Load packages\n",
-    "import getpass\n",
-    "import os\n",
-    "\n",
-    "from dotenv import load_dotenv\n",
-    "from langchain_core.messages import SystemMessage\n",
-    "from langchain_core.output_parsers import JsonOutputParser\n",
-    "from langchain_core.prompts import ChatPromptTemplate\n",
-    "from langchain_core.prompts.chat import HumanMessagePromptTemplate\n",
-    "from langchain_google_genai import ChatGoogleGenerativeAI\n",
-    "from langchain_openai import ChatOpenAI\n",
-    "from pydantic import BaseModel, Field\n",
-    "from textstat import textstat as ts\n",
-    "from capture import reset_captures, capture_llm, capture_case, build_contract_toml\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
-     "inputWidgets": {},
-     "nuid": "12709664-6132-48a8-b621-73658b4fea90",
-     "showTitle": false,
-     "tableResultSettingsMap": {},
-     "title": ""
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Vocabulary Evaluator (Early Release)\n",
+        "\n",
+        "**The Vocabulary Evaluator** gives developers the fine-grained insight they need but can’t get from traditional tools. It helps determine whether texts use words that align with grade-level expectations and support growth in academic language. This ensures students are consistently exposed to the kinds of vocabulary that build knowledge and enable them to fully engage with grade-level texts.\n",
+        "\n",
+        "By understanding what makes a text difficult for a student to read, edtech companies and educators are better equipped to ensure students get the right text for their needs, along with the right instructional supports.\n",
+        "\n",
+        "You can use this evaluator to help ensure AI-generated texts are sufficiently complex for the grade level and their intended purpose.\n",
+        "\n",
+        "1. It estimates a student’s background knowledge given the selected grade level.\n",
+        "2. It uses the background knowledge estimate as a starting point to evaluate the complexity of a passage’s vocabulary."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "application/vnd.databricks.v1+cell": {
+          "cellMetadata": {},
+          "inputWidgets": {},
+          "nuid": "5d3ac2f5-40b2-4b2c-afdb-c7400e603af3",
+          "showTitle": false,
+          "tableResultSettingsMap": {},
+          "title": ""
+        }
+      },
+      "source": [
+        "### Install & Load necessary packages"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "application/vnd.databricks.v1+cell": {
+          "cellMetadata": {
+            "byteLimit": 2048000,
+            "rowLimit": 10000
+          },
+          "inputWidgets": {},
+          "nuid": "f24faefe-74b5-4c6c-9052-b41d1e9b73cf",
+          "showTitle": false,
+          "tableResultSettingsMap": {},
+          "title": ""
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Note: you may need to restart the kernel to use updated packages.\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install -qU pydantic textstat langchain langchain_openai langchain-google-genai"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "application/vnd.databricks.v1+cell": {
+          "cellMetadata": {
+            "byteLimit": 2048000,
+            "rowLimit": 10000
+          },
+          "inputWidgets": {},
+          "nuid": "b243b748-0ed3-4d19-ab1b-d43638d2113e",
+          "showTitle": false,
+          "tableResultSettingsMap": {},
+          "title": ""
+        }
+      },
+      "outputs": [],
+      "source": [
+        "# Load packages\n",
+        "import getpass\n",
+        "import os\n",
+        "import sys\n",
+        "from pathlib import Path\n",
+        "\n",
+        "# capture.py lives under sdks/python/scripts/ (not an installable package).\n",
+        "_scripts = None\n",
+        "for _root in [Path.cwd(), *Path.cwd().parents]:\n",
+        "    _cap = _root / \"sdks\" / \"python\" / \"scripts\" / \"capture.py\"\n",
+        "    if _cap.is_file():\n",
+        "        _scripts = _cap.parent\n",
+        "        break\n",
+        "if _scripts is None:\n",
+        "    raise FileNotFoundError(\n",
+        "        \"sdks/python/scripts/capture.py not found; set the kernel cwd to the repo root or evals/.\"\n",
+        "    )\n",
+        "sys.path.insert(0, str(_scripts))\n",
+        "\n",
+        "from dotenv import load_dotenv\n",
+        "from langchain_core.messages import SystemMessage\n",
+        "from langchain_core.output_parsers import JsonOutputParser\n",
+        "from langchain_core.prompts import ChatPromptTemplate\n",
+        "from langchain_core.prompts.chat import HumanMessagePromptTemplate\n",
+        "from langchain_google_genai import ChatGoogleGenerativeAI\n",
+        "from langchain_openai import ChatOpenAI\n",
+        "from pydantic import BaseModel, Field\n",
+        "from textstat import textstat as ts\n",
+        "from capture import reset_captures, capture_llm, capture_case, build_contract_toml\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "application/vnd.databricks.v1+cell": {
+          "cellMetadata": {},
+          "inputWidgets": {},
+          "nuid": "12709664-6132-48a8-b621-73658b4fea90",
+          "showTitle": false,
+          "tableResultSettingsMap": {},
+          "title": ""
+        }
+      },
+      "source": [
+        "### Set up the evaluator's model and prompts"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "application/vnd.databricks.v1+cell": {
+          "cellMetadata": {
+            "byteLimit": 2048000,
+            "rowLimit": 10000
+          },
+          "inputWidgets": {},
+          "nuid": "eeffec80-067e-47de-a234-7dcd5078ea53",
+          "showTitle": false,
+          "tableResultSettingsMap": {},
+          "title": ""
+        }
+      },
+      "outputs": [],
+      "source": [
+        "from prompts import vocab_prompts as prompts\n",
+        "\n",
+        "# Set your api keys in your environment, .env file, or enter when prompted.\n",
+        "# os.environ['GOOGLE_API_KEY'] = 'YOUR API KEY'\n",
+        "# os.environ['OPENAI_API_KEY'] = 'YOUR API KEY'\n",
+        "load_dotenv()\n",
+        "\n",
+        "if not os.environ.get(\"OPENAI_API_KEY\"):\n",
+        "    os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter your OpenAI API key: \")\n",
+        "\n",
+        "if not os.environ.get(\"GOOGLE_API_KEY\"):\n",
+        "    os.environ[\"GOOGLE_API_KEY\"] = getpass.getpass(\"Enter your Google API key: \")\n",
+        "\n",
+        "# Grades supported by this evaluator\n",
+        "SUPPORTED_GRADES = range(3, 13)  # 3 through 12 inclusive\n",
+        "\n",
+        "VOCAB_TEMPERATURE = 0\n",
+        "# Define the model to be used for vocabulary complexity for grades 3 and 4\n",
+        "VOCAB_MODEL_GRADES_3_4 = \"gemini-2.5-pro\"\n",
+        "vocab_complexity_model_grades_3_4 = ChatGoogleGenerativeAI(\n",
+        "    model=VOCAB_MODEL_GRADES_3_4, temperature=VOCAB_TEMPERATURE\n",
+        ")\n",
+        "\n",
+        "# Define the model to be used for vocabulary complexity for all other grades\n",
+        "VOCAB_MODEL_OTHER_GRADES = \"gpt-4.1\"\n",
+        "vocab_complexity_model_other_grades = ChatOpenAI(\n",
+        "    model=VOCAB_MODEL_OTHER_GRADES, temperature=VOCAB_TEMPERATURE\n",
+        ")\n",
+        "\n",
+        "# Define the model to be used for student background knowledge generation\n",
+        "BK_MODEL = \"gpt-4o-2024-11-20\"\n",
+        "BK_TEMPERATURE = 0\n",
+        "student_bk_model = ChatOpenAI(model=BK_MODEL, temperature=BK_TEMPERATURE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Set up student background knowledge generator"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "application/vnd.databricks.v1+cell": {
+          "cellMetadata": {
+            "byteLimit": 2048000,
+            "rowLimit": 10000
+          },
+          "inputWidgets": {},
+          "nuid": "3fc5ec95-f619-439e-9286-663f3d843502",
+          "showTitle": false,
+          "tableResultSettingsMap": {},
+          "title": ""
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def get_background_knowledge_assumption(text, grade):\n",
+        "    \"\"\"Use the background knowledge prompt from the prompts file.\"\"\"\n",
+        "    prompt = prompts.bk_prompt.format(text=text, grade=grade)\n",
+        "\n",
+        "    return capture_llm(\"background_knowledge\", student_bk_model).invoke(prompt).content\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Set up the input variables and output format"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "application/vnd.databricks.v1+cell": {
+          "cellMetadata": {
+            "byteLimit": 2048000,
+            "rowLimit": 10000
+          },
+          "inputWidgets": {},
+          "nuid": "04d83e90-b786-4f2d-83d6-8c69c2cf176b",
+          "showTitle": false,
+          "tableResultSettingsMap": {},
+          "title": ""
+        }
+      },
+      "outputs": [],
+      "source": [
+        "class Output(BaseModel):\n",
+        "    tier_2_words: str = Field(description=\"List of Tier 2 words\")\n",
+        "    tier_3_words: str = Field(description=\"List of Tier 3 words\")\n",
+        "    archaic_words: str = Field(description=\"List of Archaic words\")\n",
+        "    other_complex_words: str = Field(description=\"List of Other Complex words\")\n",
+        "    complexity_score: str = Field(\n",
+        "        description=\"the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex\"\n",
+        "    )\n",
+        "    reasoning: str = Field(description=\"your reasoning for your answer\")\n",
+        "\n",
+        "\n",
+        "prompt_vars = {\n",
+        "    \"inputVars\": [\n",
+        "        \"text\",\n",
+        "        \"student_grade_level\",\n",
+        "        \"student_background_knowledge\",\n",
+        "        \"fk_level\",\n",
+        "    ],\n",
+        "    \"outputParser\": JsonOutputParser(pydantic_object=Output),\n",
+        "}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Helper functions"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "application/vnd.databricks.v1+cell": {
+          "cellMetadata": {
+            "byteLimit": 2048000,
+            "rowLimit": 10000
+          },
+          "inputWidgets": {},
+          "nuid": "23047ec3-79b2-4e45-b452-873eae41b030",
+          "showTitle": false,
+          "tableResultSettingsMap": {},
+          "title": ""
+        }
+      },
+      "outputs": [],
+      "source": [
+        "import textwrap\n",
+        "\n",
+        "\n",
+        "def calculate_fk_score(text) -> float:\n",
+        "    \"\"\"\n",
+        "    Calculate the Flesch-Kincaid Grade Level\n",
+        "    \"\"\"\n",
+        "    fk_score = round(ts.flesch_kincaid_grade(text), 2)\n",
+        "\n",
+        "    return fk_score\n",
+        "\n",
+        "\n",
+        "def prepare_text_for_complexity_prediction(text, grade):\n",
+        "    \"\"\"\n",
+        "    Enrich the text and grade given by user with additional features for complexity prediction.\n",
+        "    \"\"\"\n",
+        "    dataset = {\n",
+        "        \"text\": text,\n",
+        "        \"student_grade_level\": grade,\n",
+        "        \"fk_level\": calculate_fk_score(text),\n",
+        "        \"student_background_knowledge\": get_background_knowledge_assumption(\n",
+        "            text, grade\n",
+        "        ),\n",
+        "    }\n",
+        "\n",
+        "    return dataset\n",
+        "\n",
+        "\n",
+        "def prettify_vocab_complexity_output(vocab_complexity_output):\n",
+        "    output = f\"\"\"\n",
+        "        ========================= Complexity Score ========================\n",
+        "        {vocab_complexity_output.get('complexity_score') or vocab_complexity_output.get('answer') or 'N/A'}\n",
+        "\n",
+        "        ========================= Complexity Score Reasoning ==============\n",
+        "        {textwrap.fill(vocab_complexity_output.get('reasoning', 'N/A'), width=80)}\n",
+        "\n",
+        "        ========================  Complex words  ==========================\n",
+        "        * Tier 2 words: {textwrap.fill(vocab_complexity_output.get('tier_2_words', 'N/A'), width=65)}\n",
+        "        * Tier 3 words: {textwrap.fill(vocab_complexity_output.get('tier_3_words', 'N/A'), width=65)}\n",
+        "        * Archaic words: {textwrap.fill(vocab_complexity_output.get('archaic_words', 'N/A'), width=65)}\n",
+        "        * Other complex words: {textwrap.fill(vocab_complexity_output.get('other_complex_words', 'N/A'), width=60)}\"\"\"\n",
+        "\n",
+        "    print(textwrap.dedent(output).strip())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def get_prompts_for_grade(grade: int) -> dict:\n",
+        "    \"\"\"\n",
+        "    Returns the appropriate SYSTEM_PROMPT and USER_PROMPT for the given grade.\n",
+        "    \n",
+        "    Args:\n",
+        "        grade: Grade level (3-12)\n",
+        "    \n",
+        "    Returns:\n",
+        "        dict with keys 'SYSTEM_PROMPT' and 'USER_PROMPT'\n",
+        "    \"\"\"\n",
+        "    if grade == 3 or grade == 4:\n",
+        "        return prompts.GRADE_SPECIFIC_PROMPTS[\"GRADES_3_4\"]\n",
+        "    else:  # 5-12\n",
+        "        return prompts.GRADE_SPECIFIC_PROMPTS[\"OTHER_GRADES\"]\n",
+        "\n",
+        "\n",
+        "def get_vocab_model_for_grade(grade: int):\n",
+        "    \"\"\"\n",
+        "    Returns the appropriate vocabulary complexity model for the given grade.\n",
+        "\n",
+        "    Grades 3 & 4 use Gemini (gemini-2.5-pro), which was validated against\n",
+        "    the GRADES_3_4 prompt. All other grades use GPT-4.1, which was validated\n",
+        "    against the OTHER_GRADES prompt.\n",
+        "\n",
+        "    Args:\n",
+        "        grade: Grade level (3-12)\n",
+        "\n",
+        "    Returns:\n",
+        "        A LangChain chat model instance\n",
+        "    \"\"\"\n",
+        "    if grade == 3 or grade == 4:\n",
+        "        return vocab_complexity_model_grades_3_4\n",
+        "    else:  # 5-12\n",
+        "        return vocab_complexity_model_other_grades\n",
+        "\n",
+        "\n",
+        "def normalize_complexity_output(output: dict) -> dict:\n",
+        "    \"\"\"\n",
+        "    Normalize complexity output to use consistent string labels.\n",
+        "    Converts integer 'answer' (from OTHER_GRADES) to string 'complexity_score'.\n",
+        "    \n",
+        "    Args:\n",
+        "        output: Raw output from the model\n",
+        "    \n",
+        "    Returns:\n",
+        "        Normalized output with 'complexity_score' field\n",
+        "    \"\"\"\n",
+        "    mapping = {\n",
+        "        1: \"Slightly Complex\",\n",
+        "        2: \"Moderately Complex\",\n",
+        "        3: \"Very Complex\",\n",
+        "        4: \"Exceedingly Complex\"\n",
+        "    }\n",
+        "\n",
+        "    # Handle 'answer' field from OTHER_GRADES (will be int or string int)\n",
+        "    if 'answer' in output:\n",
+        "        value = output['answer']\n",
+        "        # Convert int or string int to proper complexity label\n",
+        "        if isinstance(value, str) and value.isdigit():\n",
+        "            value = int(value)\n",
+        "        output['complexity_score'] = mapping.get(value, str(value))\n",
+        "    \n",
+        "    # For GRADES_3_4, complexity_score already exists as a string - no changes needed\n",
+        "\n",
+        "    return output\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def predict_text_complexity_level(text, grade):\n",
+        "    \"\"\"\n",
+        "    Predict the text complexity level as well as the complex words and reasoning.\n",
+        "\n",
+        "    Args:\n",
+        "        text: The text to evaluate.\n",
+        "        grade: Grade level. Must be between 3 and 12 inclusive.\n",
+        "\n",
+        "    Raises:\n",
+        "        ValueError: If grade is not in SUPPORTED_GRADES.\n",
+        "    \"\"\"\n",
+        "    if grade not in SUPPORTED_GRADES:\n",
+        "        raise ValueError(\n",
+        "            f\"Grade {grade} is not supported. This evaluator supports grades \"\n",
+        "            f\"{min(SUPPORTED_GRADES)}-{max(SUPPORTED_GRADES)}.\"\n",
+        "        )\n",
+        "\n",
+        "    dataset = prepare_text_for_complexity_prediction(text, grade)\n",
+        "\n",
+        "    # Get grade-specific prompts and model\n",
+        "    grade_prompts = get_prompts_for_grade(grade)\n",
+        "\n",
+        "    # Use grade-specific prompts\n",
+        "    messages = [\n",
+        "        SystemMessage(content=grade_prompts['SYSTEM_PROMPT']),\n",
+        "        HumanMessagePromptTemplate.from_template(grade_prompts['USER_PROMPT']),\n",
+        "    ]\n",
+        "\n",
+        "    # Prepare chat prompt\n",
+        "    prompt = ChatPromptTemplate(\n",
+        "        messages,\n",
+        "        input_variables=prompt_vars[\"inputVars\"],\n",
+        "        partial_variables={\n",
+        "            \"format_instructions\": prompt_vars[\"outputParser\"].get_format_instructions()\n",
+        "        },\n",
+        "    )\n",
+        "\n",
+        "    # Invoke the chain\n",
+        "    chain = prompt | capture_llm(\"vocab_complexity\", get_vocab_model_for_grade(grade)) | JsonOutputParser()\n",
+        "\n",
+        "    # Get output and normalize it\n",
+        "    output = chain.invoke(dataset)\n",
+        "    output = normalize_complexity_output(output)\n",
+        "\n",
+        "    return output\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "application/vnd.databricks.v1+cell": {
+          "cellMetadata": {},
+          "inputWidgets": {},
+          "nuid": "4437f840-a00b-47cf-a7fa-86d882f0f6af",
+          "showTitle": false,
+          "tableResultSettingsMap": {},
+          "title": ""
+        }
+      },
+      "source": [
+        "# Test out examples"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "application/vnd.databricks.v1+cell": {
+          "cellMetadata": {
+            "byteLimit": 2048000,
+            "rowLimit": 10000
+          },
+          "inputWidgets": {},
+          "nuid": "767fe555-ef65-46fb-8da3-54f18565cb07",
+          "showTitle": false,
+          "tableResultSettingsMap": {},
+          "title": ""
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "========================= Complexity Score ========================\n",
+            "        very complex\n",
+            "\n",
+            "        ========================= Complexity Score Reasoning ==============\n",
+            "        The vocabulary is very complex for a 3rd grader, a conclusion supported by the\n",
+            "Flesch-Kincaid grade level of 6.59. The primary difficulty stems from the high\n",
+            "density of unfamiliar proper nouns which function as challenging vocabulary\n",
+            "(e.g., 'Mongol Dynasty', 'Kublai Khan', 'Venice', 'Tabriz'). These terms are\n",
+            "central to the narrative but are introduced without any contextual scaffolding\n",
+            "or explanation, creating a significant conceptual load. Furthermore, the text\n",
+            "incorporates several Tier 2 words like 'scholars', 'court', and 'wealthy', and\n",
+            "Tier 3 words like 'dynasty' and 'porcelain'. The cumulative effect of these\n",
+            "words will often present challenges and significantly slow down comprehension\n",
+            "for a student at this grade level, making it difficult to grasp the details of\n",
+            "the text even if the main idea of a journey is accessible.\n",
+            "\n",
+            "        ========================  Complex words  ==========================\n",
+            "        * Tier 2 words: served, court, passengers, crew, survived, scholars, described,\n",
+            "treasures, wealthy, trading\n",
+            "        * Tier 3 words: dynasty, ivory, jade, porcelain\n",
+            "        * Archaic words: \n",
+            "        * Other complex words: Marco Polo, Mongol Dynasty, Venice, Mediterranean Sea, Ayas,\n",
+            "Tabriz, Kerman, Beijing, Kublai Khan, Far East\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Add your text & the grade level you want to evaluate for vocabulary complexity\n",
+        "\n",
+        "# Clear ID = 2204\n",
+        "text = \"\"\"\n",
+        "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
+        "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
+        "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
+        "\"\"\"\n",
+        "\n",
+        "grade_level = 3\n",
+        "\n",
+        "vocabulary_complexity_output = predict_text_complexity_level(text, grade_level)\n",
+        "\n",
+        "# Pretty Print the output\n",
+        "prettify_vocab_complexity_output(vocabulary_complexity_output)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[cases.marco_polo_grade3]\n",
+            "description = \"Marco Polo passage, grade 3 (grades 3-4 Gemini path)\"\n",
+            "\n",
+            "[cases.marco_polo_grade3.input]\n",
+            "text = '''\n",
+            "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
+            "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
+            "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.'''\n",
+            "grade = 3\n",
+            "\n",
+            "[cases.marco_polo_grade3.prompt_steps.background_knowledge]\n",
+            "model = \"gpt-4o-2024-11-20\"\n",
+            "temperature = 0.0\n",
+            "system_prompt = \"\"\n",
+            "user_prompt = '''\n",
+            "\n",
+            "Review the following text, which is an educational text written for students in the following grade band: 3.\n",
+            "\n",
+            "Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.\n",
+            "\n",
+            "Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).\n",
+            "\n",
+            "Here's an example:\n",
+            "[START EXAMPLE]\n",
+            "Grade Band: 11th\n",
+            "Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not\n",
+            "learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was\n",
+            "not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to\n",
+            "know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,\n",
+            "are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it\n",
+            "is the chief end of man here to “glorify God and enjoy him forever.”\n",
+            "\n",
+            "Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.\n",
+            "[END EXAMPLE]\n",
+            "\n",
+            "You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.\n",
+            "\n",
+            "You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:\n",
+            "[BEGIN TOPICS]\n",
+            "[\n",
+            "    K: [\n",
+            "        \"Toys and Play\", \"Weather Wonders\", \"Trees are Alive\", \"Enjoying and Appreciating Trees\",\n",
+            "        \"The Five Senses: How do our senses help us learn?\", \"Once Upon a Farm: What makes a good story?\",\n",
+            "        \"America, Then and Now: How has life in America changed over time?\", \"The Continents: What makes the world fascinating?\",\n",
+            "        \"Needs of Plants and Animals\", \"Pushes and Pulls\", \"Sunlight and Weather\", \"Learning and Working Together\",\n",
+            "        \"How Do People Learn and Work Together?\", \"Where Do We Live?\", \"What Does it Mean to Be an American?\",\n",
+            "        \"How Has Our World Changed?\", \"Why Do People Have Jobs?\"\n",
+            "    ],\n",
+            "    1: [\n",
+            "        \"Tools and Work\", \"A Study of the Sun, Moon, and Stars\", \"Birds' Amazing Bodies\", \"Caring for Birds\",\n",
+            "        \"A World of Books: How do books change lives around the world?\", \"Creature Features: What can we discover about animals’ unique features?\",\n",
+            "        \"Powerful Forces: How do people respond to the powerful force of the wind?\", \"Cinderella Stories: Why do people around the world admire Cinderella?\",\n",
+            "        \"Animal and Plant Defenses\", \"Light and Sounds\", \"Spinning Earth\", \"Our Place in the World\",\n",
+            "        \"What Are the Rights and Responsibilities of Citizens?\", \"How Can We Describe Where We Live?\",\n",
+            "        \"How Do We Celebrate Our Country?\", \"How Does the Past Shape Our Lives?\", \"Why Do People Work?\"\n",
+            "    ],\n",
+            "    2: [\n",
+            "        \"Schools and Community\", \"Fossils Tell of Earth's Changes\", \"The Secret World of Pollination\", \"Providing for Pollinators\",\n",
+            "        \"A Season of Change: How does change impact people and nature?\", \"The American West: What was life like in the West for early Americans?\",\n",
+            "        \"Civil Rights Heroes: How can people respond to injustice?\", \"Good Eating: How does food nourish us?\",\n",
+            "        \"Plant and Animal Relationships\", \"Properties of Matter\", \"Changing Landforms\", \"Exploring Who We Are\",\n",
+            "        \"Why Is It Important to Learn About the Past?\", \"How Does Geography Help Us Understand Our World?\",\n",
+            "        \"How Do We Get What We Want and Need?\", \"Why Do We Need Government?\", \"How Can People Make a Difference in Our World?\"\n",
+            "    ],\n",
+            "    \"3\": [\n",
+            "        \"Overcoming Learning Challenges Near and Far\", \"Adaptations and the Wide World of Frogs\", \"Exploring Literary Classics\",\n",
+            "        \"Water Around the World\", \"Ocean/Sea Exploration\", \"Outer Space\", \"Immigration\", \"Art/Being an Artist\",\n",
+            "        \"Balancing Forces\", \"Inheritance and Traits\", \"Environments and Survival\", \"Weather and Climate\",\n",
+            "        \"Communities\", \"Why Does It Matter Where We Live?\", \"What Is Our Relationship With Our Environment?\",\n",
+            "        \"What Makes a Community Unique?\", \"How Does the Past Impact the Present?\", \"Why Do Governments and Citizens Need Each Other?\",\n",
+            "        \"How Do People in a Community Meet Their Wants and Needs?\"\n",
+            "    ],\n",
+            "    4: [\n",
+            "        \"Poetry\", \"Animal Defense Mechanisms\", \"The American Revolution\",\n",
+            "        \"Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)\",\n",
+            "        \"A Great Heart: What does it mean to have a great heart, literally and figuratively?\",\n",
+            "        \"Extreme Settings: How does a challenging setting or physical environment change a person?\",\n",
+            "        \"American Revolution/Multiple Perspectives\", \"Myths/Myth Making\", \"Energy Conversions\", \"Vision and Light\",\n",
+            "        \"Earth's Features\", \"Waves, Energy, and Information\", \"Regions of the United States\",\n",
+            "        \"How Does America Use Its Strengths and Face Its Challenges?\", \"Why Have People Moved to and From the Northeast?\",\n",
+            "        \"How Has the Southeast Changed Over Time?\", \"How Does the Midwest Reflect the Spirit of America?\",\n",
+            "        \"How Does the Southwest Reflect Its Diverse Past and Unique Environment?\", \"What Draws People to the West?\"\n",
+            "    ],\n",
+            "    5: [\n",
+            "        \"Human Rights\", \"Biodiversity in the Rainforest\", \"Athlete Leaders of Social Change\",\n",
+            "        \"Impact of Natural Disasters\", \"Cultures in Conflict: How do cultural beliefs and values guide people?\",\n",
+            "        \"Word Play: How and why do writers play with words?\", \"A War Between Us: How did the Civil War impact people?\",\n",
+            "        \"Breaking Barriers: How can sports influence individuals and societies?\", \"Patterns of Earth and Sky\",\n",
+            "        \"Modeling Matter\", \"The Earth System\", \"Ecosystem Restoration\", \"U.S. History: Making a New Nation\",\n",
+            "        \"How Were the Lives of Native Peoples Influenced by Where They Lived?\",\n",
+            "        \"What Happened When Diverse Cultures Crossed Paths?\", \"What Is the Impact of People Settling in a New Place?\",\n",
+            "        \"Why Would a Nation Want to Become Independent?\", \"What Does the Revolutionary Era Tell Us About Our Nation Today?\",\n",
+            "        \"How Does the Constitution Help Us Understand What It Means to Be an American?\",\n",
+            "        \"What Do the Early Years of the United States Reveal About the Character of the Nation?\",\n",
+            "        \"What Was the Effect of the Civil War on U.S. Society?\"\n",
+            "    ],\n",
+            "    6: [\n",
+            "        \"Greek Mythology\", \"Critical Problems and Design Solutions\", \"American Indian Boarding Schools\",\n",
+            "        \"Remarkable Accomplishments in Space Science\", \"Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?\",\n",
+            "        \"A Hero’s Journey: What is the significance and power of the hero’s journey?\",\n",
+            "        \"Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?\",\n",
+            "        \"Courage in Crisis: How can the challenges of a hostile environment inspire heroism?\",\n",
+            "        \"Microbiome\", \"Metabolism\", \"Metabolism Engineering\", \"Traits and Reproduction\", \"Thermal Energy\",\n",
+            "        \"Ocean, Atmosphere, and Climate\", \"Weather Patterns\", \"Earth's Changing Climate\",\n",
+            "        \"Earth's Changing Climate: Engineering Internship\", \"The First Americans (up to 1492)\",\n",
+            "        \"Exploration and Colonization\", \"English Colonies\", \"American Revolution\", \"First Governments and the Constitution\",\n",
+            "        \"The Early American Republic\", \"Political and Geographic Changes (1828-1850)\", \"Life in the North and South (1820-1860)\",\n",
+            "        \"Division and Civil War (1821-1865)\", \"Reconstruction (1865-1896)\", \"The West (1858-1896)\",\n",
+            "        \"New Industry and a Changing Society\", \"Expansion and War\", \"The 1920s and 1930s\", \"World War II\",\n",
+            "        \"The Cold War\", \"Civil Rights and American Society\", \"America Since the 1970s\"\n",
+            "    ],\n",
+            "    7: [\n",
+            "        \"The Lost Children of Sudan (Genocide, Genocide in Sudan)\", \"Epidemics\", \"Harlem Renaissance\", \"Plastic Pollution\",\n",
+            "        \"Identity in the Middle Ages: How does society both support and limit the development of identity?\",\n",
+            "        \"Americans All: How did World War II affect individuals?\", \"Language and Power: What is the power of language?\",\n",
+            "        \"Fever: How can times of crisis affect citizens and society?\", \"Geology on Mars\", \"Plane Motion\", \"Plane Motion Engineering\",\n",
+            "        \"Rock Formations\", \"Phase Change\", \"Phase Change Engineering\", \"Chemical Reactions\", \"Populations and Resources\",\n",
+            "        \"Matter and Energy in Ecosystems\", \"Early Humans and Agricultural Revolution\", \"Fertile Crescent\",\n",
+            "        \"Ancient Egypt and Kush\", \"The Israelites\", \"Ancient Greece\", \"Ancient South Asia\", \"Early China, Korea, and Japan\",\n",
+            "        \"Ancient Rome\", \"Rise of Christian Kingdoms\", \"The Americas\", \"Medieval Europe\", \"The Rise of Islamic Empires\",\n",
+            "        \"China in the Middle Ages\", \"Korea and Japan in the Middle Ages\", \"African Civilizations\", \"New Ways of Thinking\",\n",
+            "        \"Age of Exploration and Trade\", \"Revolutions and Empires\", \"The Modern World\"\n",
+            "    ],\n",
+            "    8: [\n",
+            "        \"Folklore of Latin America\", \"Food Choices\", \"The Holocaust\", \"Japanese American Internment\",\n",
+            "        \"The Poetics and Power of Storytelling: What is the power of storytelling?\",\n",
+            "        \"The Great War: How do literature and art illuminate the effects of World War I?\", \"What Is Love?\",\n",
+            "        \"Teens as Change Agents: How do people effect social change?\", \"Harnessing Human Energy\",\n",
+            "        \"Force and Motion\", \"Force and Motion Engineering\", \"Magnetic Fields\", \"Light Waves\", \"Earth, Moon, and Sun\",\n",
+            "        \"Natural Selection\", \"Natural Selection Engineering\", \"Evolutionary History\", \"The World in Spatial Terms\",\n",
+            "        \"Places and Regions\", \"Physical Geography\", \"Population Geography\", \"Economic Geography\",\n",
+            "        \"Political Geography\", \"Human-Environment Geography\", \"What is Economics?\", \"Markets, Money, and Businesses\",\n",
+            "        \"Government and the Economy\", \"The Global Economy\"\n",
+            "    ]\n",
+            "]\n",
+            "[END TOPICS]\n",
+            "\n",
+            "Here is the text:\n",
+            "[BEGIN TEXT]\n",
+            "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
+            "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
+            "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
+            "[END TEXT]\n",
+            "'''\n",
+            "llm_response = '''\n",
+            "Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.'''\n",
+            "\n",
+            "[cases.marco_polo_grade3.prompt_steps.vocab_complexity]\n",
+            "model = \"gemini-2.5-pro\"\n",
+            "temperature = 0.0\n",
+            "system_prompt = '''\n",
+            "\n",
+            "You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.\n",
+            "\n",
+            "You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.\n",
+            "IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)\n",
+            "\n",
+            "**Resource 1: Qualitative Text Complexity rubric (SAP)**\n",
+            "1.  **Level 1: Slightly complex**\n",
+            "    *   Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.\n",
+            "    *   Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!\n",
+            "2.  **Level 2: Moderately complex**\n",
+            "    *   Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK\n",
+            "    *   Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.\n",
+            "3.  **Level 3: Very complex**\n",
+            "    *   Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic\n",
+            "    *   Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.\n",
+            "4.  **Level 4: Exceedingly complex**\n",
+            "    *   Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.\n",
+            "    *   Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.\n",
+            "\n",
+            "**Resource 2: Flesch-Kincaid Grade Level**\n",
+            "Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.\n",
+            "*   grade 2-3: 1.98-5.34\n",
+            "*   grade 4-5: 4.51-7.73\n",
+            "*   grade 6-8: 6.51-10.34\n",
+            "*   grade 9-10: 8.32-12.12\n",
+            "*   grade 11-College: 10.34-14.2\n",
+            "\n",
+            "**Guidelines for Interpretation and Reasoning**\n",
+            "\n",
+            "Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:\n",
+            "\n",
+            "1.  **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.\n",
+            "2.  **Contextual Scaffolding:** Assess how the text supports new vocabulary.\n",
+            "    *   Are new, complex terms explicitly defined or explained with simple examples (e.g., \"volume... to see if it is big enough to hold a liter of food\")?\n",
+            "    *   Is the surrounding language simple and conversational, making the meaning of new words easier to infer?\n",
+            "    *   Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.\n",
+            "3.  **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).\n",
+            "4.  **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.\n",
+            "5.  **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.\n",
+            "    *   **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.\n",
+            "    *   **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.\n",
+            "6.  **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).\n",
+            "\n",
+            "**Final Analysis Format**\n",
+            "\n",
+            "Provide these information as your final analysis:\n",
+            "1.  **Complex vocabulary:**\n",
+            "    *   Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.\n",
+            "    *   Tier 3 words: Overly academic or domain-specific words.\n",
+            "    *   Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., “The jury retired to deliberate on their verdict.\" The use of “retire” to mean withdrawing to a private place is an archaic use.\n",
+            "    *   Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).\n",
+            "2.  **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex\n",
+            "3.  **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.\n",
+            "'''\n",
+            "user_prompt = '''\n",
+            "\n",
+            "Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.\n",
+            "\n",
+            "- It is intended for grade 3.\n",
+            "\n",
+            "- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.\n",
+            "\n",
+            "- Text Flesch-Kincaid grade level: 6.59\n",
+            "\n",
+            "- Text to evaluate: [BEGIN TEXT]\n",
+            "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
+            "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
+            "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
+            "[END TEXT]\n",
+            "\n",
+            "STRICT OUTPUT FORMAT:\n",
+            "- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.\n",
+            "- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).\n",
+            "- Do not prepend or append any text (e.g., do not write \"Here is the JSON:\").\n",
+            "- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.\n",
+            "\n",
+            "The output should be formatted as a JSON instance that conforms to the JSON schema below.\n",
+            "\n",
+            "As an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]} the object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\n",
+            "\n",
+            "Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):\n",
+            "```\n",
+            "{\"properties\": {\"tier_2_words\": {\"description\": \"List of Tier 2 words\", \"title\": \"Tier 2 Words\", \"type\": \"string\"}, \"tier_3_words\": {\"description\": \"List of Tier 3 words\", \"title\": \"Tier 3 Words\", \"type\": \"string\"}, \"archaic_words\": {\"description\": \"List of Archaic words\", \"title\": \"Archaic Words\", \"type\": \"string\"}, \"other_complex_words\": {\"description\": \"List of Other Complex words\", \"title\": \"Other Complex Words\", \"type\": \"string\"}, \"complexity_score\": {\"description\": \"the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex\", \"title\": \"Complexity Score\", \"type\": \"string\"}, \"reasoning\": {\"description\": \"your reasoning for your answer\", \"title\": \"Reasoning\", \"type\": \"string\"}}, \"required\": [\"tier_2_words\", \"tier_3_words\", \"archaic_words\", \"other_complex_words\", \"complexity_score\", \"reasoning\"]}\n",
+            "```\n",
+            "'''\n",
+            "llm_response = '''\n",
+            "```json\n",
+            "{\n",
+            "  \"tier_2_words\": \"served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers\",\n",
+            "  \"tier_3_words\": \"Mongol Dynasty, Kublai Khan, ivory, jade, porcelain\",\n",
+            "  \"archaic_words\": \"None\",\n",
+            "  \"other_complex_words\": \"Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East\",\n",
+            "  \"complexity_score\": \"very complex\",\n",
+            "  \"reasoning\": \"The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level.\"\n",
+            "}\n",
+            "```'''\n",
+            "\n",
+            "[cases.marco_polo_grade3.expected_result]\n",
+            "tier_2_words = '''\n",
+            "served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers'''\n",
+            "tier_3_words = '''\n",
+            "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain'''\n",
+            "archaic_words = '''\n",
+            "None'''\n",
+            "other_complex_words = '''\n",
+            "Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East'''\n",
+            "complexity_score = '''\n",
+            "very complex'''\n",
+            "reasoning = '''\n",
+            "The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level.'''\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "reset_captures()\n",
+        "text = \"\"\"\n",
+        "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
+        "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
+        "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
+        "\"\"\".strip()\n",
+        "grade_level = 3\n",
+        "result = predict_text_complexity_level(text, grade_level)\n",
+        "contract_input = {\"text\": text, \"grade\": grade_level}\n",
+        "\n",
+        "capture = capture_case(\n",
+        "    name=\"marco_polo_grade3\",\n",
+        "    description=\"Marco Polo passage, grade 3 (grades 3-4 Gemini path)\",\n",
+        "    input=contract_input,\n",
+        "    llm_call_captures=[\"background_knowledge\", \"vocab_complexity\"],\n",
+        "    expected_result=result,\n",
+        ")\n",
+        "\n",
+        "print(build_contract_toml(capture))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[cases.hurricanes_grade7]\n",
+            "description = \"Hurricane formation passage, grade 7 (grades 5-12 GPT path)\"\n",
+            "\n",
+            "[cases.hurricanes_grade7.input]\n",
+            "text = '''\n",
+            "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
+            "\n",
+            "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.'''\n",
+            "grade = 7\n",
+            "\n",
+            "[cases.hurricanes_grade7.prompt_steps.background_knowledge]\n",
+            "model = \"gpt-4o-2024-11-20\"\n",
+            "temperature = 0.0\n",
+            "system_prompt = \"\"\n",
+            "user_prompt = '''\n",
+            "\n",
+            "Review the following text, which is an educational text written for students in the following grade band: 7.\n",
+            "\n",
+            "Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.\n",
+            "\n",
+            "Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).\n",
+            "\n",
+            "Here's an example:\n",
+            "[START EXAMPLE]\n",
+            "Grade Band: 11th\n",
+            "Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not\n",
+            "learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was\n",
+            "not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to\n",
+            "know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,\n",
+            "are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it\n",
+            "is the chief end of man here to “glorify God and enjoy him forever.”\n",
+            "\n",
+            "Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.\n",
+            "[END EXAMPLE]\n",
+            "\n",
+            "You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.\n",
+            "\n",
+            "You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:\n",
+            "[BEGIN TOPICS]\n",
+            "[\n",
+            "    K: [\n",
+            "        \"Toys and Play\", \"Weather Wonders\", \"Trees are Alive\", \"Enjoying and Appreciating Trees\",\n",
+            "        \"The Five Senses: How do our senses help us learn?\", \"Once Upon a Farm: What makes a good story?\",\n",
+            "        \"America, Then and Now: How has life in America changed over time?\", \"The Continents: What makes the world fascinating?\",\n",
+            "        \"Needs of Plants and Animals\", \"Pushes and Pulls\", \"Sunlight and Weather\", \"Learning and Working Together\",\n",
+            "        \"How Do People Learn and Work Together?\", \"Where Do We Live?\", \"What Does it Mean to Be an American?\",\n",
+            "        \"How Has Our World Changed?\", \"Why Do People Have Jobs?\"\n",
+            "    ],\n",
+            "    1: [\n",
+            "        \"Tools and Work\", \"A Study of the Sun, Moon, and Stars\", \"Birds' Amazing Bodies\", \"Caring for Birds\",\n",
+            "        \"A World of Books: How do books change lives around the world?\", \"Creature Features: What can we discover about animals’ unique features?\",\n",
+            "        \"Powerful Forces: How do people respond to the powerful force of the wind?\", \"Cinderella Stories: Why do people around the world admire Cinderella?\",\n",
+            "        \"Animal and Plant Defenses\", \"Light and Sounds\", \"Spinning Earth\", \"Our Place in the World\",\n",
+            "        \"What Are the Rights and Responsibilities of Citizens?\", \"How Can We Describe Where We Live?\",\n",
+            "        \"How Do We Celebrate Our Country?\", \"How Does the Past Shape Our Lives?\", \"Why Do People Work?\"\n",
+            "    ],\n",
+            "    2: [\n",
+            "        \"Schools and Community\", \"Fossils Tell of Earth's Changes\", \"The Secret World of Pollination\", \"Providing for Pollinators\",\n",
+            "        \"A Season of Change: How does change impact people and nature?\", \"The American West: What was life like in the West for early Americans?\",\n",
+            "        \"Civil Rights Heroes: How can people respond to injustice?\", \"Good Eating: How does food nourish us?\",\n",
+            "        \"Plant and Animal Relationships\", \"Properties of Matter\", \"Changing Landforms\", \"Exploring Who We Are\",\n",
+            "        \"Why Is It Important to Learn About the Past?\", \"How Does Geography Help Us Understand Our World?\",\n",
+            "        \"How Do We Get What We Want and Need?\", \"Why Do We Need Government?\", \"How Can People Make a Difference in Our World?\"\n",
+            "    ],\n",
+            "    \"3\": [\n",
+            "        \"Overcoming Learning Challenges Near and Far\", \"Adaptations and the Wide World of Frogs\", \"Exploring Literary Classics\",\n",
+            "        \"Water Around the World\", \"Ocean/Sea Exploration\", \"Outer Space\", \"Immigration\", \"Art/Being an Artist\",\n",
+            "        \"Balancing Forces\", \"Inheritance and Traits\", \"Environments and Survival\", \"Weather and Climate\",\n",
+            "        \"Communities\", \"Why Does It Matter Where We Live?\", \"What Is Our Relationship With Our Environment?\",\n",
+            "        \"What Makes a Community Unique?\", \"How Does the Past Impact the Present?\", \"Why Do Governments and Citizens Need Each Other?\",\n",
+            "        \"How Do People in a Community Meet Their Wants and Needs?\"\n",
+            "    ],\n",
+            "    4: [\n",
+            "        \"Poetry\", \"Animal Defense Mechanisms\", \"The American Revolution\",\n",
+            "        \"Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)\",\n",
+            "        \"A Great Heart: What does it mean to have a great heart, literally and figuratively?\",\n",
+            "        \"Extreme Settings: How does a challenging setting or physical environment change a person?\",\n",
+            "        \"American Revolution/Multiple Perspectives\", \"Myths/Myth Making\", \"Energy Conversions\", \"Vision and Light\",\n",
+            "        \"Earth's Features\", \"Waves, Energy, and Information\", \"Regions of the United States\",\n",
+            "        \"How Does America Use Its Strengths and Face Its Challenges?\", \"Why Have People Moved to and From the Northeast?\",\n",
+            "        \"How Has the Southeast Changed Over Time?\", \"How Does the Midwest Reflect the Spirit of America?\",\n",
+            "        \"How Does the Southwest Reflect Its Diverse Past and Unique Environment?\", \"What Draws People to the West?\"\n",
+            "    ],\n",
+            "    5: [\n",
+            "        \"Human Rights\", \"Biodiversity in the Rainforest\", \"Athlete Leaders of Social Change\",\n",
+            "        \"Impact of Natural Disasters\", \"Cultures in Conflict: How do cultural beliefs and values guide people?\",\n",
+            "        \"Word Play: How and why do writers play with words?\", \"A War Between Us: How did the Civil War impact people?\",\n",
+            "        \"Breaking Barriers: How can sports influence individuals and societies?\", \"Patterns of Earth and Sky\",\n",
+            "        \"Modeling Matter\", \"The Earth System\", \"Ecosystem Restoration\", \"U.S. History: Making a New Nation\",\n",
+            "        \"How Were the Lives of Native Peoples Influenced by Where They Lived?\",\n",
+            "        \"What Happened When Diverse Cultures Crossed Paths?\", \"What Is the Impact of People Settling in a New Place?\",\n",
+            "        \"Why Would a Nation Want to Become Independent?\", \"What Does the Revolutionary Era Tell Us About Our Nation Today?\",\n",
+            "        \"How Does the Constitution Help Us Understand What It Means to Be an American?\",\n",
+            "        \"What Do the Early Years of the United States Reveal About the Character of the Nation?\",\n",
+            "        \"What Was the Effect of the Civil War on U.S. Society?\"\n",
+            "    ],\n",
+            "    6: [\n",
+            "        \"Greek Mythology\", \"Critical Problems and Design Solutions\", \"American Indian Boarding Schools\",\n",
+            "        \"Remarkable Accomplishments in Space Science\", \"Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?\",\n",
+            "        \"A Hero’s Journey: What is the significance and power of the hero’s journey?\",\n",
+            "        \"Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?\",\n",
+            "        \"Courage in Crisis: How can the challenges of a hostile environment inspire heroism?\",\n",
+            "        \"Microbiome\", \"Metabolism\", \"Metabolism Engineering\", \"Traits and Reproduction\", \"Thermal Energy\",\n",
+            "        \"Ocean, Atmosphere, and Climate\", \"Weather Patterns\", \"Earth's Changing Climate\",\n",
+            "        \"Earth's Changing Climate: Engineering Internship\", \"The First Americans (up to 1492)\",\n",
+            "        \"Exploration and Colonization\", \"English Colonies\", \"American Revolution\", \"First Governments and the Constitution\",\n",
+            "        \"The Early American Republic\", \"Political and Geographic Changes (1828-1850)\", \"Life in the North and South (1820-1860)\",\n",
+            "        \"Division and Civil War (1821-1865)\", \"Reconstruction (1865-1896)\", \"The West (1858-1896)\",\n",
+            "        \"New Industry and a Changing Society\", \"Expansion and War\", \"The 1920s and 1930s\", \"World War II\",\n",
+            "        \"The Cold War\", \"Civil Rights and American Society\", \"America Since the 1970s\"\n",
+            "    ],\n",
+            "    7: [\n",
+            "        \"The Lost Children of Sudan (Genocide, Genocide in Sudan)\", \"Epidemics\", \"Harlem Renaissance\", \"Plastic Pollution\",\n",
+            "        \"Identity in the Middle Ages: How does society both support and limit the development of identity?\",\n",
+            "        \"Americans All: How did World War II affect individuals?\", \"Language and Power: What is the power of language?\",\n",
+            "        \"Fever: How can times of crisis affect citizens and society?\", \"Geology on Mars\", \"Plane Motion\", \"Plane Motion Engineering\",\n",
+            "        \"Rock Formations\", \"Phase Change\", \"Phase Change Engineering\", \"Chemical Reactions\", \"Populations and Resources\",\n",
+            "        \"Matter and Energy in Ecosystems\", \"Early Humans and Agricultural Revolution\", \"Fertile Crescent\",\n",
+            "        \"Ancient Egypt and Kush\", \"The Israelites\", \"Ancient Greece\", \"Ancient South Asia\", \"Early China, Korea, and Japan\",\n",
+            "        \"Ancient Rome\", \"Rise of Christian Kingdoms\", \"The Americas\", \"Medieval Europe\", \"The Rise of Islamic Empires\",\n",
+            "        \"China in the Middle Ages\", \"Korea and Japan in the Middle Ages\", \"African Civilizations\", \"New Ways of Thinking\",\n",
+            "        \"Age of Exploration and Trade\", \"Revolutions and Empires\", \"The Modern World\"\n",
+            "    ],\n",
+            "    8: [\n",
+            "        \"Folklore of Latin America\", \"Food Choices\", \"The Holocaust\", \"Japanese American Internment\",\n",
+            "        \"The Poetics and Power of Storytelling: What is the power of storytelling?\",\n",
+            "        \"The Great War: How do literature and art illuminate the effects of World War I?\", \"What Is Love?\",\n",
+            "        \"Teens as Change Agents: How do people effect social change?\", \"Harnessing Human Energy\",\n",
+            "        \"Force and Motion\", \"Force and Motion Engineering\", \"Magnetic Fields\", \"Light Waves\", \"Earth, Moon, and Sun\",\n",
+            "        \"Natural Selection\", \"Natural Selection Engineering\", \"Evolutionary History\", \"The World in Spatial Terms\",\n",
+            "        \"Places and Regions\", \"Physical Geography\", \"Population Geography\", \"Economic Geography\",\n",
+            "        \"Political Geography\", \"Human-Environment Geography\", \"What is Economics?\", \"Markets, Money, and Businesses\",\n",
+            "        \"Government and the Economy\", \"The Global Economy\"\n",
+            "    ]\n",
+            "]\n",
+            "[END TOPICS]\n",
+            "\n",
+            "Here is the text:\n",
+            "[BEGIN TEXT]\n",
+            "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
+            "\n",
+            "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.\n",
+            "[END TEXT]\n",
+            "'''\n",
+            "llm_response = '''\n",
+            "Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.'''\n",
+            "\n",
+            "[cases.hurricanes_grade7.prompt_steps.vocab_complexity]\n",
+            "model = \"gpt-4.1\"\n",
+            "temperature = 0.0\n",
+            "system_prompt = '''\n",
+            "\n",
+            "You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.\n",
+            "\n",
+            "You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.\n",
+            "\n",
+            "IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)\n",
+            "IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.\n",
+            "\n",
+            "Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).\n",
+            "'''\n",
+            "user_prompt = '''\n",
+            "\n",
+            "Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly — do not introduce any additional criteria or lenses for judging the complexity of the text.\n",
+            "\n",
+            "[BEGIN ANNOTATION GUIDE AND RUBRIC]\n",
+            "Instructions\n",
+            "For the following task, please assume that:\n",
+            "    - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).\n",
+            "    - The student is moving through a common progression of topics (detailed here).\n",
+            "    - The student is fluent in speaking English.\n",
+            "    - The student has an \"average\" amount of background knowledge on topics not commonly covered in curriculum.\n",
+            "    - The student will use this material for independent reading/work, without direct instruction.\n",
+            "    - The text is reasonable for the given grade level.\n",
+            "\n",
+            "Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like \"kicked the bucket,\" consider only the qualities of the words themselves (\"kicked\", \"the\" and \"bucket\").\n",
+            "\n",
+            "Please do be sure to consider:\n",
+            "- all of the different types of vocabulary (listed below)\n",
+            "- the overall proportion of complex words in the text - including repeated complex words.\n",
+            "- the resulting holistic complexity of the vocabulary (described in the Summary section below).\n",
+            "\n",
+            "Level 1:\n",
+            "Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.\n",
+            "\n",
+            "Level 2:\n",
+            "Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.\n",
+            "\n",
+            "Level 3:\n",
+            "Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic\n",
+            "\n",
+            "Level 4:\n",
+            "Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading\n",
+            "\n",
+            "And here are some relevant definitions:\n",
+            "    - Conversational: Everyday language.\n",
+            "    - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an \"average\" level of background knowledge.\n",
+            "    - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.\n",
+            "        - For ex: 4th graders are familiar with the word \"table\" but may not be familiar with the use of the word with respect to data (\"a table of data\").\n",
+            "        - Note:\n",
+            "            - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.\n",
+            "            - For ex: \"The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb.\"\n",
+            "                - The word \"pharaoh\" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.\n",
+            "        - Unfamiliar proper nouns:\n",
+            "            - A person's name, even if unfamiliar, generally does not add to complexity.\n",
+            "            - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.\n",
+            "\n",
+            "- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.\n",
+            "- Overly-academic: Words that are excessively formal, complex, or specialized.\n",
+            "    - For ex: \"The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture\"\n",
+            "- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.\n",
+            "    - For ex: \"After a long day of court proceedings, the jury 'retired' to deliberate on their verdict.\"\n",
+            "        - The word \"retire\" meaning to stop working may be familiar to a student, but \"retire\" meaning \"withdrawing to a private place\" is an archaic use.\n",
+            "\n",
+            "\n",
+            "Examples\n",
+            "The student is on-grade-level:\n",
+            "- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level.  If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.\n",
+            "\n",
+            "Figurative Language\n",
+            "- Kicked the bucket.\n",
+            "- The pen is mightier than the sword.\n",
+            "- The classroom was a zoo.\n",
+            "- He ran faster than the speed of light.\n",
+            "[END ANNOTATION GUIDE AND RUBRIC]\n",
+            "\n",
+            "Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:\n",
+            "[BEGIN EXAMPLES]\n",
+            "\n",
+            "*** EXAMPLE 1 ***\n",
+            "The following text was intended for grade level 11 and received a complexity level of 1.\n",
+            "\n",
+            "Here is the background knowledge assumption for that text: N/A\n",
+            "\n",
+            "Here is the text:\n",
+            "// START TEXT //\n",
+            "\"In a recent lecture, \"Is Nothing Sacred?\", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.\n",
+            "\n",
+            "He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.\n",
+            "\n",
+            "I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.\n",
+            "\n",
+            "I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.\n",
+            "\n",
+            "Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized.\"\n",
+            "// END TEXT //\n",
+            "\n",
+            "Here is the reasoning for that complexity level:\n",
+            "// START REASONING //\n",
+            "This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few \"juicier\" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of \"well-frayed\" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.\n",
+            "\n",
+            "It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.\n",
+            "// END REASONING //\n",
+            "*** EXAMPLE 2 ***\n",
+            "The following text was intended for grade level 5 and received a complexity level of 2.\n",
+            "\n",
+            "Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).\n",
+            "\n",
+            "Here is the text:\n",
+            "// START TEXT //\n",
+            "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names—hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
+            "\n",
+            "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.\n",
+            "// END TEXT //\n",
+            "\n",
+            "Here is the reasoning for that complexity level:\n",
+            "// START REASONING //\n",
+            "I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as \"great whiring storms\" and \"born\" / \"giving birth\" to storm  (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as \"a lot\" and \"bigger\" are more conversational, and while technical, unfamiliar words are provided, such as \"hurricane,\" \"typhoon,\" and \"cyclone,\" knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.\n",
+            "// END REASONING //\n",
+            "\n",
+            "*** EXAMPLE 3 ***\n",
+            "The following text was intended for grade level 6 and received a complexity level of 3.\n",
+            "\n",
+            "Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.\n",
+            "\n",
+            "Here is the text:\n",
+            "// START TEXT //\n",
+            "Geology is the scientific study of Earth. Geologists study the planet—its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.\n",
+            "\n",
+            "Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.\n",
+            "\n",
+            "Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.\n",
+            "// END TEXT //\n",
+            "\n",
+            "Here is the reasoning for that complexity level:\n",
+            "// START REASONING //\n",
+            "To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I \"bolded\" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or \"gist\" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating.  I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.\n",
+            "// END REASONING //\n",
+            "[END EXAMPLES]\n",
+            "\n",
+            "Below is the text you need to evaluate. It is intended for grade 7.\n",
+            "\n",
+            "As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.\n",
+            "\n",
+            "[BEGIN TEXT]\n",
+            "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
+            "\n",
+            "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.\n",
+            "[END TEXT]\n",
+            "\n",
+            "In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say \"level 2\").\n",
+            "\n",
+            "STRICT OUTPUT FORMAT:\n",
+            "- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.\n",
+            "- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).\n",
+            "- Do not prepend or append any text (e.g., do not write \"Here is the JSON:\").\n",
+            "- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.\n",
+            "\n",
+            "The output should be formatted as a JSON instance that conforms to the JSON schema below.\n",
+            "\n",
+            "As an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]} the object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\n",
+            "\n",
+            "Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):\n",
+            "```\n",
+            "{\"properties\": {\"tier_2_words\": {\"description\": \"List of Tier 2 words\", \"title\": \"Tier 2 Words\", \"type\": \"string\"}, \"tier_3_words\": {\"description\": \"List of Tier 3 words\", \"title\": \"Tier 3 Words\", \"type\": \"string\"}, \"archaic_words\": {\"description\": \"List of Archaic words\", \"title\": \"Archaic Words\", \"type\": \"string\"}, \"other_complex_words\": {\"description\": \"List of Other Complex words\", \"title\": \"Other Complex Words\", \"type\": \"string\"}, \"complexity_score\": {\"description\": \"the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex\", \"title\": \"Complexity Score\", \"type\": \"string\"}, \"reasoning\": {\"description\": \"your reasoning for your answer\", \"title\": \"Reasoning\", \"type\": \"string\"}}, \"required\": [\"tier_2_words\", \"tier_3_words\", \"archaic_words\", \"other_complex_words\", \"complexity_score\", \"reasoning\"]}\n",
+            "```\n",
+            "'''\n",
+            "llm_response = '''\n",
+            "{\"tier_2_words\": \"roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing\", \"tier_3_words\": \"hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist\", \"archaic_words\": \"\", \"other_complex_words\": \"ashore\", \"complexity_score\": \"slightly complex\", \"reasoning\": \"The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2.\"}'''\n",
+            "\n",
+            "[cases.hurricanes_grade7.expected_result]\n",
+            "tier_2_words = '''\n",
+            "roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing'''\n",
+            "tier_3_words = '''\n",
+            "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist'''\n",
+            "archaic_words = \"\"\n",
+            "other_complex_words = '''\n",
+            "ashore'''\n",
+            "complexity_score = '''\n",
+            "slightly complex'''\n",
+            "reasoning = '''\n",
+            "The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2.'''\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "reset_captures()\n",
+        "text = \"\"\"\n",
+        "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
+        "\n",
+        "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.\n",
+        "\"\"\".strip()\n",
+        "grade_level = 7\n",
+        "result = predict_text_complexity_level(text, grade_level)\n",
+        "contract_input = {\"text\": text, \"grade\": grade_level}\n",
+        "\n",
+        "capture = capture_case(\n",
+        "    name=\"hurricanes_grade7\",\n",
+        "    description=\"Hurricane formation passage, grade 7 (grades 5-12 GPT path)\",\n",
+        "    input=contract_input,\n",
+        "    llm_call_captures=[\"background_knowledge\", \"vocab_complexity\"],\n",
+        "    expected_result=result,\n",
+        ")\n",
+        "\n",
+        "print(build_contract_toml(capture))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "You can copy or edit the above cell to test out different texts and grade levels."
+      ]
     }
-   },
-   "source": [
-    "### Set up the evaluator's model and prompts"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {
-      "byteLimit": 2048000,
-      "rowLimit": 10000
-     },
-     "inputWidgets": {},
-     "nuid": "eeffec80-067e-47de-a234-7dcd5078ea53",
-     "showTitle": false,
-     "tableResultSettingsMap": {},
-     "title": ""
-    }
-   },
-   "outputs": [],
-   "source": [
-    "from prompts import vocab_prompts as prompts\n",
-    "\n",
-    "# Set your api keys in your environment, .env file, or enter when prompted.\n",
-    "# os.environ['GOOGLE_API_KEY'] = 'YOUR API KEY'\n",
-    "# os.environ['OPENAI_API_KEY'] = 'YOUR API KEY'\n",
-    "load_dotenv()\n",
-    "\n",
-    "if not os.environ.get(\"OPENAI_API_KEY\"):\n",
-    "    os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter your OpenAI API key: \")\n",
-    "\n",
-    "if not os.environ.get(\"GOOGLE_API_KEY\"):\n",
-    "    os.environ[\"GOOGLE_API_KEY\"] = getpass.getpass(\"Enter your Google API key: \")\n",
-    "\n",
-    "# Grades supported by this evaluator\n",
-    "SUPPORTED_GRADES = range(3, 13)  # 3 through 12 inclusive\n",
-    "\n",
-    "VOCAB_TEMPERATURE = 0\n",
-    "# Define the model to be used for vocabulary complexity for grades 3 and 4\n",
-    "VOCAB_MODEL_GRADES_3_4 = \"gemini-2.5-pro\"\n",
-    "vocab_complexity_model_grades_3_4 = ChatGoogleGenerativeAI(\n",
-    "    model=VOCAB_MODEL_GRADES_3_4, temperature=VOCAB_TEMPERATURE\n",
-    ")\n",
-    "\n",
-    "# Define the model to be used for vocabulary complexity for all other grades\n",
-    "VOCAB_MODEL_OTHER_GRADES = \"gpt-4.1\"\n",
-    "vocab_complexity_model_other_grades = ChatOpenAI(\n",
-    "    model=VOCAB_MODEL_OTHER_GRADES, temperature=VOCAB_TEMPERATURE\n",
-    ")\n",
-    "\n",
-    "# Define the model to be used for student background knowledge generation\n",
-    "BK_MODEL = \"gpt-4o-2024-11-20\"\n",
-    "BK_TEMPERATURE = 0\n",
-    "student_bk_model = ChatOpenAI(model=BK_MODEL, temperature=BK_TEMPERATURE)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Set up student background knowledge generator"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {
-      "byteLimit": 2048000,
-      "rowLimit": 10000
-     },
-     "inputWidgets": {},
-     "nuid": "3fc5ec95-f619-439e-9286-663f3d843502",
-     "showTitle": false,
-     "tableResultSettingsMap": {},
-     "title": ""
-    }
-   },
-   "outputs": [],
-   "source": [
-    "def get_background_knowledge_assumption(text, grade):\n",
-    "    \"\"\"Use the background knowledge prompt from the prompts file.\"\"\"\n",
-    "    prompt = prompts.bk_prompt.format(text=text, grade=grade)\n",
-    "\n",
-    "    return capture_llm(\"background_knowledge\", student_bk_model).invoke(prompt).content\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Set up the input variables and output format"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {
-      "byteLimit": 2048000,
-      "rowLimit": 10000
-     },
-     "inputWidgets": {},
-     "nuid": "04d83e90-b786-4f2d-83d6-8c69c2cf176b",
-     "showTitle": false,
-     "tableResultSettingsMap": {},
-     "title": ""
+  ],
+  "metadata": {
+    "application/vnd.databricks.v1+notebook": {
+      "computePreferences": null,
+      "dashboards": [],
+      "environmentMetadata": {
+        "base_environment": "",
+        "environment_version": "3"
+      },
+      "inputWidgetPreferences": null,
+      "language": "python",
+      "notebookMetadata": {
+        "pythonIndentUnit": 4
+      },
+      "notebookName": "(Final Ship) Vocabulary Evaluator - Sep release (Using LangChain to instrument)",
+      "widgets": {}
+    },
+    "kernelspec": {
+      "display_name": ".venv (3.14.4)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.14.4"
     }
-   },
-   "outputs": [],
-   "source": [
-    "class Output(BaseModel):\n",
-    "    tier_2_words: str = Field(description=\"List of Tier 2 words\")\n",
-    "    tier_3_words: str = Field(description=\"List of Tier 3 words\")\n",
-    "    archaic_words: str = Field(description=\"List of Archaic words\")\n",
-    "    other_complex_words: str = Field(description=\"List of Other Complex words\")\n",
-    "    complexity_score: str = Field(\n",
-    "        description=\"the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex\"\n",
-    "    )\n",
-    "    reasoning: str = Field(description=\"your reasoning for your answer\")\n",
-    "\n",
-    "\n",
-    "prompt_vars = {\n",
-    "    \"inputVars\": [\n",
-    "        \"text\",\n",
-    "        \"student_grade_level\",\n",
-    "        \"student_background_knowledge\",\n",
-    "        \"fk_level\",\n",
-    "    ],\n",
-    "    \"outputParser\": JsonOutputParser(pydantic_object=Output),\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Helper functions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {
-      "byteLimit": 2048000,
-      "rowLimit": 10000
-     },
-     "inputWidgets": {},
-     "nuid": "23047ec3-79b2-4e45-b452-873eae41b030",
-     "showTitle": false,
-     "tableResultSettingsMap": {},
-     "title": ""
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import textwrap\n",
-    "\n",
-    "\n",
-    "def calculate_fk_score(text) -> float:\n",
-    "    \"\"\"\n",
-    "    Calculate the Flesch-Kincaid Grade Level\n",
-    "    \"\"\"\n",
-    "    fk_score = round(ts.flesch_kincaid_grade(text), 2)\n",
-    "\n",
-    "    return fk_score\n",
-    "\n",
-    "\n",
-    "def prepare_text_for_complexity_prediction(text, grade):\n",
-    "    \"\"\"\n",
-    "    Enrich the text and grade given by user with additional features for complexity prediction.\n",
-    "    \"\"\"\n",
-    "    dataset = {\n",
-    "        \"text\": text,\n",
-    "        \"student_grade_level\": grade,\n",
-    "        \"fk_level\": calculate_fk_score(text),\n",
-    "        \"student_background_knowledge\": get_background_knowledge_assumption(\n",
-    "            text, grade\n",
-    "        ),\n",
-    "    }\n",
-    "\n",
-    "    return dataset\n",
-    "\n",
-    "\n",
-    "def prettify_vocab_complexity_output(vocab_complexity_output):\n",
-    "    output = f\"\"\"\n",
-    "        ========================= Complexity Score ========================\n",
-    "        {vocab_complexity_output.get('complexity_score') or vocab_complexity_output.get('answer') or 'N/A'}\n",
-    "\n",
-    "        ========================= Complexity Score Reasoning ==============\n",
-    "        {textwrap.fill(vocab_complexity_output.get('reasoning', 'N/A'), width=80)}\n",
-    "\n",
-    "        ========================  Complex words  ==========================\n",
-    "        * Tier 2 words: {textwrap.fill(vocab_complexity_output.get('tier_2_words', 'N/A'), width=65)}\n",
-    "        * Tier 3 words: {textwrap.fill(vocab_complexity_output.get('tier_3_words', 'N/A'), width=65)}\n",
-    "        * Archaic words: {textwrap.fill(vocab_complexity_output.get('archaic_words', 'N/A'), width=65)}\n",
-    "        * Other complex words: {textwrap.fill(vocab_complexity_output.get('other_complex_words', 'N/A'), width=60)}\"\"\"\n",
-    "\n",
-    "    print(textwrap.dedent(output).strip())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_prompts_for_grade(grade: int) -> dict:\n",
-    "    \"\"\"\n",
-    "    Returns the appropriate SYSTEM_PROMPT and USER_PROMPT for the given grade.\n",
-    "    \n",
-    "    Args:\n",
-    "        grade: Grade level (3-12)\n",
-    "    \n",
-    "    Returns:\n",
-    "        dict with keys 'SYSTEM_PROMPT' and 'USER_PROMPT'\n",
-    "    \"\"\"\n",
-    "    if grade == 3 or grade == 4:\n",
-    "        return prompts.GRADE_SPECIFIC_PROMPTS[\"GRADES_3_4\"]\n",
-    "    else:  # 5-12\n",
-    "        return prompts.GRADE_SPECIFIC_PROMPTS[\"OTHER_GRADES\"]\n",
-    "\n",
-    "\n",
-    "def get_vocab_model_for_grade(grade: int):\n",
-    "    \"\"\"\n",
-    "    Returns the appropriate vocabulary complexity model for the given grade.\n",
-    "\n",
-    "    Grades 3 & 4 use Gemini (gemini-2.5-pro), which was validated against\n",
-    "    the GRADES_3_4 prompt. All other grades use GPT-4.1, which was validated\n",
-    "    against the OTHER_GRADES prompt.\n",
-    "\n",
-    "    Args:\n",
-    "        grade: Grade level (3-12)\n",
-    "\n",
-    "    Returns:\n",
-    "        A LangChain chat model instance\n",
-    "    \"\"\"\n",
-    "    if grade == 3 or grade == 4:\n",
-    "        return vocab_complexity_model_grades_3_4\n",
-    "    else:  # 5-12\n",
-    "        return vocab_complexity_model_other_grades\n",
-    "\n",
-    "\n",
-    "def normalize_complexity_output(output: dict) -> dict:\n",
-    "    \"\"\"\n",
-    "    Normalize complexity output to use consistent string labels.\n",
-    "    Converts integer 'answer' (from OTHER_GRADES) to string 'complexity_score'.\n",
-    "    \n",
-    "    Args:\n",
-    "        output: Raw output from the model\n",
-    "    \n",
-    "    Returns:\n",
-    "        Normalized output with 'complexity_score' field\n",
-    "    \"\"\"\n",
-    "    mapping = {\n",
-    "        1: \"Slightly Complex\",\n",
-    "        2: \"Moderately Complex\",\n",
-    "        3: \"Very Complex\",\n",
-    "        4: \"Exceedingly Complex\"\n",
-    "    }\n",
-    "\n",
-    "    # Handle 'answer' field from OTHER_GRADES (will be int or string int)\n",
-    "    if 'answer' in output:\n",
-    "        value = output['answer']\n",
-    "        # Convert int or string int to proper complexity label\n",
-    "        if isinstance(value, str) and value.isdigit():\n",
-    "            value = int(value)\n",
-    "        output['complexity_score'] = mapping.get(value, str(value))\n",
-    "    \n",
-    "    # For GRADES_3_4, complexity_score already exists as a string - no changes needed\n",
-    "\n",
-    "    return output\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def predict_text_complexity_level(text, grade):\n",
-    "    \"\"\"\n",
-    "    Predict the text complexity level as well as the complex words and reasoning.\n",
-    "\n",
-    "    Args:\n",
-    "        text: The text to evaluate.\n",
-    "        grade: Grade level. Must be between 3 and 12 inclusive.\n",
-    "\n",
-    "    Raises:\n",
-    "        ValueError: If grade is not in SUPPORTED_GRADES.\n",
-    "    \"\"\"\n",
-    "    if grade not in SUPPORTED_GRADES:\n",
-    "        raise ValueError(\n",
-    "            f\"Grade {grade} is not supported. This evaluator supports grades \"\n",
-    "            f\"{min(SUPPORTED_GRADES)}-{max(SUPPORTED_GRADES)}.\"\n",
-    "        )\n",
-    "\n",
-    "    dataset = prepare_text_for_complexity_prediction(text, grade)\n",
-    "\n",
-    "    # Get grade-specific prompts and model\n",
-    "    grade_prompts = get_prompts_for_grade(grade)\n",
-    "\n",
-    "    # Use grade-specific prompts\n",
-    "    messages = [\n",
-    "        SystemMessage(content=grade_prompts['SYSTEM_PROMPT']),\n",
-    "        HumanMessagePromptTemplate.from_template(grade_prompts['USER_PROMPT']),\n",
-    "    ]\n",
-    "\n",
-    "    # Prepare chat prompt\n",
-    "    prompt = ChatPromptTemplate(\n",
-    "        messages,\n",
-    "        input_variables=prompt_vars[\"inputVars\"],\n",
-    "        partial_variables={\n",
-    "            \"format_instructions\": prompt_vars[\"outputParser\"].get_format_instructions()\n",
-    "        },\n",
-    "    )\n",
-    "\n",
-    "    # Invoke the chain\n",
-    "    chain = prompt | capture_llm(\"vocab_complexity\", get_vocab_model_for_grade(grade)) | JsonOutputParser()\n",
-    "\n",
-    "    # Get output and normalize it\n",
-    "    output = chain.invoke(dataset)\n",
-    "    output = normalize_complexity_output(output)\n",
-    "\n",
-    "    return output\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
-     "inputWidgets": {},
-     "nuid": "4437f840-a00b-47cf-a7fa-86d882f0f6af",
-     "showTitle": false,
-     "tableResultSettingsMap": {},
-     "title": ""
-    }
-   },
-   "source": [
-    "# Test out examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {
-      "byteLimit": 2048000,
-      "rowLimit": 10000
-     },
-     "inputWidgets": {},
-     "nuid": "767fe555-ef65-46fb-8da3-54f18565cb07",
-     "showTitle": false,
-     "tableResultSettingsMap": {},
-     "title": ""
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# Add your text & the grade level you want to evaluate for vocabulary complexity\n",
-    "\n",
-    "# Clear ID = 2204\n",
-    "text = \"\"\"\n",
-    "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
-    "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
-    "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
-    "\"\"\"\n",
-    "\n",
-    "grade_level = 3\n",
-    "\n",
-    "vocabulary_complexity_output = predict_text_complexity_level(text, grade_level)\n",
-    "\n",
-    "# Pretty Print the output\n",
-    "prettify_vocab_complexity_output(vocabulary_complexity_output)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "reset_captures()\n",
-    "text = \"\"\"\n",
-    "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
-    "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
-    "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
-    "\"\"\"\n",
-    "input = {\"text\": text, \"grade\": 3}\n",
-    "result = predict_text_complexity_level(**input)\n",
-    "\n",
-    "capture = capture_case(\n",
-    "    name=\"marco_polo_grade3\",\n",
-    "    description=\"Marco Polo passage, grade 3 (grades 3-4 Gemini path)\",\n",
-    "    input=input,\n",
-    "    llm_call_captures=[\"background_knowledge\", \"vocab_complexity\"],\n",
-    "    expected_result=result,\n",
-    ")\n",
-    "\n",
-    "print(build_contract_toml(capture))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "reset_captures()\n",
-    "text = \"\"\"\n",
-    "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
-    "\n",
-    "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.\n",
-    "\"\"\"\n",
-    "input = {\"text\": text, \"grade\": 7}\n",
-    "result = predict_text_complexity_level(**input)\n",
-    "\n",
-    "capture = capture_case(\n",
-    "    name=\"hurricanes_grade7\",\n",
-    "    description=\"Hurricane formation passage, grade 7 (grades 5-12 GPT path)\",\n",
-    "    input=input,\n",
-    "    llm_call_captures=[\"background_knowledge\", \"vocab_complexity\"],\n",
-    "    expected_result=result,\n",
-    ")\n",
-    "\n",
-    "print(build_contract_toml(capture))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can copy or edit the above cell to test out different texts and grade levels."
-   ]
-  }
- ],
- "metadata": {
-  "application/vnd.databricks.v1+notebook": {
-   "computePreferences": null,
-   "dashboards": [],
-   "environmentMetadata": {
-    "base_environment": "",
-    "environment_version": "3"
-   },
-   "inputWidgetPreferences": null,
-   "language": "python",
-   "notebookMetadata": {
-    "pythonIndentUnit": 4
-   },
-   "notebookName": "(Final Ship) Vocabulary Evaluator - Sep release (Using LangChain to instrument)",
-   "widgets": {}
-  },
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.13.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
diff --git a/sdks/python/scripts/capture.py b/sdks/python/scripts/capture.py
index 196d811..b2c158c 100644
--- a/sdks/python/scripts/capture.py
+++ b/sdks/python/scripts/capture.py
@@ -25,6 +25,10 @@
            description="…",                 # optional human-readable label
        )
 
+   String values in ``input`` (e.g. ``text``) are ``.strip()``'d before writing
+   TOML so contract ``[input]`` matches common evaluator behavior. Run your
+   chains with the same stripped strings so captured ``user_prompt`` matches.
+
 3. Print the TOML block and paste it into ``contracts.toml`` (for example,
    ``sdks/settings/<evaluator>/contracts.toml``):
 
@@ -159,7 +163,8 @@ def capture_case(
     Args:
         name:              Case identifier used as the TOML key (e.g. ``"marco_polo_grade3"``).
         input:             The evaluator's input dict (e.g. ``{"text": ..., "grade": 4}``).
-                           Keys are written as-is to the ``[input]`` TOML section.
+                           String values are ``.strip()``'d before writing TOML.
+                           Keys are otherwise unchanged.
         llm_call_captures: Ordered list of capture prefixes to include as
                            ``prompt_steps`` in the TOML.  Must match the prefixes
                            passed to ``capture_llm()`` during this run, in call order.
@@ -185,7 +190,7 @@ def capture_case(
     """
     data: dict[str, Any] = dict(_captures)
     data["name"] = name
-    data["input"] = dict(input)
+    data["input"] = _strip_string_values(dict(input))
     data["llm_call_captures"] = llm_call_captures
     if expected_result is not None:
         # Normalise to a plain dict so capture_case() is always fully serializable.
@@ -223,6 +228,11 @@ def build_contract_toml(*cases: dict[str, Any]) -> str:
 # ---------------------------------------------------------------------------
 
 
+def _strip_string_values(inp: dict[str, Any]) -> dict[str, Any]:
+    """Return a shallow copy of *inp* with leading/trailing whitespace removed from str values."""
+    return {k: v.strip() if isinstance(v, str) else v for k, v in inp.items()}
+
+
 def _extract_text_content(content: Any) -> str:
     """Extract a plain-text string from an AIMessage content value.
 
@@ -291,9 +301,7 @@ def _build_case(c: dict[str, Any]) -> str:
     # ── input section ────────────────────────────────────────────────────────
     lines.append(f"[cases.{name}.input]")
     for field, val in c.get("input", {}).items():
-        # Do NOT strip text — stripping would make input.text differ from the
-        # text that capture_llm used when formatting the user_prompt, causing
-        # the contract test's prompt-fidelity assertion to fail.
+        # ``capture_case`` applies ``.strip()`` to string fields in ``input``.
         lines.append(f"{field} = {_toml_value(val)}")
     lines.append("")
 
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
index 3f00797..cf2a308 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
@@ -4,7 +4,7 @@
 
 from typing import ClassVar
 
-import textstat
+import textstat  # type: ignore[import-untyped]
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from pydantic import Field
@@ -51,16 +51,16 @@ class VocabularyEvaluationInput(EvaluationInput):
 
     Example::
 
-        inp = VocabularyEvaluationInput(text="The quick brown fox...", grade_level=5)
+        inp = VocabularyEvaluationInput(text="The quick brown fox...", grade=5)
     """
 
     _input_settings: ClassVar[dict] = _INPUT_SETTINGS
 
     text: TextInputField = Field(description="The text to evaluate.")
-    grade_level: GradeInputField = Field(description="The grade level of the text (3–12).")
+    grade: GradeInputField = Field(description="The grade level of the text (3–12).")
 
-    def __init__(self, *, text: str, grade_level: int, **kwargs):
-        super().__init__(text=text, grade_level=grade_level, **kwargs)
+    def __init__(self, *, text: str, grade: int, **kwargs):
+        super().__init__(text=text, grade=grade, **kwargs)
 
 
 class VocabularyEvaluator(
@@ -95,7 +95,7 @@ def evaluate_impl(
         """Run the two-step vocabulary evaluation and return a TextComplexityResult.
 
         Grade validation is handled by the framework before this method is called:
-        ``VocabularyEvaluationInput`` automatically constrains ``grade_level`` to
+        ``VocabularyEvaluationInput`` automatically constrains ``grade`` to
         :data:`~learning_commons_evaluators.schemas.vocabulary.VOCABULARY_SUPPORTED_GRADES`
         (3–12), so ``BaseEvaluator.evaluate`` raises before reaching here for
         unsupported grades.
@@ -105,7 +105,7 @@ def evaluate_impl(
         ps_og = evaluation_settings.prompt_settings_step_vocab_other_grades
         assert ps_bk is not None and ps_34 is not None and ps_og is not None
 
-        grade = input.grade_level.value
+        grade = input.grade.value
         fk_score = round(textstat.flesch_kincaid_grade(input.text.value), 2)
         prompts = _VOCABULARY_CONFIG.prompts
 
diff --git a/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py b/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py
index ee4650c..82562f8 100644
--- a/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py
+++ b/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py
@@ -24,8 +24,8 @@
             name='text',
             description='The text to evaluate for vocabulary complexity.',
         ),
-        'grade_level': GradeInputSpec(
-            name='grade_level',
+        'grade': GradeInputSpec(
+            name='grade',
             description='The grade level of the text (3–12).',
             allowed_grades=[3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
         ),
diff --git a/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
index d1c55a8..a734162 100644
--- a/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
+++ b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
@@ -42,12 +42,10 @@ description = "Marco Polo passage, grade 3 (grades 3-4 Gemini path)"
 
 [cases.marco_polo_grade3.input]
 text = '''
-
 Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
 Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
-His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
-'''
-grade_level = 3
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.'''
+grade = 3
 
 [cases.marco_polo_grade3.prompt_steps.background_knowledge]
 model = "gpt-4o-2024-11-20"
@@ -178,15 +176,13 @@ You can use the following list of topics that we know are covered for each grade
 
 Here is the text:
 [BEGIN TEXT]
-
 Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
 Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
 His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
-
 [END TEXT]
 '''
 llm_response = '''
-Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of trade. They are unlikely to be familiar with Marco Polo, the Mongol Dynasty, Kublai Khan, or the historical origins of pasta.'''
+Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.'''
 
 [cases.marco_polo_grade3.prompt_steps.vocab_complexity]
 model = "gemini-2.5-pro"
@@ -253,16 +249,14 @@ Below is the text you need to evaluate. Let's think step by step in order to pre
 
 - It is intended for grade 3.
 
-- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of trade. They are unlikely to be familiar with Marco Polo, the Mongol Dynasty, Kublai Khan, or the historical origins of pasta.
+- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.
 
 - Text Flesch-Kincaid grade level: 6.59
 
 - Text to evaluate: [BEGIN TEXT]
-
 Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
 Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
 His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
-
 [END TEXT]
 
 STRICT OUTPUT FORMAT:
@@ -283,27 +277,29 @@ Here is the output schema (shown in a code block for readability only — do not
 llm_response = '''
 ```json
 {
-  "tier_2_words": "served, nevertheless, survived, scholars, described, treasures, wealthy",
-  "tier_3_words": "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain, silk",
-  "archaic_words": "",
-  "other_complex_words": "Venice, Ayas, Tabriz, Kerman, Beijing, Far East, near East",
+  "tier_2_words": "served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers",
+  "tier_3_words": "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain",
+  "archaic_words": "None",
+  "other_complex_words": "Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East",
   "complexity_score": "very complex",
-  "reasoning": "The vocabulary is rated as 'very complex' primarily due to the high density and conceptual load of unfamiliar proper nouns and domain-specific terms, presented with no contextual scaffolding. For a 3rd grader, the text introduces a large number of new names in rapid succession (Mongol Dynasty, Venice, Ayas, Tabriz, Kerman, Beijing, Kublai Khan) which function as challenging vocabulary. Additionally, the text lists specific, unfamiliar goods (ivory, jade, porcelain, silk) without any description. This barrage of new, undefined terms creates a significant cognitive load that would slow comprehension considerably, even if the student can grasp the basic gist of the story. While the main idea might be accessible, understanding the details of the journey and its significance is made very challenging by the vocabulary, aligning with the 'very complex' rating."
+  "reasoning": "The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level."
 }
 ```'''
 
 [cases.marco_polo_grade3.expected_result]
 tier_2_words = '''
-served, nevertheless, survived, scholars, described, treasures, wealthy'''
+served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers'''
 tier_3_words = '''
-Mongol Dynasty, Kublai Khan, ivory, jade, porcelain, silk'''
-archaic_words = ""
+Mongol Dynasty, Kublai Khan, ivory, jade, porcelain'''
+archaic_words = '''
+None'''
 other_complex_words = '''
-Venice, Ayas, Tabriz, Kerman, Beijing, Far East, near East'''
+Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East'''
 complexity_score = '''
 very complex'''
 reasoning = '''
-The vocabulary is rated as 'very complex' primarily due to the high density and conceptual load of unfamiliar proper nouns and domain-specific terms, presented with no contextual scaffolding. For a 3rd grader, the text introduces a large number of new names in rapid succession (Mongol Dynasty, Venice, Ayas, Tabriz, Kerman, Beijing, Kublai Khan) which function as challenging vocabulary. Additionally, the text lists specific, unfamiliar goods (ivory, jade, porcelain, silk) without any description. This barrage of new, undefined terms creates a significant cognitive load that would slow comprehension considerably, even if the student can grasp the basic gist of the story. While the main idea might be accessible, understanding the details of the journey and its significance is made very challenging by the vocabulary, aligning with the 'very complex' rating.'''
+The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level.'''
+
 
 # ===========================================================================
 # Case 2: hurricanes_grade7  (grades 5–12 path)
@@ -314,12 +310,10 @@ description = "Hurricane formation passage, grade 7 (grades 5-12 GPT path)"
 
 [cases.hurricanes_grade7.input]
 text = '''
-
 Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
 
-Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
-'''
-grade_level = 7
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.'''
+grade = 7
 
 [cases.hurricanes_grade7.prompt_steps.background_knowledge]
 model = "gpt-4o-2024-11-20"
@@ -450,15 +444,13 @@ You can use the following list of topics that we know are covered for each grade
 
 Here is the text:
 [BEGIN TEXT]
-
 Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
 
 Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
-
 [END TEXT]
 '''
 llm_response = '''
-Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, atmospheric pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes or the specific mechanisms of their formation.'''
+Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.'''
 
 [cases.hurricanes_grade7.prompt_steps.vocab_complexity]
 model = "gpt-4.1"
@@ -605,14 +597,12 @@ To determine the complexity rating of this text based on the vocabulary present,
 
 Below is the text you need to evaluate. It is intended for grade 7.
 
-As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, atmospheric pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes or the specific mechanisms of their formation.
+As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.
 
 [BEGIN TEXT]
-
 Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
 
 Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
-
 [END TEXT]
 
 In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
@@ -633,16 +623,17 @@ Here is the output schema (shown in a code block for readability only — do not
 ```
 '''
 llm_response = '''
-{"tier_2_words": "roar, fury, envelope, damage, match, mightiest, rising, sinking, presses, surrounds, develop, feeding, cover", "tier_3_words": "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist, ashore", "archaic_words": "", "other_complex_words": "", "complexity_score": "slightly complex", "reasoning": "The vocabulary in this text is mostly familiar and conversational for a 7th grader with the assumed background knowledge. Words like 'roar,' 'damage,' 'develop,' and 'feeding' are Tier 2 but are common in both everyday and academic contexts. The subject-specific (Tier 3) words include 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' 'low pressure,' and 'tropical.' However, these are either defined in context or are likely to be familiar due to prior exposure in science classes. There are no archaic or overly academic words. The proportion of subject-specific vocabulary is low to moderate, but not enough to make the text 'often complex.' The overall vocabulary is accessible, with only a few terms that might require brief pauses for clarification. Therefore, the text fits best at Level 2: mostly not complex, with a low proportion of subject-specific words."}'''
+{"tier_2_words": "roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing", "tier_3_words": "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist", "archaic_words": "", "other_complex_words": "ashore", "complexity_score": "slightly complex", "reasoning": "The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2."}'''
 
 [cases.hurricanes_grade7.expected_result]
 tier_2_words = '''
-roar, fury, envelope, damage, match, mightiest, rising, sinking, presses, surrounds, develop, feeding, cover'''
+roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing'''
 tier_3_words = '''
-hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist, ashore'''
+hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist'''
 archaic_words = ""
-other_complex_words = ""
+other_complex_words = '''
+ashore'''
 complexity_score = '''
 slightly complex'''
 reasoning = '''
-The vocabulary in this text is mostly familiar and conversational for a 7th grader with the assumed background knowledge. Words like 'roar,' 'damage,' 'develop,' and 'feeding' are Tier 2 but are common in both everyday and academic contexts. The subject-specific (Tier 3) words include 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' 'low pressure,' and 'tropical.' However, these are either defined in context or are likely to be familiar due to prior exposure in science classes. There are no archaic or overly academic words. The proportion of subject-specific vocabulary is low to moderate, but not enough to make the text 'often complex.' The overall vocabulary is accessible, with only a few terms that might require brief pauses for clarification. Therefore, the text fits best at Level 2: mostly not complex, with a low proportion of subject-specific words.'''
+The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2.'''
diff --git a/sdks/python/tests/contract_tests/test_vocabulary.py b/sdks/python/tests/contract_tests/test_vocabulary.py
index 23b03f1..1cc5baf 100644
--- a/sdks/python/tests/contract_tests/test_vocabulary.py
+++ b/sdks/python/tests/contract_tests/test_vocabulary.py
@@ -21,7 +21,11 @@
 -----------------------------
 Run the notebook ``evals/vocabulary_evaluator.ipynb`` with valid API keys.
 The final cells output a TOML block; paste it into ``contracts.toml``
-replacing the placeholder values.
+replacing the placeholder values (canonical copy under ``sdks/settings/vocabulary/``).
+
+Then run ``make sync-settings`` from ``sdks/python`` so the bundled
+``src/learning_commons_evaluators/settings/vocabulary/contracts.toml`` matches;
+contract tests load contracts from that package copy, not from ``sdks/settings/`` alone.
 
 IMPORTANT: ``system_prompt = ""`` for the background_knowledge step is the
 correct value (the prompt has no system message), not a placeholder. Only
@@ -60,7 +64,7 @@ def test_marco_polo_grade3(self) -> None:
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(
             text=case.input["text"],
-            grade_level=case.input["grade_level"],
+            grade=case.input["grade"],
         )
 
         with ContractTestHarness(case) as harness:
@@ -104,7 +108,7 @@ def test_hurricanes_grade7(self) -> None:
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(
             text=case.input["text"],
-            grade_level=case.input["grade_level"],
+            grade=case.input["grade"],
         )
 
         with ContractTestHarness(case) as harness:
diff --git a/sdks/python/tests/contract_tests/vocabulary.py b/sdks/python/tests/contract_tests/vocabulary.py
index dffd643..ada2fb0 100644
--- a/sdks/python/tests/contract_tests/vocabulary.py
+++ b/sdks/python/tests/contract_tests/vocabulary.py
@@ -59,7 +59,7 @@ def _placeholder_metadata() -> EvaluationMetadata:
             id="vocabulary",
             version="0.1",
             name="Vocabulary",
-            description="",
+            description="Contract test placeholder.",
             maturity=EvaluatorMaturity.alpha,
         ),
         evaluation_settings=None,  # type: ignore[arg-type]
diff --git a/sdks/python/tests/evaluators/test_vocabulary.py b/sdks/python/tests/evaluators/test_vocabulary.py
index a9ac041..5d257d0 100644
--- a/sdks/python/tests/evaluators/test_vocabulary.py
+++ b/sdks/python/tests/evaluators/test_vocabulary.py
@@ -82,7 +82,7 @@ class TestVocabularyEvaluatorGrades34:
     def test_evaluate_grade_3_returns_result(self):
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
-        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=3)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3)
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()):
             result = evaluator.evaluate(inp)
 
@@ -94,7 +94,7 @@ def test_evaluate_grade_3_returns_result(self):
     def test_evaluate_grade_4_returns_result(self):
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
-        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=4)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=4)
         with _patch_steps(
             evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output("very_complex")
         ):
@@ -106,7 +106,7 @@ def test_grades34_score_with_spaces_is_normalised(self):
         """The grades 3–4 prompt may return "slightly complex" (spaces); normalise to underscores."""
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
-        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=3)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3)
         # The evaluator calls .lower().replace(" ", "_") before from_score(),
         # so we verify space-separated scores survive the normalisation path.
         output = _make_grades34_output("slightly_complex")
@@ -119,7 +119,7 @@ def test_grades34_score_with_spaces_is_normalised(self):
     def test_evaluate_grades34_explanation_has_word_breakdown(self):
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
-        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=3)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3)
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()):
             result = evaluator.evaluate(inp)
 
@@ -147,7 +147,7 @@ def test_all_complexity_scores_map_correctly(self, score_label, expected_score):
         """Each complexity label (passed as convenience int 1–4) maps to the right score."""
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
-        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=7)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=7)
         with _patch_steps(
             evaluator,
             _MOCK_BACKGROUND_KNOWLEDGE,
@@ -160,7 +160,7 @@ def test_all_complexity_scores_map_correctly(self, score_label, expected_score):
     def test_evaluate_grade_12_returns_result(self):
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
-        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=12)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=12)
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(1)):
             result = evaluator.evaluate(inp)
 
@@ -176,7 +176,7 @@ def test_other_grades_explanation_details_is_empty(self):
         """
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
-        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=8)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=8)
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)):
             result = evaluator.evaluate(inp)
 
@@ -189,15 +189,15 @@ def test_other_grades_explanation_details_is_empty(self):
 class TestVocabularyEvaluationInputValidation:
     def test_allowed_grades_set_from_toml(self):
         """VocabularyEvaluationInput picks up allowed_grades from the TOML spec."""
-        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
-        assert set(inp.grade_level.spec.allowed_grades) == VOCABULARY_SUPPORTED_GRADES
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+        assert set(inp.grade.spec.allowed_grades) == VOCABULARY_SUPPORTED_GRADES
 
     @pytest.mark.parametrize("unsupported_grade", [0, 1, 2])
     def test_unsupported_grade_raises_via_framework(self, unsupported_grade):
         """BaseEvaluator.evaluate() calls input.validate(), which catches the bad grade."""
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
-        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=unsupported_grade)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=unsupported_grade)
         # The base evaluator catches the ValidationError, sets status=failed, then re-raises.
         with pytest.raises(ValidationError):
             evaluator.evaluate(inp)
@@ -206,7 +206,7 @@ def test_unsupported_grade_sets_status_failed(self):
         """Metadata status is set to failed when grade validation fails."""
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
-        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=2)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=2)
         with pytest.raises(ValidationError):
             evaluator.evaluate(inp)
 
@@ -230,7 +230,7 @@ def test_default_settings_has_all_prompt_steps(self):
     def test_evaluate_succeeds_and_records_metadata(self):
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
-        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)):
             result = evaluator.evaluate(inp)
 
@@ -251,10 +251,10 @@ def test_missing_text_spec_raises_configuration_error(self, monkeypatch):
         """If 'text' is absent from _input_settings, ConfigurationError is raised immediately."""
         monkeypatch.setattr(VocabularyEvaluationInput, "_input_settings", {})
         with pytest.raises(ConfigurationError, match="'text'"):
-            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
 
-    def test_missing_grade_level_spec_raises_configuration_error(self, monkeypatch):
-        """If 'grade_level' is absent from _input_settings, ConfigurationError is raised."""
+    def test_missing_grade_spec_raises_configuration_error(self, monkeypatch):
+        """If 'grade' is absent from _input_settings, ConfigurationError is raised."""
         from learning_commons_evaluators.schemas.input_specs import TextInputSpec
 
         monkeypatch.setattr(
@@ -262,8 +262,8 @@ def test_missing_grade_level_spec_raises_configuration_error(self, monkeypatch):
             "_input_settings",
             {"text": TextInputSpec(name="text")},
         )
-        with pytest.raises(ConfigurationError, match="'grade_level'"):
-            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
+        with pytest.raises(ConfigurationError, match="'grade'"):
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
 
     def test_wrong_text_spec_type_raises_configuration_error(self, monkeypatch):
         """If the 'text' spec has the wrong type, ConfigurationError names the type mismatch."""
@@ -275,10 +275,10 @@ def test_wrong_text_spec_type_raises_configuration_error(self, monkeypatch):
             {"text": GradeInputSpec(name="text")},
         )
         with pytest.raises(ConfigurationError, match="TextInputSpec"):
-            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
 
-    def test_wrong_grade_level_spec_type_raises_configuration_error(self, monkeypatch):
-        """If the 'grade_level' spec has the wrong type, ConfigurationError names the mismatch."""
+    def test_wrong_grade_spec_type_raises_configuration_error(self, monkeypatch):
+        """If the 'grade' spec has the wrong type, ConfigurationError names the mismatch."""
         from learning_commons_evaluators.schemas.input_specs import TextInputSpec
 
         monkeypatch.setattr(
@@ -286,8 +286,8 @@ def test_wrong_grade_level_spec_type_raises_configuration_error(self, monkeypatc
             "_input_settings",
             {
                 "text": TextInputSpec(name="text"),
-                "grade_level": TextInputSpec(name="grade_level"),  # wrong type
+                "grade": TextInputSpec(name="grade"),  # wrong type
             },
         )
         with pytest.raises(ConfigurationError, match="GradeInputSpec"):
-            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade_level=5)
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
diff --git a/sdks/settings/vocabulary/contracts.toml b/sdks/settings/vocabulary/contracts.toml
index d1c55a8..a734162 100644
--- a/sdks/settings/vocabulary/contracts.toml
+++ b/sdks/settings/vocabulary/contracts.toml
@@ -42,12 +42,10 @@ description = "Marco Polo passage, grade 3 (grades 3-4 Gemini path)"
 
 [cases.marco_polo_grade3.input]
 text = '''
-
 Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
 Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
-His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
-'''
-grade_level = 3
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.'''
+grade = 3
 
 [cases.marco_polo_grade3.prompt_steps.background_knowledge]
 model = "gpt-4o-2024-11-20"
@@ -178,15 +176,13 @@ You can use the following list of topics that we know are covered for each grade
 
 Here is the text:
 [BEGIN TEXT]
-
 Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
 Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
 His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
-
 [END TEXT]
 '''
 llm_response = '''
-Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of trade. They are unlikely to be familiar with Marco Polo, the Mongol Dynasty, Kublai Khan, or the historical origins of pasta.'''
+Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.'''
 
 [cases.marco_polo_grade3.prompt_steps.vocab_complexity]
 model = "gemini-2.5-pro"
@@ -253,16 +249,14 @@ Below is the text you need to evaluate. Let's think step by step in order to pre
 
 - It is intended for grade 3.
 
-- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of trade. They are unlikely to be familiar with Marco Polo, the Mongol Dynasty, Kublai Khan, or the historical origins of pasta.
+- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.
 
 - Text Flesch-Kincaid grade level: 6.59
 
 - Text to evaluate: [BEGIN TEXT]
-
 Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
 Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
 His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
-
 [END TEXT]
 
 STRICT OUTPUT FORMAT:
@@ -283,27 +277,29 @@ Here is the output schema (shown in a code block for readability only — do not
 llm_response = '''
 ```json
 {
-  "tier_2_words": "served, nevertheless, survived, scholars, described, treasures, wealthy",
-  "tier_3_words": "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain, silk",
-  "archaic_words": "",
-  "other_complex_words": "Venice, Ayas, Tabriz, Kerman, Beijing, Far East, near East",
+  "tier_2_words": "served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers",
+  "tier_3_words": "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain",
+  "archaic_words": "None",
+  "other_complex_words": "Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East",
   "complexity_score": "very complex",
-  "reasoning": "The vocabulary is rated as 'very complex' primarily due to the high density and conceptual load of unfamiliar proper nouns and domain-specific terms, presented with no contextual scaffolding. For a 3rd grader, the text introduces a large number of new names in rapid succession (Mongol Dynasty, Venice, Ayas, Tabriz, Kerman, Beijing, Kublai Khan) which function as challenging vocabulary. Additionally, the text lists specific, unfamiliar goods (ivory, jade, porcelain, silk) without any description. This barrage of new, undefined terms creates a significant cognitive load that would slow comprehension considerably, even if the student can grasp the basic gist of the story. While the main idea might be accessible, understanding the details of the journey and its significance is made very challenging by the vocabulary, aligning with the 'very complex' rating."
+  "reasoning": "The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level."
 }
 ```'''
 
 [cases.marco_polo_grade3.expected_result]
 tier_2_words = '''
-served, nevertheless, survived, scholars, described, treasures, wealthy'''
+served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers'''
 tier_3_words = '''
-Mongol Dynasty, Kublai Khan, ivory, jade, porcelain, silk'''
-archaic_words = ""
+Mongol Dynasty, Kublai Khan, ivory, jade, porcelain'''
+archaic_words = '''
+None'''
 other_complex_words = '''
-Venice, Ayas, Tabriz, Kerman, Beijing, Far East, near East'''
+Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East'''
 complexity_score = '''
 very complex'''
 reasoning = '''
-The vocabulary is rated as 'very complex' primarily due to the high density and conceptual load of unfamiliar proper nouns and domain-specific terms, presented with no contextual scaffolding. For a 3rd grader, the text introduces a large number of new names in rapid succession (Mongol Dynasty, Venice, Ayas, Tabriz, Kerman, Beijing, Kublai Khan) which function as challenging vocabulary. Additionally, the text lists specific, unfamiliar goods (ivory, jade, porcelain, silk) without any description. This barrage of new, undefined terms creates a significant cognitive load that would slow comprehension considerably, even if the student can grasp the basic gist of the story. While the main idea might be accessible, understanding the details of the journey and its significance is made very challenging by the vocabulary, aligning with the 'very complex' rating.'''
+The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level.'''
+
 
 # ===========================================================================
 # Case 2: hurricanes_grade7  (grades 5–12 path)
@@ -314,12 +310,10 @@ description = "Hurricane formation passage, grade 7 (grades 5-12 GPT path)"
 
 [cases.hurricanes_grade7.input]
 text = '''
-
 Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
 
-Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
-'''
-grade_level = 7
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.'''
+grade = 7
 
 [cases.hurricanes_grade7.prompt_steps.background_knowledge]
 model = "gpt-4o-2024-11-20"
@@ -450,15 +444,13 @@ You can use the following list of topics that we know are covered for each grade
 
 Here is the text:
 [BEGIN TEXT]
-
 Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
 
 Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
-
 [END TEXT]
 '''
 llm_response = '''
-Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, atmospheric pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes or the specific mechanisms of their formation.'''
+Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.'''
 
 [cases.hurricanes_grade7.prompt_steps.vocab_complexity]
 model = "gpt-4.1"
@@ -605,14 +597,12 @@ To determine the complexity rating of this text based on the vocabulary present,
 
 Below is the text you need to evaluate. It is intended for grade 7.
 
-As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, atmospheric pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes or the specific mechanisms of their formation.
+As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.
 
 [BEGIN TEXT]
-
 Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
 
 Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
-
 [END TEXT]
 
 In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
@@ -633,16 +623,17 @@ Here is the output schema (shown in a code block for readability only — do not
 ```
 '''
 llm_response = '''
-{"tier_2_words": "roar, fury, envelope, damage, match, mightiest, rising, sinking, presses, surrounds, develop, feeding, cover", "tier_3_words": "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist, ashore", "archaic_words": "", "other_complex_words": "", "complexity_score": "slightly complex", "reasoning": "The vocabulary in this text is mostly familiar and conversational for a 7th grader with the assumed background knowledge. Words like 'roar,' 'damage,' 'develop,' and 'feeding' are Tier 2 but are common in both everyday and academic contexts. The subject-specific (Tier 3) words include 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' 'low pressure,' and 'tropical.' However, these are either defined in context or are likely to be familiar due to prior exposure in science classes. There are no archaic or overly academic words. The proportion of subject-specific vocabulary is low to moderate, but not enough to make the text 'often complex.' The overall vocabulary is accessible, with only a few terms that might require brief pauses for clarification. Therefore, the text fits best at Level 2: mostly not complex, with a low proportion of subject-specific words."}'''
+{"tier_2_words": "roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing", "tier_3_words": "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist", "archaic_words": "", "other_complex_words": "ashore", "complexity_score": "slightly complex", "reasoning": "The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2."}'''
 
 [cases.hurricanes_grade7.expected_result]
 tier_2_words = '''
-roar, fury, envelope, damage, match, mightiest, rising, sinking, presses, surrounds, develop, feeding, cover'''
+roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing'''
 tier_3_words = '''
-hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist, ashore'''
+hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist'''
 archaic_words = ""
-other_complex_words = ""
+other_complex_words = '''
+ashore'''
 complexity_score = '''
 slightly complex'''
 reasoning = '''
-The vocabulary in this text is mostly familiar and conversational for a 7th grader with the assumed background knowledge. Words like 'roar,' 'damage,' 'develop,' and 'feeding' are Tier 2 but are common in both everyday and academic contexts. The subject-specific (Tier 3) words include 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' 'low pressure,' and 'tropical.' However, these are either defined in context or are likely to be familiar due to prior exposure in science classes. There are no archaic or overly academic words. The proportion of subject-specific vocabulary is low to moderate, but not enough to make the text 'often complex.' The overall vocabulary is accessible, with only a few terms that might require brief pauses for clarification. Therefore, the text fits best at Level 2: mostly not complex, with a low proportion of subject-specific words.'''
+The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2.'''
diff --git a/sdks/settings/vocabulary/settings.toml b/sdks/settings/vocabulary/settings.toml
index 3760c19..295444f 100644
--- a/sdks/settings/vocabulary/settings.toml
+++ b/sdks/settings/vocabulary/settings.toml
@@ -11,7 +11,7 @@ type = "TextInputField"
 description = "The text to evaluate for vocabulary complexity."
 
 [[evaluator_metadata.inputs]]
-name = "grade_level"
+name = "grade"
 type = "GradeInputField"
 description = "The grade level of the text (3–12)."
 allowed_grades = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

From dd8d1f4d0562509b57d17bc6bdf67eadb540301a Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 22:29:07 -0700
Subject: [PATCH 04/10] chore: vocabulary settings are required

---
 .../learning_commons_evaluators/evaluators/vocabulary.py    | 3 ---
 .../src/learning_commons_evaluators/schemas/vocabulary.py   | 6 +++---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
index cf2a308..1cc4ea4 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
@@ -103,7 +103,6 @@ def evaluate_impl(
         ps_bk = evaluation_settings.prompt_settings_step_background_knowledge
         ps_34 = evaluation_settings.prompt_settings_step_vocab_grades_3_4
         ps_og = evaluation_settings.prompt_settings_step_vocab_other_grades
-        assert ps_bk is not None and ps_34 is not None and ps_og is not None
 
         grade = input.grade.value
         fk_score = round(textstat.flesch_kincaid_grade(input.text.value), 2)
@@ -188,7 +187,6 @@ def _evaluate_grades_3_4(
             },
             parser_output_type=VocabularyOutputGrades34,
         )
-        assert isinstance(output, VocabularyOutputGrades34)
 
         # Normalise the score string: the prompt may return spaces ("very complex")
         # but TextComplexityAnswer expects underscores ("very_complex").
@@ -241,7 +239,6 @@ def _evaluate_other_grades(
             },
             parser_output_type=VocabularyOutputOtherGrades,
         )
-        assert isinstance(output, VocabularyOutputOtherGrades)
 
         # Normalise the score string: the prompt may return spaces ("slightly complex")
         # but TextComplexityAnswer expects underscores ("slightly_complex").
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py b/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
index 7e3e2e9..aa6c3b1 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
@@ -23,9 +23,9 @@ class VocabularyEvaluationSettings(EvaluationSettings):
     - ``prompt_settings_step_vocab_other_grades``: vocabulary complexity for grades 5–12.
     """
 
-    prompt_settings_step_background_knowledge: PromptSettings | None = None
-    prompt_settings_step_vocab_grades_3_4: PromptSettings | None = None
-    prompt_settings_step_vocab_other_grades: PromptSettings | None = None
+    prompt_settings_step_background_knowledge: PromptSettings
+    prompt_settings_step_vocab_grades_3_4: PromptSettings
+    prompt_settings_step_vocab_other_grades: PromptSettings
 
 
 # Note: avoid class docstrings here — Pydantic adds them as a root ``description``

From 3d0e87065f60b3a4cbcd88389e59abf316f2b8bf Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 22:46:43 -0700
Subject: [PATCH 05/10] feat: eval instance settings overrides

---
 sdks/python/README.md                         | 43 ++++++++++++++++++-
 .../evaluators/base.py                        | 20 +++++++--
 sdks/python/tests/evaluators/test_base.py     | 23 ++++++++++
 3 files changed, 81 insertions(+), 5 deletions(-)

diff --git a/sdks/python/README.md b/sdks/python/README.md
index 15dc866..9a1e80c 100644
--- a/sdks/python/README.md
+++ b/sdks/python/README.md
@@ -230,6 +230,42 @@ config = create_config(
 )
 ```
 
+### Per-instance default evaluation settings
+
+Every `BaseEvaluator` subclass defines **class-level** `default_evaluation_settings`
+(the bundled evaluators load these from generated settings). You can override that
+default for a single evaluator instance by passing the same keyword to the
+constructor:
+
+```python
+from learning_commons_evaluators import ConventionalityEvaluator, create_config
+
+config = create_config(...)
+# Start from the bundled defaults, then change what your deployment needs (models,
+# temperatures, etc. live on nested PromptSettings).
+settings = ConventionalityEvaluator.default_evaluation_settings.model_copy(deep=True)
+settings.prompt_settings_step_conventionality_evaluation = (
+    settings.prompt_settings_step_conventionality_evaluation.model_copy(
+        update={"temperature": 0.2}
+    )
+)
+evaluator = ConventionalityEvaluator(
+    config,
+    default_evaluation_settings=settings,
+)
+
+# Uses the instance default (a deep copy is taken inside evaluate)
+result = evaluator.evaluate(input)
+
+# Per-call override still wins
+result = evaluator.evaluate(input, evaluation_settings=other_settings)
+```
+
+If you omit `default_evaluation_settings` at construction, attribute lookup uses the
+subclass class attribute, same as before. Whenever you call `evaluate()` without
+`evaluation_settings`, the SDK uses `model_copy(deep=True)` of the resolved default,
+so the object you keep on the instance is not mutated by a run.
+
 ### Logging
 
 The SDK uses Python's standard `logging` module. By default, `EvaluatorConfig` uses the
@@ -294,7 +330,10 @@ On evaluation failure, `metadata.status` and `error_details` are set on the in-m
 
 ## Creating custom evaluators
 
-Extend `BaseEvaluator` to create custom evaluators:
+Extend `BaseEvaluator` to create custom evaluators. Set **class-level**
+`default_evaluation_settings` for the usual defaults; callers may still construct
+`MyEvaluator(config, default_evaluation_settings=...)` to pin different defaults on a
+specific instance (see [Per-instance default evaluation settings](#per-instance-default-evaluation-settings)).
 
 ```python
 from learning_commons_evaluators import BaseEvaluator, EvaluatorConfig
@@ -329,6 +368,8 @@ class MyEvaluator(BaseEvaluator[MyInput, EvaluationResult, MySettings]):
         return EvaluationResult(answer=..., explanation=..., metadata=evaluation_metadata)
 ```
 
+If you override `__init__` on the subclass, accept the same keyword-only argument and forward it: `super().__init__(config, default_evaluation_settings=default_evaluation_settings)`.
+
 ## License
 
 MIT
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
index 5bedf05..5a89076 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -49,14 +49,25 @@ class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]):
     """
     Abstract base class for all evaluators.
     Subclasses must set metadata, default_evaluation_settings, and implement evaluate_impl().
+
+    Pass ``default_evaluation_settings`` at construction to override the class-level
+    defaults for that instance (used when :meth:`evaluate` is called without
+    ``evaluation_settings``).
     """
 
     config: EvaluatorConfig
     metadata: EvaluatorMetadata
     default_evaluation_settings: SettingsT
 
-    def __init__(self, config: EvaluatorConfig) -> None:
+    def __init__(
+        self,
+        config: EvaluatorConfig,
+        *,
+        default_evaluation_settings: SettingsT | None = None,
+    ) -> None:
         self.config = config
+        if default_evaluation_settings is not None:
+            self.default_evaluation_settings = default_evaluation_settings
         # TODO: validate config
 
     def evaluate(
@@ -68,13 +79,14 @@ def evaluate(
 
         Validates the input, delegates to :meth:`evaluate_impl`, records timing
         and status on the returned metadata, and logs start/end events via the
-        configured logger.  If ``evaluation_settings`` is ``None``, the
-        evaluator's :attr:`default_evaluation_settings` is used.
+        configured logger.  If ``evaluation_settings`` is ``None``, a deep copy of
+        the instance's :attr:`default_evaluation_settings` is used (from the
+        constructor keyword when given, otherwise the subclass class attribute).
 
         Args:
             input: Typed input for this evaluator.
             evaluation_settings: Optional override for evaluation settings.
-                Defaults to :attr:`default_evaluation_settings`.
+                Defaults to :attr:`default_evaluation_settings` (constructor or class).
 
         Returns:
             A typed result whose ``metadata.status`` is
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index b453853..a2718c4 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -115,6 +115,15 @@ class TestBaseEvaluatorInit:
     def test_config_is_stored(self, config):
         assert _StubEvaluator(config).config is config
 
+    def test_constructor_default_evaluation_settings_overrides_class_default(self, config):
+        instance_default = _StubSettings(marker=99)
+        ev = _StubEvaluator(config, default_evaluation_settings=instance_default)
+        assert ev.default_evaluation_settings is instance_default
+
+    def test_omitted_constructor_default_falls_back_to_class_attribute(self, config):
+        ev = _StubEvaluator(config)
+        assert ev.default_evaluation_settings is _StubEvaluator.default_evaluation_settings
+
 
 # ---------------------------------------------------------------------------
 # evaluate()
@@ -133,6 +142,20 @@ def test_passes_explicit_evaluation_settings(self, stub_evaluator):
         assert result.metadata.evaluation_settings.marker == 42
         assert result.explanation.details.get("marker") == 42
 
+    def test_constructor_default_used_when_evaluate_settings_omitted(self, config):
+        ev = _StubEvaluator(config, default_evaluation_settings=_StubSettings(marker=77))
+        result = ev.evaluate(_stub_input())
+        assert result.metadata.evaluation_settings.marker == 77
+        assert result.explanation.details.get("marker") == 77
+
+    def test_evaluate_explicit_settings_override_constructor_default(self, config):
+        ev = _StubEvaluator(
+            config,
+            default_evaluation_settings=_StubSettings(marker=1),
+        )
+        result = ev.evaluate(_stub_input(), evaluation_settings=_StubSettings(marker=2))
+        assert result.explanation.details.get("marker") == 2
+
 
 class TestEvaluateInputMetadata:
     """``input_metadata`` on :class:`EvaluationMetadata` always comes from ``input.input_metadata()``."""

From bf383cc6c3b1396eb1163fcd5b6ecd988c48d3c3 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 23:08:17 -0700
Subject: [PATCH 06/10] chore: addressing PR comments

---
 .../src/learning_commons_evaluators/evaluators/vocabulary.py  | 3 ++-
 sdks/python/tests/evaluators/test_vocabulary.py               | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
index 1cc4ea4..780496b 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
@@ -76,7 +76,8 @@ class VocabularyEvaluator(
     2. **Vocabulary complexity** – a grade-specific prompt + model produces the
        final score and reasoning.  Grades 3–4 use a Gemini model and return a
        rubric label plus a word-level breakdown; grades 5–12 use a GPT model and
-       return an integer score (1–4).
+       return a complexity label (e.g. ``"slightly complex"``). The results are
+       normalised and mapped to :class:`TextComplexityAnswer`.
 
     Supported grades: 3–12.
     """
diff --git a/sdks/python/tests/evaluators/test_vocabulary.py b/sdks/python/tests/evaluators/test_vocabulary.py
index 5d257d0..7aefb0c 100644
--- a/sdks/python/tests/evaluators/test_vocabulary.py
+++ b/sdks/python/tests/evaluators/test_vocabulary.py
@@ -108,8 +108,8 @@ def test_grades34_score_with_spaces_is_normalised(self):
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3)
         # The evaluator calls .lower().replace(" ", "_") before from_score(),
-        # so we verify space-separated scores survive the normalisation path.
-        output = _make_grades34_output("slightly_complex")
+        # so we feed a space-separated label and assert it survives the path.
+        output = _make_grades34_output("slightly complex")
 
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, output):
             result = evaluator.evaluate(inp)

From 918a659be11ba69aba45e4abc9afcd87b828f27c Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 23:10:48 -0700
Subject: [PATCH 07/10] chore: restore vocabulary notebook

---
 evals/vocabulary_evaluator.ipynb | 1764 +++++++++---------------------
 1 file changed, 523 insertions(+), 1241 deletions(-)

diff --git a/evals/vocabulary_evaluator.ipynb b/evals/vocabulary_evaluator.ipynb
index 23aa63b..892ecdf 100644
--- a/evals/vocabulary_evaluator.ipynb
+++ b/evals/vocabulary_evaluator.ipynb
@@ -1,1246 +1,528 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# Vocabulary Evaluator (Early Release)\n",
-        "\n",
-        "**The Vocabulary Evaluator** gives developers the fine-grained insight they need but can’t get from traditional tools. It helps determine whether texts use words that align with grade-level expectations and support growth in academic language. This ensures students are consistently exposed to the kinds of vocabulary that build knowledge and enable them to fully engage with grade-level texts.\n",
-        "\n",
-        "By understanding what makes a text difficult for a student to read, edtech companies and educators are better equipped to ensure students get the right text for their needs, along with the right instructional supports.\n",
-        "\n",
-        "You can use this evaluator to help ensure AI-generated texts are sufficiently complex for the grade level and their intended purpose.\n",
-        "\n",
-        "1. It estimates a student’s background knowledge given the selected grade level.\n",
-        "2. It uses the background knowledge estimate as a starting point to evaluate the complexity of a passage’s vocabulary."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "application/vnd.databricks.v1+cell": {
-          "cellMetadata": {},
-          "inputWidgets": {},
-          "nuid": "5d3ac2f5-40b2-4b2c-afdb-c7400e603af3",
-          "showTitle": false,
-          "tableResultSettingsMap": {},
-          "title": ""
-        }
-      },
-      "source": [
-        "### Install & Load necessary packages"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "application/vnd.databricks.v1+cell": {
-          "cellMetadata": {
-            "byteLimit": 2048000,
-            "rowLimit": 10000
-          },
-          "inputWidgets": {},
-          "nuid": "f24faefe-74b5-4c6c-9052-b41d1e9b73cf",
-          "showTitle": false,
-          "tableResultSettingsMap": {},
-          "title": ""
-        }
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Note: you may need to restart the kernel to use updated packages.\n"
-          ]
-        }
-      ],
-      "source": [
-        "%pip install -qU pydantic textstat langchain langchain_openai langchain-google-genai"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {
-        "application/vnd.databricks.v1+cell": {
-          "cellMetadata": {
-            "byteLimit": 2048000,
-            "rowLimit": 10000
-          },
-          "inputWidgets": {},
-          "nuid": "b243b748-0ed3-4d19-ab1b-d43638d2113e",
-          "showTitle": false,
-          "tableResultSettingsMap": {},
-          "title": ""
-        }
-      },
-      "outputs": [],
-      "source": [
-        "# Load packages\n",
-        "import getpass\n",
-        "import os\n",
-        "import sys\n",
-        "from pathlib import Path\n",
-        "\n",
-        "# capture.py lives under sdks/python/scripts/ (not an installable package).\n",
-        "_scripts = None\n",
-        "for _root in [Path.cwd(), *Path.cwd().parents]:\n",
-        "    _cap = _root / \"sdks\" / \"python\" / \"scripts\" / \"capture.py\"\n",
-        "    if _cap.is_file():\n",
-        "        _scripts = _cap.parent\n",
-        "        break\n",
-        "if _scripts is None:\n",
-        "    raise FileNotFoundError(\n",
-        "        \"sdks/python/scripts/capture.py not found; set the kernel cwd to the repo root or evals/.\"\n",
-        "    )\n",
-        "sys.path.insert(0, str(_scripts))\n",
-        "\n",
-        "from dotenv import load_dotenv\n",
-        "from langchain_core.messages import SystemMessage\n",
-        "from langchain_core.output_parsers import JsonOutputParser\n",
-        "from langchain_core.prompts import ChatPromptTemplate\n",
-        "from langchain_core.prompts.chat import HumanMessagePromptTemplate\n",
-        "from langchain_google_genai import ChatGoogleGenerativeAI\n",
-        "from langchain_openai import ChatOpenAI\n",
-        "from pydantic import BaseModel, Field\n",
-        "from textstat import textstat as ts\n",
-        "from capture import reset_captures, capture_llm, capture_case, build_contract_toml\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "application/vnd.databricks.v1+cell": {
-          "cellMetadata": {},
-          "inputWidgets": {},
-          "nuid": "12709664-6132-48a8-b621-73658b4fea90",
-          "showTitle": false,
-          "tableResultSettingsMap": {},
-          "title": ""
-        }
-      },
-      "source": [
-        "### Set up the evaluator's model and prompts"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "application/vnd.databricks.v1+cell": {
-          "cellMetadata": {
-            "byteLimit": 2048000,
-            "rowLimit": 10000
-          },
-          "inputWidgets": {},
-          "nuid": "eeffec80-067e-47de-a234-7dcd5078ea53",
-          "showTitle": false,
-          "tableResultSettingsMap": {},
-          "title": ""
-        }
-      },
-      "outputs": [],
-      "source": [
-        "from prompts import vocab_prompts as prompts\n",
-        "\n",
-        "# Set your api keys in your environment, .env file, or enter when prompted.\n",
-        "# os.environ['GOOGLE_API_KEY'] = 'YOUR API KEY'\n",
-        "# os.environ['OPENAI_API_KEY'] = 'YOUR API KEY'\n",
-        "load_dotenv()\n",
-        "\n",
-        "if not os.environ.get(\"OPENAI_API_KEY\"):\n",
-        "    os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter your OpenAI API key: \")\n",
-        "\n",
-        "if not os.environ.get(\"GOOGLE_API_KEY\"):\n",
-        "    os.environ[\"GOOGLE_API_KEY\"] = getpass.getpass(\"Enter your Google API key: \")\n",
-        "\n",
-        "# Grades supported by this evaluator\n",
-        "SUPPORTED_GRADES = range(3, 13)  # 3 through 12 inclusive\n",
-        "\n",
-        "VOCAB_TEMPERATURE = 0\n",
-        "# Define the model to be used for vocabulary complexity for grades 3 and 4\n",
-        "VOCAB_MODEL_GRADES_3_4 = \"gemini-2.5-pro\"\n",
-        "vocab_complexity_model_grades_3_4 = ChatGoogleGenerativeAI(\n",
-        "    model=VOCAB_MODEL_GRADES_3_4, temperature=VOCAB_TEMPERATURE\n",
-        ")\n",
-        "\n",
-        "# Define the model to be used for vocabulary complexity for all other grades\n",
-        "VOCAB_MODEL_OTHER_GRADES = \"gpt-4.1\"\n",
-        "vocab_complexity_model_other_grades = ChatOpenAI(\n",
-        "    model=VOCAB_MODEL_OTHER_GRADES, temperature=VOCAB_TEMPERATURE\n",
-        ")\n",
-        "\n",
-        "# Define the model to be used for student background knowledge generation\n",
-        "BK_MODEL = \"gpt-4o-2024-11-20\"\n",
-        "BK_TEMPERATURE = 0\n",
-        "student_bk_model = ChatOpenAI(model=BK_MODEL, temperature=BK_TEMPERATURE)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Set up student background knowledge generator"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {
-        "application/vnd.databricks.v1+cell": {
-          "cellMetadata": {
-            "byteLimit": 2048000,
-            "rowLimit": 10000
-          },
-          "inputWidgets": {},
-          "nuid": "3fc5ec95-f619-439e-9286-663f3d843502",
-          "showTitle": false,
-          "tableResultSettingsMap": {},
-          "title": ""
-        }
-      },
-      "outputs": [],
-      "source": [
-        "def get_background_knowledge_assumption(text, grade):\n",
-        "    \"\"\"Use the background knowledge prompt from the prompts file.\"\"\"\n",
-        "    prompt = prompts.bk_prompt.format(text=text, grade=grade)\n",
-        "\n",
-        "    return capture_llm(\"background_knowledge\", student_bk_model).invoke(prompt).content\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Set up the input variables and output format"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {
-        "application/vnd.databricks.v1+cell": {
-          "cellMetadata": {
-            "byteLimit": 2048000,
-            "rowLimit": 10000
-          },
-          "inputWidgets": {},
-          "nuid": "04d83e90-b786-4f2d-83d6-8c69c2cf176b",
-          "showTitle": false,
-          "tableResultSettingsMap": {},
-          "title": ""
-        }
-      },
-      "outputs": [],
-      "source": [
-        "class Output(BaseModel):\n",
-        "    tier_2_words: str = Field(description=\"List of Tier 2 words\")\n",
-        "    tier_3_words: str = Field(description=\"List of Tier 3 words\")\n",
-        "    archaic_words: str = Field(description=\"List of Archaic words\")\n",
-        "    other_complex_words: str = Field(description=\"List of Other Complex words\")\n",
-        "    complexity_score: str = Field(\n",
-        "        description=\"the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex\"\n",
-        "    )\n",
-        "    reasoning: str = Field(description=\"your reasoning for your answer\")\n",
-        "\n",
-        "\n",
-        "prompt_vars = {\n",
-        "    \"inputVars\": [\n",
-        "        \"text\",\n",
-        "        \"student_grade_level\",\n",
-        "        \"student_background_knowledge\",\n",
-        "        \"fk_level\",\n",
-        "    ],\n",
-        "    \"outputParser\": JsonOutputParser(pydantic_object=Output),\n",
-        "}\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Helper functions"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 6,
-      "metadata": {
-        "application/vnd.databricks.v1+cell": {
-          "cellMetadata": {
-            "byteLimit": 2048000,
-            "rowLimit": 10000
-          },
-          "inputWidgets": {},
-          "nuid": "23047ec3-79b2-4e45-b452-873eae41b030",
-          "showTitle": false,
-          "tableResultSettingsMap": {},
-          "title": ""
-        }
-      },
-      "outputs": [],
-      "source": [
-        "import textwrap\n",
-        "\n",
-        "\n",
-        "def calculate_fk_score(text) -> float:\n",
-        "    \"\"\"\n",
-        "    Calculate the Flesch-Kincaid Grade Level\n",
-        "    \"\"\"\n",
-        "    fk_score = round(ts.flesch_kincaid_grade(text), 2)\n",
-        "\n",
-        "    return fk_score\n",
-        "\n",
-        "\n",
-        "def prepare_text_for_complexity_prediction(text, grade):\n",
-        "    \"\"\"\n",
-        "    Enrich the text and grade given by user with additional features for complexity prediction.\n",
-        "    \"\"\"\n",
-        "    dataset = {\n",
-        "        \"text\": text,\n",
-        "        \"student_grade_level\": grade,\n",
-        "        \"fk_level\": calculate_fk_score(text),\n",
-        "        \"student_background_knowledge\": get_background_knowledge_assumption(\n",
-        "            text, grade\n",
-        "        ),\n",
-        "    }\n",
-        "\n",
-        "    return dataset\n",
-        "\n",
-        "\n",
-        "def prettify_vocab_complexity_output(vocab_complexity_output):\n",
-        "    output = f\"\"\"\n",
-        "        ========================= Complexity Score ========================\n",
-        "        {vocab_complexity_output.get('complexity_score') or vocab_complexity_output.get('answer') or 'N/A'}\n",
-        "\n",
-        "        ========================= Complexity Score Reasoning ==============\n",
-        "        {textwrap.fill(vocab_complexity_output.get('reasoning', 'N/A'), width=80)}\n",
-        "\n",
-        "        ========================  Complex words  ==========================\n",
-        "        * Tier 2 words: {textwrap.fill(vocab_complexity_output.get('tier_2_words', 'N/A'), width=65)}\n",
-        "        * Tier 3 words: {textwrap.fill(vocab_complexity_output.get('tier_3_words', 'N/A'), width=65)}\n",
-        "        * Archaic words: {textwrap.fill(vocab_complexity_output.get('archaic_words', 'N/A'), width=65)}\n",
-        "        * Other complex words: {textwrap.fill(vocab_complexity_output.get('other_complex_words', 'N/A'), width=60)}\"\"\"\n",
-        "\n",
-        "    print(textwrap.dedent(output).strip())"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 7,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "def get_prompts_for_grade(grade: int) -> dict:\n",
-        "    \"\"\"\n",
-        "    Returns the appropriate SYSTEM_PROMPT and USER_PROMPT for the given grade.\n",
-        "    \n",
-        "    Args:\n",
-        "        grade: Grade level (3-12)\n",
-        "    \n",
-        "    Returns:\n",
-        "        dict with keys 'SYSTEM_PROMPT' and 'USER_PROMPT'\n",
-        "    \"\"\"\n",
-        "    if grade == 3 or grade == 4:\n",
-        "        return prompts.GRADE_SPECIFIC_PROMPTS[\"GRADES_3_4\"]\n",
-        "    else:  # 5-12\n",
-        "        return prompts.GRADE_SPECIFIC_PROMPTS[\"OTHER_GRADES\"]\n",
-        "\n",
-        "\n",
-        "def get_vocab_model_for_grade(grade: int):\n",
-        "    \"\"\"\n",
-        "    Returns the appropriate vocabulary complexity model for the given grade.\n",
-        "\n",
-        "    Grades 3 & 4 use Gemini (gemini-2.5-pro), which was validated against\n",
-        "    the GRADES_3_4 prompt. All other grades use GPT-4.1, which was validated\n",
-        "    against the OTHER_GRADES prompt.\n",
-        "\n",
-        "    Args:\n",
-        "        grade: Grade level (3-12)\n",
-        "\n",
-        "    Returns:\n",
-        "        A LangChain chat model instance\n",
-        "    \"\"\"\n",
-        "    if grade == 3 or grade == 4:\n",
-        "        return vocab_complexity_model_grades_3_4\n",
-        "    else:  # 5-12\n",
-        "        return vocab_complexity_model_other_grades\n",
-        "\n",
-        "\n",
-        "def normalize_complexity_output(output: dict) -> dict:\n",
-        "    \"\"\"\n",
-        "    Normalize complexity output to use consistent string labels.\n",
-        "    Converts integer 'answer' (from OTHER_GRADES) to string 'complexity_score'.\n",
-        "    \n",
-        "    Args:\n",
-        "        output: Raw output from the model\n",
-        "    \n",
-        "    Returns:\n",
-        "        Normalized output with 'complexity_score' field\n",
-        "    \"\"\"\n",
-        "    mapping = {\n",
-        "        1: \"Slightly Complex\",\n",
-        "        2: \"Moderately Complex\",\n",
-        "        3: \"Very Complex\",\n",
-        "        4: \"Exceedingly Complex\"\n",
-        "    }\n",
-        "\n",
-        "    # Handle 'answer' field from OTHER_GRADES (will be int or string int)\n",
-        "    if 'answer' in output:\n",
-        "        value = output['answer']\n",
-        "        # Convert int or string int to proper complexity label\n",
-        "        if isinstance(value, str) and value.isdigit():\n",
-        "            value = int(value)\n",
-        "        output['complexity_score'] = mapping.get(value, str(value))\n",
-        "    \n",
-        "    # For GRADES_3_4, complexity_score already exists as a string - no changes needed\n",
-        "\n",
-        "    return output\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 8,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "def predict_text_complexity_level(text, grade):\n",
-        "    \"\"\"\n",
-        "    Predict the text complexity level as well as the complex words and reasoning.\n",
-        "\n",
-        "    Args:\n",
-        "        text: The text to evaluate.\n",
-        "        grade: Grade level. Must be between 3 and 12 inclusive.\n",
-        "\n",
-        "    Raises:\n",
-        "        ValueError: If grade is not in SUPPORTED_GRADES.\n",
-        "    \"\"\"\n",
-        "    if grade not in SUPPORTED_GRADES:\n",
-        "        raise ValueError(\n",
-        "            f\"Grade {grade} is not supported. This evaluator supports grades \"\n",
-        "            f\"{min(SUPPORTED_GRADES)}-{max(SUPPORTED_GRADES)}.\"\n",
-        "        )\n",
-        "\n",
-        "    dataset = prepare_text_for_complexity_prediction(text, grade)\n",
-        "\n",
-        "    # Get grade-specific prompts and model\n",
-        "    grade_prompts = get_prompts_for_grade(grade)\n",
-        "\n",
-        "    # Use grade-specific prompts\n",
-        "    messages = [\n",
-        "        SystemMessage(content=grade_prompts['SYSTEM_PROMPT']),\n",
-        "        HumanMessagePromptTemplate.from_template(grade_prompts['USER_PROMPT']),\n",
-        "    ]\n",
-        "\n",
-        "    # Prepare chat prompt\n",
-        "    prompt = ChatPromptTemplate(\n",
-        "        messages,\n",
-        "        input_variables=prompt_vars[\"inputVars\"],\n",
-        "        partial_variables={\n",
-        "            \"format_instructions\": prompt_vars[\"outputParser\"].get_format_instructions()\n",
-        "        },\n",
-        "    )\n",
-        "\n",
-        "    # Invoke the chain\n",
-        "    chain = prompt | capture_llm(\"vocab_complexity\", get_vocab_model_for_grade(grade)) | JsonOutputParser()\n",
-        "\n",
-        "    # Get output and normalize it\n",
-        "    output = chain.invoke(dataset)\n",
-        "    output = normalize_complexity_output(output)\n",
-        "\n",
-        "    return output\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "application/vnd.databricks.v1+cell": {
-          "cellMetadata": {},
-          "inputWidgets": {},
-          "nuid": "4437f840-a00b-47cf-a7fa-86d882f0f6af",
-          "showTitle": false,
-          "tableResultSettingsMap": {},
-          "title": ""
-        }
-      },
-      "source": [
-        "# Test out examples"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 9,
-      "metadata": {
-        "application/vnd.databricks.v1+cell": {
-          "cellMetadata": {
-            "byteLimit": 2048000,
-            "rowLimit": 10000
-          },
-          "inputWidgets": {},
-          "nuid": "767fe555-ef65-46fb-8da3-54f18565cb07",
-          "showTitle": false,
-          "tableResultSettingsMap": {},
-          "title": ""
-        }
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "========================= Complexity Score ========================\n",
-            "        very complex\n",
-            "\n",
-            "        ========================= Complexity Score Reasoning ==============\n",
-            "        The vocabulary is very complex for a 3rd grader, a conclusion supported by the\n",
-            "Flesch-Kincaid grade level of 6.59. The primary difficulty stems from the high\n",
-            "density of unfamiliar proper nouns which function as challenging vocabulary\n",
-            "(e.g., 'Mongol Dynasty', 'Kublai Khan', 'Venice', 'Tabriz'). These terms are\n",
-            "central to the narrative but are introduced without any contextual scaffolding\n",
-            "or explanation, creating a significant conceptual load. Furthermore, the text\n",
-            "incorporates several Tier 2 words like 'scholars', 'court', and 'wealthy', and\n",
-            "Tier 3 words like 'dynasty' and 'porcelain'. The cumulative effect of these\n",
-            "words will often present challenges and significantly slow down comprehension\n",
-            "for a student at this grade level, making it difficult to grasp the details of\n",
-            "the text even if the main idea of a journey is accessible.\n",
-            "\n",
-            "        ========================  Complex words  ==========================\n",
-            "        * Tier 2 words: served, court, passengers, crew, survived, scholars, described,\n",
-            "treasures, wealthy, trading\n",
-            "        * Tier 3 words: dynasty, ivory, jade, porcelain\n",
-            "        * Archaic words: \n",
-            "        * Other complex words: Marco Polo, Mongol Dynasty, Venice, Mediterranean Sea, Ayas,\n",
-            "Tabriz, Kerman, Beijing, Kublai Khan, Far East\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Add your text & the grade level you want to evaluate for vocabulary complexity\n",
-        "\n",
-        "# Clear ID = 2204\n",
-        "text = \"\"\"\n",
-        "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
-        "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
-        "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
-        "\"\"\"\n",
-        "\n",
-        "grade_level = 3\n",
-        "\n",
-        "vocabulary_complexity_output = predict_text_complexity_level(text, grade_level)\n",
-        "\n",
-        "# Pretty Print the output\n",
-        "prettify_vocab_complexity_output(vocabulary_complexity_output)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 10,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[cases.marco_polo_grade3]\n",
-            "description = \"Marco Polo passage, grade 3 (grades 3-4 Gemini path)\"\n",
-            "\n",
-            "[cases.marco_polo_grade3.input]\n",
-            "text = '''\n",
-            "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
-            "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
-            "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.'''\n",
-            "grade = 3\n",
-            "\n",
-            "[cases.marco_polo_grade3.prompt_steps.background_knowledge]\n",
-            "model = \"gpt-4o-2024-11-20\"\n",
-            "temperature = 0.0\n",
-            "system_prompt = \"\"\n",
-            "user_prompt = '''\n",
-            "\n",
-            "Review the following text, which is an educational text written for students in the following grade band: 3.\n",
-            "\n",
-            "Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.\n",
-            "\n",
-            "Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).\n",
-            "\n",
-            "Here's an example:\n",
-            "[START EXAMPLE]\n",
-            "Grade Band: 11th\n",
-            "Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not\n",
-            "learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was\n",
-            "not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to\n",
-            "know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,\n",
-            "are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it\n",
-            "is the chief end of man here to “glorify God and enjoy him forever.”\n",
-            "\n",
-            "Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.\n",
-            "[END EXAMPLE]\n",
-            "\n",
-            "You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.\n",
-            "\n",
-            "You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:\n",
-            "[BEGIN TOPICS]\n",
-            "[\n",
-            "    K: [\n",
-            "        \"Toys and Play\", \"Weather Wonders\", \"Trees are Alive\", \"Enjoying and Appreciating Trees\",\n",
-            "        \"The Five Senses: How do our senses help us learn?\", \"Once Upon a Farm: What makes a good story?\",\n",
-            "        \"America, Then and Now: How has life in America changed over time?\", \"The Continents: What makes the world fascinating?\",\n",
-            "        \"Needs of Plants and Animals\", \"Pushes and Pulls\", \"Sunlight and Weather\", \"Learning and Working Together\",\n",
-            "        \"How Do People Learn and Work Together?\", \"Where Do We Live?\", \"What Does it Mean to Be an American?\",\n",
-            "        \"How Has Our World Changed?\", \"Why Do People Have Jobs?\"\n",
-            "    ],\n",
-            "    1: [\n",
-            "        \"Tools and Work\", \"A Study of the Sun, Moon, and Stars\", \"Birds' Amazing Bodies\", \"Caring for Birds\",\n",
-            "        \"A World of Books: How do books change lives around the world?\", \"Creature Features: What can we discover about animals’ unique features?\",\n",
-            "        \"Powerful Forces: How do people respond to the powerful force of the wind?\", \"Cinderella Stories: Why do people around the world admire Cinderella?\",\n",
-            "        \"Animal and Plant Defenses\", \"Light and Sounds\", \"Spinning Earth\", \"Our Place in the World\",\n",
-            "        \"What Are the Rights and Responsibilities of Citizens?\", \"How Can We Describe Where We Live?\",\n",
-            "        \"How Do We Celebrate Our Country?\", \"How Does the Past Shape Our Lives?\", \"Why Do People Work?\"\n",
-            "    ],\n",
-            "    2: [\n",
-            "        \"Schools and Community\", \"Fossils Tell of Earth's Changes\", \"The Secret World of Pollination\", \"Providing for Pollinators\",\n",
-            "        \"A Season of Change: How does change impact people and nature?\", \"The American West: What was life like in the West for early Americans?\",\n",
-            "        \"Civil Rights Heroes: How can people respond to injustice?\", \"Good Eating: How does food nourish us?\",\n",
-            "        \"Plant and Animal Relationships\", \"Properties of Matter\", \"Changing Landforms\", \"Exploring Who We Are\",\n",
-            "        \"Why Is It Important to Learn About the Past?\", \"How Does Geography Help Us Understand Our World?\",\n",
-            "        \"How Do We Get What We Want and Need?\", \"Why Do We Need Government?\", \"How Can People Make a Difference in Our World?\"\n",
-            "    ],\n",
-            "    \"3\": [\n",
-            "        \"Overcoming Learning Challenges Near and Far\", \"Adaptations and the Wide World of Frogs\", \"Exploring Literary Classics\",\n",
-            "        \"Water Around the World\", \"Ocean/Sea Exploration\", \"Outer Space\", \"Immigration\", \"Art/Being an Artist\",\n",
-            "        \"Balancing Forces\", \"Inheritance and Traits\", \"Environments and Survival\", \"Weather and Climate\",\n",
-            "        \"Communities\", \"Why Does It Matter Where We Live?\", \"What Is Our Relationship With Our Environment?\",\n",
-            "        \"What Makes a Community Unique?\", \"How Does the Past Impact the Present?\", \"Why Do Governments and Citizens Need Each Other?\",\n",
-            "        \"How Do People in a Community Meet Their Wants and Needs?\"\n",
-            "    ],\n",
-            "    4: [\n",
-            "        \"Poetry\", \"Animal Defense Mechanisms\", \"The American Revolution\",\n",
-            "        \"Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)\",\n",
-            "        \"A Great Heart: What does it mean to have a great heart, literally and figuratively?\",\n",
-            "        \"Extreme Settings: How does a challenging setting or physical environment change a person?\",\n",
-            "        \"American Revolution/Multiple Perspectives\", \"Myths/Myth Making\", \"Energy Conversions\", \"Vision and Light\",\n",
-            "        \"Earth's Features\", \"Waves, Energy, and Information\", \"Regions of the United States\",\n",
-            "        \"How Does America Use Its Strengths and Face Its Challenges?\", \"Why Have People Moved to and From the Northeast?\",\n",
-            "        \"How Has the Southeast Changed Over Time?\", \"How Does the Midwest Reflect the Spirit of America?\",\n",
-            "        \"How Does the Southwest Reflect Its Diverse Past and Unique Environment?\", \"What Draws People to the West?\"\n",
-            "    ],\n",
-            "    5: [\n",
-            "        \"Human Rights\", \"Biodiversity in the Rainforest\", \"Athlete Leaders of Social Change\",\n",
-            "        \"Impact of Natural Disasters\", \"Cultures in Conflict: How do cultural beliefs and values guide people?\",\n",
-            "        \"Word Play: How and why do writers play with words?\", \"A War Between Us: How did the Civil War impact people?\",\n",
-            "        \"Breaking Barriers: How can sports influence individuals and societies?\", \"Patterns of Earth and Sky\",\n",
-            "        \"Modeling Matter\", \"The Earth System\", \"Ecosystem Restoration\", \"U.S. History: Making a New Nation\",\n",
-            "        \"How Were the Lives of Native Peoples Influenced by Where They Lived?\",\n",
-            "        \"What Happened When Diverse Cultures Crossed Paths?\", \"What Is the Impact of People Settling in a New Place?\",\n",
-            "        \"Why Would a Nation Want to Become Independent?\", \"What Does the Revolutionary Era Tell Us About Our Nation Today?\",\n",
-            "        \"How Does the Constitution Help Us Understand What It Means to Be an American?\",\n",
-            "        \"What Do the Early Years of the United States Reveal About the Character of the Nation?\",\n",
-            "        \"What Was the Effect of the Civil War on U.S. Society?\"\n",
-            "    ],\n",
-            "    6: [\n",
-            "        \"Greek Mythology\", \"Critical Problems and Design Solutions\", \"American Indian Boarding Schools\",\n",
-            "        \"Remarkable Accomplishments in Space Science\", \"Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?\",\n",
-            "        \"A Hero’s Journey: What is the significance and power of the hero’s journey?\",\n",
-            "        \"Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?\",\n",
-            "        \"Courage in Crisis: How can the challenges of a hostile environment inspire heroism?\",\n",
-            "        \"Microbiome\", \"Metabolism\", \"Metabolism Engineering\", \"Traits and Reproduction\", \"Thermal Energy\",\n",
-            "        \"Ocean, Atmosphere, and Climate\", \"Weather Patterns\", \"Earth's Changing Climate\",\n",
-            "        \"Earth's Changing Climate: Engineering Internship\", \"The First Americans (up to 1492)\",\n",
-            "        \"Exploration and Colonization\", \"English Colonies\", \"American Revolution\", \"First Governments and the Constitution\",\n",
-            "        \"The Early American Republic\", \"Political and Geographic Changes (1828-1850)\", \"Life in the North and South (1820-1860)\",\n",
-            "        \"Division and Civil War (1821-1865)\", \"Reconstruction (1865-1896)\", \"The West (1858-1896)\",\n",
-            "        \"New Industry and a Changing Society\", \"Expansion and War\", \"The 1920s and 1930s\", \"World War II\",\n",
-            "        \"The Cold War\", \"Civil Rights and American Society\", \"America Since the 1970s\"\n",
-            "    ],\n",
-            "    7: [\n",
-            "        \"The Lost Children of Sudan (Genocide, Genocide in Sudan)\", \"Epidemics\", \"Harlem Renaissance\", \"Plastic Pollution\",\n",
-            "        \"Identity in the Middle Ages: How does society both support and limit the development of identity?\",\n",
-            "        \"Americans All: How did World War II affect individuals?\", \"Language and Power: What is the power of language?\",\n",
-            "        \"Fever: How can times of crisis affect citizens and society?\", \"Geology on Mars\", \"Plane Motion\", \"Plane Motion Engineering\",\n",
-            "        \"Rock Formations\", \"Phase Change\", \"Phase Change Engineering\", \"Chemical Reactions\", \"Populations and Resources\",\n",
-            "        \"Matter and Energy in Ecosystems\", \"Early Humans and Agricultural Revolution\", \"Fertile Crescent\",\n",
-            "        \"Ancient Egypt and Kush\", \"The Israelites\", \"Ancient Greece\", \"Ancient South Asia\", \"Early China, Korea, and Japan\",\n",
-            "        \"Ancient Rome\", \"Rise of Christian Kingdoms\", \"The Americas\", \"Medieval Europe\", \"The Rise of Islamic Empires\",\n",
-            "        \"China in the Middle Ages\", \"Korea and Japan in the Middle Ages\", \"African Civilizations\", \"New Ways of Thinking\",\n",
-            "        \"Age of Exploration and Trade\", \"Revolutions and Empires\", \"The Modern World\"\n",
-            "    ],\n",
-            "    8: [\n",
-            "        \"Folklore of Latin America\", \"Food Choices\", \"The Holocaust\", \"Japanese American Internment\",\n",
-            "        \"The Poetics and Power of Storytelling: What is the power of storytelling?\",\n",
-            "        \"The Great War: How do literature and art illuminate the effects of World War I?\", \"What Is Love?\",\n",
-            "        \"Teens as Change Agents: How do people effect social change?\", \"Harnessing Human Energy\",\n",
-            "        \"Force and Motion\", \"Force and Motion Engineering\", \"Magnetic Fields\", \"Light Waves\", \"Earth, Moon, and Sun\",\n",
-            "        \"Natural Selection\", \"Natural Selection Engineering\", \"Evolutionary History\", \"The World in Spatial Terms\",\n",
-            "        \"Places and Regions\", \"Physical Geography\", \"Population Geography\", \"Economic Geography\",\n",
-            "        \"Political Geography\", \"Human-Environment Geography\", \"What is Economics?\", \"Markets, Money, and Businesses\",\n",
-            "        \"Government and the Economy\", \"The Global Economy\"\n",
-            "    ]\n",
-            "]\n",
-            "[END TOPICS]\n",
-            "\n",
-            "Here is the text:\n",
-            "[BEGIN TEXT]\n",
-            "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
-            "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
-            "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
-            "[END TEXT]\n",
-            "'''\n",
-            "llm_response = '''\n",
-            "Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.'''\n",
-            "\n",
-            "[cases.marco_polo_grade3.prompt_steps.vocab_complexity]\n",
-            "model = \"gemini-2.5-pro\"\n",
-            "temperature = 0.0\n",
-            "system_prompt = '''\n",
-            "\n",
-            "You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.\n",
-            "\n",
-            "You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.\n",
-            "IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)\n",
-            "\n",
-            "**Resource 1: Qualitative Text Complexity rubric (SAP)**\n",
-            "1.  **Level 1: Slightly complex**\n",
-            "    *   Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.\n",
-            "    *   Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!\n",
-            "2.  **Level 2: Moderately complex**\n",
-            "    *   Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK\n",
-            "    *   Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.\n",
-            "3.  **Level 3: Very complex**\n",
-            "    *   Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic\n",
-            "    *   Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.\n",
-            "4.  **Level 4: Exceedingly complex**\n",
-            "    *   Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.\n",
-            "    *   Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.\n",
-            "\n",
-            "**Resource 2: Flesch-Kincaid Grade Level**\n",
-            "Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.\n",
-            "*   grade 2-3: 1.98-5.34\n",
-            "*   grade 4-5: 4.51-7.73\n",
-            "*   grade 6-8: 6.51-10.34\n",
-            "*   grade 9-10: 8.32-12.12\n",
-            "*   grade 11-College: 10.34-14.2\n",
-            "\n",
-            "**Guidelines for Interpretation and Reasoning**\n",
-            "\n",
-            "Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:\n",
-            "\n",
-            "1.  **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.\n",
-            "2.  **Contextual Scaffolding:** Assess how the text supports new vocabulary.\n",
-            "    *   Are new, complex terms explicitly defined or explained with simple examples (e.g., \"volume... to see if it is big enough to hold a liter of food\")?\n",
-            "    *   Is the surrounding language simple and conversational, making the meaning of new words easier to infer?\n",
-            "    *   Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.\n",
-            "3.  **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).\n",
-            "4.  **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.\n",
-            "5.  **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.\n",
-            "    *   **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.\n",
-            "    *   **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.\n",
-            "6.  **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).\n",
-            "\n",
-            "**Final Analysis Format**\n",
-            "\n",
-            "Provide these information as your final analysis:\n",
-            "1.  **Complex vocabulary:**\n",
-            "    *   Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.\n",
-            "    *   Tier 3 words: Overly academic or domain-specific words.\n",
-            "    *   Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., “The jury retired to deliberate on their verdict.\" The use of “retire” to mean withdrawing to a private place is an archaic use.\n",
-            "    *   Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).\n",
-            "2.  **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex\n",
-            "3.  **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.\n",
-            "'''\n",
-            "user_prompt = '''\n",
-            "\n",
-            "Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.\n",
-            "\n",
-            "- It is intended for grade 3.\n",
-            "\n",
-            "- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.\n",
-            "\n",
-            "- Text Flesch-Kincaid grade level: 6.59\n",
-            "\n",
-            "- Text to evaluate: [BEGIN TEXT]\n",
-            "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
-            "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
-            "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
-            "[END TEXT]\n",
-            "\n",
-            "STRICT OUTPUT FORMAT:\n",
-            "- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.\n",
-            "- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).\n",
-            "- Do not prepend or append any text (e.g., do not write \"Here is the JSON:\").\n",
-            "- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.\n",
-            "\n",
-            "The output should be formatted as a JSON instance that conforms to the JSON schema below.\n",
-            "\n",
-            "As an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]} the object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\n",
-            "\n",
-            "Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):\n",
-            "```\n",
-            "{\"properties\": {\"tier_2_words\": {\"description\": \"List of Tier 2 words\", \"title\": \"Tier 2 Words\", \"type\": \"string\"}, \"tier_3_words\": {\"description\": \"List of Tier 3 words\", \"title\": \"Tier 3 Words\", \"type\": \"string\"}, \"archaic_words\": {\"description\": \"List of Archaic words\", \"title\": \"Archaic Words\", \"type\": \"string\"}, \"other_complex_words\": {\"description\": \"List of Other Complex words\", \"title\": \"Other Complex Words\", \"type\": \"string\"}, \"complexity_score\": {\"description\": \"the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex\", \"title\": \"Complexity Score\", \"type\": \"string\"}, \"reasoning\": {\"description\": \"your reasoning for your answer\", \"title\": \"Reasoning\", \"type\": \"string\"}}, \"required\": [\"tier_2_words\", \"tier_3_words\", \"archaic_words\", \"other_complex_words\", \"complexity_score\", \"reasoning\"]}\n",
-            "```\n",
-            "'''\n",
-            "llm_response = '''\n",
-            "```json\n",
-            "{\n",
-            "  \"tier_2_words\": \"served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers\",\n",
-            "  \"tier_3_words\": \"Mongol Dynasty, Kublai Khan, ivory, jade, porcelain\",\n",
-            "  \"archaic_words\": \"None\",\n",
-            "  \"other_complex_words\": \"Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East\",\n",
-            "  \"complexity_score\": \"very complex\",\n",
-            "  \"reasoning\": \"The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level.\"\n",
-            "}\n",
-            "```'''\n",
-            "\n",
-            "[cases.marco_polo_grade3.expected_result]\n",
-            "tier_2_words = '''\n",
-            "served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers'''\n",
-            "tier_3_words = '''\n",
-            "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain'''\n",
-            "archaic_words = '''\n",
-            "None'''\n",
-            "other_complex_words = '''\n",
-            "Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East'''\n",
-            "complexity_score = '''\n",
-            "very complex'''\n",
-            "reasoning = '''\n",
-            "The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level.'''\n",
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "reset_captures()\n",
-        "text = \"\"\"\n",
-        "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
-        "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
-        "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
-        "\"\"\".strip()\n",
-        "grade_level = 3\n",
-        "result = predict_text_complexity_level(text, grade_level)\n",
-        "contract_input = {\"text\": text, \"grade\": grade_level}\n",
-        "\n",
-        "capture = capture_case(\n",
-        "    name=\"marco_polo_grade3\",\n",
-        "    description=\"Marco Polo passage, grade 3 (grades 3-4 Gemini path)\",\n",
-        "    input=contract_input,\n",
-        "    llm_call_captures=[\"background_knowledge\", \"vocab_complexity\"],\n",
-        "    expected_result=result,\n",
-        ")\n",
-        "\n",
-        "print(build_contract_toml(capture))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 11,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[cases.hurricanes_grade7]\n",
-            "description = \"Hurricane formation passage, grade 7 (grades 5-12 GPT path)\"\n",
-            "\n",
-            "[cases.hurricanes_grade7.input]\n",
-            "text = '''\n",
-            "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
-            "\n",
-            "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.'''\n",
-            "grade = 7\n",
-            "\n",
-            "[cases.hurricanes_grade7.prompt_steps.background_knowledge]\n",
-            "model = \"gpt-4o-2024-11-20\"\n",
-            "temperature = 0.0\n",
-            "system_prompt = \"\"\n",
-            "user_prompt = '''\n",
-            "\n",
-            "Review the following text, which is an educational text written for students in the following grade band: 7.\n",
-            "\n",
-            "Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.\n",
-            "\n",
-            "Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).\n",
-            "\n",
-            "Here's an example:\n",
-            "[START EXAMPLE]\n",
-            "Grade Band: 11th\n",
-            "Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not\n",
-            "learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was\n",
-            "not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to\n",
-            "know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,\n",
-            "are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it\n",
-            "is the chief end of man here to “glorify God and enjoy him forever.”\n",
-            "\n",
-            "Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.\n",
-            "[END EXAMPLE]\n",
-            "\n",
-            "You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.\n",
-            "\n",
-            "You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:\n",
-            "[BEGIN TOPICS]\n",
-            "[\n",
-            "    K: [\n",
-            "        \"Toys and Play\", \"Weather Wonders\", \"Trees are Alive\", \"Enjoying and Appreciating Trees\",\n",
-            "        \"The Five Senses: How do our senses help us learn?\", \"Once Upon a Farm: What makes a good story?\",\n",
-            "        \"America, Then and Now: How has life in America changed over time?\", \"The Continents: What makes the world fascinating?\",\n",
-            "        \"Needs of Plants and Animals\", \"Pushes and Pulls\", \"Sunlight and Weather\", \"Learning and Working Together\",\n",
-            "        \"How Do People Learn and Work Together?\", \"Where Do We Live?\", \"What Does it Mean to Be an American?\",\n",
-            "        \"How Has Our World Changed?\", \"Why Do People Have Jobs?\"\n",
-            "    ],\n",
-            "    1: [\n",
-            "        \"Tools and Work\", \"A Study of the Sun, Moon, and Stars\", \"Birds' Amazing Bodies\", \"Caring for Birds\",\n",
-            "        \"A World of Books: How do books change lives around the world?\", \"Creature Features: What can we discover about animals’ unique features?\",\n",
-            "        \"Powerful Forces: How do people respond to the powerful force of the wind?\", \"Cinderella Stories: Why do people around the world admire Cinderella?\",\n",
-            "        \"Animal and Plant Defenses\", \"Light and Sounds\", \"Spinning Earth\", \"Our Place in the World\",\n",
-            "        \"What Are the Rights and Responsibilities of Citizens?\", \"How Can We Describe Where We Live?\",\n",
-            "        \"How Do We Celebrate Our Country?\", \"How Does the Past Shape Our Lives?\", \"Why Do People Work?\"\n",
-            "    ],\n",
-            "    2: [\n",
-            "        \"Schools and Community\", \"Fossils Tell of Earth's Changes\", \"The Secret World of Pollination\", \"Providing for Pollinators\",\n",
-            "        \"A Season of Change: How does change impact people and nature?\", \"The American West: What was life like in the West for early Americans?\",\n",
-            "        \"Civil Rights Heroes: How can people respond to injustice?\", \"Good Eating: How does food nourish us?\",\n",
-            "        \"Plant and Animal Relationships\", \"Properties of Matter\", \"Changing Landforms\", \"Exploring Who We Are\",\n",
-            "        \"Why Is It Important to Learn About the Past?\", \"How Does Geography Help Us Understand Our World?\",\n",
-            "        \"How Do We Get What We Want and Need?\", \"Why Do We Need Government?\", \"How Can People Make a Difference in Our World?\"\n",
-            "    ],\n",
-            "    \"3\": [\n",
-            "        \"Overcoming Learning Challenges Near and Far\", \"Adaptations and the Wide World of Frogs\", \"Exploring Literary Classics\",\n",
-            "        \"Water Around the World\", \"Ocean/Sea Exploration\", \"Outer Space\", \"Immigration\", \"Art/Being an Artist\",\n",
-            "        \"Balancing Forces\", \"Inheritance and Traits\", \"Environments and Survival\", \"Weather and Climate\",\n",
-            "        \"Communities\", \"Why Does It Matter Where We Live?\", \"What Is Our Relationship With Our Environment?\",\n",
-            "        \"What Makes a Community Unique?\", \"How Does the Past Impact the Present?\", \"Why Do Governments and Citizens Need Each Other?\",\n",
-            "        \"How Do People in a Community Meet Their Wants and Needs?\"\n",
-            "    ],\n",
-            "    4: [\n",
-            "        \"Poetry\", \"Animal Defense Mechanisms\", \"The American Revolution\",\n",
-            "        \"Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)\",\n",
-            "        \"A Great Heart: What does it mean to have a great heart, literally and figuratively?\",\n",
-            "        \"Extreme Settings: How does a challenging setting or physical environment change a person?\",\n",
-            "        \"American Revolution/Multiple Perspectives\", \"Myths/Myth Making\", \"Energy Conversions\", \"Vision and Light\",\n",
-            "        \"Earth's Features\", \"Waves, Energy, and Information\", \"Regions of the United States\",\n",
-            "        \"How Does America Use Its Strengths and Face Its Challenges?\", \"Why Have People Moved to and From the Northeast?\",\n",
-            "        \"How Has the Southeast Changed Over Time?\", \"How Does the Midwest Reflect the Spirit of America?\",\n",
-            "        \"How Does the Southwest Reflect Its Diverse Past and Unique Environment?\", \"What Draws People to the West?\"\n",
-            "    ],\n",
-            "    5: [\n",
-            "        \"Human Rights\", \"Biodiversity in the Rainforest\", \"Athlete Leaders of Social Change\",\n",
-            "        \"Impact of Natural Disasters\", \"Cultures in Conflict: How do cultural beliefs and values guide people?\",\n",
-            "        \"Word Play: How and why do writers play with words?\", \"A War Between Us: How did the Civil War impact people?\",\n",
-            "        \"Breaking Barriers: How can sports influence individuals and societies?\", \"Patterns of Earth and Sky\",\n",
-            "        \"Modeling Matter\", \"The Earth System\", \"Ecosystem Restoration\", \"U.S. History: Making a New Nation\",\n",
-            "        \"How Were the Lives of Native Peoples Influenced by Where They Lived?\",\n",
-            "        \"What Happened When Diverse Cultures Crossed Paths?\", \"What Is the Impact of People Settling in a New Place?\",\n",
-            "        \"Why Would a Nation Want to Become Independent?\", \"What Does the Revolutionary Era Tell Us About Our Nation Today?\",\n",
-            "        \"How Does the Constitution Help Us Understand What It Means to Be an American?\",\n",
-            "        \"What Do the Early Years of the United States Reveal About the Character of the Nation?\",\n",
-            "        \"What Was the Effect of the Civil War on U.S. Society?\"\n",
-            "    ],\n",
-            "    6: [\n",
-            "        \"Greek Mythology\", \"Critical Problems and Design Solutions\", \"American Indian Boarding Schools\",\n",
-            "        \"Remarkable Accomplishments in Space Science\", \"Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?\",\n",
-            "        \"A Hero’s Journey: What is the significance and power of the hero’s journey?\",\n",
-            "        \"Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?\",\n",
-            "        \"Courage in Crisis: How can the challenges of a hostile environment inspire heroism?\",\n",
-            "        \"Microbiome\", \"Metabolism\", \"Metabolism Engineering\", \"Traits and Reproduction\", \"Thermal Energy\",\n",
-            "        \"Ocean, Atmosphere, and Climate\", \"Weather Patterns\", \"Earth's Changing Climate\",\n",
-            "        \"Earth's Changing Climate: Engineering Internship\", \"The First Americans (up to 1492)\",\n",
-            "        \"Exploration and Colonization\", \"English Colonies\", \"American Revolution\", \"First Governments and the Constitution\",\n",
-            "        \"The Early American Republic\", \"Political and Geographic Changes (1828-1850)\", \"Life in the North and South (1820-1860)\",\n",
-            "        \"Division and Civil War (1821-1865)\", \"Reconstruction (1865-1896)\", \"The West (1858-1896)\",\n",
-            "        \"New Industry and a Changing Society\", \"Expansion and War\", \"The 1920s and 1930s\", \"World War II\",\n",
-            "        \"The Cold War\", \"Civil Rights and American Society\", \"America Since the 1970s\"\n",
-            "    ],\n",
-            "    7: [\n",
-            "        \"The Lost Children of Sudan (Genocide, Genocide in Sudan)\", \"Epidemics\", \"Harlem Renaissance\", \"Plastic Pollution\",\n",
-            "        \"Identity in the Middle Ages: How does society both support and limit the development of identity?\",\n",
-            "        \"Americans All: How did World War II affect individuals?\", \"Language and Power: What is the power of language?\",\n",
-            "        \"Fever: How can times of crisis affect citizens and society?\", \"Geology on Mars\", \"Plane Motion\", \"Plane Motion Engineering\",\n",
-            "        \"Rock Formations\", \"Phase Change\", \"Phase Change Engineering\", \"Chemical Reactions\", \"Populations and Resources\",\n",
-            "        \"Matter and Energy in Ecosystems\", \"Early Humans and Agricultural Revolution\", \"Fertile Crescent\",\n",
-            "        \"Ancient Egypt and Kush\", \"The Israelites\", \"Ancient Greece\", \"Ancient South Asia\", \"Early China, Korea, and Japan\",\n",
-            "        \"Ancient Rome\", \"Rise of Christian Kingdoms\", \"The Americas\", \"Medieval Europe\", \"The Rise of Islamic Empires\",\n",
-            "        \"China in the Middle Ages\", \"Korea and Japan in the Middle Ages\", \"African Civilizations\", \"New Ways of Thinking\",\n",
-            "        \"Age of Exploration and Trade\", \"Revolutions and Empires\", \"The Modern World\"\n",
-            "    ],\n",
-            "    8: [\n",
-            "        \"Folklore of Latin America\", \"Food Choices\", \"The Holocaust\", \"Japanese American Internment\",\n",
-            "        \"The Poetics and Power of Storytelling: What is the power of storytelling?\",\n",
-            "        \"The Great War: How do literature and art illuminate the effects of World War I?\", \"What Is Love?\",\n",
-            "        \"Teens as Change Agents: How do people effect social change?\", \"Harnessing Human Energy\",\n",
-            "        \"Force and Motion\", \"Force and Motion Engineering\", \"Magnetic Fields\", \"Light Waves\", \"Earth, Moon, and Sun\",\n",
-            "        \"Natural Selection\", \"Natural Selection Engineering\", \"Evolutionary History\", \"The World in Spatial Terms\",\n",
-            "        \"Places and Regions\", \"Physical Geography\", \"Population Geography\", \"Economic Geography\",\n",
-            "        \"Political Geography\", \"Human-Environment Geography\", \"What is Economics?\", \"Markets, Money, and Businesses\",\n",
-            "        \"Government and the Economy\", \"The Global Economy\"\n",
-            "    ]\n",
-            "]\n",
-            "[END TOPICS]\n",
-            "\n",
-            "Here is the text:\n",
-            "[BEGIN TEXT]\n",
-            "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
-            "\n",
-            "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.\n",
-            "[END TEXT]\n",
-            "'''\n",
-            "llm_response = '''\n",
-            "Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.'''\n",
-            "\n",
-            "[cases.hurricanes_grade7.prompt_steps.vocab_complexity]\n",
-            "model = \"gpt-4.1\"\n",
-            "temperature = 0.0\n",
-            "system_prompt = '''\n",
-            "\n",
-            "You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.\n",
-            "\n",
-            "You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.\n",
-            "\n",
-            "IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)\n",
-            "IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.\n",
-            "\n",
-            "Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).\n",
-            "'''\n",
-            "user_prompt = '''\n",
-            "\n",
-            "Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly — do not introduce any additional criteria or lenses for judging the complexity of the text.\n",
-            "\n",
-            "[BEGIN ANNOTATION GUIDE AND RUBRIC]\n",
-            "Instructions\n",
-            "For the following task, please assume that:\n",
-            "    - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).\n",
-            "    - The student is moving through a common progression of topics (detailed here).\n",
-            "    - The student is fluent in speaking English.\n",
-            "    - The student has an \"average\" amount of background knowledge on topics not commonly covered in curriculum.\n",
-            "    - The student will use this material for independent reading/work, without direct instruction.\n",
-            "    - The text is reasonable for the given grade level.\n",
-            "\n",
-            "Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like \"kicked the bucket,\" consider only the qualities of the words themselves (\"kicked\", \"the\" and \"bucket\").\n",
-            "\n",
-            "Please do be sure to consider:\n",
-            "- all of the different types of vocabulary (listed below)\n",
-            "- the overall proportion of complex words in the text - including repeated complex words.\n",
-            "- the resulting holistic complexity of the vocabulary (described in the Summary section below).\n",
-            "\n",
-            "Level 1:\n",
-            "Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.\n",
-            "\n",
-            "Level 2:\n",
-            "Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.\n",
-            "\n",
-            "Level 3:\n",
-            "Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic\n",
-            "\n",
-            "Level 4:\n",
-            "Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading\n",
-            "\n",
-            "And here are some relevant definitions:\n",
-            "    - Conversational: Everyday language.\n",
-            "    - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an \"average\" level of background knowledge.\n",
-            "    - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.\n",
-            "        - For ex: 4th graders are familiar with the word \"table\" but may not be familiar with the use of the word with respect to data (\"a table of data\").\n",
-            "        - Note:\n",
-            "            - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.\n",
-            "            - For ex: \"The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb.\"\n",
-            "                - The word \"pharaoh\" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.\n",
-            "        - Unfamiliar proper nouns:\n",
-            "            - A person's name, even if unfamiliar, generally does not add to complexity.\n",
-            "            - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.\n",
-            "\n",
-            "- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.\n",
-            "- Overly-academic: Words that are excessively formal, complex, or specialized.\n",
-            "    - For ex: \"The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture\"\n",
-            "- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.\n",
-            "    - For ex: \"After a long day of court proceedings, the jury 'retired' to deliberate on their verdict.\"\n",
-            "        - The word \"retire\" meaning to stop working may be familiar to a student, but \"retire\" meaning \"withdrawing to a private place\" is an archaic use.\n",
-            "\n",
-            "\n",
-            "Examples\n",
-            "The student is on-grade-level:\n",
-            "- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level.  If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.\n",
-            "\n",
-            "Figurative Language\n",
-            "- Kicked the bucket.\n",
-            "- The pen is mightier than the sword.\n",
-            "- The classroom was a zoo.\n",
-            "- He ran faster than the speed of light.\n",
-            "[END ANNOTATION GUIDE AND RUBRIC]\n",
-            "\n",
-            "Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:\n",
-            "[BEGIN EXAMPLES]\n",
-            "\n",
-            "*** EXAMPLE 1 ***\n",
-            "The following text was intended for grade level 11 and received a complexity level of 1.\n",
-            "\n",
-            "Here is the background knowledge assumption for that text: N/A\n",
-            "\n",
-            "Here is the text:\n",
-            "// START TEXT //\n",
-            "\"In a recent lecture, \"Is Nothing Sacred?\", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.\n",
-            "\n",
-            "He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.\n",
-            "\n",
-            "I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.\n",
-            "\n",
-            "I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.\n",
-            "\n",
-            "Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized.\"\n",
-            "// END TEXT //\n",
-            "\n",
-            "Here is the reasoning for that complexity level:\n",
-            "// START REASONING //\n",
-            "This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few \"juicier\" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of \"well-frayed\" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.\n",
-            "\n",
-            "It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.\n",
-            "// END REASONING //\n",
-            "*** EXAMPLE 2 ***\n",
-            "The following text was intended for grade level 5 and received a complexity level of 2.\n",
-            "\n",
-            "Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).\n",
-            "\n",
-            "Here is the text:\n",
-            "// START TEXT //\n",
-            "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names—hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
-            "\n",
-            "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.\n",
-            "// END TEXT //\n",
-            "\n",
-            "Here is the reasoning for that complexity level:\n",
-            "// START REASONING //\n",
-            "I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as \"great whiring storms\" and \"born\" / \"giving birth\" to storm  (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as \"a lot\" and \"bigger\" are more conversational, and while technical, unfamiliar words are provided, such as \"hurricane,\" \"typhoon,\" and \"cyclone,\" knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.\n",
-            "// END REASONING //\n",
-            "\n",
-            "*** EXAMPLE 3 ***\n",
-            "The following text was intended for grade level 6 and received a complexity level of 3.\n",
-            "\n",
-            "Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.\n",
-            "\n",
-            "Here is the text:\n",
-            "// START TEXT //\n",
-            "Geology is the scientific study of Earth. Geologists study the planet—its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.\n",
-            "\n",
-            "Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.\n",
-            "\n",
-            "Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.\n",
-            "// END TEXT //\n",
-            "\n",
-            "Here is the reasoning for that complexity level:\n",
-            "// START REASONING //\n",
-            "To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I \"bolded\" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or \"gist\" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating.  I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.\n",
-            "// END REASONING //\n",
-            "[END EXAMPLES]\n",
-            "\n",
-            "Below is the text you need to evaluate. It is intended for grade 7.\n",
-            "\n",
-            "As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.\n",
-            "\n",
-            "[BEGIN TEXT]\n",
-            "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
-            "\n",
-            "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.\n",
-            "[END TEXT]\n",
-            "\n",
-            "In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say \"level 2\").\n",
-            "\n",
-            "STRICT OUTPUT FORMAT:\n",
-            "- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.\n",
-            "- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).\n",
-            "- Do not prepend or append any text (e.g., do not write \"Here is the JSON:\").\n",
-            "- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.\n",
-            "\n",
-            "The output should be formatted as a JSON instance that conforms to the JSON schema below.\n",
-            "\n",
-            "As an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]} the object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\n",
-            "\n",
-            "Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):\n",
-            "```\n",
-            "{\"properties\": {\"tier_2_words\": {\"description\": \"List of Tier 2 words\", \"title\": \"Tier 2 Words\", \"type\": \"string\"}, \"tier_3_words\": {\"description\": \"List of Tier 3 words\", \"title\": \"Tier 3 Words\", \"type\": \"string\"}, \"archaic_words\": {\"description\": \"List of Archaic words\", \"title\": \"Archaic Words\", \"type\": \"string\"}, \"other_complex_words\": {\"description\": \"List of Other Complex words\", \"title\": \"Other Complex Words\", \"type\": \"string\"}, \"complexity_score\": {\"description\": \"the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex\", \"title\": \"Complexity Score\", \"type\": \"string\"}, \"reasoning\": {\"description\": \"your reasoning for your answer\", \"title\": \"Reasoning\", \"type\": \"string\"}}, \"required\": [\"tier_2_words\", \"tier_3_words\", \"archaic_words\", \"other_complex_words\", \"complexity_score\", \"reasoning\"]}\n",
-            "```\n",
-            "'''\n",
-            "llm_response = '''\n",
-            "{\"tier_2_words\": \"roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing\", \"tier_3_words\": \"hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist\", \"archaic_words\": \"\", \"other_complex_words\": \"ashore\", \"complexity_score\": \"slightly complex\", \"reasoning\": \"The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2.\"}'''\n",
-            "\n",
-            "[cases.hurricanes_grade7.expected_result]\n",
-            "tier_2_words = '''\n",
-            "roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing'''\n",
-            "tier_3_words = '''\n",
-            "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist'''\n",
-            "archaic_words = \"\"\n",
-            "other_complex_words = '''\n",
-            "ashore'''\n",
-            "complexity_score = '''\n",
-            "slightly complex'''\n",
-            "reasoning = '''\n",
-            "The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2.'''\n",
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "reset_captures()\n",
-        "text = \"\"\"\n",
-        "Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.\n",
-        "\n",
-        "Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.\n",
-        "\"\"\".strip()\n",
-        "grade_level = 7\n",
-        "result = predict_text_complexity_level(text, grade_level)\n",
-        "contract_input = {\"text\": text, \"grade\": grade_level}\n",
-        "\n",
-        "capture = capture_case(\n",
-        "    name=\"hurricanes_grade7\",\n",
-        "    description=\"Hurricane formation passage, grade 7 (grades 5-12 GPT path)\",\n",
-        "    input=contract_input,\n",
-        "    llm_call_captures=[\"background_knowledge\", \"vocab_complexity\"],\n",
-        "    expected_result=result,\n",
-        ")\n",
-        "\n",
-        "print(build_contract_toml(capture))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "You can copy or edit the above cell to test out different texts and grade levels."
-      ]
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Vocabulary Evaluator (Early Release)\n",
+    "\n",
+    "**The Vocabulary Evaluator** gives developers the fine-grained insight they need but can’t get from traditional tools. It helps determine whether texts use words that align with grade-level expectations and support growth in academic language. This ensures students are consistently exposed to the kinds of vocabulary that build knowledge and enable them to fully engage with grade-level texts.\n",
+    "\n",
+    "By understanding what makes a text difficult for a student to read, edtech companies and educators are better equipped to ensure students get the right text for their needs, along with the right instructional supports.\n",
+    "\n",
+    "You can use this evaluator to help ensure AI-generated texts are sufficiently complex for the grade level and their intended purpose.\n",
+    "\n",
+    "1. It estimates a student’s background knowledge given the selected grade level.\n",
+    "2. It uses the background knowledge estimate as a starting point to evaluate the complexity of a passage’s vocabulary."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "5d3ac2f5-40b2-4b2c-afdb-c7400e603af3",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "source": [
+    "### Install & Load necessary packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "f24faefe-74b5-4c6c-9052-b41d1e9b73cf",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
     }
-  ],
-  "metadata": {
-    "application/vnd.databricks.v1+notebook": {
-      "computePreferences": null,
-      "dashboards": [],
-      "environmentMetadata": {
-        "base_environment": "",
-        "environment_version": "3"
-      },
-      "inputWidgetPreferences": null,
-      "language": "python",
-      "notebookMetadata": {
-        "pythonIndentUnit": 4
-      },
-      "notebookName": "(Final Ship) Vocabulary Evaluator - Sep release (Using LangChain to instrument)",
-      "widgets": {}
-    },
-    "kernelspec": {
-      "display_name": ".venv (3.14.4)",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.14.4"
+   },
+   "outputs": [],
+   "source": [
+    "%pip install -qU pydantic textstat langchain langchain_openai langchain-google-genai"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "b243b748-0ed3-4d19-ab1b-d43638d2113e",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
     }
+   },
+   "outputs": [],
+   "source": [
+    "# Load packages\n",
+    "import getpass\n",
+    "import os\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "from langchain_core.messages import SystemMessage\n",
+    "from langchain_core.output_parsers import JsonOutputParser\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_core.prompts.chat import HumanMessagePromptTemplate\n",
+    "from langchain_google_genai import ChatGoogleGenerativeAI\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "from pydantic import BaseModel, Field\n",
+    "from textstat import textstat as ts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "12709664-6132-48a8-b621-73658b4fea90",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "source": [
+    "### Set up the evaluator's model and prompts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "eeffec80-067e-47de-a234-7dcd5078ea53",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from prompts import vocab_prompts as prompts\n",
+    "\n",
+    "# Set your api keys in your environment, .env file, or enter when prompted.\n",
+    "# os.environ['GOOGLE_API_KEY'] = 'YOUR API KEY'\n",
+    "# os.environ['OPENAI_API_KEY'] = 'YOUR API KEY'\n",
+    "load_dotenv()\n",
+    "\n",
+    "if not os.environ.get(\"OPENAI_API_KEY\"):\n",
+    "    os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter your OpenAI API key: \")\n",
+    "\n",
+    "if not os.environ.get(\"GOOGLE_API_KEY\"):\n",
+    "    os.environ[\"GOOGLE_API_KEY\"] = getpass.getpass(\"Enter your Google API key: \")\n",
+    "\n",
+    "# Grades supported by this evaluator\n",
+    "SUPPORTED_GRADES = range(3, 13)  # 3 through 12 inclusive\n",
+    "\n",
+    "VOCAB_TEMPERATURE = 0\n",
+    "# Define the model to be used for vocabulary complexity for grades 3 and 4\n",
+    "VOCAB_MODEL_GRADES_3_4 = \"gemini-2.5-pro\"\n",
+    "vocab_complexity_model_grades_3_4 = ChatGoogleGenerativeAI(\n",
+    "    model=VOCAB_MODEL_GRADES_3_4, temperature=VOCAB_TEMPERATURE\n",
+    ")\n",
+    "\n",
+    "# Define the model to be used for vocabulary complexity for all other grades\n",
+    "VOCAB_MODEL_OTHER_GRADES = \"gpt-4.1\"\n",
+    "vocab_complexity_model_other_grades = ChatOpenAI(\n",
+    "    model=VOCAB_MODEL_OTHER_GRADES, temperature=VOCAB_TEMPERATURE\n",
+    ")\n",
+    "\n",
+    "# Define the model to be used for student background knowledge generation\n",
+    "BK_MODEL = \"gpt-4o-2024-11-20\"\n",
+    "BK_TEMPERATURE = 0\n",
+    "student_bk_model = ChatOpenAI(model=BK_MODEL, temperature=BK_TEMPERATURE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set up student background knowledge generator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "3fc5ec95-f619-439e-9286-663f3d843502",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def get_background_knowledge_assumption(text, grade):\n",
+    "    \"\"\"Use the background knowledge prompt from the prompts file.\"\"\"\n",
+    "    prompt = prompts.bk_prompt.format(text=text, grade=grade)\n",
+    "\n",
+    "    return student_bk_model.invoke(prompt).content"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set up the input variables and output format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "04d83e90-b786-4f2d-83d6-8c69c2cf176b",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "class Output(BaseModel):\n",
+    "    tier_2_words: str = Field(description=\"List of Tier 2 words\")\n",
+    "    tier_3_words: str = Field(description=\"List of Tier 3 words\")\n",
+    "    archaic_words: str = Field(description=\"List of Archaic words\")\n",
+    "    other_complex_words: str = Field(description=\"List of Other Complex words\")\n",
+    "    complexity_score: str = Field(\n",
+    "        description=\"the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex\"\n",
+    "    )\n",
+    "    reasoning: str = Field(description=\"your reasoning for your answer\")\n",
+    "\n",
+    "\n",
+    "prompt_vars = {\n",
+    "    \"inputVars\": [\n",
+    "        \"text\",\n",
+    "        \"student_grade_level\",\n",
+    "        \"student_background_knowledge\",\n",
+    "        \"fk_level\",\n",
+    "    ],\n",
+    "    \"outputParser\": JsonOutputParser(pydantic_object=Output),\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Helper functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "23047ec3-79b2-4e45-b452-873eae41b030",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import textwrap\n",
+    "\n",
+    "\n",
+    "def calculate_fk_score(text) -> float:\n",
+    "    \"\"\"\n",
+    "    Calculate the Flesch-Kincaid Grade Level\n",
+    "    \"\"\"\n",
+    "    fk_score = round(ts.flesch_kincaid_grade(text), 2)\n",
+    "\n",
+    "    return fk_score\n",
+    "\n",
+    "\n",
+    "def prepare_text_for_complexity_prediction(text, grade):\n",
+    "    \"\"\"\n",
+    "    Enrich the text and grade given by user with additional features for complexity prediction.\n",
+    "    \"\"\"\n",
+    "    dataset = {\n",
+    "        \"text\": text,\n",
+    "        \"student_grade_level\": grade,\n",
+    "        \"fk_level\": calculate_fk_score(text),\n",
+    "        \"student_background_knowledge\": get_background_knowledge_assumption(\n",
+    "            text, grade\n",
+    "        ),\n",
+    "    }\n",
+    "\n",
+    "    return dataset\n",
+    "\n",
+    "\n",
+    "def prettify_vocab_complexity_output(vocab_complexity_output):\n",
+    "    output = f\"\"\"\n",
+    "        ========================= Complexity Score ========================\n",
+    "        {vocab_complexity_output.get('complexity_score') or vocab_complexity_output.get('answer') or 'N/A'}\n",
+    "\n",
+    "        ========================= Complexity Score Reasoning ==============\n",
+    "        {textwrap.fill(vocab_complexity_output.get('reasoning', 'N/A'), width=80)}\n",
+    "\n",
+    "        ========================  Complex words  ==========================\n",
+    "        * Tier 2 words: {textwrap.fill(vocab_complexity_output.get('tier_2_words', 'N/A'), width=65)}\n",
+    "        * Tier 3 words: {textwrap.fill(vocab_complexity_output.get('tier_3_words', 'N/A'), width=65)}\n",
+    "        * Archaic words: {textwrap.fill(vocab_complexity_output.get('archaic_words', 'N/A'), width=65)}\n",
+    "        * Other complex words: {textwrap.fill(vocab_complexity_output.get('other_complex_words', 'N/A'), width=60)}\"\"\"\n",
+    "\n",
+    "    print(textwrap.dedent(output).strip())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_prompts_for_grade(grade: int) -> dict:\n",
+    "    \"\"\"\n",
+    "    Returns the appropriate SYSTEM_PROMPT and USER_PROMPT for the given grade.\n",
+    "    \n",
+    "    Args:\n",
+    "        grade: Grade level (3-12)\n",
+    "    \n",
+    "    Returns:\n",
+    "        dict with keys 'SYSTEM_PROMPT' and 'USER_PROMPT'\n",
+    "    \"\"\"\n",
+    "    if grade == 3 or grade == 4:\n",
+    "        return prompts.GRADE_SPECIFIC_PROMPTS[\"GRADES_3_4\"]\n",
+    "    else:  # 5-12\n",
+    "        return prompts.GRADE_SPECIFIC_PROMPTS[\"OTHER_GRADES\"]\n",
+    "\n",
+    "\n",
+    "def get_vocab_model_for_grade(grade: int):\n",
+    "    \"\"\"\n",
+    "    Returns the appropriate vocabulary complexity model for the given grade.\n",
+    "\n",
+    "    Grades 3 & 4 use Gemini (gemini-2.5-pro), which was validated against\n",
+    "    the GRADES_3_4 prompt. All other grades use GPT-4.1, which was validated\n",
+    "    against the OTHER_GRADES prompt.\n",
+    "\n",
+    "    Args:\n",
+    "        grade: Grade level (3-12)\n",
+    "\n",
+    "    Returns:\n",
+    "        A LangChain chat model instance\n",
+    "    \"\"\"\n",
+    "    if grade == 3 or grade == 4:\n",
+    "        return vocab_complexity_model_grades_3_4\n",
+    "    else:  # 5-12\n",
+    "        return vocab_complexity_model_other_grades\n",
+    "\n",
+    "\n",
+    "def normalize_complexity_output(output: dict) -> dict:\n",
+    "    \"\"\"\n",
+    "    Normalize complexity output to use consistent string labels.\n",
+    "    Converts integer 'answer' (from OTHER_GRADES) to string 'complexity_score'.\n",
+    "    \n",
+    "    Args:\n",
+    "        output: Raw output from the model\n",
+    "    \n",
+    "    Returns:\n",
+    "        Normalized output with 'complexity_score' field\n",
+    "    \"\"\"\n",
+    "    mapping = {\n",
+    "        1: \"Slightly Complex\",\n",
+    "        2: \"Moderately Complex\",\n",
+    "        3: \"Very Complex\",\n",
+    "        4: \"Exceedingly Complex\"\n",
+    "    }\n",
+    "\n",
+    "    # Handle 'answer' field from OTHER_GRADES (will be int or string int)\n",
+    "    if 'answer' in output:\n",
+    "        value = output['answer']\n",
+    "        # Convert int or string int to proper complexity label\n",
+    "        if isinstance(value, str) and value.isdigit():\n",
+    "            value = int(value)\n",
+    "        output['complexity_score'] = mapping.get(value, str(value))\n",
+    "    \n",
+    "    # For GRADES_3_4, complexity_score already exists as a string - no changes needed\n",
+    "\n",
+    "    return output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict_text_complexity_level(text, grade):\n",
+    "    \"\"\"\n",
+    "    Predict the text complexity level as well as the complex words and reasoning.\n",
+    "\n",
+    "    Args:\n",
+    "        text: The text to evaluate.\n",
+    "        grade: Grade level. Must be between 3 and 12 inclusive.\n",
+    "\n",
+    "    Raises:\n",
+    "        ValueError: If grade is not in SUPPORTED_GRADES.\n",
+    "    \"\"\"\n",
+    "    if grade not in SUPPORTED_GRADES:\n",
+    "        raise ValueError(\n",
+    "            f\"Grade {grade} is not supported. This evaluator supports grades \"\n",
+    "            f\"{min(SUPPORTED_GRADES)}-{max(SUPPORTED_GRADES)}.\"\n",
+    "        )\n",
+    "\n",
+    "    dataset = prepare_text_for_complexity_prediction(text, grade)\n",
+    "\n",
+    "    # Get grade-specific prompts and model\n",
+    "    grade_prompts = get_prompts_for_grade(grade)\n",
+    "\n",
+    "    # Use grade-specific prompts\n",
+    "    messages = [\n",
+    "        SystemMessage(content=grade_prompts['SYSTEM_PROMPT']),\n",
+    "        HumanMessagePromptTemplate.from_template(grade_prompts['USER_PROMPT']),\n",
+    "    ]\n",
+    "\n",
+    "    # Prepare chat prompt\n",
+    "    prompt = ChatPromptTemplate(\n",
+    "        messages,\n",
+    "        input_variables=prompt_vars[\"inputVars\"],\n",
+    "        partial_variables={\n",
+    "            \"format_instructions\": prompt_vars[\"outputParser\"].get_format_instructions()\n",
+    "        },\n",
+    "    )\n",
+    "\n",
+    "    # Invoke the chain\n",
+    "    chain = prompt | get_vocab_model_for_grade(grade) | JsonOutputParser()\n",
+    "\n",
+    "    # Get output and normalize it\n",
+    "    output = chain.invoke(dataset)\n",
+    "    output = normalize_complexity_output(output)\n",
+    "\n",
+    "    return output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "4437f840-a00b-47cf-a7fa-86d882f0f6af",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "source": [
+    "# Test out examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "767fe555-ef65-46fb-8da3-54f18565cb07",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Add your text & the grade level you want to evaluate for vocabulary complexity\n",
+    "\n",
+    "# Clear ID = 2204\n",
+    "text = \"\"\"\n",
+    "Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.\n",
+    "Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.\n",
+    "His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.\n",
+    "\"\"\"\n",
+    "\n",
+    "grade_level = 3\n",
+    "\n",
+    "vocabulary_complexity_output = predict_text_complexity_level(text, grade_level)\n",
+    "\n",
+    "# Pretty Print the output\n",
+    "prettify_vocab_complexity_output(vocabulary_complexity_output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can copy or edit the above cell to test out different texts and grade levels."
+   ]
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "computePreferences": null,
+   "dashboards": [],
+   "environmentMetadata": {
+    "base_environment": "",
+    "environment_version": "3"
+   },
+   "inputWidgetPreferences": null,
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 4
+   },
+   "notebookName": "(Final Ship) Vocabulary Evaluator - Sep release (Using LangChain to instrument)",
+   "widgets": {}
+  },
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }

From ffa652fe4279c99cf87f167f118ef25ae752a351 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Wed, 13 May 2026 02:00:41 -0700
Subject: [PATCH 08/10] feat: base eval support for json normalizers

---
 .../evaluators/base.py                        | 19 +++++++
 sdks/python/tests/evaluators/test_base.py     | 55 +++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
index 5a89076..509b2e1 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -196,6 +196,7 @@ def execute_prompt_chain_step(
         template: Any,
         chain_inputs: dict[str, Any],
         parser_output_type: type[ParsedT],
+        json_dict_normalizer: Callable[[dict], dict] | None = None,
     ) -> ParsedT: ...
 
     def execute_prompt_chain_step(
@@ -206,6 +207,7 @@ def execute_prompt_chain_step(
         template: Any,
         chain_inputs: dict[str, Any],
         parser_output_type: type[BaseModel] | None = None,
+        json_dict_normalizer: Callable[[dict], dict] | None = None,
     ) -> BaseModel | str:
         """Run a prompt chain (template | LLM), record metadata, and return the result.
 
@@ -227,6 +229,12 @@ def execute_prompt_chain_step(
             chain_inputs: Variables to format the template and invoke the chain.
             parser_output_type: Pydantic model class for JSON parsing, or ``None``
                 to return the raw text response.
+            json_dict_normalizer: When set with ``parser_output_type``, parse the
+                model response as JSON into a plain dict (no Pydantic parse),
+                apply this function (e.g. notebook-style ``normalize_complexity_output``),
+                then validate with ``parser_output_type``. Format instructions for the
+                prompt should still be built from the same ``parser_output_type`` via
+                :class:`~langchain_core.output_parsers.JsonOutputParser`.
 
         Returns:
             Parsed instance of ``parser_output_type`` when it is a model class; plain
@@ -235,7 +243,10 @@ def execute_prompt_chain_step(
         Raises:
             ConfigurationError: No provider config for prompt_settings.provider_type.
             EvaluatorError: SDK errors, including :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` output for LangChain or HTTP failures (typically :class:`~learning_commons_evaluators.schemas.errors.APIError` subclasses). Pydantic :exc:`pydantic.ValidationError` from output parsing is re-raised unchanged.
+            ValueError: If ``json_dict_normalizer`` is set but ``parser_output_type`` is omitted.
         """
+        if json_dict_normalizer is not None and parser_output_type is None:
+            raise ValueError("json_dict_normalizer requires parser_output_type to be set")
         # Populated after a successful LLM invoke so we can attach usage even if parsing fails.
         token_usage: TokenUsage | None = None
 
@@ -250,6 +261,14 @@ def _run_chain() -> BaseModel | str:
                     return str(ai_message.content)
                 from langchain_core.output_parsers.json import JsonOutputParser
 
+                if json_dict_normalizer is not None:
+                    loose = JsonOutputParser()
+                    parsed_dict = loose.invoke(ai_message)
+                    if not isinstance(parsed_dict, dict):
+                        parsed_dict = dict(parsed_dict)
+                    normalized = json_dict_normalizer(parsed_dict)
+                    return parser_output_type.model_validate(normalized)
+
                 parser = JsonOutputParser(pydantic_object=parser_output_type)
                 raw = parser.invoke(ai_message)
                 if isinstance(raw, parser_output_type):
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index a2718c4..333e5c1 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -337,6 +337,25 @@ def _fake_llm(_pv):
             )
         assert out == "plain prose"
 
+    def test_json_dict_normalizer_without_parser_type_raises(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with pytest.raises(ValueError, match="json_dict_normalizer requires"):
+            stub_evaluator.execute_prompt_chain_step(
+                step_name="raw",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "Hello"},
+                parser_output_type=None,
+                json_dict_normalizer=lambda d: d,
+            )
+
     def test_returns_parsed_pydantic_output(self, stub_evaluator, evaluation_metadata):
         def _fake_llm(_pv):
             return AIMessage(content=_CHAIN_JSON)
@@ -359,6 +378,42 @@ def _fake_llm(_pv):
         assert result.label == "ok"
         assert result.score == 7
 
+    def test_json_dict_normalizer_parses_dict_then_normalizes_then_validates(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        """Optional ``json_dict_normalizer``: loose JSON → dict → user fn → ``model_validate``."""
+
+        def _fake_llm(_pv):
+            return AIMessage(content='{"n": 1}')
+
+        class _Out(BaseModel):
+            n: int = Field(description="n")
+            doubled: int = Field(description="doubled")
+
+        def _double(d: dict) -> dict:
+            d = dict(d)
+            d["doubled"] = int(d["n"]) * 2
+            return d
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with patch(_CHAIN_PATCH, return_value=_fake_llm):
+            result = stub_evaluator.execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "Hello"},
+                parser_output_type=_Out,
+                json_dict_normalizer=_double,
+            )
+        assert isinstance(result, _Out)
+        assert result.n == 1
+        assert result.doubled == 2
+
     def test_parser_returning_model_instance_short_circuits_model_validate(
         self, stub_evaluator, evaluation_metadata
     ):

From 0b449c8567ad6d5aa142e328c04acc637607483d Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Wed, 13 May 2026 02:01:04 -0700
Subject: [PATCH 09/10] chore: cleaner implementation of vocab

---
 .../learning_commons_evaluators/__init__.py   |  10 +-
 .../evaluators/__init__.py                    |   2 +
 .../evaluators/vocabulary.py                  | 132 ++++++------------
 .../schemas/vocabulary.py                     |  44 +++---
 .../settings/vocabulary/contracts.toml        |   6 +-
 .../tests/contract_tests/test_vocabulary.py   |  12 +-
 .../python/tests/contract_tests/vocabulary.py |  17 ++-
 .../tests/evaluators/test_vocabulary.py       |  72 +++++++---
 sdks/settings/vocabulary/contracts.toml       |   6 +-
 9 files changed, 152 insertions(+), 149 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/__init__.py
index 8594c42..b3208dc 100644
--- a/sdks/python/src/learning_commons_evaluators/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/__init__.py
@@ -86,10 +86,9 @@
     TextComplexityEvaluationInput,
 )
 from learning_commons_evaluators.schemas.vocabulary import (
-    VOCABULARY_SUPPORTED_GRADES,
+    VocabularyComplexityOutput,
     VocabularyEvaluationSettings,
-    VocabularyOutputGrades34,
-    VocabularyOutputOtherGrades,
+    normalize_complexity_output,
 )
 
 __all__ = [
@@ -139,12 +138,11 @@
     "TextInputField",
     "TokenUsage",
     "ValidationError",
-    "VOCABULARY_SUPPORTED_GRADES",
     "VocabularyEvaluationInput",
     "VocabularyEvaluationSettings",
     "VocabularyEvaluator",
-    "VocabularyOutputGrades34",
-    "VocabularyOutputOtherGrades",
+    "VocabularyComplexityOutput",
+    "normalize_complexity_output",
     "create_config",
     "create_config_no_telemetry",
     "create_config_telemetry_with_full_input",
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
index d42cd15..5bb9fc1 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
@@ -15,6 +15,7 @@
     VocabularyEvaluationInput,
     VocabularyEvaluator,
 )
+from learning_commons_evaluators.schemas.vocabulary import normalize_complexity_output
 
 __all__ = [
     "BaseEvaluator",
@@ -24,4 +25,5 @@
     "OutputT",
     "VocabularyEvaluationInput",
     "VocabularyEvaluator",
+    "normalize_complexity_output",
 ]
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
index 780496b..8b2f449 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import ClassVar
+from typing import Any, ClassVar
 
 import textstat  # type: ignore[import-untyped]
 from langchain_core.output_parsers import JsonOutputParser
@@ -28,9 +28,9 @@
     TextComplexityResult,
 )
 from learning_commons_evaluators.schemas.vocabulary import (
+    VocabularyComplexityOutput,
     VocabularyEvaluationSettings,
-    VocabularyOutputGrades34,
-    VocabularyOutputOtherGrades,
+    normalize_complexity_output,
 )
 from learning_commons_evaluators.settings._generated_vocabulary_settings import (
     CONFIG as _VOCABULARY_CONFIG,
@@ -73,11 +73,11 @@ class VocabularyEvaluator(
        students at the target grade already know about the text's topic.  This
        provides context that keeps the complexity rating from penalising familiar
        domain words.
-    2. **Vocabulary complexity** – a grade-specific prompt + model produces the
-       final score and reasoning.  Grades 3–4 use a Gemini model and return a
-       rubric label plus a word-level breakdown; grades 5–12 use a GPT model and
-       return a complexity label (e.g. ``"slightly complex"``). The results are
-       normalised and mapped to :class:`TextComplexityAnswer`.
+    2. **Vocabulary complexity** – a grade-specific prompt + model produces JSON
+       matching the notebook ``Output`` shape. Raw JSON is passed through
+       :func:`~learning_commons_evaluators.schemas.vocabulary.normalize_complexity_output`
+       (same behaviour as the notebook), then validated. Grades 3–4 use Gemini;
+       grades 5–12 use GPT. Scores are normalised to :class:`TextComplexityAnswer`.
 
     Supported grades: 3–12.
     """
@@ -96,21 +96,20 @@ def evaluate_impl(
         """Run the two-step vocabulary evaluation and return a TextComplexityResult.
 
         Grade validation is handled by the framework before this method is called:
-        ``VocabularyEvaluationInput`` automatically constrains ``grade`` to
-        :data:`~learning_commons_evaluators.schemas.vocabulary.VOCABULARY_SUPPORTED_GRADES`
-        (3–12), so ``BaseEvaluator.evaluate`` raises before reaching here for
-        unsupported grades.
+        ``VocabularyEvaluationInput`` automatically constrains ``grade`` to the
+        evaluator's ``allowed_grades`` from settings (3–12), so
+        ``BaseEvaluator.evaluate`` raises before reaching here for unsupported grades.
         """
         ps_bk = evaluation_settings.prompt_settings_step_background_knowledge
         ps_34 = evaluation_settings.prompt_settings_step_vocab_grades_3_4
         ps_og = evaluation_settings.prompt_settings_step_vocab_other_grades
 
         grade = input.grade.value
-        fk_score = round(textstat.flesch_kincaid_grade(input.text.value), 2)
+        text = input.text.value
+        fk_score = round(textstat.flesch_kincaid_grade(text), 2)
         prompts = _VOCABULARY_CONFIG.prompts
 
         # ── Step 1: background knowledge ──────────────────────────────────────
-        # parser_output_type=None → execute_prompt_chain_step returns plain str.
         bk_template = ChatPromptTemplate.from_messages(
             [("human", prompts["background_knowledge_prompt"])]
         )
@@ -119,29 +118,32 @@ def evaluate_impl(
             prompt_settings=ps_bk,
             evaluation_metadata=evaluation_metadata,
             template=bk_template,
-            chain_inputs={"text": input.text.value, "grade": grade},
+            chain_inputs={"text": text, "grade": grade},
             parser_output_type=None,
         )
 
-        # ── Step 2: vocabulary complexity (grade-specific) ────────────────────
+        # ── Step 2: vocabulary complexity (grade-specific prompts, shared Output shape)
+        chain_inputs: dict[str, Any] = {
+            "text": input.text.value,
+            "student_grade_level": grade,
+            "student_background_knowledge": background_knowledge,
+        }
         if grade in _GRADES_3_4:
-            answer, explanation = self._evaluate_grades_3_4(
-                input=input,
-                grade=grade,
-                fk_score=fk_score,
-                background_knowledge=background_knowledge,
+            chain_inputs["fk_level"] = fk_score
+            answer, explanation = self._run_vocab_complexity_chain(
+                chain_inputs=chain_inputs,
                 evaluation_metadata=evaluation_metadata,
-                prompts=prompts,
                 prompt_settings_vocab=ps_34,
+                system_prompt=prompts["vocab_grades_3_4_system_prompt"],
+                user_prompt_template=prompts["vocab_grades_3_4_user_prompt"],
             )
         else:
-            answer, explanation = self._evaluate_other_grades(
-                input=input,
-                grade=grade,
-                background_knowledge=background_knowledge,
+            answer, explanation = self._run_vocab_complexity_chain(
+                chain_inputs=chain_inputs,
                 evaluation_metadata=evaluation_metadata,
-                prompts=prompts,
                 prompt_settings_vocab=ps_og,
+                system_prompt=prompts["vocab_other_grades_system_prompt"],
+                user_prompt_template=prompts["vocab_other_grades_user_prompt"],
             )
 
         return TextComplexityResult(
@@ -150,28 +152,20 @@ def evaluate_impl(
             metadata=evaluation_metadata,
         )
 
-    # ── Private helpers ───────────────────────────────────────────────────────
-
-    def _evaluate_grades_3_4(
+    def _run_vocab_complexity_chain(
         self,
         *,
-        input: VocabularyEvaluationInput,
-        grade: int,
-        fk_score: float,
-        background_knowledge: str,
+        chain_inputs: dict[str, Any],
         evaluation_metadata: EvaluationMetadata,
-        prompts: dict,
         prompt_settings_vocab: PromptSettings,
+        system_prompt: str,
+        user_prompt_template: str,
     ) -> tuple[TextComplexityAnswer, EvaluationExplanation]:
-        """Run the grades 3–4 vocabulary complexity step.
-
-        Returns a rubric-label score and a word-breakdown explanation.
-        """
-        parser = JsonOutputParser(pydantic_object=VocabularyOutputGrades34)
+        parser = JsonOutputParser(pydantic_object=VocabularyComplexityOutput)
         template = ChatPromptTemplate.from_messages(
             [
-                ("system", prompts["vocab_grades_3_4_system_prompt"]),
-                ("human", prompts["vocab_grades_3_4_user_prompt"]),
+                ("system", system_prompt),
+                ("human", user_prompt_template),
             ]
         ).partial(format_instructions=parser.get_format_instructions())
 
@@ -180,13 +174,9 @@ def _evaluate_grades_3_4(
             prompt_settings=prompt_settings_vocab,
             evaluation_metadata=evaluation_metadata,
             template=template,
-            chain_inputs={
-                "text": input.text.value,
-                "student_grade_level": grade,
-                "student_background_knowledge": background_knowledge,
-                "fk_level": fk_score,
-            },
-            parser_output_type=VocabularyOutputGrades34,
+            chain_inputs=chain_inputs,
+            parser_output_type=VocabularyComplexityOutput,
+            json_dict_normalizer=normalize_complexity_output,
         )
 
         # Normalise the score string: the prompt may return spaces ("very complex")
@@ -203,47 +193,3 @@ def _evaluate_grades_3_4(
             },
         )
         return answer, explanation
-
-    def _evaluate_other_grades(
-        self,
-        *,
-        input: VocabularyEvaluationInput,
-        grade: int,
-        background_knowledge: str,
-        evaluation_metadata: EvaluationMetadata,
-        prompts: dict,
-        prompt_settings_vocab: PromptSettings,
-    ) -> tuple[TextComplexityAnswer, EvaluationExplanation]:
-        """Run the grades 5–12 vocabulary complexity step.
-
-        Returns a string-label score and a reasoning explanation.
-        The OTHER_GRADES prompt uses the same ``Output``-style schema as grades 3–4,
-        so the LLM returns a word-level breakdown and a string ``complexity_score``.
-        """
-        parser = JsonOutputParser(pydantic_object=VocabularyOutputOtherGrades)
-        template = ChatPromptTemplate.from_messages(
-            [
-                ("system", prompts["vocab_other_grades_system_prompt"]),
-                ("human", prompts["vocab_other_grades_user_prompt"]),
-            ]
-        ).partial(format_instructions=parser.get_format_instructions())
-
-        output = self.execute_prompt_chain_step(
-            step_name="vocab_complexity",
-            prompt_settings=prompt_settings_vocab,
-            evaluation_metadata=evaluation_metadata,
-            template=template,
-            chain_inputs={
-                "text": input.text.value,
-                "student_grade_level": grade,
-                "student_background_knowledge": background_knowledge,
-            },
-            parser_output_type=VocabularyOutputOtherGrades,
-        )
-
-        # Normalise the score string: the prompt may return spaces ("slightly complex")
-        # but TextComplexityAnswer expects underscores ("slightly_complex").
-        score = output.complexity_score.lower().replace(" ", "_")
-        answer = TextComplexityAnswer.from_score(score)
-        explanation = EvaluationExplanation(summary=output.reasoning, details={})
-        return answer, explanation
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py b/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
index aa6c3b1..64cde9f 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
@@ -7,11 +7,6 @@
     PromptSettings,
 )
 
-# Grades supported by the vocabulary evaluator.
-# Kept here for backwards-compatibility; the actual constraint is now enforced
-# via allowed_grades in [[evaluator_metadata.inputs]] in the vocabulary evaluator settings.
-VOCABULARY_SUPPORTED_GRADES: frozenset[int] = frozenset(range(3, 13))
-
 
 class VocabularyEvaluationSettings(EvaluationSettings):
     """Settings for a vocabulary complexity evaluation.
@@ -34,8 +29,7 @@ class VocabularyEvaluationSettings(EvaluationSettings):
 # that key (see ``contracts.toml``). Document behavior on fields / comments.
 
 
-class VocabularyOutputGrades34(BaseModel):
-    # LLM output for grades 3–4: rubric label + word breakdown; mirrors evals notebook Output.
+class VocabularyComplexityOutput(BaseModel):
     tier_2_words: str = Field(description="List of Tier 2 words")
     tier_3_words: str = Field(description="List of Tier 3 words")
     archaic_words: str = Field(description="List of Archaic words")
@@ -46,13 +40,29 @@ class VocabularyOutputGrades34(BaseModel):
     reasoning: str = Field(description="your reasoning for your answer")
 
 
-class VocabularyOutputOtherGrades(BaseModel):
-    # Same shape as grades 3–4; complexity_score is normalised to underscores before TextComplexityAnswer.
-    tier_2_words: str = Field(description="List of Tier 2 words")
-    tier_3_words: str = Field(description="List of Tier 3 words")
-    archaic_words: str = Field(description="List of Archaic words")
-    other_complex_words: str = Field(description="List of Other Complex words")
-    complexity_score: str = Field(
-        description="the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex"
-    )
-    reasoning: str = Field(description="your reasoning for your answer")
+def normalize_complexity_output(output: dict) -> dict:
+    """Mirror ``evals/vocabulary_evaluator.ipynb`` ``normalize_complexity_output``.
+
+    Maps integer ``answer`` (1–4, including string digits) from the grades 5–12
+    path to ``complexity_score`` using the same labels as the notebook. When
+    ``answer`` is absent, ``complexity_score`` is left unchanged.
+
+    Missing ``tier_*`` / ``archaic_words`` / ``other_complex_words`` keys are
+    filled with ``\"\"`` so minimal JSON still validates as ``VocabularyComplexityOutput``.
+    """
+    result = dict(output)
+    for key in ("tier_2_words", "tier_3_words", "archaic_words", "other_complex_words"):
+        if key not in result or result[key] is None:
+            result[key] = ""
+    mapping = {
+        1: "Slightly Complex",
+        2: "Moderately Complex",
+        3: "Very Complex",
+        4: "Exceedingly Complex",
+    }
+    if "answer" in result:
+        value = result["answer"]
+        if isinstance(value, str) and value.isdigit():
+            value = int(value)
+        result["complexity_score"] = mapping.get(value, str(value))
+    return result
diff --git a/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
index a734162..8420e01 100644
--- a/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
+++ b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
@@ -30,8 +30,10 @@
 # -----------------------
 # marco_polo_grade3  — grade 3, uses the grades 3–4 path (Gemini + rubric labels
 #                      + word-level breakdown in expected_result).
-# hurricanes_grade7  — grade 7, uses the grades 5–12 path (GPT + integer score
-#                      in expected_result).
+# hurricanes_grade7  — grade 7, uses the grades 5–12 path (GPT; expected_result
+#                      holds ``complexity_score`` as a string plus word-list fields,
+#                      matching the notebook ``Output`` shape. The SDK also accepts
+#                      legacy integer ``answer`` in raw JSON like ``normalize_complexity_output``.)
 
 # ===========================================================================
 # Case 1: marco_polo_grade3  (grades 3–4 path)
diff --git a/sdks/python/tests/contract_tests/test_vocabulary.py b/sdks/python/tests/contract_tests/test_vocabulary.py
index 1cc5baf..57364c7 100644
--- a/sdks/python/tests/contract_tests/test_vocabulary.py
+++ b/sdks/python/tests/contract_tests/test_vocabulary.py
@@ -3,8 +3,8 @@
 Each test verifies two things for a known input:
   1. The SDK sends the same LLM requests as the notebook (fully-formatted
      system prompt, user prompt, model, and temperature — for both steps).
-  2. Given the same LLM responses, the SDK produces the same result as the
-     notebook.
+  2. Given the same LLM responses, the SDK produces the same structured result
+     as the notebook (score, reasoning, and word-list fields where applicable).
 
 The vocabulary evaluator makes two LLM calls per evaluation:
   - Step "background_knowledge": plain-text response (no system prompt).
@@ -99,8 +99,9 @@ def test_hurricanes_grade7(self) -> None:
 
         Verifies:
         - Both LLM requests match the notebook.
-        - Given the notebook's LLM responses, the SDK maps the integer score
-          to the correct TextComplexityAnswer and returns the same reasoning.
+        - Given the notebook's LLM responses, the SDK maps ``complexity_score``
+          to the correct ``TextComplexityAnswer`` and returns the same reasoning
+          and word-breakdown details as the notebook.
         """
         case = load_vocabulary_other_grades_case()
 
@@ -130,3 +131,6 @@ def test_hurricanes_grade7(self) -> None:
         assert result.explanation.summary == expected.explanation.summary, (
             "explanation.summary (reasoning) differs between SDK and notebook"
         )
+        assert result.explanation.details == expected.explanation.details, (
+            "explanation.details (word breakdown) differs between SDK and notebook"
+        )
diff --git a/sdks/python/tests/contract_tests/vocabulary.py b/sdks/python/tests/contract_tests/vocabulary.py
index ada2fb0..26e56b1 100644
--- a/sdks/python/tests/contract_tests/vocabulary.py
+++ b/sdks/python/tests/contract_tests/vocabulary.py
@@ -114,10 +114,11 @@ def vocabulary_other_grades_notebook_to_sdk_result(
 
     The OTHER_GRADES path uses the same Output-style schema as grades 3–4, so
     ``expected_result`` contains a string ``complexity_score`` (e.g. "slightly
-    complex") which is normalised to underscore form before mapping.
+    complex") which is normalised to underscore form before mapping, plus the
+    same word-list fields as the notebook dict.
 
-    Only ``answer`` and ``explanation.summary`` are compared — ``metadata`` is
-    a placeholder because it contains non-deterministic fields.
+    Only ``answer`` and ``explanation`` are compared — ``metadata`` is a
+    placeholder because it contains non-deterministic fields.
 
     Args:
         case: A loaded :class:`~loader.ContractCase` with a populated
@@ -129,7 +130,15 @@ def vocabulary_other_grades_notebook_to_sdk_result(
     r = case.expected_result
     score = r["complexity_score"].lower().replace(" ", "_")
     answer = TextComplexityAnswer.from_score(score)
-    explanation = EvaluationExplanation(summary=r["reasoning"], details={})
+    explanation = EvaluationExplanation(
+        summary=r["reasoning"],
+        details={
+            "tier_2_words": r["tier_2_words"],
+            "tier_3_words": r["tier_3_words"],
+            "archaic_words": r["archaic_words"],
+            "other_complex_words": r["other_complex_words"],
+        },
+    )
     return TextComplexityResult(
         answer=answer,
         explanation=explanation,
diff --git a/sdks/python/tests/evaluators/test_vocabulary.py b/sdks/python/tests/evaluators/test_vocabulary.py
index 7aefb0c..37bd963 100644
--- a/sdks/python/tests/evaluators/test_vocabulary.py
+++ b/sdks/python/tests/evaluators/test_vocabulary.py
@@ -12,9 +12,8 @@
 from learning_commons_evaluators.schemas.errors import ConfigurationError, ValidationError
 from learning_commons_evaluators.schemas.metadata import Status
 from learning_commons_evaluators.schemas.vocabulary import (
-    VOCABULARY_SUPPORTED_GRADES,
-    VocabularyOutputGrades34,
-    VocabularyOutputOtherGrades,
+    VocabularyComplexityOutput,
+    normalize_complexity_output,
 )
 
 _SAMPLE_TEXT = "The cat sat on the mat."
@@ -26,8 +25,8 @@
 
 def _make_grades34_output(
     score: str = "moderately_complex",
-) -> VocabularyOutputGrades34:
-    return VocabularyOutputGrades34(
+) -> VocabularyComplexityOutput:
+    return VocabularyComplexityOutput(
         tier_2_words="sat",
         tier_3_words="none",
         archaic_words="none",
@@ -45,14 +44,10 @@ def _make_grades34_output(
 }
 
 
-def _make_other_grades_output(answer: int = 2) -> VocabularyOutputOtherGrades:
-    """Build a mock ``VocabularyOutputOtherGrades`` from a convenience integer (1–4).
+def _make_other_grades_output(answer: int = 2) -> VocabularyComplexityOutput:
+    """Build mock complexity output from a convenience integer rubric level (1–4)."""
 
-    The OTHER_GRADES LLM returns space-separated string labels ("slightly complex",
-    etc.).  Calling code keeps the same integer-based API for readability; this
-    helper maps it to the string the real model would return.
-    """
-    return VocabularyOutputOtherGrades(
+    return VocabularyComplexityOutput(
         tier_2_words="sat",
         tier_3_words="none",
         archaic_words="none",
@@ -167,20 +162,55 @@ def test_evaluate_grade_12_returns_result(self):
         assert result.metadata.status == Status.succeeded
         assert result.answer.score == "slightly_complex"
 
-    def test_other_grades_explanation_details_is_empty(self):
-        """The grades 5–12 evaluator drops word-level breakdowns from explanation.details.
-
-        The LLM does return tier_2_words etc. in its output (same schema as grades
-        3–4), but the evaluator intentionally omits them from the explanation so
-        callers get a clean summary-only result for the higher-grade path.
-        """
+    def test_other_grades_explanation_includes_word_breakdown(self):
+        """Grades 5–12 mirror the notebook: word lists live in ``explanation.details``."""
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=8)
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)):
             result = evaluator.evaluate(inp)
 
-        assert result.explanation.details == {}
+        details = result.explanation.details
+        assert details["tier_2_words"] == "sat"
+        assert details["tier_3_words"] == "none"
+        assert details["archaic_words"] == "none"
+        assert details["other_complex_words"] == "none"
+
+    def test_other_grades_legacy_integer_answer_normalizes_like_notebook(self):
+        """``normalize_complexity_output`` then validate (same order as the notebook)."""
+        parsed = VocabularyComplexityOutput.model_validate(
+            normalize_complexity_output(
+                {"answer": 3, "reasoning": "Dense technical terms throughout."}
+            )
+        )
+        assert parsed.complexity_score == "Very Complex"
+        assert parsed.tier_2_words == ""
+        assert parsed.tier_3_words == ""
+
+    def test_other_grades_legacy_string_digit_answer(self):
+        parsed = VocabularyComplexityOutput.model_validate(
+            normalize_complexity_output({"answer": "2", "reasoning": "Accessible vocabulary."})
+        )
+        assert parsed.complexity_score == "Moderately Complex"
+
+
+class TestNormalizeComplexityOutput:
+    def test_preserves_complexity_score_when_answer_absent(self):
+        row = normalize_complexity_output(
+            {
+                "tier_2_words": "a",
+                "tier_3_words": "b",
+                "archaic_words": "c",
+                "other_complex_words": "d",
+                "complexity_score": "slightly complex",
+                "reasoning": "r",
+            }
+        )
+        assert row["complexity_score"] == "slightly complex"
+
+    def test_answer_overwrites_or_sets_complexity_score(self):
+        row = normalize_complexity_output({"answer": 1, "reasoning": "x"})
+        assert row["complexity_score"] == "Slightly Complex"
 
 
 # ── Grade validation via framework ────────────────────────────────────────────
@@ -190,7 +220,7 @@ class TestVocabularyEvaluationInputValidation:
     def test_allowed_grades_set_from_toml(self):
         """VocabularyEvaluationInput picks up allowed_grades from the TOML spec."""
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
-        assert set(inp.grade.spec.allowed_grades) == VOCABULARY_SUPPORTED_GRADES
+        assert set(inp.grade.spec.allowed_grades) == frozenset(range(3, 13))
 
     @pytest.mark.parametrize("unsupported_grade", [0, 1, 2])
     def test_unsupported_grade_raises_via_framework(self, unsupported_grade):
diff --git a/sdks/settings/vocabulary/contracts.toml b/sdks/settings/vocabulary/contracts.toml
index a734162..8420e01 100644
--- a/sdks/settings/vocabulary/contracts.toml
+++ b/sdks/settings/vocabulary/contracts.toml
@@ -30,8 +30,10 @@
 # -----------------------
 # marco_polo_grade3  — grade 3, uses the grades 3–4 path (Gemini + rubric labels
 #                      + word-level breakdown in expected_result).
-# hurricanes_grade7  — grade 7, uses the grades 5–12 path (GPT + integer score
-#                      in expected_result).
+# hurricanes_grade7  — grade 7, uses the grades 5–12 path (GPT; expected_result
+#                      holds ``complexity_score`` as a string plus word-list fields,
+#                      matching the notebook ``Output`` shape. The SDK also accepts
+#                      legacy integer ``answer`` in raw JSON like ``normalize_complexity_output``.)
 
 # ===========================================================================
 # Case 1: marco_polo_grade3  (grades 3–4 path)

From c6ac4961ff2bf4de7457ea2954314c5792fc22d5 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Wed, 13 May 2026 20:20:41 -0700
Subject: [PATCH 10/10] chore: same step name as typescript sdk + edge case
 unit test

---
 .../evaluators/vocabulary.py                  |  2 +-
 .../tests/evaluators/test_vocabulary.py       | 30 +++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
index 8b2f449..cde17df 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
@@ -170,7 +170,7 @@ def _run_vocab_complexity_chain(
         ).partial(format_instructions=parser.get_format_instructions())
 
         output = self.execute_prompt_chain_step(
-            step_name="vocab_complexity",
+            step_name="complexity_evaluation",
             prompt_settings=prompt_settings_vocab,
             evaluation_metadata=evaluation_metadata,
             template=template,
diff --git a/sdks/python/tests/evaluators/test_vocabulary.py b/sdks/python/tests/evaluators/test_vocabulary.py
index 37bd963..a6eacf0 100644
--- a/sdks/python/tests/evaluators/test_vocabulary.py
+++ b/sdks/python/tests/evaluators/test_vocabulary.py
@@ -193,6 +193,26 @@ def test_other_grades_legacy_string_digit_answer(self):
         )
         assert parsed.complexity_score == "Moderately Complex"
 
+    def test_other_grades_unexpected_digit_answer_raises(self):
+        """Out-of-range rubric digit normalizes to a bare string; ``from_score`` rejects it."""
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=7)
+        # Same ``complexity_score`` as ``normalize_complexity_output({"answer": 9, ...})``.
+        unexpected = VocabularyComplexityOutput(
+            tier_2_words="sat",
+            tier_3_words="none",
+            archaic_words="none",
+            other_complex_words="none",
+            complexity_score="9",
+            reasoning="Model returned an invalid rubric code.",
+        )
+        with (
+            _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, unexpected),
+            pytest.raises(ValueError, match=r"Unknown text complexity score: '9'"),
+        ):
+            evaluator.evaluate(inp)
+
 
 class TestNormalizeComplexityOutput:
     def test_preserves_complexity_score_when_answer_absent(self):
@@ -212,6 +232,16 @@ def test_answer_overwrites_or_sets_complexity_score(self):
         row = normalize_complexity_output({"answer": 1, "reasoning": "x"})
         assert row["complexity_score"] == "Slightly Complex"
 
+    def test_unexpected_digit_answer_falls_back_to_string_value(self):
+        """Rubric is 1–4; other digits (int or string) become ``complexity_score == str(value)``."""
+        assert (
+            normalize_complexity_output({"answer": 5, "reasoning": "x"})["complexity_score"] == "5"
+        )
+        assert (
+            normalize_complexity_output({"answer": "9", "reasoning": "x"})["complexity_score"]
+            == "9"
+        )
+
 
 # ── Grade validation via framework ────────────────────────────────────────────