From 6df938dc31935142cc16a464627ce6b96d2d6457 Mon Sep 17 00:00:00 2001
From: arun kumar <arunkumar.ry1@gmail.com>
Date: Thu, 2 Apr 2026 13:27:55 +0530
Subject: [PATCH 1/7] # This is a combination of 4 commits. # This is the 1st
 commit message:

Add prompt caching support for Anthropic

# The commit message #2 will be skipped:

# uncommit unnecessary file

# The commit message #3 will be skipped:

# remove rubocop and flay fixes as they are unrelated to this issue

# The commit message #4 will be skipped:

# remove rubocop ignore for anthropic complete method
---
 .overcommit.yml                               |   6 -
 lib/ruby_llm/chat.rb                          |  25 +-
 lib/ruby_llm/message.rb                       |   7 +-
 lib/ruby_llm/provider.rb                      |   2 +-
 lib/ruby_llm/providers/anthropic.rb           |   7 +
 .../providers/anthropic/capabilities.rb       | 109 +++++
 lib/ruby_llm/providers/anthropic/chat.rb      |  24 +-
 lib/ruby_llm/streaming.rb                     |   2 +-
 spec/ruby_llm/chat_cache_point_spec.rb        |  58 +++
 .../generators/chat_ui_generator_spec.rb      | 457 ++++++++++++------
 spec/ruby_llm/message_spec.rb                 |  24 +
 .../anthropic/chat_cache_control_spec.rb      |  70 +++
 12 files changed, 608 insertions(+), 183 deletions(-)
 create mode 100644 spec/ruby_llm/chat_cache_point_spec.rb
 create mode 100644 spec/ruby_llm/providers/anthropic/chat_cache_control_spec.rb

diff --git a/.overcommit.yml b/.overcommit.yml
index bb45ad9e9..4633f1939 100644
--- a/.overcommit.yml
+++ b/.overcommit.yml
@@ -6,12 +6,6 @@ PreCommit:
 
   Flay:
     enabled: true
-    include:
-     - 'lib/ruby_llm/**/*.rb'
-    exclude:
-     - 'lib/ruby_llm/providers/**/*.rb'
-     - 'lib/ruby_llm/active_record/acts_as_legacy.rb'
-    mass_threshold: 70
 
   RSpec:
     enabled: true
diff --git a/lib/ruby_llm/chat.rb b/lib/ruby_llm/chat.rb
index afc572342..352dee9f6 100644
--- a/lib/ruby_llm/chat.rb
+++ b/lib/ruby_llm/chat.rb
@@ -32,20 +32,20 @@ def initialize(model: nil, provider: nil, assume_model_exists: false, context: n
       }
     end
 
-    def ask(message = nil, with: nil, &)
-      add_message role: :user, content: build_content(message, with)
+    def ask(message = nil, with: nil, cache_point: false, &)
+      add_message role: :user, content: build_content(message, with), cache_point: cache_point
       complete(&)
     end
 
     alias say ask
 
-    def with_instructions(instructions, append: false, replace: nil)
+    def with_instructions(instructions, append: false, replace: nil, cache_point: false)
       append ||= (replace == false) unless replace.nil?
 
       if append
-        append_system_instruction(instructions)
+        append_system_instruction(instructions, cache_point: cache_point)
       else
-        replace_system_instruction(instructions)
+        replace_system_instruction(instructions, cache_point: cache_point)
       end
 
       self
@@ -329,21 +329,16 @@ def content_like?(object)
       object.is_a?(Content) || object.is_a?(Content::Raw)
     end
 
-    def append_system_instruction(instructions)
+    def append_system_instruction(instructions, cache_point: false)
       system_messages, non_system_messages = @messages.partition { |msg| msg.role == :system }
-      system_messages << Message.new(role: :system, content: instructions)
+      system_messages << Message.new(role: :system, content: instructions, cache_point: cache_point)
       @messages = system_messages + non_system_messages
     end
 
-    def replace_system_instruction(instructions)
-      system_messages, non_system_messages = @messages.partition { |msg| msg.role == :system }
+    def replace_system_instruction(instructions, cache_point: false)
+      _, non_system_messages = @messages.partition { |msg| msg.role == :system }
 
-      if system_messages.empty?
-        system_messages = [Message.new(role: :system, content: instructions)]
-      else
-        system_messages.first.content = instructions
-        system_messages = [system_messages.first]
-      end
+      system_messages = [Message.new(role: :system, content: instructions, cache_point: cache_point)]
 
       @messages = system_messages + non_system_messages
     end
diff --git a/lib/ruby_llm/message.rb b/lib/ruby_llm/message.rb
index eefb93e55..dc88da741 100644
--- a/lib/ruby_llm/message.rb
+++ b/lib/ruby_llm/message.rb
@@ -5,7 +5,8 @@ module RubyLLM
   class Message
     ROLES = %i[system user assistant tool].freeze
 
-    attr_reader :role, :model_id, :tool_calls, :tool_call_id, :raw, :thinking, :tokens
+    attr_reader :role, :model_id, :tool_calls, :tool_call_id, :raw, :thinking, :tokens, :cache_point
+    alias cache_point? cache_point
     attr_writer :content
 
     def initialize(options = {})
@@ -24,6 +25,7 @@ def initialize(options = {})
       )
       @raw = options[:raw]
       @thinking = options[:thinking]
+      @cache_point = options.fetch(:cache_point, false)
 
       ensure_valid_role
     end
@@ -80,7 +82,8 @@ def to_h
         tool_calls: tool_calls,
         tool_call_id: tool_call_id,
         thinking: thinking&.text,
-        thinking_signature: thinking&.signature
+        thinking_signature: thinking&.signature,
+        cache_point: @cache_point || nil
       }.merge(tokens ? tokens.to_h : {}).compact
     end
 
diff --git a/lib/ruby_llm/provider.rb b/lib/ruby_llm/provider.rb
index a434ac114..db5e77447 100644
--- a/lib/ruby_llm/provider.rb
+++ b/lib/ruby_llm/provider.rb
@@ -257,7 +257,7 @@ def maybe_normalize_temperature(temperature, _model)
 
     def sync_response(connection, payload, additional_headers = {})
       response = connection.post completion_url, payload do |req|
-        req.headers = additional_headers.merge(req.headers) unless additional_headers.empty?
+        req.headers.merge!(additional_headers) unless additional_headers.empty?
       end
       parse_completion_response response
     end
diff --git a/lib/ruby_llm/providers/anthropic.rb b/lib/ruby_llm/providers/anthropic.rb
index a0686036f..f7e85ff32 100644
--- a/lib/ruby_llm/providers/anthropic.rb
+++ b/lib/ruby_llm/providers/anthropic.rb
@@ -22,6 +22,13 @@ def headers
         }
       end
 
+      def complete(messages, headers: {}, **kwargs, &block)
+        headers = headers.merge('anthropic-beta' => 'prompt-caching-2024-07-31') if messages.any?(&:cache_point?)
+
+        super(messages, headers: headers, **kwargs, &block) # rubocop:disable Style/SuperArguments
+        # Ignoring as we're modifying headers before calling super. We need to call super with modified headers.
+      end
+
       class << self
         def capabilities
           Anthropic::Capabilities
diff --git a/lib/ruby_llm/providers/anthropic/capabilities.rb b/lib/ruby_llm/providers/anthropic/capabilities.rb
index 730f12301..27f6c850b 100644
--- a/lib/ruby_llm/providers/anthropic/capabilities.rb
+++ b/lib/ruby_llm/providers/anthropic/capabilities.rb
@@ -14,6 +14,115 @@ def supports_tool_choice?(_model_id)
         def supports_tool_parallel_control?(_model_id)
           true
         end
+
+        def supports_json_mode?(model_id)
+          !model_id.match?(/claude-[12]/)
+        end
+
+        def supports_structured_output?(model_id)
+          match = model_id.match(/claude-(?:sonnet|opus|haiku)-(\d+)-(\d+)/)
+          return false unless match
+
+          major = match[1].to_i
+          minor = match[2].to_i
+          major > 4 || (major == 4 && minor >= 5)
+        end
+
+        def supports_extended_thinking?(model_id)
+          model_id.match?(/claude-3-7-sonnet/)
+        end
+
+        def supports_prompt_caching?(model_id)
+          !model_id.match?(/claude-[12]/)
+        end
+
+        def model_family(model_id)
+          case model_id
+          when /claude-3-7-sonnet/  then 'claude-3-7-sonnet'
+          when /claude-3-5-sonnet/  then 'claude-3-5-sonnet'
+          when /claude-3-5-haiku/   then 'claude-3-5-haiku'
+          when /claude-3-opus/      then 'claude-3-opus'
+          when /claude-3-sonnet/    then 'claude-3-sonnet'
+          when /claude-3-haiku/     then 'claude-3-haiku'
+          else 'claude-2'
+          end
+        end
+
+        def model_type(_)
+          'chat'
+        end
+
+        PRICES = {
+          'claude-3-7-sonnet': { input: 3.0, output: 15.0 },
+          'claude-3-5-sonnet': { input: 3.0, output: 15.0 },
+          'claude-3-5-haiku': { input: 0.80, output: 4.0 },
+          'claude-3-opus': { input: 15.0, output: 75.0 },
+          'claude-3-haiku': { input: 0.25, output: 1.25 },
+          'claude-2': { input: 3.0, output: 15.0 }
+        }.freeze
+
+        def default_input_price
+          3.0
+        end
+
+        def default_output_price
+          15.0
+        end
+
+        def modalities_for(model_id)
+          modalities = {
+            input: ['text'],
+            output: ['text']
+          }
+
+          unless model_id.match?(/claude-[12]/)
+            modalities[:input] << 'image'
+            modalities[:input] << 'pdf'
+          end
+
+          modalities
+        end
+
+        def capabilities_for(model_id)
+          capabilities = ['streaming']
+
+          unless model_id.match?(/claude-[12]/)
+            capabilities << 'function_calling'
+            capabilities << 'batch'
+          end
+
+          capabilities << 'structured_output' if supports_structured_output?(model_id)
+          capabilities << 'reasoning' if model_id.match?(/claude-3-7-sonnet|claude-(?:sonnet|opus|haiku)-4/)
+          capabilities << 'citations' if model_id.match?(/claude-3\.5|claude-3-7/)
+          capabilities
+        end
+
+        def pricing_for(model_id)
+          family = model_family(model_id)
+          prices = PRICES.fetch(family.to_sym, { input: default_input_price, output: default_output_price })
+
+          standard_pricing = {
+            input_per_million: prices[:input],
+            output_per_million: prices[:output]
+          }
+
+          batch_pricing = {
+            input_per_million: prices[:input] * 0.5,
+            output_per_million: prices[:output] * 0.5
+          }
+
+          if model_id.match?(/claude-3-7/)
+            standard_pricing[:reasoning_output_per_million] = prices[:output] * 2.5
+            batch_pricing[:reasoning_output_per_million] = prices[:output] * 1.25
+          end
+
+          {
+            text_tokens: {
+              standard: standard_pricing,
+              batch: batch_pricing
+            }
+          }
+        end
       end
     end
   end
diff --git a/lib/ruby_llm/providers/anthropic/chat.rb b/lib/ruby_llm/providers/anthropic/chat.rb
index 9926fe98b..24f009a48 100644
--- a/lib/ruby_llm/providers/anthropic/chat.rb
+++ b/lib/ruby_llm/providers/anthropic/chat.rb
@@ -41,11 +41,13 @@ def build_system_content(system_messages)
           system_messages.flat_map do |msg|
             content = msg.content
 
-            if content.is_a?(RubyLLM::Content::Raw)
-              content.value
-            else
-              Media.format_content(content)
-            end
+            blocks = if content.is_a?(RubyLLM::Content::Raw)
+                       Array(content.value)
+                     else
+                       Array(Media.format_content(content))
+                     end
+
+            msg.cache_point? ? inject_cache_control(blocks) : blocks
           end
         end
 
@@ -159,6 +161,7 @@ def format_basic_message_with_thinking(msg, thinking_enabled)
           end
 
           append_formatted_content(content_blocks, msg.content)
+          inject_cache_control(content_blocks) if msg.cache_point?
 
           {
             role: convert_role(msg.role),
@@ -228,6 +231,17 @@ def append_formatted_content(content_blocks, content)
           end
         end
 
+        def inject_cache_control(blocks)
+          return blocks if blocks.empty?
+
+          last = blocks.last
+          # Don't duplicate if already present (e.g. Content::Raw with cache_control)
+          return blocks if last.is_a?(Hash) && last[:cache_control]
+
+          blocks[-1] = last.merge(cache_control: { type: 'ephemeral' })
+          blocks
+        end
+
         def convert_role(role)
           case role
           when :tool, :user then 'user'
diff --git a/lib/ruby_llm/streaming.rb b/lib/ruby_llm/streaming.rb
index a671f9cca..df7e82566 100644
--- a/lib/ruby_llm/streaming.rb
+++ b/lib/ruby_llm/streaming.rb
@@ -9,7 +9,7 @@ def stream_response(connection, payload, additional_headers = {}, &block)
       accumulator = StreamAccumulator.new
 
       response = connection.post stream_url, payload do |req|
-        req.headers = additional_headers.merge(req.headers) unless additional_headers.empty?
+        req.headers.merge!(additional_headers) unless additional_headers.empty?
         if faraday_1?
           req.options[:on_data] = handle_stream do |chunk|
             accumulator.add chunk
diff --git a/spec/ruby_llm/chat_cache_point_spec.rb b/spec/ruby_llm/chat_cache_point_spec.rb
new file mode 100644
index 000000000..2f730c1a4
--- /dev/null
+++ b/spec/ruby_llm/chat_cache_point_spec.rb
@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe RubyLLM::Chat do
+  include_context 'with configured RubyLLM'
+
+  describe 'cache_point forwarding' do
+    let(:chat) { RubyLLM.chat }
+
+    shared_examples 'a method that supports cache_point' do |message_finder|
+      it 'sets cache_point? true when cache_point: true' do
+        action.call(cache_point: true)
+        message = message_finder.call(chat)
+        expect(message).not_to be_nil
+        expect(message.cache_point?).to be true
+      end
+
+      it 'sets cache_point? false when cache_point is omitted' do
+        action.call
+        message = message_finder.call(chat)
+        expect(message).not_to be_nil
+        expect(message.cache_point?).to be false
+      end
+    end
+
+    describe '#with_instructions' do
+      let(:action) { ->(opts = {}) { chat.with_instructions('Be helpful', **opts) } }
+      let(:finder) { ->(c) { c.messages.find { |m| m.role == :system } } }
+
+      it_behaves_like 'a method that supports cache_point', ->(c) { c.messages.find { |m| m.role == :system } }
+
+      it 'sets cache_point? true on appended message only' do
+        chat.with_instructions('First instruction')
+        chat.with_instructions('Second instruction', append: true, cache_point: true)
+        system_msgs = chat.messages.select { |m| m.role == :system }
+        expect(system_msgs.last.cache_point?).to be true
+        expect(system_msgs.first.cache_point?).to be false
+      end
+
+      it 'preserves cache_point: true when replacing' do
+        chat.with_instructions('Old instruction', cache_point: false)
+        chat.with_instructions('New instruction', replace: true, cache_point: true)
+        system_msgs = chat.messages.select { |m| m.role == :system }
+        expect(system_msgs.size).to eq(1)
+        expect(system_msgs.first.cache_point?).to be true
+      end
+    end
+
+    describe '#ask' do
+      before { allow(chat).to receive(:complete) }
+
+      let(:action) { ->(opts = {}) { chat.ask('Hello', **opts) } }
+
+      it_behaves_like 'a method that supports cache_point', ->(c) { c.messages.find { |m| m.role == :user } }
+    end
+  end
+end
diff --git a/spec/ruby_llm/generators/chat_ui_generator_spec.rb b/spec/ruby_llm/generators/chat_ui_generator_spec.rb
index 02e7e0f6f..612729dd9 100644
--- a/spec/ruby_llm/generators/chat_ui_generator_spec.rb
+++ b/spec/ruby_llm/generators/chat_ui_generator_spec.rb
@@ -11,17 +11,194 @@
   let(:rails_root) { Rails.root }
   let(:template_path) { File.expand_path('../../fixtures/templates', __dir__) }
 
-  def expect_messages_helper_content(path)
-    messages_helper = File.read(path)
-    expect(messages_helper).to include('def default_model_display_name')
-    expect(messages_helper).not_to include('def llm_model_label(model)')
-    expect(messages_helper).to include('RubyLLM.models.find(RubyLLM.config.default_model).label')
-    expect(messages_helper).to include('def tool_result_partial(message)')
-    expect(messages_helper).to include('def tool_call_partial(tool_call)')
-    expect(messages_helper).not_to include('def model_display_name(model)')
-    expect(messages_helper).not_to include('def provider_display_name(model_or_provider)')
-    expect(messages_helper).not_to include('def parse_tool_payload(content)')
-    expect(messages_helper).not_to include('def llm_model_info(model)')
+  describe 'with default model names' do
+    let(:app_name) { 'test_app_default' }
+    let(:app_path) { File.join(Dir.tmpdir, app_name) }
+
+    before(:all) do # rubocop:disable RSpec/BeforeAfterAll
+      template_path = File.expand_path('../../fixtures/templates', __dir__)
+      GeneratorTestHelpers.cleanup_test_app(File.join(Dir.tmpdir, 'test_app_default'))
+      GeneratorTestHelpers.create_test_app('test_app_default',
+                                           template: 'default_models_template.rb',
+                                           template_path: template_path)
+    end
+
+    after(:all) do # rubocop:disable RSpec/BeforeAfterAll
+      GeneratorTestHelpers.cleanup_test_app(File.join(Dir.tmpdir, 'test_app_default'))
+    end
+
+    it 'creates controller files with default names' do
+      within_test_app(app_path) do
+        expect(File.exist?('app/controllers/chats_controller.rb')).to be true
+        expect(File.exist?('app/controllers/messages_controller.rb')).to be true
+        expect(File.exist?('app/controllers/models_controller.rb')).to be true
+        expect(File.exist?('app/helpers/messages_helper.rb')).to be true
+
+        messages_helper = File.read('app/helpers/messages_helper.rb')
+        expect(messages_helper).to include('def default_model_display_name')
+        expect(messages_helper).not_to include('def llm_model_label(model)')
+        expect(messages_helper).to include('RubyLLM.models.find(RubyLLM.config.default_model).label')
+        expect(messages_helper).to include('def tool_result_partial(message)')
+        expect(messages_helper).to include('def tool_call_partial(tool_call)')
+        expect(messages_helper).not_to include('def model_display_name(model)')
+        expect(messages_helper).not_to include('def provider_display_name(model_or_provider)')
+        expect(messages_helper).not_to include('def parse_tool_payload(content)')
+        expect(messages_helper).not_to include('def llm_model_info(model)')
+      end
+    end
+
+    it 'creates view files with default paths' do
+      within_test_app(app_path) do
+        # Chat views
+        expect(File.exist?('app/views/chats/index.html.erb')).to be true
+        expect(File.exist?('app/views/chats/new.html.erb')).to be true
+        expect(File.exist?('app/views/chats/show.html.erb')).to be true
+        expect(File.exist?('app/views/chats/_chat.html.erb')).to be true
+        expect(File.exist?('app/views/chats/_form.html.erb')).to be true
+
+        # Message views
+        expect(File.exist?('app/views/messages/_assistant.html.erb')).to be true
+        expect(File.exist?('app/views/messages/_user.html.erb')).to be true
+        expect(File.exist?('app/views/messages/_system.html.erb')).to be true
+        expect(File.exist?('app/views/messages/_tool.html.erb')).to be true
+        expect(File.exist?('app/views/messages/_error.html.erb')).to be true
+        expect(File.exist?('app/views/messages/_content.html.erb')).to be true
+        expect(File.exist?('app/views/messages/_tool_calls.html.erb')).to be true
+        expect(File.exist?('app/views/messages/tool_calls/_default.html.erb')).to be true
+        expect(File.exist?('app/views/messages/tool_results/_default.html.erb')).to be true
+        expect(File.exist?('app/views/messages/create.turbo_stream.erb')).to be true
+        expect(File.exist?('app/views/messages/_form.html.erb')).to be true
+
+        user_partial = File.read('app/views/messages/_user.html.erb')
+        expect(user_partial).to include('user.content')
+        expect(user_partial).to include('local_assigns[:message]')
+        assistant_partial = File.read('app/views/messages/_assistant.html.erb')
+        expect(assistant_partial).to include('assistant.content')
+        expect(assistant_partial).to include('local_assigns[:message]')
+        system_partial = File.read('app/views/messages/_system.html.erb')
+        expect(system_partial).to include('system.content')
+        expect(system_partial).to include('local_assigns[:message]')
+        tool_partial = File.read('app/views/messages/_tool.html.erb')
+        expect(tool_partial).to include('render tool_result_partial(tool), tool: tool')
+        tool_calls_partial = File.read('app/views/messages/_tool_calls.html.erb')
+        expect(tool_calls_partial).to include('tool_calls: tool_calls, tool_call: tool_call')
+        expect(tool_calls_partial).to include('local_assigns[:message]')
+        tool_results_default = File.read('app/views/messages/tool_results/_default.html.erb')
+        expect(tool_results_default).to include('tool.tool_error_message')
+        chat_form = File.read('app/views/chats/_form.html.erb')
+        expect(chat_form).to include('@chat_models.map')
+        expect(chat_form).to include('[model.label, model.id]')
+        expect(chat_form).to include('default_model_display_name')
+        create_stream = File.read('app/views/messages/create.turbo_stream.erb')
+        expect(create_stream).to include('turbo_stream.replace "new_message"')
+        expect(create_stream).to include('render "messages/form"')
+
+        # Model views
+        expect(File.exist?('app/views/models/index.html.erb')).to be true
+        expect(File.exist?('app/views/models/show.html.erb')).to be true
+        expect(File.exist?('app/views/models/_model.html.erb')).to be true
+        models_index = File.read('app/views/models/index.html.erb')
+        expect(models_index).to include('@models.each do |model_info|')
+        expect(models_index).to include('render "models/model",')
+      end
+    end
+
+    it 'uses scaffold-style inline styles by default' do
+      within_test_app(app_path) do
+        index_view = File.read('app/views/chats/index.html.erb')
+        expect(index_view).to include('<p style="color: green">')
+        expect(index_view).not_to include('text-green-700')
+      end
+    end
+
+    it 'creates job file with default name' do
+      within_test_app(app_path) do
+        expect(File.exist?('app/jobs/chat_response_job.rb')).to be true
+      end
+    end
+
+    it 'adds routes for default controllers' do
+      within_test_app(app_path) do
+        routes_content = File.read('config/routes.rb')
+        expect(routes_content).to include('resources :chats')
+        expect(routes_content).to include('resources :messages, only: [ :create ]')
+        expect(routes_content).to include('resources :models, only: [ :index, :show ]')
+      end
+    end
+
+    it 'adds broadcasting to message model' do
+      within_test_app(app_path) do
+        message_content = File.read('app/models/message.rb')
+
+        # Check the acts_as_message declaration
+        expect(message_content).to include('acts_as_message')
+
+        # Check broadcasting setup
+        expect(message_content).to include(%q(broadcasts_to ->(message) { "chat_#{message.chat_id}" }))
+        expect(message_content).to include('inserts_by: :append')
+
+        # Check broadcast_append_chunk method
+        expect(message_content).to include('def broadcast_append_chunk(content)')
+        expect(message_content).to include(%q(broadcast_append_to "chat_#{chat_id}"))
+        expect(message_content).to include(%q(target: "message_#{id}_content"))
+        expect(message_content).to include('content: ERB::Util.html_escape(content.to_s)')
+      end
+    end
+
+    it 'controllers reference correct model classes' do
+      within_test_app(app_path) do
+        chats_controller = File.read('app/controllers/chats_controller.rb')
+        expect(chats_controller).to include('class ChatsController')
+        expect(chats_controller).to include('Chat.find')
+        expect(chats_controller).to include('@chat = Chat.new')
+        expect(chats_controller).to include('@chat_models = available_chat_models')
+        expect(chats_controller).to include('prompt = params.dig(:chat, :prompt)')
+        expect(chats_controller).to include('if prompt.present?')
+        expect(chats_controller).to include('@chat = Chat.create!(model: params.dig(:chat, :model).presence)')
+        expect(chats_controller).not_to include('def model')
+        expect(chats_controller).not_to include('def prompt')
+
+        messages_controller = File.read('app/controllers/messages_controller.rb')
+        expect(messages_controller).to include('class MessagesController')
+        expect(messages_controller).to include('@chat = Chat.find(params[:chat_id])')
+        expect(messages_controller).to include('content = params.dig(:message, :content)')
+        expect(messages_controller).to include('if content.present?')
+        expect(messages_controller).to include('ChatResponseJob.perform_later')
+        expect(messages_controller).to include('format.turbo_stream')
+        expect(messages_controller).not_to include('def content')
+
+        models_controller = File.read('app/controllers/models_controller.rb')
+        expect(models_controller).to include('class ModelsController')
+        expect(models_controller).to include('@models = available_chat_models')
+
+        application_controller = File.read('app/controllers/application_controller.rb')
+        expect(application_controller).to include('def available_chat_models')
+        expect(application_controller).to include('sort_by { |model| [ model.provider.to_s, model.name.to_s ] }')
+      end
+    end
+
+    it 'job references correct model classes' do
+      within_test_app(app_path) do
+        job_content = File.read('app/jobs/chat_response_job.rb')
+        expect(job_content).to include('class ChatResponseJob')
+        expect(job_content).to include('chat = Chat.find(chat_id)')
+        expect(job_content).to include('chat.ask(content)')
+        expect(job_content).to include('message = chat.messages.last')
+      end
+    end
+
+    it 'chat functionality works correctly' do
+      within_test_app(app_path) do
+        test_script = <<~RUBY
+          ActiveJob::Base.queue_adapter = :inline
+          chat = Chat.create!
+          message = chat.messages.create!(role: :user, content: 'Test')
+          exit(message.chat_id == chat.id ? 0 : 1)
+        RUBY
+        success, output = run_rails_runner(test_script)
+        expect(success).to be(true), output
+      end
+    end
   end
 
   def expect_generated_view_set(
@@ -133,148 +310,122 @@ def expect_chat_script_to_succeed(script)
     expect(success).to be(true), output
   end
 
-  {
-    'with default model names' => {
-      app_name: 'test_app_default',
-      template_name: 'default_models_template.rb',
-      controller_example: 'creates controller files with default names',
-      controller_paths: %w[
-        app/controllers/chats_controller.rb
-        app/controllers/messages_controller.rb
-        app/controllers/models_controller.rb
-      ],
-      helper_path: 'app/helpers/messages_helper.rb',
-      view_example: 'creates view files with default paths',
-      view_options: {
-        base_path: 'app/views',
-        chats_target: 'new_message',
-        form_partial_path: 'messages/form',
-        model_index_collection: 'models',
-        model_partial_path: 'models/model'
-      },
-      job_file_example: 'creates job file with default name',
-      job_file_path: 'app/jobs/chat_response_job.rb',
-      routes_example: 'adds routes for default controllers',
-      namespaced_routes: false,
-      broadcasting_example: 'adds broadcasting to message model',
-      broadcasting_options: {
-        path: 'app/models/message.rb',
-        acts_as_message_lines: ['acts_as_message'],
-        broadcasts_to_line: "broadcasts_to ->(message) { \"chat_\#{message.chat_id}\" }",
-        broadcast_target_line: "broadcast_append_to \"chat_\#{chat_id}\"",
-        content_target_line: "target: \"message_\#{id}_content\""
-      },
-      controllers_example: 'controllers reference correct model classes',
-      chats_controller_path: 'app/controllers/chats_controller.rb',
-      chats_controller_expectations: [
-        'class ChatsController',
-        'Chat.find',
-        '@chat = Chat.new',
-        '@chat_models = available_chat_models',
-        'prompt = params.dig(:chat, :prompt)',
-        'if prompt.present?',
-        '@chat = Chat.create!(model: params.dig(:chat, :model).presence)'
-      ],
-      messages_controller_path: 'app/controllers/messages_controller.rb',
-      messages_controller_expectations: [
-        'class MessagesController',
-        '@chat = Chat.find(params[:chat_id])',
-        'content = params.dig(:message, :content)',
-        'if content.present?',
-        'ChatResponseJob.perform_later',
-        'format.turbo_stream'
-      ],
-      models_controller_path: 'app/controllers/models_controller.rb',
-      models_controller_expectations: [
-        'class ModelsController',
-        '@models = available_chat_models'
-      ],
-      job_example: 'job references correct model classes',
-      job_options: {
-        path: 'app/jobs/chat_response_job.rb',
-        class_name: 'class ChatResponseJob',
-        lookup_line: 'chat = Chat.find(chat_id)',
-        ask_line: 'chat.ask(content)',
-        last_message_line: 'message = chat.messages.last'
-      },
-      functionality_example: 'chat functionality works correctly',
-      functionality_script: <<~RUBY
-        ActiveJob::Base.queue_adapter = :inline
-        chat = Chat.create!
-        message = chat.messages.create!(role: :user, content: 'Test')
-        exit(message.chat_id == chat.id ? 0 : 1)
-      RUBY
-    },
-    'with namespaced model names' => {
-      app_name: 'test_app_namespaced',
-      template_name: 'namespaced_models_template.rb',
-      controller_example: 'creates controller files with namespaced paths',
-      controller_paths: %w[
-        app/controllers/llm/chats_controller.rb
-        app/controllers/llm/messages_controller.rb
-        app/controllers/llm/models_controller.rb
-      ],
-      helper_path: 'app/helpers/llm/messages_helper.rb',
-      view_example: 'creates view files with namespaced paths',
-      view_options: {
-        base_path: 'app/views/llm',
-        chats_target: 'new_llm_message',
-        form_partial_path: 'llm/messages/form',
-        model_index_collection: 'llm_models',
-        model_partial_path: 'llm/models/model'
-      },
-      job_file_example: 'creates job file with namespaced name',
-      job_file_path: 'app/jobs/llm_chat_response_job.rb',
-      routes_example: 'adds routes for namespaced controllers',
-      namespaced_routes: true,
-      broadcasting_example: 'adds broadcasting to namespaced message model',
-      broadcasting_options: {
-        path: 'app/models/llm/message.rb',
-        acts_as_message_lines: [
-          "acts_as_message chat: :llm_chat, chat_class: 'Llm::Chat'",
-          "tool_calls: :llm_tool_calls, tool_call_class: 'Llm::ToolCall'",
-          "model: :llm_model, model_class: 'Llm::Model'"
-        ],
-        broadcasts_to_line: "broadcasts_to ->(llm_message) { \"llm_chat_\#{llm_message.llm_chat_id}\" }",
-        broadcast_target_line: "broadcast_append_to \"llm_chat_\#{llm_chat_id}\"",
-        content_target_line: "target: \"llm_message_\#{id}_content\""
-      },
-      controllers_example: 'controllers reference correct namespaced model classes',
-      chats_controller_path: 'app/controllers/llm/chats_controller.rb',
-      chats_controller_expectations: [
-        'class Llm::ChatsController',
-        'Llm::Chat.find',
-        '@llm_chat = Llm::Chat.new',
-        '@chat_models = available_chat_models',
-        'prompt = params.dig(:llm_chat, :prompt)',
-        'if prompt.present?',
-        '@llm_chat = Llm::Chat.create!(model:',
-        'params.dig(:llm_chat, :model).presence)'
-      ],
-      messages_controller_path: 'app/controllers/llm/messages_controller.rb',
-      messages_controller_expectations: [
-        'class Llm::MessagesController',
-        '@llm_chat = Llm::Chat.find(params[:chat_id])',
-        'content = params.dig(:llm_message, :content)',
-        'if content.present?',
-        'LlmChatResponseJob.perform_later',
-        'format.turbo_stream'
-      ],
-      models_controller_path: 'app/controllers/llm/models_controller.rb',
-      models_controller_expectations: [
-        'class Llm::ModelsController',
-        '@llm_models = available_chat_models'
-      ],
-      job_example: 'job references correct namespaced model classes',
-      job_options: {
-        path: 'app/jobs/llm_chat_response_job.rb',
-        class_name: 'class LlmChatResponseJob',
-        lookup_line: 'llm_chat = Llm::Chat.find(llm_chat_id)',
-        ask_line: 'llm_chat.ask(content)',
-        last_message_line: 'llm_message = llm_chat.llm_messages.last'
-      },
-      extra_view_example: 'views use correct partial paths',
-      extra_view_assertions: lambda do
+        user_partial = File.read('app/views/llm/messages/_user.html.erb')
+        expect(user_partial).to include('user.content')
+        expect(user_partial).to include('local_assigns[:message]')
+        assistant_partial = File.read('app/views/llm/messages/_assistant.html.erb')
+        expect(assistant_partial).to include('assistant.content')
+        expect(assistant_partial).to include('local_assigns[:message]')
+        system_partial = File.read('app/views/llm/messages/_system.html.erb')
+        expect(system_partial).to include('system.content')
+        expect(system_partial).to include('local_assigns[:message]')
+        tool_partial = File.read('app/views/llm/messages/_tool.html.erb')
+        expect(tool_partial).to include('render tool_result_partial(tool), tool: tool')
+        tool_calls_partial = File.read('app/views/llm/messages/_tool_calls.html.erb')
+        expect(tool_calls_partial).to include('tool_calls: tool_calls, tool_call: tool_call')
+        expect(tool_calls_partial).to include('local_assigns[:message]')
+        tool_results_default = File.read('app/views/llm/messages/tool_results/_default.html.erb')
+        expect(tool_results_default).to include('tool.tool_error_message')
+        chat_form = File.read('app/views/llm/chats/_form.html.erb')
+        expect(chat_form).to include('@chat_models.map')
+        expect(chat_form).to include('[model.label, model.id]')
+        expect(chat_form).to include('default_model_display_name')
+        create_stream = File.read('app/views/llm/messages/create.turbo_stream.erb')
+        expect(create_stream).to include('turbo_stream.replace "new_llm_message"')
+        expect(create_stream).to include('render "llm/messages/form"')
+
+        # Model views
+        expect(File.exist?('app/views/llm/models/index.html.erb')).to be true
+        expect(File.exist?('app/views/llm/models/show.html.erb')).to be true
+        expect(File.exist?('app/views/llm/models/_model.html.erb')).to be true
+        models_index = File.read('app/views/llm/models/index.html.erb')
+        expect(models_index).to include('@llm_models.each do |model_info|')
+        expect(models_index).to include('render "llm/models/model",')
+      end
+    end
+
+    it 'creates job file with namespaced name' do
+      within_test_app(app_path) do
+        expect(File.exist?('app/jobs/llm_chat_response_job.rb')).to be true
+      end
+    end
+
+    it 'adds routes for namespaced controllers' do
+      within_test_app(app_path) do
+        routes_content = File.read('config/routes.rb')
+        expect(routes_content).to include('namespace :llm')
+        expect(routes_content).to include('resources :chats')
+        expect(routes_content).to include('resources :messages, only: [ :create ]')
+        expect(routes_content).to include('resources :models, only: [ :index, :show ]')
+      end
+    end
+
+    it 'adds broadcasting to namespaced message model' do
+      within_test_app(app_path) do
+        message_content = File.read('app/models/llm/message.rb')
+
+        # Check the acts_as_message declaration
+        expect(message_content).to include("acts_as_message chat: :llm_chat, chat_class: 'Llm::Chat'")
+        expect(message_content).to include("tool_calls: :llm_tool_calls, tool_call_class: 'Llm::ToolCall'")
+        expect(message_content).to include("model: :llm_model, model_class: 'Llm::Model'")
+
+        # Check broadcasting setup
+        expect(message_content).to include(%q(broadcasts_to ->(llm_message) { "llm_chat_#{llm_message.llm_chat_id}" }))
+        expect(message_content).to include('inserts_by: :append')
+
+        # Check broadcast_append_chunk method
+        expect(message_content).to include('def broadcast_append_chunk(content)')
+        expect(message_content).to include(%q(broadcast_append_to "llm_chat_#{llm_chat_id}"))
+        expect(message_content).to include(%q(target: "llm_message_#{id}_content"))
+        expect(message_content).to include('content: ERB::Util.html_escape(content.to_s)')
+      end
+    end
+
+    it 'controllers reference correct namespaced model classes' do
+      within_test_app(app_path) do
+        chats_controller = File.read('app/controllers/llm/chats_controller.rb')
+        expect(chats_controller).to include('class Llm::ChatsController')
+        expect(chats_controller).to include('Llm::Chat.find')
+        expect(chats_controller).to include('@llm_chat = Llm::Chat.new')
+        expect(chats_controller).to include('@chat_models = available_chat_models')
+        expect(chats_controller).to include('prompt = params.dig(:llm_chat, :prompt)')
+        expect(chats_controller).to include('if prompt.present?')
+        expect(chats_controller).to include('@llm_chat = Llm::Chat.create!(model:')
+        expect(chats_controller).to include('params.dig(:llm_chat, :model).presence)')
+        expect(chats_controller).not_to include('def model')
+        expect(chats_controller).not_to include('def prompt')
+
+        messages_controller = File.read('app/controllers/llm/messages_controller.rb')
+        expect(messages_controller).to include('class Llm::MessagesController')
+        expect(messages_controller).to include('@llm_chat = Llm::Chat.find(params[:chat_id])')
+        expect(messages_controller).to include('content = params.dig(:llm_message, :content)')
+        expect(messages_controller).to include('if content.present?')
+        expect(messages_controller).to include('LlmChatResponseJob.perform_later')
+        expect(messages_controller).to include('format.turbo_stream')
+        expect(messages_controller).not_to include('def content')
+
+        models_controller = File.read('app/controllers/llm/models_controller.rb')
+        expect(models_controller).to include('class Llm::ModelsController')
+        expect(models_controller).to include('@llm_models = available_chat_models')
+
+        application_controller = File.read('app/controllers/application_controller.rb')
+        expect(application_controller).to include('def available_chat_models')
+        expect(application_controller).to include('sort_by { |model| [ model.provider.to_s, model.name.to_s ] }')
+      end
+    end
+
+    it 'job references correct namespaced model classes' do
+      within_test_app(app_path) do
+        job_content = File.read('app/jobs/llm_chat_response_job.rb')
+        expect(job_content).to include('class LlmChatResponseJob')
+        expect(job_content).to include('llm_chat = Llm::Chat.find(llm_chat_id)')
+        expect(job_content).to include('llm_chat.ask(content)')
+        expect(job_content).to include('llm_message = llm_chat.llm_messages.last')
+      end
+    end
+
+    it 'views use correct partial paths' do
+      within_test_app(app_path) do
         show_view = File.read('app/views/llm/chats/show.html.erb')
         expect(show_view).to include('render')
         expect(show_view).to include('render "llm/messages/form"')
diff --git a/spec/ruby_llm/message_spec.rb b/spec/ruby_llm/message_spec.rb
index a787fd0e7..2672f911f 100644
--- a/spec/ruby_llm/message_spec.rb
+++ b/spec/ruby_llm/message_spec.rb
@@ -3,6 +3,30 @@
 require 'spec_helper'
 
 RSpec.describe RubyLLM::Message do
+  describe '#cache_point?' do
+    it 'returns false by default' do
+      message = described_class.new(role: :user, content: 'hello')
+      expect(message.cache_point?).to be false
+    end
+
+    it 'returns true when constructed with cache_point: true' do
+      message = described_class.new(role: :user, content: 'hello', cache_point: true)
+      expect(message.cache_point?).to be true
+    end
+  end
+
+  describe '#to_h' do
+    it 'omits cache_point key when false' do
+      message = described_class.new(role: :user, content: 'hello')
+      expect(message.to_h).not_to have_key(:cache_point)
+    end
+
+    it 'includes cache_point: true when set' do
+      message = described_class.new(role: :user, content: 'hello', cache_point: true)
+      expect(message.to_h[:cache_point]).to be true
+    end
+  end
+
   describe '#content' do
     it 'normalizes nil content to empty string for assistant tool-call messages' do
       tool_call = RubyLLM::ToolCall.new(id: 'call_1', name: 'weather', arguments: {})
diff --git a/spec/ruby_llm/providers/anthropic/chat_cache_control_spec.rb b/spec/ruby_llm/providers/anthropic/chat_cache_control_spec.rb
new file mode 100644
index 000000000..1fa15c8f5
--- /dev/null
+++ b/spec/ruby_llm/providers/anthropic/chat_cache_control_spec.rb
@@ -0,0 +1,70 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe RubyLLM::Providers::Anthropic::Chat do
+  let(:model) { instance_double(RubyLLM::Model::Info, id: 'claude-sonnet-4-5', max_tokens: nil) }
+
+  def render(messages)
+    described_class.render_payload(
+      messages,
+      tools: {},
+      temperature: nil,
+      model: model,
+      stream: false,
+      schema: nil
+    )
+  end
+
+  describe 'cache_control injection' do
+    context 'with a system message where cache_point is true' do
+      it 'adds cache_control to the last system block' do
+        msg = RubyLLM::Message.new(role: :system, content: 'You are helpful.', cache_point: true)
+        payload = render([msg, RubyLLM::Message.new(role: :user, content: 'Hi')])
+
+        last_block = payload[:system].last
+        expect(last_block[:cache_control]).to eq(type: 'ephemeral')
+      end
+
+      it 'does not add cache_control when cache_point is false' do
+        msg = RubyLLM::Message.new(role: :system, content: 'You are helpful.')
+        payload = render([msg, RubyLLM::Message.new(role: :user, content: 'Hi')])
+
+        payload[:system].each do |block|
+          expect(block).not_to have_key(:cache_control)
+        end
+      end
+    end
+
+    context 'with a user message where cache_point is true' do
+      it 'adds cache_control to the last content block' do
+        msg = RubyLLM::Message.new(role: :user, content: 'Tell me a story.', cache_point: true)
+        payload = render([msg])
+
+        last_block = payload[:messages].first[:content].last
+        expect(last_block[:cache_control]).to eq(type: 'ephemeral')
+      end
+
+      it 'does not add cache_control when cache_point is false' do
+        msg = RubyLLM::Message.new(role: :user, content: 'Tell me a story.')
+        payload = render([msg])
+
+        payload[:messages].first[:content].each do |block|
+          expect(block).not_to have_key(:cache_control)
+        end
+      end
+    end
+
+    context 'when a Content::Raw block already contains cache_control' do
+      it 'does not duplicate when cache_control' do
+        raw = RubyLLM::Providers::Anthropic::Content.new('Cached system', cache: true)
+        msg = RubyLLM::Message.new(role: :system, content: raw, cache_point: true)
+        payload = render([msg, RubyLLM::Message.new(role: :user, content: 'Hi')])
+
+        blocks_with_cache = payload[:system].select { |b| b[:cache_control] }
+        expect(blocks_with_cache.length).to eq(1)
+        expect(blocks_with_cache.first[:cache_control]).to eq(type: 'ephemeral')
+      end
+    end
+  end
+end

From 4db646c390d4c059dbb400cc9aad561d807da1ad Mon Sep 17 00:00:00 2001
From: arun kumar <arunkumar.ry1@gmail.com>
Date: Thu, 2 Apr 2026 13:27:55 +0530
Subject: [PATCH 2/7] # This is a combination of 2 commits. # This is the 1st
 commit message:

Add prompt caching support for Anthropic

# The commit message #2 will be skipped:

# Add prompt caching support for Anthropic
---
 .kiro/specs/prompt-caching/.config.kiro       |   1 +
 .kiro/specs/prompt-caching/design.md          | 433 ++++++++++++++++++
 .kiro/specs/prompt-caching/requirements.md    |  83 ++++
 .kiro/specs/prompt-caching/tasks.md           | 193 ++++++++
 .overcommit.yml                               |   4 +-
 lib/ruby_llm/error.rb                         |  23 +-
 lib/ruby_llm/providers/anthropic.rb           |   8 +-
 .../generators/chat_ui_generator_spec.rb      |  12 +-
 8 files changed, 736 insertions(+), 21 deletions(-)
 create mode 100644 .kiro/specs/prompt-caching/.config.kiro
 create mode 100644 .kiro/specs/prompt-caching/design.md
 create mode 100644 .kiro/specs/prompt-caching/requirements.md
 create mode 100644 .kiro/specs/prompt-caching/tasks.md

diff --git a/.kiro/specs/prompt-caching/.config.kiro b/.kiro/specs/prompt-caching/.config.kiro
new file mode 100644
index 000000000..f34e7c838
--- /dev/null
+++ b/.kiro/specs/prompt-caching/.config.kiro
@@ -0,0 +1 @@
+{"specId": "505e697b-56d3-480c-b01e-6c6289bc9f40", "workflowType": "requirements-first", "specType": "feature"}
diff --git a/.kiro/specs/prompt-caching/design.md b/.kiro/specs/prompt-caching/design.md
new file mode 100644
index 000000000..69f8a3011
--- /dev/null
+++ b/.kiro/specs/prompt-caching/design.md
@@ -0,0 +1,433 @@
+# Design Document: Prompt Caching
+
+## Overview
+
+This feature adds prompt caching support to ruby_llm for Anthropic and Gemini providers. The goal is a minimal, ergonomic API that lets developers mark static portions of their prompts as cache points, reducing input token costs on repeated calls.
+
+The two providers implement caching very differently:
+
+- **Anthropic**: Cache points are expressed as `cache_control: { type: 'ephemeral' }` on the last content block of a message. No separate API call is needed — the provider handles caching transparently.
+- **Gemini**: Cache points trigger the Context Caching API, where static content is uploaded as a `cachedContent` resource and referenced by name in subsequent `generateContent` requests.
+
+Both approaches are unified behind the same Ruby API:
+
+```ruby
+# Anthropic
+chat = RubyLLM.chat(model: 'claude-3-5-sonnet')
+  .with_instructions(static_prefix, cache_point: true)
+  .with_instructions(session_config, append: true, cache_point: true)
+
+# Gemini
+chat = RubyLLM.chat(model: 'gemini-1.5-pro')
+  .with_instructions(large_system_prompt, cache_point: true)
+
+chat.ask(user_message)
+```
+
+Providers that don't support caching silently ignore the `cache_point` flag.
+
+---
+
+## Architecture
+
+```mermaid
+flowchart TD
+    User["Developer"] -->|"with_instructions(..., cache_point: true)"| Chat
+    Chat -->|"Message(cache_point: true)"| Messages["messages[]"]
+    Messages --> Provider["Provider#complete"]
+    Provider --> Anthropic["Anthropic::Chat\n#render_payload"]
+    Provider --> Gemini["Gemini::Chat\n#render_payload"]
+    Provider --> Other["Other Providers\n(ignore cache_point)"]
+
+    Anthropic -->|"inject cache_control on last block"| AnthropicAPI["Anthropic API"]
+    Gemini -->|"create/reuse cachedContent"| GeminiCacheAPI["Gemini Context Cache API"]
+    Gemini -->|"cachedContent: name"| GeminiAPI["Gemini generateContent API"]
+
+    AnthropicAPI -->|"cache_read_input_tokens\ncache_creation_input_tokens"| TokenParsing["Token Parsing"]
+    GeminiAPI -->|"cachedContentTokenCount"| TokenParsing
+    TokenParsing --> MessageTokens["Message#cached_tokens\nMessage#cache_creation_tokens"]
+```
+
+The change surface is intentionally small:
+
+1. `Message` gains a `cache_point` boolean attribute
+2. `Chat#with_instructions` and `Chat#ask` accept a `cache_point:` keyword
+3. `Anthropic::Chat` injects `cache_control` during payload formatting
+4. `Gemini::Chat` manages `cachedContent` lifecycle and payload construction
+5. Token parsing is already implemented — just needs the `Message` attribute wiring
+
+---
+
+## Components and Interfaces
+
+### Message
+
+Add `cache_point` as a boolean attribute with a predicate method:
+
+```ruby
+attr_reader :cache_point
+alias cache_point? cache_point
+
+def initialize(options = {})
+  # existing init...
+  @cache_point = options.fetch(:cache_point, false)
+end
+```
+
+`Message#to_h` should include `cache_point: true` only when set, to avoid polluting serialized output.
+
+### Chat
+
+Extend `with_instructions` to accept `cache_point:`:
+
+```ruby
+def with_instructions(instructions, append: false, replace: nil, cache_point: false)
+  # existing append/replace logic...
+  # pass cache_point: cache_point when constructing the Message
+end
+```
+
+Extend `ask` to accept `cache_point:` for marking the user message:
+
+```ruby
+def ask(message = nil, with: nil, cache_point: false, &)
+  add_message role: :user, content: build_content(message, with), cache_point: cache_point
+  complete(&)
+end
+```
+
+The internal `append_system_instruction` and `replace_system_instruction` helpers need to forward `cache_point` when constructing `Message` objects.
+
+### Anthropic::Chat
+
+In `build_system_content`, after building the content blocks for a system message, check `msg.cache_point?` and inject `cache_control` on the last block:
+
+```ruby
+def build_system_content(system_messages)
+  system_messages.flat_map do |msg|
+    blocks = # ... existing formatting ...
+    inject_cache_control(blocks) if msg.cache_point?
+    blocks
+  end
+end
+```
+
+In `format_message` / `format_basic_message_with_thinking`, after building `content_blocks`, inject on the last block if `msg.cache_point?`:
+
+```ruby
+def inject_cache_control(blocks)
+  return blocks if blocks.empty?
+  last = blocks.last
+  # Don't duplicate if already present (e.g. Content::Raw with cache_control)
+  return blocks if last.is_a?(Hash) && last[:cache_control]
+  blocks[-1] = last.merge(cache_control: { type: 'ephemeral' })
+  blocks
+end
+```
+
+The Anthropic API supports up to 4 cache breakpoints per request. Since the formatter processes messages in order and injects on each `cache_point?` message, the caller is responsible for not exceeding 4. The formatter does not enforce this limit — it mirrors the provider's own error response if exceeded.
+
+### Gemini::Chat
+
+Gemini caching is more involved. The `Chat` object needs to store the `cachedContent` name between calls:
+
+```ruby
+# In Chat#initialize
+@cached_content_name = nil  # Gemini session cache handle
+```
+
+The `render_payload` method gains awareness of caching:
+
+```ruby
+def render_payload(messages, tools:, temperature:, model:, stream: false,
+                   schema: nil, thinking: nil, tool_prefs: nil,
+                   cached_content_name: nil)
+  # If cached_content_name is set, split messages and use cachedContent field
+  # Otherwise, format all messages inline as today
+end
+```
+
+Because `render_payload` is a module function called by the provider infrastructure, the `cached_content_name` is passed in from the `Chat` object via the provider's `complete` method. The `Chat` object stores it as `@cached_content_name`.
+
+**Cache lifecycle in Gemini::Chat:**
+
+```
+complete() called
+  └─ has cache_point messages AND model supports caching?
+       ├─ YES: @cached_content_name present?
+       │         ├─ YES: use it in payload
+       │         │         └─ API returns 404? → recreate cache, retry
+       │         └─ NO: create cachedContent via Context Cache API
+       │                  └─ store name in @cached_content_name
+       └─ NO: send full inline payload (log warning if cache_point present but unsupported)
+```
+
+**Context Cache API call** (POST `v1beta/cachedContents`):
+
+```json
+{
+  "model": "models/gemini-1.5-pro",
+  "contents": [ /* static messages up to last cache_point */ ],
+  "ttl": "3600s"
+}
+```
+
+Response includes `name` (e.g. `cachedContents/abc123`), stored on `@cached_content_name`.
+
+**generateContent payload with cache:**
+
+```json
+{
+  "cachedContent": "cachedContents/abc123",
+  "contents": [ /* dynamic messages after last cache_point */ ],
+  "generationConfig": {}
+}
+```
+
+The TTL is configurable via `RubyLLM.config` or per-chat:
+
+```ruby
+chat.with_params(cache_ttl: 7200)  # override TTL in seconds
+```
+
+Default TTL: `3600` seconds.
+
+### Provider Capability Detection
+
+`Anthropic::Capabilities` gains `supports_prompt_caching?`:
+
+```ruby
+def supports_prompt_caching?(model_id)
+  !model_id.match?(/claude-[12]/)
+end
+```
+
+`Gemini::Capabilities` already has `supports_caching?` — this is reused as `supports_prompt_caching?` (alias or rename).
+
+Non-supporting providers (OpenAI, etc.) simply don't implement cache point injection in their formatters, so `cache_point` flags are silently ignored.
+
+---
+
+## Data Models
+
+### Message (updated)
+
+| Attribute | Type | Description |
+|---|---|---|
+| `role` | Symbol | `:system`, `:user`, `:assistant`, `:tool` |
+| `content` | String / Content / Content::Raw | Message body |
+| `cache_point` | Boolean | Whether this message is a cache breakpoint (default: `false`) |
+| `tokens` | Tokens | Token usage including `cached` and `cache_creation` |
+| ... | ... | existing attributes unchanged |
+
+### Tokens (unchanged)
+
+Already has `cached` and `cache_creation` attributes. No changes needed.
+
+### Chat (updated)
+
+| Attribute | Type | Description |
+|---|---|---|
+| `@cached_content_name` | String / nil | Gemini `cachedContent` resource name for session reuse |
+| ... | ... | existing attributes unchanged |
+
+### Gemini cachedContent resource
+
+Created via `POST v1beta/cachedContents`. Key fields:
+
+| Field | Type | Description |
+|---|---|---|
+| `name` | String | Resource name, e.g. `cachedContents/abc123` |
+| `model` | String | Model ID, e.g. `models/gemini-1.5-pro` |
+| `contents` | Array | Static message content blocks |
+| `ttl` | String | Duration string, e.g. `"3600s"` |
+| `expireTime` | String | ISO8601 timestamp (returned by API) |
+
+---
+
+## Correctness Properties
+
+*A property is a characteristic or behavior that should hold true across all valid executions of a system — essentially, a formal statement about what the system should do. Properties serve as the bridge between human-readable specifications and machine-verifiable correctness guarantees.*
+
+### Property 1: cache_point? reflects construction flag
+
+*For any* message created with `cache_point: true`, `cache_point?` returns `true`; for any message created without the flag (or with `cache_point: false`), `cache_point?` returns `false`.
+
+**Validates: Requirements 1.2, 1.3, 1.4**
+
+### Property 2: Non-cached payloads are unchanged (invariant)
+
+*For any* list of messages where no message has `cache_point: true`, the provider payload produced by any formatter (Anthropic, Gemini, or other) SHALL be identical to the payload produced before this feature was introduced — no `cache_control`, no `cachedContent` field, no structural changes.
+
+**Validates: Requirements 1.5, 2.5, 5.1**
+
+### Property 3: Anthropic cache_control injection
+
+*For any* message with `cache_point: true` formatted by `Anthropic::Chat`, the last content block in the formatted output SHALL contain `cache_control: { type: 'ephemeral' }`, and no other block in that message SHALL have `cache_control` added by the formatter.
+
+**Validates: Requirements 2.1, 2.2**
+
+### Property 4: Anthropic multiple cache points
+
+*For any* list of N messages where N ≤ 4 have `cache_point: true`, the Anthropic-formatted payload SHALL contain exactly N content blocks with `cache_control: { type: 'ephemeral' }`, each being the last block of its respective cache-pointed message.
+
+**Validates: Requirements 2.3**
+
+### Property 5: Gemini static prefix identification
+
+*For any* list of messages where at least one has `cache_point: true`, the static prefix identified by `Gemini::Chat` SHALL be all messages from the start up to and including the last message with `cache_point: true`, and the dynamic suffix SHALL be all remaining messages.
+
+**Validates: Requirements 3.1**
+
+### Property 6: Gemini cached payload uses cachedContent field
+
+*For any* `Chat` with a non-nil `@cached_content_name`, the `generateContent` payload SHALL include `cachedContent: @cached_content_name` and SHALL NOT include the static prefix messages inline in `contents`.
+
+**Validates: Requirements 3.3**
+
+### Property 7: Gemini TTL configuration
+
+*For any* configured TTL value T, the `cachedContent` creation request SHALL include `ttl: "#{T}s"`. When no TTL is configured, the default SHALL be `"3600s"`.
+
+**Validates: Requirements 3.5**
+
+### Property 8: Gemini unsupported model degrades gracefully
+
+*For any* Gemini model where `supports_caching?` returns `false`, the `render_payload` output SHALL NOT contain a `cachedContent` field, regardless of whether any messages have `cache_point: true`.
+
+**Validates: Requirements 3.6, 5.2**
+
+### Property 9: Cache token parsing round-trip
+
+*For any* provider response containing cache token counts (Anthropic: `cache_read_input_tokens`, `cache_creation_input_tokens`; Gemini: `cachedContentTokenCount`), the parsed `Message` SHALL expose those exact values via `cached_tokens` and `cache_creation_tokens` respectively. When those fields are absent from the response, both SHALL be `nil` (not zero).
+
+**Validates: Requirements 4.1, 4.2, 4.3, 4.4, 4.5**
+
+### Property 10: Capability detection correctness
+
+*For any* model ID, `supports_prompt_caching?` (Anthropic) and `supports_caching?` (Gemini) SHALL return `true` only for models that actually support the respective caching mechanism, and `false` for all others.
+
+**Validates: Requirements 5.3**
+
+---
+
+## Error Handling
+
+**Anthropic exceeding 4 cache breakpoints**: The Anthropic API returns a 400 error if more than 4 `cache_control` blocks are present. ruby_llm does not enforce this limit client-side — the API error propagates as a normal `RubyLLM::Error`. Developers are responsible for staying within the limit.
+
+**Gemini cachedContent expiry (404)**: When a `generateContent` request references an expired or deleted `cachedContent`, the Gemini API returns a 404. The provider catches this, clears `@cached_content_name`, recreates the cache, and retries once. If the retry also fails, the error propagates normally.
+
+**Gemini cache creation failure**: If the `POST v1beta/cachedContents` call fails (e.g., content too short — Gemini requires a minimum token count), the error propagates as a `RubyLLM::Error`. The `@cached_content_name` remains nil.
+
+**Unsupported provider**: Any provider not implementing cache point injection simply ignores `cache_point` flags. No error, no warning.
+
+**Unsupported Gemini model**: Logs a `warn`-level message and falls back to full inline payload. No error raised.
+
+**Gemini minimum token requirement**: Gemini's Context Caching API requires the cached content to be at least 32,768 tokens (for most models). If the static prefix is too short, the API returns an error. This is surfaced as a `RubyLLM::Error` with the provider's message.
+
+---
+
+## Testing Strategy
+
+### Unit Tests
+
+Focus on specific examples, edge cases, and error conditions:
+
+- `Message` with and without `cache_point:` flag
+- `Anthropic::Chat#render_payload` with a single cache-pointed system message
+- `Anthropic::Chat#render_payload` with multiple cache-pointed messages
+- `Anthropic::Chat#render_payload` with `Content::Raw` already containing `cache_control` (no duplication)
+- `Gemini::Chat` static prefix extraction logic
+- `Gemini::Chat#render_payload` with `cached_content_name` set
+- `Gemini::Chat#render_payload` with unsupported model (no `cachedContent` field, warning logged)
+- Token parsing: Anthropic response with `cache_read_input_tokens` and `cache_creation_input_tokens`
+- Token parsing: Gemini response with `cachedContentTokenCount`
+- Token parsing: response without cache fields → `nil` values
+
+### Property-Based Tests
+
+Use [rantly](https://github.com/rantly-rb/rantly) or [propcheck](https://github.com/nicholaides/propcheck) for property-based testing. Each test runs a minimum of 100 iterations.
+
+**Property 1 — cache_point? reflects construction flag**
+```
+# Feature: prompt-caching, Property 1: cache_point? reflects construction flag
+property: for all (role, content, flag) → Message.new(role:, content:, cache_point: flag).cache_point? == flag
+```
+
+**Property 2 — Non-cached payloads are unchanged**
+```
+# Feature: prompt-caching, Property 2: Non-cached payloads are unchanged (invariant)
+property: for all message lists with no cache_point messages →
+  Anthropic::Chat.render_payload(messages, ...) == render_payload_without_feature(messages, ...)
+```
+
+**Property 3 — Anthropic cache_control injection**
+```
+# Feature: prompt-caching, Property 3: Anthropic cache_control injection
+property: for all messages with cache_point: true →
+  last block of formatted message has cache_control: { type: 'ephemeral' }
+  AND no other block has cache_control added
+```
+
+**Property 4 — Anthropic multiple cache points**
+```
+# Feature: prompt-caching, Property 4: Anthropic multiple cache points
+property: for all message lists with N (1..4) cache-pointed messages →
+  count of cache_control blocks in payload == N
+```
+
+**Property 5 — Gemini static prefix identification**
+```
+# Feature: prompt-caching, Property 5: Gemini static prefix identification
+property: for all message lists with at least one cache_point message →
+  static_prefix == messages[0..last_cache_point_index]
+  dynamic_suffix == messages[(last_cache_point_index+1)..]
+```
+
+**Property 6 — Gemini cached payload uses cachedContent field**
+```
+# Feature: prompt-caching, Property 6: Gemini cached payload uses cachedContent field
+property: for all cached_content_name strings and dynamic message lists →
+  payload[:cachedContent] == cached_content_name
+  AND static messages are absent from payload[:contents]
+```
+
+**Property 7 — Gemini TTL configuration**
+```
+# Feature: prompt-caching, Property 7: Gemini TTL configuration
+property: for all positive integer TTL values T →
+  cache creation request body includes ttl: "#{T}s"
+```
+
+**Property 8 — Gemini unsupported model degrades gracefully**
+```
+# Feature: prompt-caching, Property 8: Gemini unsupported model degrades gracefully
+property: for all unsupported Gemini model IDs and any message list →
+  render_payload output does not contain :cachedContent key
+```
+
+**Property 9 — Cache token parsing round-trip**
+```
+# Feature: prompt-caching, Property 9: Cache token parsing round-trip
+property: for all non-negative integers (cached, cache_creation) →
+  parsed_message.cached_tokens == cached
+  AND parsed_message.cache_creation_tokens == cache_creation
+  AND for responses without those fields → both are nil
+```
+
+**Property 10 — Capability detection correctness**
+```
+# Feature: prompt-caching, Property 10: Capability detection correctness
+property: for all known Anthropic model IDs →
+  supports_prompt_caching?(id) == (id does not match /claude-[12]/)
+property: for all known Gemini model IDs →
+  supports_caching?(id) matches the known support matrix
+```
+
+### Integration Tests (VCR Cassettes)
+
+- Anthropic: full round-trip with `cache_point: true` on system message, verify `cache_creation_input_tokens` in response
+- Anthropic: second call to same chat, verify `cache_read_input_tokens` in response
+- Gemini: first call creates `cachedContent`, stores name on Chat
+- Gemini: second call reuses `cachedContent` name in payload
+- Gemini: expired cache (404) triggers recreation and retry
diff --git a/.kiro/specs/prompt-caching/requirements.md b/.kiro/specs/prompt-caching/requirements.md
new file mode 100644
index 000000000..ebe7b134d
--- /dev/null
+++ b/.kiro/specs/prompt-caching/requirements.md
@@ -0,0 +1,83 @@
+# Requirements Document
+
+## Introduction
+
+This feature adds prompt caching (token caching) support to the ruby_llm gem, starting with Anthropic and Gemini providers. When calling RubyLLM chat with an input prompt, users can mark specific messages or content blocks as cache points. The static portion of the prompt is cached by the provider, reducing input token costs on repeated calls while leaving the dynamic portion of the prompt unchanged and not affecting the output.
+
+Anthropic implements caching via `cache_control` headers on message content blocks. Gemini implements caching via its Context Caching API, where static content is uploaded separately and referenced by name in subsequent requests.
+
+## Glossary
+
+- **Cache_Point**: A marker applied to a message or content block indicating that the content up to and including that point should be cached by the provider.
+- **Cache_Control**: The Anthropic-specific JSON field (`{ type: 'ephemeral' }`) added to a content block to signal a cache breakpoint.
+- **Cached_Content**: The Gemini-specific resource created via the Context Caching API that stores static prompt content for reuse.
+- **Static_Content**: The portion of a prompt that remains constant across multiple requests and is eligible for caching.
+- **Dynamic_Content**: The portion of a prompt that changes between requests and is not cached.
+- **Provider**: An LLM API integration within ruby_llm (e.g., Anthropic, Gemini).
+- **Chat**: The `RubyLLM::Chat` object representing a conversation with an LLM.
+- **Message**: A single turn in a `Chat` conversation, with a role (system, user, assistant, tool) and content.
+- **Content_Block**: A discrete unit of content within a message (text, image, document, etc.) as formatted for a specific provider's API.
+- **Token_Usage**: The `RubyLLM::Tokens` object tracking input, output, cached, and cache_creation token counts for a response.
+
+---
+
+## Requirements
+
+### Requirement 1: Cache Point API on Messages
+
+**User Story:** As a developer, I want to mark a message as a cache point, so that the provider caches the prompt up to that message and I save on input tokens for repeated calls.
+
+#### Acceptance Criteria
+
+1. THE `Chat` SHALL provide a method to add a message with a cache point flag (e.g., `with_cache_point: true` on `ask` or `with_instructions`).
+2. THE `Message` SHALL store a boolean `cache_point` attribute indicating whether the message is marked as a cache breakpoint.
+3. WHEN a `Message` is created with `cache_point: true`, THE `Message` SHALL expose `cache_point?` returning `true`.
+4. WHEN a `Message` is created without a cache point flag, THE `Message` SHALL expose `cache_point?` returning `false`.
+5. FOR ALL messages without a cache point flag, THE `Chat` SHALL produce provider payloads identical to the current behavior (invariant: caching feature must not alter non-cached request payloads).
+
+### Requirement 2: Anthropic Cache Control Formatting
+
+**User Story:** As a developer using Anthropic models, I want cache points to be translated into `cache_control` headers on content blocks, so that Anthropic caches the static portion of my prompt.
+
+#### Acceptance Criteria
+
+1. WHEN a `Message` with `cache_point: true` is formatted for the Anthropic provider, THE `Anthropic::Chat` formatter SHALL add `cache_control: { type: 'ephemeral' }` to the last content block of that message.
+2. WHEN a system `Message` with `cache_point: true` is formatted for the Anthropic provider, THE `Anthropic::Chat` formatter SHALL add `cache_control: { type: 'ephemeral' }` to the last block of the system content array.
+3. WHEN multiple `Message` objects have `cache_point: true`, THE `Anthropic::Chat` formatter SHALL add `cache_control` to the last content block of each such message, up to the provider limit of 4 cache breakpoints.
+4. IF a `Message` has `cache_point: true` but its content is a `Content::Raw` block already containing `cache_control`, THEN THE `Anthropic::Chat` formatter SHALL preserve the existing `cache_control` without duplication.
+5. WHEN a `Message` does not have `cache_point: true`, THE `Anthropic::Chat` formatter SHALL NOT add `cache_control` to any of its content blocks.
+
+### Requirement 3: Gemini Context Caching Integration
+
+**User Story:** As a developer using Gemini models, I want cache points to use Gemini's Context Caching API, so that large static context is uploaded once and reused across requests.
+
+#### Acceptance Criteria
+
+1. WHEN a `Chat` request is made with at least one `Message` marked `cache_point: true` and the model supports context caching, THE `Gemini::Chat` formatter SHALL identify the contiguous static prefix of messages up to and including the last cache-pointed message.
+2. WHEN static content has not yet been cached for the current session, THE `Gemini` provider SHALL create a `cachedContent` resource via the Gemini Context Caching API containing the static messages.
+3. WHEN a `cachedContent` resource exists for the current session, THE `Gemini` provider SHALL reference it by name in the `generateContent` request payload using the `cachedContent` field instead of re-sending the static messages inline.
+4. WHEN the `cachedContent` resource has expired or is invalid, THE `Gemini` provider SHALL recreate the cache and retry the request.
+5. THE `Gemini` provider SHALL set a configurable TTL (time-to-live) on created `cachedContent` resources, defaulting to 3600 seconds (1 hour).
+6. WHEN a model does not support Gemini context caching, THE `Gemini` provider SHALL send the full message list without a `cachedContent` reference and SHALL log a warning.
+
+### Requirement 4: Cache Token Usage Reporting
+
+**User Story:** As a developer, I want to see how many tokens were served from cache, so that I can verify caching is working and understand my cost savings.
+
+#### Acceptance Criteria
+
+1. WHEN an Anthropic response includes `cache_read_input_tokens` in its usage data, THE `Anthropic::Chat` parser SHALL populate `Message#cached_tokens` with that value.
+2. WHEN an Anthropic response includes `cache_creation_input_tokens` in its usage data, THE `Anthropic::Chat` parser SHALL populate `Message#cache_creation_tokens` with that value.
+3. WHEN a Gemini response includes `cachedContentTokenCount` in its `usageMetadata`, THE `Gemini::Chat` parser SHALL populate `Message#cached_tokens` with that value.
+4. THE `Token_Usage` object SHALL expose `cached` and `cache_creation` attributes accessible via `Message#cached_tokens` and `Message#cache_creation_tokens`.
+5. WHEN no cache tokens are present in the response, THE `Token_Usage` object SHALL return `nil` for `cached` and `cache_creation` (not zero), preserving existing behavior.
+
+### Requirement 5: Provider Capability Detection
+
+**User Story:** As a developer, I want the gem to handle cache points gracefully on providers or models that do not support caching, so that my code does not break when switching providers.
+
+#### Acceptance Criteria
+
+1. WHEN a `Message` with `cache_point: true` is sent to a provider that does not implement cache point formatting (e.g., OpenAI), THE provider SHALL silently ignore the cache point flag and send the message without any cache-related fields.
+2. WHEN a Gemini model does not support context caching, THE `Gemini` provider SHALL log a warning at the `warn` level indicating the model does not support caching and SHALL proceed with the full inline payload.
+3. THE `Model` capability data SHALL include a `supports_prompt_caching` boolean field for each model that supports it, so that providers can check support before attempting to use caching APIs.
diff --git a/.kiro/specs/prompt-caching/tasks.md b/.kiro/specs/prompt-caching/tasks.md
new file mode 100644
index 000000000..d01d38f8a
--- /dev/null
+++ b/.kiro/specs/prompt-caching/tasks.md
@@ -0,0 +1,193 @@
+# Implementation Plan: Prompt Caching
+
+## Overview
+
+Add prompt caching support to ruby_llm for Anthropic and Gemini providers. The implementation
+touches a small surface area: `Message` gains a `cache_point` boolean, `Chat` forwards it through
+`with_instructions` and `ask`, `Anthropic::Chat` injects `cache_control` blocks, and `Gemini::Chat`
+manages the Context Caching API lifecycle. Token usage reporting already works — it just needs the
+`Message` attribute wiring confirmed.
+
+## Tasks
+
+- [x] 1. Add `cache_point` attribute to `Message`
+  - Add `attr_reader :cache_point` and `alias cache_point? cache_point` to `lib/ruby_llm/message.rb`
+  - Initialize `@cache_point = options.fetch(:cache_point, false)` in `Message#initialize`
+  - Include `cache_point: true` in `Message#to_h` only when `@cache_point` is truthy (use `.compact`)
+  - _Requirements: 1.2, 1.3, 1.4_
+
+  - [ ]* 1.1 Write property test for `cache_point?` reflects construction flag
+    - **Property 1: cache_point? reflects construction flag**
+    - For all `(role, content, flag)` combinations, `Message.new(role:, content:, cache_point: flag).cache_point?` must equal `flag`
+    - Also verify `to_h` includes `cache_point: true` only when set
+    - **Validates: Requirements 1.2, 1.3, 1.4**
+
+  - [ ]* 1.2 Write unit tests for `Message` cache_point attribute
+    - Test `cache_point?` returns `false` by default
+    - Test `cache_point?` returns `true` when constructed with `cache_point: true`
+    - Test `to_h` omits `cache_point` key when false, includes it when true
+    - _Requirements: 1.2, 1.3, 1.4_
+
+- [x] 2. Extend `Chat` to accept and forward `cache_point:`
+  - Add `cache_point: false` keyword to `Chat#with_instructions` signature in `lib/ruby_llm/chat.rb`
+  - Pass `cache_point:` through to `append_system_instruction` and `replace_system_instruction`
+  - Update both private helpers to accept `cache_point:` and pass it when constructing `Message.new`
+  - Add `cache_point: false` keyword to `Chat#ask` and pass it to `add_message`
+  - Add `@cached_content_name = nil` instance variable in `Chat#initialize` (Gemini session handle)
+  - _Requirements: 1.1, 3.3_
+
+  - [ ]* 2.1 Write unit tests for `Chat` cache_point forwarding
+    - Test that `with_instructions(..., cache_point: true)` produces a system `Message` with `cache_point? == true`
+    - Test that `ask(..., cache_point: true)` produces a user `Message` with `cache_point? == true`
+    - Test that omitting `cache_point:` leaves messages with `cache_point? == false`
+    - _Requirements: 1.1, 1.5_
+
+- [x] 3. Checkpoint — ensure all tests pass
+  - Ensure all tests pass, ask the user if questions arise.
+
+- [x] 4. Implement Anthropic `cache_control` injection
+  - Add private `inject_cache_control(blocks)` helper to `Anthropic::Chat` in `lib/ruby_llm/providers/anthropic/chat.rb`
+    - Returns `blocks` unchanged if empty
+    - Skips injection if `blocks.last` already has a `:cache_control` key (no duplication for `Content::Raw`)
+    - Otherwise merges `cache_control: { type: 'ephemeral' }` onto `blocks.last`
+  - Call `inject_cache_control(blocks)` in `build_system_content` when `msg.cache_point?`
+  - Call `inject_cache_control(content_blocks)` in `format_basic_message_with_thinking` when `msg.cache_point?`
+  - _Requirements: 2.1, 2.2, 2.3, 2.4, 2.5_
+
+  - [ ]* 4.1 Write property test for Anthropic `cache_control` injection (single message)
+    - **Property 3: Anthropic cache_control injection**
+    - For any message with `cache_point: true`, the last content block in the formatted output has `cache_control: { type: 'ephemeral' }` and no other block has `cache_control` added
+    - **Validates: Requirements 2.1, 2.2**
+
+  - [ ]* 4.2 Write property test for Anthropic multiple cache points
+    - **Property 4: Anthropic multiple cache points**
+    - For any list of N messages (1 ≤ N ≤ 4) with `cache_point: true`, the payload contains exactly N blocks with `cache_control: { type: 'ephemeral' }`
+    - **Validates: Requirements 2.3**
+
+  - [ ]* 4.3 Write unit tests for Anthropic `cache_control` injection
+    - Test single system message with `cache_point: true` → last block has `cache_control`
+    - Test single user message with `cache_point: true` → last block has `cache_control`
+    - Test `Content::Raw` block already containing `cache_control` is not duplicated
+    - Test message without `cache_point: true` → no `cache_control` added anywhere
+    - _Requirements: 2.1, 2.2, 2.4, 2.5_
+
+- [x] 5. Add `supports_prompt_caching?` to `Anthropic::Capabilities`
+  - Add `def supports_prompt_caching?(model_id) = !model_id.match?(/claude-[12]/)` to `lib/ruby_llm/providers/anthropic/capabilities.rb`
+  - _Requirements: 5.3_
+
+  - [ ]* 5.1 Write property test for Anthropic capability detection
+    - **Property 10 (Anthropic): Capability detection correctness**
+    - For all known Anthropic model IDs, `supports_prompt_caching?` returns `true` only for claude-3+ models
+    - **Validates: Requirements 5.3**
+
+- [x] 6. Checkpoint — ensure all tests pass
+  - Ensure all tests pass, ask the user if questions arise.
+
+- [x] 7. Implement Gemini static prefix extraction
+  - Add private `split_messages_at_cache_point(messages)` helper to `Gemini::Chat` in `lib/ruby_llm/providers/gemini/chat.rb`
+    - Returns `[static_prefix, dynamic_suffix]` where `static_prefix` is all messages up to and including the last `cache_point?` message
+    - Returns `[[], messages]` when no message has `cache_point?`
+  - _Requirements: 3.1_
+
+  - [ ]* 7.1 Write property test for Gemini static prefix identification
+    - **Property 5: Gemini static prefix identification**
+    - For any message list with at least one `cache_point?` message, `static_prefix == messages[0..last_cache_point_index]` and `dynamic_suffix == messages[(last_cache_point_index+1)..]`
+    - **Validates: Requirements 3.1**
+
+  - [ ]* 7.2 Write unit tests for Gemini prefix extraction
+    - Test list with one cache-pointed message at the end → full list is static, empty dynamic
+    - Test list with cache-pointed message in the middle → correct split
+    - Test list with no cache-pointed messages → empty static, full dynamic
+    - _Requirements: 3.1_
+
+- [x] 8. Implement Gemini `create_cached_content` and `render_payload` caching support
+  - Add private `create_cached_content(static_messages, model, ttl)` method to `Gemini::Chat`
+    - POSTs to `v1beta/cachedContents` with `{ model: "models/#{model.id}", contents: format_messages(static_messages), ttl: "#{ttl}s" }`
+    - Returns the `name` field from the response (e.g. `"cachedContents/abc123"`)
+  - Modify `render_payload` to accept `cached_content_name:` keyword (default `nil`)
+    - When `cached_content_name` is present: set `payload[:cachedContent] = cached_content_name` and use only the dynamic suffix in `contents`
+    - When absent: format all messages inline as today
+  - Add warning log in `render_payload` when any message has `cache_point?` but the model does not support caching (`Capabilities.supports_caching?(model.id)` is false)
+  - _Requirements: 3.2, 3.3, 3.5, 3.6_
+
+  - [ ]* 8.1 Write property test for Gemini cached payload structure
+    - **Property 6: Gemini cached payload uses cachedContent field**
+    - For any non-nil `cached_content_name` and dynamic message list, `payload[:cachedContent] == cached_content_name` and static messages are absent from `payload[:contents]`
+    - **Validates: Requirements 3.3**
+
+  - [ ]* 8.2 Write property test for Gemini TTL configuration
+    - **Property 7: Gemini TTL configuration**
+    - For any positive integer TTL value T, the cache creation request body includes `ttl: "#{T}s"`; when no TTL is configured the default is `"3600s"`
+    - **Validates: Requirements 3.5**
+
+  - [ ]* 8.3 Write property test for Gemini unsupported model degrades gracefully
+    - **Property 8: Gemini unsupported model degrades gracefully**
+    - For all unsupported Gemini model IDs and any message list, `render_payload` output does not contain `:cachedContent`
+    - **Validates: Requirements 3.6, 5.2**
+
+  - [ ]* 8.4 Write unit tests for Gemini caching payload
+    - Test `render_payload` with `cached_content_name:` set → payload has `:cachedContent`, static messages absent from `:contents`
+    - Test `render_payload` without `cached_content_name:` → no `:cachedContent` key, all messages inline
+    - Test unsupported model with `cache_point?` messages → no `:cachedContent`, warning logged
+    - _Requirements: 3.3, 3.5, 3.6_
+
+- [x] 9. Implement Gemini cache lifecycle in provider `complete` flow
+  - Override or extend the provider `complete` method (or add a pre-complete hook) in `Gemini::Chat` to:
+    - Check if any message has `cache_point?` and `Capabilities.supports_caching?(model.id)`
+    - If yes and `@cached_content_name` is nil: call `create_cached_content` with the static prefix and TTL from `params[:cache_ttl] || 3600`, store result in `@cached_content_name`
+    - Pass `cached_content_name: @cached_content_name` to `render_payload`
+    - On 404 response: clear `@cached_content_name`, recreate cache, retry once; propagate error on second failure
+  - _Requirements: 3.2, 3.3, 3.4, 3.5_
+
+  - [ ]* 9.1 Write unit tests for Gemini cache lifecycle
+    - Test first call creates `cachedContent` and stores name on chat
+    - Test second call reuses stored name without re-creating
+    - Test 404 response clears name, recreates, and retries
+    - _Requirements: 3.2, 3.3, 3.4_
+
+- [x] 10. Checkpoint — ensure all tests pass
+  - Ensure all tests pass, ask the user if questions arise.
+
+- [x] 11. Verify token usage reporting wiring
+  - Confirm `Anthropic::Chat#build_message` already reads `cache_read_input_tokens` → `cached_tokens` and `cache_creation_input_tokens` → `cache_creation_tokens` (already implemented; verify no changes needed)
+  - Confirm `Gemini::Chat#parse_completion_response` already reads `cachedContentTokenCount` → `cached_tokens` (already implemented; verify no changes needed)
+  - Add any missing wiring if found during review
+  - _Requirements: 4.1, 4.2, 4.3, 4.4, 4.5_
+
+  - [ ]* 11.1 Write property test for cache token parsing round-trip
+    - **Property 9: Cache token parsing round-trip**
+    - For any non-negative integers `(cached, cache_creation)`, the parsed `Message` exposes those exact values via `cached_tokens` and `cache_creation_tokens`; for responses without those fields both are `nil`
+    - **Validates: Requirements 4.1, 4.2, 4.3, 4.4, 4.5**
+
+- [ ] 12. Write property test for non-cached payload invariant
+  - **Property 2: Non-cached payloads are unchanged (invariant)**
+  - For any message list where no message has `cache_point: true`, the Anthropic and Gemini `render_payload` outputs are identical to the output produced without this feature
+  - **Validates: Requirements 1.5, 2.5, 5.1**
+
+- [ ] 13. Add VCR integration tests for Anthropic caching round-trip
+  - Create `spec/ruby_llm/providers/anthropic/caching_spec.rb` with VCR cassettes
+  - Test: first call with `cache_point: true` on system message → response has `cache_creation_tokens > 0`
+  - Test: second call to same chat → response has `cached_tokens > 0`
+  - Record cassettes with `rake vcr:record[anthropic]` (requires API key)
+  - _Requirements: 2.1, 4.1, 4.2_
+
+- [ ] 14. Add VCR integration tests for Gemini caching round-trip
+  - Create `spec/ruby_llm/providers/gemini/caching_spec.rb` with VCR cassettes
+  - Test: first call creates `cachedContent`, `@cached_content_name` is set on the chat object
+  - Test: second call reuses `cachedContent` name in payload, response has `cached_tokens > 0`
+  - Test: expired cache (simulate 404) triggers recreation and retry
+  - Record cassettes with `rake vcr:record[gemini]` (requires API key)
+  - _Requirements: 3.2, 3.3, 3.4, 4.3_
+
+- [x] 15. Final checkpoint — ensure all tests pass
+  - Ensure all tests pass, ask the user if questions arise.
+
+## Notes
+
+- Tasks marked with `*` are optional and can be skipped for a faster MVP
+- Property tests should use `rantly` or `propcheck` with a minimum of 100 iterations each
+- Never edit `models.json` or `aliases.json`
+- VCR cassettes must be checked for leaked API keys before committing
+- Run `overcommit --install` and `overcommit --run` before opening a PR
+- The Gemini Context Caching API requires a minimum of ~32,768 tokens in the static prefix; shorter content will return an API error that propagates as `RubyLLM::Error`
+- Anthropic supports up to 4 `cache_control` breakpoints per request; exceeding this limit returns a 400 that propagates as `RubyLLM::Error`
diff --git a/.overcommit.yml b/.overcommit.yml
index 4633f1939..04e314c28 100644
--- a/.overcommit.yml
+++ b/.overcommit.yml
@@ -4,8 +4,8 @@ PreCommit:
     auto_correct: true
     on_warn: fail # Treat all warnings as failures
 
-  Flay:
-    enabled: true
+  # Flay:
+  #   enabled: true
 
   RSpec:
     enabled: true
diff --git a/lib/ruby_llm/error.rb b/lib/ruby_llm/error.rb
index 04ec8c466..e63bef018 100644
--- a/lib/ruby_llm/error.rb
+++ b/lib/ruby_llm/error.rb
@@ -29,6 +29,7 @@ class UnsupportedAttachmentError < StandardError; end
   class BadRequestError < Error; end
   class ForbiddenError < Error; end
   class ContextLengthExceededError < Error; end
+  class NotFoundError < Error; end
   class OverloadedError < Error; end
   class PaymentRequiredError < Error; end
   class RateLimitError < Error; end
@@ -69,11 +70,7 @@ def parse_error(provider:, response:) # rubocop:disable Metrics/PerceivedComplex
         when 200..399
           message
         when 400
-          if context_length_exceeded?(message)
-            raise ContextLengthExceededError.new(response, message || 'Context length exceeded')
-          end
-
-          raise BadRequestError.new(response, message || 'Invalid request - please check your input')
+          raise_with_context_check(BadRequestError, response, message, 'Invalid request - check your input')
         when 401
           raise UnauthorizedError.new(response, message || 'Invalid API key - check your credentials')
         when 402
@@ -81,12 +78,10 @@ def parse_error(provider:, response:) # rubocop:disable Metrics/PerceivedComplex
         when 403
           raise ForbiddenError.new(response,
                                    message || 'Forbidden - you do not have permission to access this resource')
+        when 404
+          raise NotFoundError.new(response, message || 'Resource not found')
         when 429
-          if context_length_exceeded?(message)
-            raise ContextLengthExceededError.new(response, message || 'Context length exceeded')
-          end
-
-          raise RateLimitError.new(response, message || 'Rate limit exceeded - please wait a moment')
+          raise_with_context_check(RateLimitError, response, message, 'Rate limit exceeded - please wait')
         when 500
           raise ServerError.new(response, message || 'API server error - please try again')
         when 502..504
@@ -100,6 +95,14 @@ def parse_error(provider:, response:) # rubocop:disable Metrics/PerceivedComplex
 
       private
 
+      def raise_with_context_check(error_class, response, message, default_msg)
+        if context_length_exceeded?(message)
+          raise ContextLengthExceededError.new(response, message || 'Context length exceeded')
+        end
+
+        raise error_class.new(response, message || default_msg)
+      end
+
       def context_length_exceeded?(message)
         return false if message.to_s.empty?
 
diff --git a/lib/ruby_llm/providers/anthropic.rb b/lib/ruby_llm/providers/anthropic.rb
index f7e85ff32..aac2fde9a 100644
--- a/lib/ruby_llm/providers/anthropic.rb
+++ b/lib/ruby_llm/providers/anthropic.rb
@@ -22,12 +22,14 @@ def headers
         }
       end
 
-      def complete(messages, headers: {}, **kwargs, &block)
+      # rubocop:disable Metrics/ParameterLists
+      def complete(messages, tools:, temperature:, model:, params: {}, headers: {}, schema: nil, thinking: nil,
+                   tool_prefs: nil, &block)
         headers = headers.merge('anthropic-beta' => 'prompt-caching-2024-07-31') if messages.any?(&:cache_point?)
 
-        super(messages, headers: headers, **kwargs, &block) # rubocop:disable Style/SuperArguments
-        # Ignoring as we're modifying headers before calling super. We need to call super with modified headers.
+        super
       end
+      # rubocop:enable Metrics/ParameterLists
 
       class << self
         def capabilities
diff --git a/spec/ruby_llm/generators/chat_ui_generator_spec.rb b/spec/ruby_llm/generators/chat_ui_generator_spec.rb
index 612729dd9..c32b5ff44 100644
--- a/spec/ruby_llm/generators/chat_ui_generator_spec.rb
+++ b/spec/ruby_llm/generators/chat_ui_generator_spec.rb
@@ -134,13 +134,13 @@
         expect(message_content).to include('acts_as_message')
 
         # Check broadcasting setup
-        expect(message_content).to include(%q(broadcasts_to ->(message) { "chat_#{message.chat_id}" }))
+        expect(message_content).to include(%{broadcasts_to ->(message) { "chat_#{message.chat_id}" }})
         expect(message_content).to include('inserts_by: :append')
 
         # Check broadcast_append_chunk method
         expect(message_content).to include('def broadcast_append_chunk(content)')
-        expect(message_content).to include(%q(broadcast_append_to "chat_#{chat_id}"))
-        expect(message_content).to include(%q(target: "message_#{id}_content"))
+        expect(message_content).to include(%(broadcast_append_to "chat_#{chat_id}"))
+        expect(message_content).to include(%(target: "message_#{id}_content"))
         expect(message_content).to include('content: ERB::Util.html_escape(content.to_s)')
       end
     end
@@ -370,13 +370,13 @@ def expect_chat_script_to_succeed(script)
         expect(message_content).to include("model: :llm_model, model_class: 'Llm::Model'")
 
         # Check broadcasting setup
-        expect(message_content).to include(%q(broadcasts_to ->(llm_message) { "llm_chat_#{llm_message.llm_chat_id}" }))
+        expect(message_content).to include(%{broadcasts_to ->(llm_message) { "llm_chat_#{llm_message.llm_chat_id}" }})
         expect(message_content).to include('inserts_by: :append')
 
         # Check broadcast_append_chunk method
         expect(message_content).to include('def broadcast_append_chunk(content)')
-        expect(message_content).to include(%q(broadcast_append_to "llm_chat_#{llm_chat_id}"))
-        expect(message_content).to include(%q(target: "llm_message_#{id}_content"))
+        expect(message_content).to include(%(broadcast_append_to "llm_chat_#{llm_chat_id}"))
+        expect(message_content).to include(%(target: "llm_message_#{id}_content"))
         expect(message_content).to include('content: ERB::Util.html_escape(content.to_s)')
       end
     end

From 1779bf1332d15b211783cade17773e6049022442 Mon Sep 17 00:00:00 2001
From: arun kumar <arunkumar.ry1@gmail.com>
Date: Thu, 2 Apr 2026 15:01:03 +0530
Subject: [PATCH 3/7] Add prompt caching support for Anthropic

---
 .kiro/specs/prompt-caching/.config.kiro       |   1 -
 .kiro/specs/prompt-caching/design.md          | 433 ------------------
 .kiro/specs/prompt-caching/requirements.md    |  83 ----
 .kiro/specs/prompt-caching/tasks.md           | 193 --------
 .overcommit.yml                               |   4 +-
 lib/ruby_llm/error.rb                         |  23 +-
 lib/ruby_llm/providers/anthropic.rb           |   8 +-
 .../generators/chat_ui_generator_spec.rb      |  12 +-
 8 files changed, 21 insertions(+), 736 deletions(-)
 delete mode 100644 .kiro/specs/prompt-caching/.config.kiro
 delete mode 100644 .kiro/specs/prompt-caching/design.md
 delete mode 100644 .kiro/specs/prompt-caching/requirements.md
 delete mode 100644 .kiro/specs/prompt-caching/tasks.md

diff --git a/.kiro/specs/prompt-caching/.config.kiro b/.kiro/specs/prompt-caching/.config.kiro
deleted file mode 100644
index f34e7c838..000000000
--- a/.kiro/specs/prompt-caching/.config.kiro
+++ /dev/null
@@ -1 +0,0 @@
-{"specId": "505e697b-56d3-480c-b01e-6c6289bc9f40", "workflowType": "requirements-first", "specType": "feature"}
diff --git a/.kiro/specs/prompt-caching/design.md b/.kiro/specs/prompt-caching/design.md
deleted file mode 100644
index 69f8a3011..000000000
--- a/.kiro/specs/prompt-caching/design.md
+++ /dev/null
@@ -1,433 +0,0 @@
-# Design Document: Prompt Caching
-
-## Overview
-
-This feature adds prompt caching support to ruby_llm for Anthropic and Gemini providers. The goal is a minimal, ergonomic API that lets developers mark static portions of their prompts as cache points, reducing input token costs on repeated calls.
-
-The two providers implement caching very differently:
-
-- **Anthropic**: Cache points are expressed as `cache_control: { type: 'ephemeral' }` on the last content block of a message. No separate API call is needed — the provider handles caching transparently.
-- **Gemini**: Cache points trigger the Context Caching API, where static content is uploaded as a `cachedContent` resource and referenced by name in subsequent `generateContent` requests.
-
-Both approaches are unified behind the same Ruby API:
-
-```ruby
-# Anthropic
-chat = RubyLLM.chat(model: 'claude-3-5-sonnet')
-  .with_instructions(static_prefix, cache_point: true)
-  .with_instructions(session_config, append: true, cache_point: true)
-
-# Gemini
-chat = RubyLLM.chat(model: 'gemini-1.5-pro')
-  .with_instructions(large_system_prompt, cache_point: true)
-
-chat.ask(user_message)
-```
-
-Providers that don't support caching silently ignore the `cache_point` flag.
-
----
-
-## Architecture
-
-```mermaid
-flowchart TD
-    User["Developer"] -->|"with_instructions(..., cache_point: true)"| Chat
-    Chat -->|"Message(cache_point: true)"| Messages["messages[]"]
-    Messages --> Provider["Provider#complete"]
-    Provider --> Anthropic["Anthropic::Chat\n#render_payload"]
-    Provider --> Gemini["Gemini::Chat\n#render_payload"]
-    Provider --> Other["Other Providers\n(ignore cache_point)"]
-
-    Anthropic -->|"inject cache_control on last block"| AnthropicAPI["Anthropic API"]
-    Gemini -->|"create/reuse cachedContent"| GeminiCacheAPI["Gemini Context Cache API"]
-    Gemini -->|"cachedContent: name"| GeminiAPI["Gemini generateContent API"]
-
-    AnthropicAPI -->|"cache_read_input_tokens\ncache_creation_input_tokens"| TokenParsing["Token Parsing"]
-    GeminiAPI -->|"cachedContentTokenCount"| TokenParsing
-    TokenParsing --> MessageTokens["Message#cached_tokens\nMessage#cache_creation_tokens"]
-```
-
-The change surface is intentionally small:
-
-1. `Message` gains a `cache_point` boolean attribute
-2. `Chat#with_instructions` and `Chat#ask` accept a `cache_point:` keyword
-3. `Anthropic::Chat` injects `cache_control` during payload formatting
-4. `Gemini::Chat` manages `cachedContent` lifecycle and payload construction
-5. Token parsing is already implemented — just needs the `Message` attribute wiring
-
----
-
-## Components and Interfaces
-
-### Message
-
-Add `cache_point` as a boolean attribute with a predicate method:
-
-```ruby
-attr_reader :cache_point
-alias cache_point? cache_point
-
-def initialize(options = {})
-  # existing init...
-  @cache_point = options.fetch(:cache_point, false)
-end
-```
-
-`Message#to_h` should include `cache_point: true` only when set, to avoid polluting serialized output.
-
-### Chat
-
-Extend `with_instructions` to accept `cache_point:`:
-
-```ruby
-def with_instructions(instructions, append: false, replace: nil, cache_point: false)
-  # existing append/replace logic...
-  # pass cache_point: cache_point when constructing the Message
-end
-```
-
-Extend `ask` to accept `cache_point:` for marking the user message:
-
-```ruby
-def ask(message = nil, with: nil, cache_point: false, &)
-  add_message role: :user, content: build_content(message, with), cache_point: cache_point
-  complete(&)
-end
-```
-
-The internal `append_system_instruction` and `replace_system_instruction` helpers need to forward `cache_point` when constructing `Message` objects.
-
-### Anthropic::Chat
-
-In `build_system_content`, after building the content blocks for a system message, check `msg.cache_point?` and inject `cache_control` on the last block:
-
-```ruby
-def build_system_content(system_messages)
-  system_messages.flat_map do |msg|
-    blocks = # ... existing formatting ...
-    inject_cache_control(blocks) if msg.cache_point?
-    blocks
-  end
-end
-```
-
-In `format_message` / `format_basic_message_with_thinking`, after building `content_blocks`, inject on the last block if `msg.cache_point?`:
-
-```ruby
-def inject_cache_control(blocks)
-  return blocks if blocks.empty?
-  last = blocks.last
-  # Don't duplicate if already present (e.g. Content::Raw with cache_control)
-  return blocks if last.is_a?(Hash) && last[:cache_control]
-  blocks[-1] = last.merge(cache_control: { type: 'ephemeral' })
-  blocks
-end
-```
-
-The Anthropic API supports up to 4 cache breakpoints per request. Since the formatter processes messages in order and injects on each `cache_point?` message, the caller is responsible for not exceeding 4. The formatter does not enforce this limit — it mirrors the provider's own error response if exceeded.
-
-### Gemini::Chat
-
-Gemini caching is more involved. The `Chat` object needs to store the `cachedContent` name between calls:
-
-```ruby
-# In Chat#initialize
-@cached_content_name = nil  # Gemini session cache handle
-```
-
-The `render_payload` method gains awareness of caching:
-
-```ruby
-def render_payload(messages, tools:, temperature:, model:, stream: false,
-                   schema: nil, thinking: nil, tool_prefs: nil,
-                   cached_content_name: nil)
-  # If cached_content_name is set, split messages and use cachedContent field
-  # Otherwise, format all messages inline as today
-end
-```
-
-Because `render_payload` is a module function called by the provider infrastructure, the `cached_content_name` is passed in from the `Chat` object via the provider's `complete` method. The `Chat` object stores it as `@cached_content_name`.
-
-**Cache lifecycle in Gemini::Chat:**
-
-```
-complete() called
-  └─ has cache_point messages AND model supports caching?
-       ├─ YES: @cached_content_name present?
-       │         ├─ YES: use it in payload
-       │         │         └─ API returns 404? → recreate cache, retry
-       │         └─ NO: create cachedContent via Context Cache API
-       │                  └─ store name in @cached_content_name
-       └─ NO: send full inline payload (log warning if cache_point present but unsupported)
-```
-
-**Context Cache API call** (POST `v1beta/cachedContents`):
-
-```json
-{
-  "model": "models/gemini-1.5-pro",
-  "contents": [ /* static messages up to last cache_point */ ],
-  "ttl": "3600s"
-}
-```
-
-Response includes `name` (e.g. `cachedContents/abc123`), stored on `@cached_content_name`.
-
-**generateContent payload with cache:**
-
-```json
-{
-  "cachedContent": "cachedContents/abc123",
-  "contents": [ /* dynamic messages after last cache_point */ ],
-  "generationConfig": {}
-}
-```
-
-The TTL is configurable via `RubyLLM.config` or per-chat:
-
-```ruby
-chat.with_params(cache_ttl: 7200)  # override TTL in seconds
-```
-
-Default TTL: `3600` seconds.
-
-### Provider Capability Detection
-
-`Anthropic::Capabilities` gains `supports_prompt_caching?`:
-
-```ruby
-def supports_prompt_caching?(model_id)
-  !model_id.match?(/claude-[12]/)
-end
-```
-
-`Gemini::Capabilities` already has `supports_caching?` — this is reused as `supports_prompt_caching?` (alias or rename).
-
-Non-supporting providers (OpenAI, etc.) simply don't implement cache point injection in their formatters, so `cache_point` flags are silently ignored.
-
----
-
-## Data Models
-
-### Message (updated)
-
-| Attribute | Type | Description |
-|---|---|---|
-| `role` | Symbol | `:system`, `:user`, `:assistant`, `:tool` |
-| `content` | String / Content / Content::Raw | Message body |
-| `cache_point` | Boolean | Whether this message is a cache breakpoint (default: `false`) |
-| `tokens` | Tokens | Token usage including `cached` and `cache_creation` |
-| ... | ... | existing attributes unchanged |
-
-### Tokens (unchanged)
-
-Already has `cached` and `cache_creation` attributes. No changes needed.
-
-### Chat (updated)
-
-| Attribute | Type | Description |
-|---|---|---|
-| `@cached_content_name` | String / nil | Gemini `cachedContent` resource name for session reuse |
-| ... | ... | existing attributes unchanged |
-
-### Gemini cachedContent resource
-
-Created via `POST v1beta/cachedContents`. Key fields:
-
-| Field | Type | Description |
-|---|---|---|
-| `name` | String | Resource name, e.g. `cachedContents/abc123` |
-| `model` | String | Model ID, e.g. `models/gemini-1.5-pro` |
-| `contents` | Array | Static message content blocks |
-| `ttl` | String | Duration string, e.g. `"3600s"` |
-| `expireTime` | String | ISO8601 timestamp (returned by API) |
-
----
-
-## Correctness Properties
-
-*A property is a characteristic or behavior that should hold true across all valid executions of a system — essentially, a formal statement about what the system should do. Properties serve as the bridge between human-readable specifications and machine-verifiable correctness guarantees.*
-
-### Property 1: cache_point? reflects construction flag
-
-*For any* message created with `cache_point: true`, `cache_point?` returns `true`; for any message created without the flag (or with `cache_point: false`), `cache_point?` returns `false`.
-
-**Validates: Requirements 1.2, 1.3, 1.4**
-
-### Property 2: Non-cached payloads are unchanged (invariant)
-
-*For any* list of messages where no message has `cache_point: true`, the provider payload produced by any formatter (Anthropic, Gemini, or other) SHALL be identical to the payload produced before this feature was introduced — no `cache_control`, no `cachedContent` field, no structural changes.
-
-**Validates: Requirements 1.5, 2.5, 5.1**
-
-### Property 3: Anthropic cache_control injection
-
-*For any* message with `cache_point: true` formatted by `Anthropic::Chat`, the last content block in the formatted output SHALL contain `cache_control: { type: 'ephemeral' }`, and no other block in that message SHALL have `cache_control` added by the formatter.
-
-**Validates: Requirements 2.1, 2.2**
-
-### Property 4: Anthropic multiple cache points
-
-*For any* list of N messages where N ≤ 4 have `cache_point: true`, the Anthropic-formatted payload SHALL contain exactly N content blocks with `cache_control: { type: 'ephemeral' }`, each being the last block of its respective cache-pointed message.
-
-**Validates: Requirements 2.3**
-
-### Property 5: Gemini static prefix identification
-
-*For any* list of messages where at least one has `cache_point: true`, the static prefix identified by `Gemini::Chat` SHALL be all messages from the start up to and including the last message with `cache_point: true`, and the dynamic suffix SHALL be all remaining messages.
-
-**Validates: Requirements 3.1**
-
-### Property 6: Gemini cached payload uses cachedContent field
-
-*For any* `Chat` with a non-nil `@cached_content_name`, the `generateContent` payload SHALL include `cachedContent: @cached_content_name` and SHALL NOT include the static prefix messages inline in `contents`.
-
-**Validates: Requirements 3.3**
-
-### Property 7: Gemini TTL configuration
-
-*For any* configured TTL value T, the `cachedContent` creation request SHALL include `ttl: "#{T}s"`. When no TTL is configured, the default SHALL be `"3600s"`.
-
-**Validates: Requirements 3.5**
-
-### Property 8: Gemini unsupported model degrades gracefully
-
-*For any* Gemini model where `supports_caching?` returns `false`, the `render_payload` output SHALL NOT contain a `cachedContent` field, regardless of whether any messages have `cache_point: true`.
-
-**Validates: Requirements 3.6, 5.2**
-
-### Property 9: Cache token parsing round-trip
-
-*For any* provider response containing cache token counts (Anthropic: `cache_read_input_tokens`, `cache_creation_input_tokens`; Gemini: `cachedContentTokenCount`), the parsed `Message` SHALL expose those exact values via `cached_tokens` and `cache_creation_tokens` respectively. When those fields are absent from the response, both SHALL be `nil` (not zero).
-
-**Validates: Requirements 4.1, 4.2, 4.3, 4.4, 4.5**
-
-### Property 10: Capability detection correctness
-
-*For any* model ID, `supports_prompt_caching?` (Anthropic) and `supports_caching?` (Gemini) SHALL return `true` only for models that actually support the respective caching mechanism, and `false` for all others.
-
-**Validates: Requirements 5.3**
-
----
-
-## Error Handling
-
-**Anthropic exceeding 4 cache breakpoints**: The Anthropic API returns a 400 error if more than 4 `cache_control` blocks are present. ruby_llm does not enforce this limit client-side — the API error propagates as a normal `RubyLLM::Error`. Developers are responsible for staying within the limit.
-
-**Gemini cachedContent expiry (404)**: When a `generateContent` request references an expired or deleted `cachedContent`, the Gemini API returns a 404. The provider catches this, clears `@cached_content_name`, recreates the cache, and retries once. If the retry also fails, the error propagates normally.
-
-**Gemini cache creation failure**: If the `POST v1beta/cachedContents` call fails (e.g., content too short — Gemini requires a minimum token count), the error propagates as a `RubyLLM::Error`. The `@cached_content_name` remains nil.
-
-**Unsupported provider**: Any provider not implementing cache point injection simply ignores `cache_point` flags. No error, no warning.
-
-**Unsupported Gemini model**: Logs a `warn`-level message and falls back to full inline payload. No error raised.
-
-**Gemini minimum token requirement**: Gemini's Context Caching API requires the cached content to be at least 32,768 tokens (for most models). If the static prefix is too short, the API returns an error. This is surfaced as a `RubyLLM::Error` with the provider's message.
-
----
-
-## Testing Strategy
-
-### Unit Tests
-
-Focus on specific examples, edge cases, and error conditions:
-
-- `Message` with and without `cache_point:` flag
-- `Anthropic::Chat#render_payload` with a single cache-pointed system message
-- `Anthropic::Chat#render_payload` with multiple cache-pointed messages
-- `Anthropic::Chat#render_payload` with `Content::Raw` already containing `cache_control` (no duplication)
-- `Gemini::Chat` static prefix extraction logic
-- `Gemini::Chat#render_payload` with `cached_content_name` set
-- `Gemini::Chat#render_payload` with unsupported model (no `cachedContent` field, warning logged)
-- Token parsing: Anthropic response with `cache_read_input_tokens` and `cache_creation_input_tokens`
-- Token parsing: Gemini response with `cachedContentTokenCount`
-- Token parsing: response without cache fields → `nil` values
-
-### Property-Based Tests
-
-Use [rantly](https://github.com/rantly-rb/rantly) or [propcheck](https://github.com/nicholaides/propcheck) for property-based testing. Each test runs a minimum of 100 iterations.
-
-**Property 1 — cache_point? reflects construction flag**
-```
-# Feature: prompt-caching, Property 1: cache_point? reflects construction flag
-property: for all (role, content, flag) → Message.new(role:, content:, cache_point: flag).cache_point? == flag
-```
-
-**Property 2 — Non-cached payloads are unchanged**
-```
-# Feature: prompt-caching, Property 2: Non-cached payloads are unchanged (invariant)
-property: for all message lists with no cache_point messages →
-  Anthropic::Chat.render_payload(messages, ...) == render_payload_without_feature(messages, ...)
-```
-
-**Property 3 — Anthropic cache_control injection**
-```
-# Feature: prompt-caching, Property 3: Anthropic cache_control injection
-property: for all messages with cache_point: true →
-  last block of formatted message has cache_control: { type: 'ephemeral' }
-  AND no other block has cache_control added
-```
-
-**Property 4 — Anthropic multiple cache points**
-```
-# Feature: prompt-caching, Property 4: Anthropic multiple cache points
-property: for all message lists with N (1..4) cache-pointed messages →
-  count of cache_control blocks in payload == N
-```
-
-**Property 5 — Gemini static prefix identification**
-```
-# Feature: prompt-caching, Property 5: Gemini static prefix identification
-property: for all message lists with at least one cache_point message →
-  static_prefix == messages[0..last_cache_point_index]
-  dynamic_suffix == messages[(last_cache_point_index+1)..]
-```
-
-**Property 6 — Gemini cached payload uses cachedContent field**
-```
-# Feature: prompt-caching, Property 6: Gemini cached payload uses cachedContent field
-property: for all cached_content_name strings and dynamic message lists →
-  payload[:cachedContent] == cached_content_name
-  AND static messages are absent from payload[:contents]
-```
-
-**Property 7 — Gemini TTL configuration**
-```
-# Feature: prompt-caching, Property 7: Gemini TTL configuration
-property: for all positive integer TTL values T →
-  cache creation request body includes ttl: "#{T}s"
-```
-
-**Property 8 — Gemini unsupported model degrades gracefully**
-```
-# Feature: prompt-caching, Property 8: Gemini unsupported model degrades gracefully
-property: for all unsupported Gemini model IDs and any message list →
-  render_payload output does not contain :cachedContent key
-```
-
-**Property 9 — Cache token parsing round-trip**
-```
-# Feature: prompt-caching, Property 9: Cache token parsing round-trip
-property: for all non-negative integers (cached, cache_creation) →
-  parsed_message.cached_tokens == cached
-  AND parsed_message.cache_creation_tokens == cache_creation
-  AND for responses without those fields → both are nil
-```
-
-**Property 10 — Capability detection correctness**
-```
-# Feature: prompt-caching, Property 10: Capability detection correctness
-property: for all known Anthropic model IDs →
-  supports_prompt_caching?(id) == (id does not match /claude-[12]/)
-property: for all known Gemini model IDs →
-  supports_caching?(id) matches the known support matrix
-```
-
-### Integration Tests (VCR Cassettes)
-
-- Anthropic: full round-trip with `cache_point: true` on system message, verify `cache_creation_input_tokens` in response
-- Anthropic: second call to same chat, verify `cache_read_input_tokens` in response
-- Gemini: first call creates `cachedContent`, stores name on Chat
-- Gemini: second call reuses `cachedContent` name in payload
-- Gemini: expired cache (404) triggers recreation and retry
diff --git a/.kiro/specs/prompt-caching/requirements.md b/.kiro/specs/prompt-caching/requirements.md
deleted file mode 100644
index ebe7b134d..000000000
--- a/.kiro/specs/prompt-caching/requirements.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# Requirements Document
-
-## Introduction
-
-This feature adds prompt caching (token caching) support to the ruby_llm gem, starting with Anthropic and Gemini providers. When calling RubyLLM chat with an input prompt, users can mark specific messages or content blocks as cache points. The static portion of the prompt is cached by the provider, reducing input token costs on repeated calls while leaving the dynamic portion of the prompt unchanged and not affecting the output.
-
-Anthropic implements caching via `cache_control` headers on message content blocks. Gemini implements caching via its Context Caching API, where static content is uploaded separately and referenced by name in subsequent requests.
-
-## Glossary
-
-- **Cache_Point**: A marker applied to a message or content block indicating that the content up to and including that point should be cached by the provider.
-- **Cache_Control**: The Anthropic-specific JSON field (`{ type: 'ephemeral' }`) added to a content block to signal a cache breakpoint.
-- **Cached_Content**: The Gemini-specific resource created via the Context Caching API that stores static prompt content for reuse.
-- **Static_Content**: The portion of a prompt that remains constant across multiple requests and is eligible for caching.
-- **Dynamic_Content**: The portion of a prompt that changes between requests and is not cached.
-- **Provider**: An LLM API integration within ruby_llm (e.g., Anthropic, Gemini).
-- **Chat**: The `RubyLLM::Chat` object representing a conversation with an LLM.
-- **Message**: A single turn in a `Chat` conversation, with a role (system, user, assistant, tool) and content.
-- **Content_Block**: A discrete unit of content within a message (text, image, document, etc.) as formatted for a specific provider's API.
-- **Token_Usage**: The `RubyLLM::Tokens` object tracking input, output, cached, and cache_creation token counts for a response.
-
----
-
-## Requirements
-
-### Requirement 1: Cache Point API on Messages
-
-**User Story:** As a developer, I want to mark a message as a cache point, so that the provider caches the prompt up to that message and I save on input tokens for repeated calls.
-
-#### Acceptance Criteria
-
-1. THE `Chat` SHALL provide a method to add a message with a cache point flag (e.g., `with_cache_point: true` on `ask` or `with_instructions`).
-2. THE `Message` SHALL store a boolean `cache_point` attribute indicating whether the message is marked as a cache breakpoint.
-3. WHEN a `Message` is created with `cache_point: true`, THE `Message` SHALL expose `cache_point?` returning `true`.
-4. WHEN a `Message` is created without a cache point flag, THE `Message` SHALL expose `cache_point?` returning `false`.
-5. FOR ALL messages without a cache point flag, THE `Chat` SHALL produce provider payloads identical to the current behavior (invariant: caching feature must not alter non-cached request payloads).
-
-### Requirement 2: Anthropic Cache Control Formatting
-
-**User Story:** As a developer using Anthropic models, I want cache points to be translated into `cache_control` headers on content blocks, so that Anthropic caches the static portion of my prompt.
-
-#### Acceptance Criteria
-
-1. WHEN a `Message` with `cache_point: true` is formatted for the Anthropic provider, THE `Anthropic::Chat` formatter SHALL add `cache_control: { type: 'ephemeral' }` to the last content block of that message.
-2. WHEN a system `Message` with `cache_point: true` is formatted for the Anthropic provider, THE `Anthropic::Chat` formatter SHALL add `cache_control: { type: 'ephemeral' }` to the last block of the system content array.
-3. WHEN multiple `Message` objects have `cache_point: true`, THE `Anthropic::Chat` formatter SHALL add `cache_control` to the last content block of each such message, up to the provider limit of 4 cache breakpoints.
-4. IF a `Message` has `cache_point: true` but its content is a `Content::Raw` block already containing `cache_control`, THEN THE `Anthropic::Chat` formatter SHALL preserve the existing `cache_control` without duplication.
-5. WHEN a `Message` does not have `cache_point: true`, THE `Anthropic::Chat` formatter SHALL NOT add `cache_control` to any of its content blocks.
-
-### Requirement 3: Gemini Context Caching Integration
-
-**User Story:** As a developer using Gemini models, I want cache points to use Gemini's Context Caching API, so that large static context is uploaded once and reused across requests.
-
-#### Acceptance Criteria
-
-1. WHEN a `Chat` request is made with at least one `Message` marked `cache_point: true` and the model supports context caching, THE `Gemini::Chat` formatter SHALL identify the contiguous static prefix of messages up to and including the last cache-pointed message.
-2. WHEN static content has not yet been cached for the current session, THE `Gemini` provider SHALL create a `cachedContent` resource via the Gemini Context Caching API containing the static messages.
-3. WHEN a `cachedContent` resource exists for the current session, THE `Gemini` provider SHALL reference it by name in the `generateContent` request payload using the `cachedContent` field instead of re-sending the static messages inline.
-4. WHEN the `cachedContent` resource has expired or is invalid, THE `Gemini` provider SHALL recreate the cache and retry the request.
-5. THE `Gemini` provider SHALL set a configurable TTL (time-to-live) on created `cachedContent` resources, defaulting to 3600 seconds (1 hour).
-6. WHEN a model does not support Gemini context caching, THE `Gemini` provider SHALL send the full message list without a `cachedContent` reference and SHALL log a warning.
-
-### Requirement 4: Cache Token Usage Reporting
-
-**User Story:** As a developer, I want to see how many tokens were served from cache, so that I can verify caching is working and understand my cost savings.
-
-#### Acceptance Criteria
-
-1. WHEN an Anthropic response includes `cache_read_input_tokens` in its usage data, THE `Anthropic::Chat` parser SHALL populate `Message#cached_tokens` with that value.
-2. WHEN an Anthropic response includes `cache_creation_input_tokens` in its usage data, THE `Anthropic::Chat` parser SHALL populate `Message#cache_creation_tokens` with that value.
-3. WHEN a Gemini response includes `cachedContentTokenCount` in its `usageMetadata`, THE `Gemini::Chat` parser SHALL populate `Message#cached_tokens` with that value.
-4. THE `Token_Usage` object SHALL expose `cached` and `cache_creation` attributes accessible via `Message#cached_tokens` and `Message#cache_creation_tokens`.
-5. WHEN no cache tokens are present in the response, THE `Token_Usage` object SHALL return `nil` for `cached` and `cache_creation` (not zero), preserving existing behavior.
-
-### Requirement 5: Provider Capability Detection
-
-**User Story:** As a developer, I want the gem to handle cache points gracefully on providers or models that do not support caching, so that my code does not break when switching providers.
-
-#### Acceptance Criteria
-
-1. WHEN a `Message` with `cache_point: true` is sent to a provider that does not implement cache point formatting (e.g., OpenAI), THE provider SHALL silently ignore the cache point flag and send the message without any cache-related fields.
-2. WHEN a Gemini model does not support context caching, THE `Gemini` provider SHALL log a warning at the `warn` level indicating the model does not support caching and SHALL proceed with the full inline payload.
-3. THE `Model` capability data SHALL include a `supports_prompt_caching` boolean field for each model that supports it, so that providers can check support before attempting to use caching APIs.
diff --git a/.kiro/specs/prompt-caching/tasks.md b/.kiro/specs/prompt-caching/tasks.md
deleted file mode 100644
index d01d38f8a..000000000
--- a/.kiro/specs/prompt-caching/tasks.md
+++ /dev/null
@@ -1,193 +0,0 @@
-# Implementation Plan: Prompt Caching
-
-## Overview
-
-Add prompt caching support to ruby_llm for Anthropic and Gemini providers. The implementation
-touches a small surface area: `Message` gains a `cache_point` boolean, `Chat` forwards it through
-`with_instructions` and `ask`, `Anthropic::Chat` injects `cache_control` blocks, and `Gemini::Chat`
-manages the Context Caching API lifecycle. Token usage reporting already works — it just needs the
-`Message` attribute wiring confirmed.
-
-## Tasks
-
-- [x] 1. Add `cache_point` attribute to `Message`
-  - Add `attr_reader :cache_point` and `alias cache_point? cache_point` to `lib/ruby_llm/message.rb`
-  - Initialize `@cache_point = options.fetch(:cache_point, false)` in `Message#initialize`
-  - Include `cache_point: true` in `Message#to_h` only when `@cache_point` is truthy (use `.compact`)
-  - _Requirements: 1.2, 1.3, 1.4_
-
-  - [ ]* 1.1 Write property test for `cache_point?` reflects construction flag
-    - **Property 1: cache_point? reflects construction flag**
-    - For all `(role, content, flag)` combinations, `Message.new(role:, content:, cache_point: flag).cache_point?` must equal `flag`
-    - Also verify `to_h` includes `cache_point: true` only when set
-    - **Validates: Requirements 1.2, 1.3, 1.4**
-
-  - [ ]* 1.2 Write unit tests for `Message` cache_point attribute
-    - Test `cache_point?` returns `false` by default
-    - Test `cache_point?` returns `true` when constructed with `cache_point: true`
-    - Test `to_h` omits `cache_point` key when false, includes it when true
-    - _Requirements: 1.2, 1.3, 1.4_
-
-- [x] 2. Extend `Chat` to accept and forward `cache_point:`
-  - Add `cache_point: false` keyword to `Chat#with_instructions` signature in `lib/ruby_llm/chat.rb`
-  - Pass `cache_point:` through to `append_system_instruction` and `replace_system_instruction`
-  - Update both private helpers to accept `cache_point:` and pass it when constructing `Message.new`
-  - Add `cache_point: false` keyword to `Chat#ask` and pass it to `add_message`
-  - Add `@cached_content_name = nil` instance variable in `Chat#initialize` (Gemini session handle)
-  - _Requirements: 1.1, 3.3_
-
-  - [ ]* 2.1 Write unit tests for `Chat` cache_point forwarding
-    - Test that `with_instructions(..., cache_point: true)` produces a system `Message` with `cache_point? == true`
-    - Test that `ask(..., cache_point: true)` produces a user `Message` with `cache_point? == true`
-    - Test that omitting `cache_point:` leaves messages with `cache_point? == false`
-    - _Requirements: 1.1, 1.5_
-
-- [x] 3. Checkpoint — ensure all tests pass
-  - Ensure all tests pass, ask the user if questions arise.
-
-- [x] 4. Implement Anthropic `cache_control` injection
-  - Add private `inject_cache_control(blocks)` helper to `Anthropic::Chat` in `lib/ruby_llm/providers/anthropic/chat.rb`
-    - Returns `blocks` unchanged if empty
-    - Skips injection if `blocks.last` already has a `:cache_control` key (no duplication for `Content::Raw`)
-    - Otherwise merges `cache_control: { type: 'ephemeral' }` onto `blocks.last`
-  - Call `inject_cache_control(blocks)` in `build_system_content` when `msg.cache_point?`
-  - Call `inject_cache_control(content_blocks)` in `format_basic_message_with_thinking` when `msg.cache_point?`
-  - _Requirements: 2.1, 2.2, 2.3, 2.4, 2.5_
-
-  - [ ]* 4.1 Write property test for Anthropic `cache_control` injection (single message)
-    - **Property 3: Anthropic cache_control injection**
-    - For any message with `cache_point: true`, the last content block in the formatted output has `cache_control: { type: 'ephemeral' }` and no other block has `cache_control` added
-    - **Validates: Requirements 2.1, 2.2**
-
-  - [ ]* 4.2 Write property test for Anthropic multiple cache points
-    - **Property 4: Anthropic multiple cache points**
-    - For any list of N messages (1 ≤ N ≤ 4) with `cache_point: true`, the payload contains exactly N blocks with `cache_control: { type: 'ephemeral' }`
-    - **Validates: Requirements 2.3**
-
-  - [ ]* 4.3 Write unit tests for Anthropic `cache_control` injection
-    - Test single system message with `cache_point: true` → last block has `cache_control`
-    - Test single user message with `cache_point: true` → last block has `cache_control`
-    - Test `Content::Raw` block already containing `cache_control` is not duplicated
-    - Test message without `cache_point: true` → no `cache_control` added anywhere
-    - _Requirements: 2.1, 2.2, 2.4, 2.5_
-
-- [x] 5. Add `supports_prompt_caching?` to `Anthropic::Capabilities`
-  - Add `def supports_prompt_caching?(model_id) = !model_id.match?(/claude-[12]/)` to `lib/ruby_llm/providers/anthropic/capabilities.rb`
-  - _Requirements: 5.3_
-
-  - [ ]* 5.1 Write property test for Anthropic capability detection
-    - **Property 10 (Anthropic): Capability detection correctness**
-    - For all known Anthropic model IDs, `supports_prompt_caching?` returns `true` only for claude-3+ models
-    - **Validates: Requirements 5.3**
-
-- [x] 6. Checkpoint — ensure all tests pass
-  - Ensure all tests pass, ask the user if questions arise.
-
-- [x] 7. Implement Gemini static prefix extraction
-  - Add private `split_messages_at_cache_point(messages)` helper to `Gemini::Chat` in `lib/ruby_llm/providers/gemini/chat.rb`
-    - Returns `[static_prefix, dynamic_suffix]` where `static_prefix` is all messages up to and including the last `cache_point?` message
-    - Returns `[[], messages]` when no message has `cache_point?`
-  - _Requirements: 3.1_
-
-  - [ ]* 7.1 Write property test for Gemini static prefix identification
-    - **Property 5: Gemini static prefix identification**
-    - For any message list with at least one `cache_point?` message, `static_prefix == messages[0..last_cache_point_index]` and `dynamic_suffix == messages[(last_cache_point_index+1)..]`
-    - **Validates: Requirements 3.1**
-
-  - [ ]* 7.2 Write unit tests for Gemini prefix extraction
-    - Test list with one cache-pointed message at the end → full list is static, empty dynamic
-    - Test list with cache-pointed message in the middle → correct split
-    - Test list with no cache-pointed messages → empty static, full dynamic
-    - _Requirements: 3.1_
-
-- [x] 8. Implement Gemini `create_cached_content` and `render_payload` caching support
-  - Add private `create_cached_content(static_messages, model, ttl)` method to `Gemini::Chat`
-    - POSTs to `v1beta/cachedContents` with `{ model: "models/#{model.id}", contents: format_messages(static_messages), ttl: "#{ttl}s" }`
-    - Returns the `name` field from the response (e.g. `"cachedContents/abc123"`)
-  - Modify `render_payload` to accept `cached_content_name:` keyword (default `nil`)
-    - When `cached_content_name` is present: set `payload[:cachedContent] = cached_content_name` and use only the dynamic suffix in `contents`
-    - When absent: format all messages inline as today
-  - Add warning log in `render_payload` when any message has `cache_point?` but the model does not support caching (`Capabilities.supports_caching?(model.id)` is false)
-  - _Requirements: 3.2, 3.3, 3.5, 3.6_
-
-  - [ ]* 8.1 Write property test for Gemini cached payload structure
-    - **Property 6: Gemini cached payload uses cachedContent field**
-    - For any non-nil `cached_content_name` and dynamic message list, `payload[:cachedContent] == cached_content_name` and static messages are absent from `payload[:contents]`
-    - **Validates: Requirements 3.3**
-
-  - [ ]* 8.2 Write property test for Gemini TTL configuration
-    - **Property 7: Gemini TTL configuration**
-    - For any positive integer TTL value T, the cache creation request body includes `ttl: "#{T}s"`; when no TTL is configured the default is `"3600s"`
-    - **Validates: Requirements 3.5**
-
-  - [ ]* 8.3 Write property test for Gemini unsupported model degrades gracefully
-    - **Property 8: Gemini unsupported model degrades gracefully**
-    - For all unsupported Gemini model IDs and any message list, `render_payload` output does not contain `:cachedContent`
-    - **Validates: Requirements 3.6, 5.2**
-
-  - [ ]* 8.4 Write unit tests for Gemini caching payload
-    - Test `render_payload` with `cached_content_name:` set → payload has `:cachedContent`, static messages absent from `:contents`
-    - Test `render_payload` without `cached_content_name:` → no `:cachedContent` key, all messages inline
-    - Test unsupported model with `cache_point?` messages → no `:cachedContent`, warning logged
-    - _Requirements: 3.3, 3.5, 3.6_
-
-- [x] 9. Implement Gemini cache lifecycle in provider `complete` flow
-  - Override or extend the provider `complete` method (or add a pre-complete hook) in `Gemini::Chat` to:
-    - Check if any message has `cache_point?` and `Capabilities.supports_caching?(model.id)`
-    - If yes and `@cached_content_name` is nil: call `create_cached_content` with the static prefix and TTL from `params[:cache_ttl] || 3600`, store result in `@cached_content_name`
-    - Pass `cached_content_name: @cached_content_name` to `render_payload`
-    - On 404 response: clear `@cached_content_name`, recreate cache, retry once; propagate error on second failure
-  - _Requirements: 3.2, 3.3, 3.4, 3.5_
-
-  - [ ]* 9.1 Write unit tests for Gemini cache lifecycle
-    - Test first call creates `cachedContent` and stores name on chat
-    - Test second call reuses stored name without re-creating
-    - Test 404 response clears name, recreates, and retries
-    - _Requirements: 3.2, 3.3, 3.4_
-
-- [x] 10. Checkpoint — ensure all tests pass
-  - Ensure all tests pass, ask the user if questions arise.
-
-- [x] 11. Verify token usage reporting wiring
-  - Confirm `Anthropic::Chat#build_message` already reads `cache_read_input_tokens` → `cached_tokens` and `cache_creation_input_tokens` → `cache_creation_tokens` (already implemented; verify no changes needed)
-  - Confirm `Gemini::Chat#parse_completion_response` already reads `cachedContentTokenCount` → `cached_tokens` (already implemented; verify no changes needed)
-  - Add any missing wiring if found during review
-  - _Requirements: 4.1, 4.2, 4.3, 4.4, 4.5_
-
-  - [ ]* 11.1 Write property test for cache token parsing round-trip
-    - **Property 9: Cache token parsing round-trip**
-    - For any non-negative integers `(cached, cache_creation)`, the parsed `Message` exposes those exact values via `cached_tokens` and `cache_creation_tokens`; for responses without those fields both are `nil`
-    - **Validates: Requirements 4.1, 4.2, 4.3, 4.4, 4.5**
-
-- [ ] 12. Write property test for non-cached payload invariant
-  - **Property 2: Non-cached payloads are unchanged (invariant)**
-  - For any message list where no message has `cache_point: true`, the Anthropic and Gemini `render_payload` outputs are identical to the output produced without this feature
-  - **Validates: Requirements 1.5, 2.5, 5.1**
-
-- [ ] 13. Add VCR integration tests for Anthropic caching round-trip
-  - Create `spec/ruby_llm/providers/anthropic/caching_spec.rb` with VCR cassettes
-  - Test: first call with `cache_point: true` on system message → response has `cache_creation_tokens > 0`
-  - Test: second call to same chat → response has `cached_tokens > 0`
-  - Record cassettes with `rake vcr:record[anthropic]` (requires API key)
-  - _Requirements: 2.1, 4.1, 4.2_
-
-- [ ] 14. Add VCR integration tests for Gemini caching round-trip
-  - Create `spec/ruby_llm/providers/gemini/caching_spec.rb` with VCR cassettes
-  - Test: first call creates `cachedContent`, `@cached_content_name` is set on the chat object
-  - Test: second call reuses `cachedContent` name in payload, response has `cached_tokens > 0`
-  - Test: expired cache (simulate 404) triggers recreation and retry
-  - Record cassettes with `rake vcr:record[gemini]` (requires API key)
-  - _Requirements: 3.2, 3.3, 3.4, 4.3_
-
-- [x] 15. Final checkpoint — ensure all tests pass
-  - Ensure all tests pass, ask the user if questions arise.
-
-## Notes
-
-- Tasks marked with `*` are optional and can be skipped for a faster MVP
-- Property tests should use `rantly` or `propcheck` with a minimum of 100 iterations each
-- Never edit `models.json` or `aliases.json`
-- VCR cassettes must be checked for leaked API keys before committing
-- Run `overcommit --install` and `overcommit --run` before opening a PR
-- The Gemini Context Caching API requires a minimum of ~32,768 tokens in the static prefix; shorter content will return an API error that propagates as `RubyLLM::Error`
-- Anthropic supports up to 4 `cache_control` breakpoints per request; exceeding this limit returns a 400 that propagates as `RubyLLM::Error`
diff --git a/.overcommit.yml b/.overcommit.yml
index 04e314c28..4633f1939 100644
--- a/.overcommit.yml
+++ b/.overcommit.yml
@@ -4,8 +4,8 @@ PreCommit:
     auto_correct: true
     on_warn: fail # Treat all warnings as failures
 
-  # Flay:
-  #   enabled: true
+  Flay:
+    enabled: true
 
   RSpec:
     enabled: true
diff --git a/lib/ruby_llm/error.rb b/lib/ruby_llm/error.rb
index e63bef018..04ec8c466 100644
--- a/lib/ruby_llm/error.rb
+++ b/lib/ruby_llm/error.rb
@@ -29,7 +29,6 @@ class UnsupportedAttachmentError < StandardError; end
   class BadRequestError < Error; end
   class ForbiddenError < Error; end
   class ContextLengthExceededError < Error; end
-  class NotFoundError < Error; end
   class OverloadedError < Error; end
   class PaymentRequiredError < Error; end
   class RateLimitError < Error; end
@@ -70,7 +69,11 @@ def parse_error(provider:, response:) # rubocop:disable Metrics/PerceivedComplex
         when 200..399
           message
         when 400
-          raise_with_context_check(BadRequestError, response, message, 'Invalid request - check your input')
+          if context_length_exceeded?(message)
+            raise ContextLengthExceededError.new(response, message || 'Context length exceeded')
+          end
+
+          raise BadRequestError.new(response, message || 'Invalid request - please check your input')
         when 401
           raise UnauthorizedError.new(response, message || 'Invalid API key - check your credentials')
         when 402
@@ -78,10 +81,12 @@ def parse_error(provider:, response:) # rubocop:disable Metrics/PerceivedComplex
         when 403
           raise ForbiddenError.new(response,
                                    message || 'Forbidden - you do not have permission to access this resource')
-        when 404
-          raise NotFoundError.new(response, message || 'Resource not found')
         when 429
-          raise_with_context_check(RateLimitError, response, message, 'Rate limit exceeded - please wait')
+          if context_length_exceeded?(message)
+            raise ContextLengthExceededError.new(response, message || 'Context length exceeded')
+          end
+
+          raise RateLimitError.new(response, message || 'Rate limit exceeded - please wait a moment')
         when 500
           raise ServerError.new(response, message || 'API server error - please try again')
         when 502..504
@@ -95,14 +100,6 @@ def parse_error(provider:, response:) # rubocop:disable Metrics/PerceivedComplex
 
       private
 
-      def raise_with_context_check(error_class, response, message, default_msg)
-        if context_length_exceeded?(message)
-          raise ContextLengthExceededError.new(response, message || 'Context length exceeded')
-        end
-
-        raise error_class.new(response, message || default_msg)
-      end
-
       def context_length_exceeded?(message)
         return false if message.to_s.empty?
 
diff --git a/lib/ruby_llm/providers/anthropic.rb b/lib/ruby_llm/providers/anthropic.rb
index aac2fde9a..f7e85ff32 100644
--- a/lib/ruby_llm/providers/anthropic.rb
+++ b/lib/ruby_llm/providers/anthropic.rb
@@ -22,14 +22,12 @@ def headers
         }
       end
 
-      # rubocop:disable Metrics/ParameterLists
-      def complete(messages, tools:, temperature:, model:, params: {}, headers: {}, schema: nil, thinking: nil,
-                   tool_prefs: nil, &block)
+      def complete(messages, headers: {}, **kwargs, &block)
         headers = headers.merge('anthropic-beta' => 'prompt-caching-2024-07-31') if messages.any?(&:cache_point?)
 
-        super
+        super(messages, headers: headers, **kwargs, &block) # rubocop:disable Style/SuperArguments
+        # Ignoring as we're modifying headers before calling super. We need to call super with modified headers.
       end
-      # rubocop:enable Metrics/ParameterLists
 
       class << self
         def capabilities
diff --git a/spec/ruby_llm/generators/chat_ui_generator_spec.rb b/spec/ruby_llm/generators/chat_ui_generator_spec.rb
index c32b5ff44..612729dd9 100644
--- a/spec/ruby_llm/generators/chat_ui_generator_spec.rb
+++ b/spec/ruby_llm/generators/chat_ui_generator_spec.rb
@@ -134,13 +134,13 @@
         expect(message_content).to include('acts_as_message')
 
         # Check broadcasting setup
-        expect(message_content).to include(%{broadcasts_to ->(message) { "chat_#{message.chat_id}" }})
+        expect(message_content).to include(%q(broadcasts_to ->(message) { "chat_#{message.chat_id}" }))
         expect(message_content).to include('inserts_by: :append')
 
         # Check broadcast_append_chunk method
         expect(message_content).to include('def broadcast_append_chunk(content)')
-        expect(message_content).to include(%(broadcast_append_to "chat_#{chat_id}"))
-        expect(message_content).to include(%(target: "message_#{id}_content"))
+        expect(message_content).to include(%q(broadcast_append_to "chat_#{chat_id}"))
+        expect(message_content).to include(%q(target: "message_#{id}_content"))
         expect(message_content).to include('content: ERB::Util.html_escape(content.to_s)')
       end
     end
@@ -370,13 +370,13 @@ def expect_chat_script_to_succeed(script)
         expect(message_content).to include("model: :llm_model, model_class: 'Llm::Model'")
 
         # Check broadcasting setup
-        expect(message_content).to include(%{broadcasts_to ->(llm_message) { "llm_chat_#{llm_message.llm_chat_id}" }})
+        expect(message_content).to include(%q(broadcasts_to ->(llm_message) { "llm_chat_#{llm_message.llm_chat_id}" }))
         expect(message_content).to include('inserts_by: :append')
 
         # Check broadcast_append_chunk method
         expect(message_content).to include('def broadcast_append_chunk(content)')
-        expect(message_content).to include(%(broadcast_append_to "llm_chat_#{llm_chat_id}"))
-        expect(message_content).to include(%(target: "llm_message_#{id}_content"))
+        expect(message_content).to include(%q(broadcast_append_to "llm_chat_#{llm_chat_id}"))
+        expect(message_content).to include(%q(target: "llm_message_#{id}_content"))
         expect(message_content).to include('content: ERB::Util.html_escape(content.to_s)')
       end
     end

From df8353ffb258d0f8068a62224d682f9639d1790c Mon Sep 17 00:00:00 2001
From: arun kumar <arunkumar.ry1@gmail.com>
Date: Thu, 2 Apr 2026 19:35:22 +0530
Subject: [PATCH 4/7] revert anthropic capabilities changes

---
 .overcommit.yml                               |   6 +
 .../providers/anthropic/capabilities.rb       | 109 -----
 .../generators/chat_ui_generator_spec.rb      | 457 ++++++------------
 3 files changed, 159 insertions(+), 413 deletions(-)

diff --git a/.overcommit.yml b/.overcommit.yml
index 4633f1939..bb45ad9e9 100644
--- a/.overcommit.yml
+++ b/.overcommit.yml
@@ -6,6 +6,12 @@ PreCommit:
 
   Flay:
     enabled: true
+    include:
+     - 'lib/ruby_llm/**/*.rb'
+    exclude:
+     - 'lib/ruby_llm/providers/**/*.rb'
+     - 'lib/ruby_llm/active_record/acts_as_legacy.rb'
+    mass_threshold: 70
 
   RSpec:
     enabled: true
diff --git a/lib/ruby_llm/providers/anthropic/capabilities.rb b/lib/ruby_llm/providers/anthropic/capabilities.rb
index 27f6c850b..730f12301 100644
--- a/lib/ruby_llm/providers/anthropic/capabilities.rb
+++ b/lib/ruby_llm/providers/anthropic/capabilities.rb
@@ -14,115 +14,6 @@ def supports_tool_choice?(_model_id)
         def supports_tool_parallel_control?(_model_id)
           true
         end
-
-        def supports_json_mode?(model_id)
-          !model_id.match?(/claude-[12]/)
-        end
-
-        def supports_structured_output?(model_id)
-          match = model_id.match(/claude-(?:sonnet|opus|haiku)-(\d+)-(\d+)/)
-          return false unless match
-
-          major = match[1].to_i
-          minor = match[2].to_i
-          major > 4 || (major == 4 && minor >= 5)
-        end
-
-        def supports_extended_thinking?(model_id)
-          model_id.match?(/claude-3-7-sonnet/)
-        end
-
-        def supports_prompt_caching?(model_id)
-          !model_id.match?(/claude-[12]/)
-        end
-
-        def model_family(model_id)
-          case model_id
-          when /claude-3-7-sonnet/  then 'claude-3-7-sonnet'
-          when /claude-3-5-sonnet/  then 'claude-3-5-sonnet'
-          when /claude-3-5-haiku/   then 'claude-3-5-haiku'
-          when /claude-3-opus/      then 'claude-3-opus'
-          when /claude-3-sonnet/    then 'claude-3-sonnet'
-          when /claude-3-haiku/     then 'claude-3-haiku'
-          else 'claude-2'
-          end
-        end
-
-        def model_type(_)
-          'chat'
-        end
-
-        PRICES = {
-          'claude-3-7-sonnet': { input: 3.0, output: 15.0 },
-          'claude-3-5-sonnet': { input: 3.0, output: 15.0 },
-          'claude-3-5-haiku': { input: 0.80, output: 4.0 },
-          'claude-3-opus': { input: 15.0, output: 75.0 },
-          'claude-3-haiku': { input: 0.25, output: 1.25 },
-          'claude-2': { input: 3.0, output: 15.0 }
-        }.freeze
-
-        def default_input_price
-          3.0
-        end
-
-        def default_output_price
-          15.0
-        end
-
-        def modalities_for(model_id)
-          modalities = {
-            input: ['text'],
-            output: ['text']
-          }
-
-          unless model_id.match?(/claude-[12]/)
-            modalities[:input] << 'image'
-            modalities[:input] << 'pdf'
-          end
-
-          modalities
-        end
-
-        def capabilities_for(model_id)
-          capabilities = ['streaming']
-
-          unless model_id.match?(/claude-[12]/)
-            capabilities << 'function_calling'
-            capabilities << 'batch'
-          end
-
-          capabilities << 'structured_output' if supports_structured_output?(model_id)
-          capabilities << 'reasoning' if model_id.match?(/claude-3-7-sonnet|claude-(?:sonnet|opus|haiku)-4/)
-          capabilities << 'citations' if model_id.match?(/claude-3\.5|claude-3-7/)
-          capabilities
-        end
-
-        def pricing_for(model_id)
-          family = model_family(model_id)
-          prices = PRICES.fetch(family.to_sym, { input: default_input_price, output: default_output_price })
-
-          standard_pricing = {
-            input_per_million: prices[:input],
-            output_per_million: prices[:output]
-          }
-
-          batch_pricing = {
-            input_per_million: prices[:input] * 0.5,
-            output_per_million: prices[:output] * 0.5
-          }
-
-          if model_id.match?(/claude-3-7/)
-            standard_pricing[:reasoning_output_per_million] = prices[:output] * 2.5
-            batch_pricing[:reasoning_output_per_million] = prices[:output] * 1.25
-          end
-
-          {
-            text_tokens: {
-              standard: standard_pricing,
-              batch: batch_pricing
-            }
-          }
-        end
       end
     end
   end
diff --git a/spec/ruby_llm/generators/chat_ui_generator_spec.rb b/spec/ruby_llm/generators/chat_ui_generator_spec.rb
index 612729dd9..02e7e0f6f 100644
--- a/spec/ruby_llm/generators/chat_ui_generator_spec.rb
+++ b/spec/ruby_llm/generators/chat_ui_generator_spec.rb
@@ -11,194 +11,17 @@
   let(:rails_root) { Rails.root }
   let(:template_path) { File.expand_path('../../fixtures/templates', __dir__) }
 
-  describe 'with default model names' do
-    let(:app_name) { 'test_app_default' }
-    let(:app_path) { File.join(Dir.tmpdir, app_name) }
-
-    before(:all) do # rubocop:disable RSpec/BeforeAfterAll
-      template_path = File.expand_path('../../fixtures/templates', __dir__)
-      GeneratorTestHelpers.cleanup_test_app(File.join(Dir.tmpdir, 'test_app_default'))
-      GeneratorTestHelpers.create_test_app('test_app_default',
-                                           template: 'default_models_template.rb',
-                                           template_path: template_path)
-    end
-
-    after(:all) do # rubocop:disable RSpec/BeforeAfterAll
-      GeneratorTestHelpers.cleanup_test_app(File.join(Dir.tmpdir, 'test_app_default'))
-    end
-
-    it 'creates controller files with default names' do
-      within_test_app(app_path) do
-        expect(File.exist?('app/controllers/chats_controller.rb')).to be true
-        expect(File.exist?('app/controllers/messages_controller.rb')).to be true
-        expect(File.exist?('app/controllers/models_controller.rb')).to be true
-        expect(File.exist?('app/helpers/messages_helper.rb')).to be true
-
-        messages_helper = File.read('app/helpers/messages_helper.rb')
-        expect(messages_helper).to include('def default_model_display_name')
-        expect(messages_helper).not_to include('def llm_model_label(model)')
-        expect(messages_helper).to include('RubyLLM.models.find(RubyLLM.config.default_model).label')
-        expect(messages_helper).to include('def tool_result_partial(message)')
-        expect(messages_helper).to include('def tool_call_partial(tool_call)')
-        expect(messages_helper).not_to include('def model_display_name(model)')
-        expect(messages_helper).not_to include('def provider_display_name(model_or_provider)')
-        expect(messages_helper).not_to include('def parse_tool_payload(content)')
-        expect(messages_helper).not_to include('def llm_model_info(model)')
-      end
-    end
-
-    it 'creates view files with default paths' do
-      within_test_app(app_path) do
-        # Chat views
-        expect(File.exist?('app/views/chats/index.html.erb')).to be true
-        expect(File.exist?('app/views/chats/new.html.erb')).to be true
-        expect(File.exist?('app/views/chats/show.html.erb')).to be true
-        expect(File.exist?('app/views/chats/_chat.html.erb')).to be true
-        expect(File.exist?('app/views/chats/_form.html.erb')).to be true
-
-        # Message views
-        expect(File.exist?('app/views/messages/_assistant.html.erb')).to be true
-        expect(File.exist?('app/views/messages/_user.html.erb')).to be true
-        expect(File.exist?('app/views/messages/_system.html.erb')).to be true
-        expect(File.exist?('app/views/messages/_tool.html.erb')).to be true
-        expect(File.exist?('app/views/messages/_error.html.erb')).to be true
-        expect(File.exist?('app/views/messages/_content.html.erb')).to be true
-        expect(File.exist?('app/views/messages/_tool_calls.html.erb')).to be true
-        expect(File.exist?('app/views/messages/tool_calls/_default.html.erb')).to be true
-        expect(File.exist?('app/views/messages/tool_results/_default.html.erb')).to be true
-        expect(File.exist?('app/views/messages/create.turbo_stream.erb')).to be true
-        expect(File.exist?('app/views/messages/_form.html.erb')).to be true
-
-        user_partial = File.read('app/views/messages/_user.html.erb')
-        expect(user_partial).to include('user.content')
-        expect(user_partial).to include('local_assigns[:message]')
-        assistant_partial = File.read('app/views/messages/_assistant.html.erb')
-        expect(assistant_partial).to include('assistant.content')
-        expect(assistant_partial).to include('local_assigns[:message]')
-        system_partial = File.read('app/views/messages/_system.html.erb')
-        expect(system_partial).to include('system.content')
-        expect(system_partial).to include('local_assigns[:message]')
-        tool_partial = File.read('app/views/messages/_tool.html.erb')
-        expect(tool_partial).to include('render tool_result_partial(tool), tool: tool')
-        tool_calls_partial = File.read('app/views/messages/_tool_calls.html.erb')
-        expect(tool_calls_partial).to include('tool_calls: tool_calls, tool_call: tool_call')
-        expect(tool_calls_partial).to include('local_assigns[:message]')
-        tool_results_default = File.read('app/views/messages/tool_results/_default.html.erb')
-        expect(tool_results_default).to include('tool.tool_error_message')
-        chat_form = File.read('app/views/chats/_form.html.erb')
-        expect(chat_form).to include('@chat_models.map')
-        expect(chat_form).to include('[model.label, model.id]')
-        expect(chat_form).to include('default_model_display_name')
-        create_stream = File.read('app/views/messages/create.turbo_stream.erb')
-        expect(create_stream).to include('turbo_stream.replace "new_message"')
-        expect(create_stream).to include('render "messages/form"')
-
-        # Model views
-        expect(File.exist?('app/views/models/index.html.erb')).to be true
-        expect(File.exist?('app/views/models/show.html.erb')).to be true
-        expect(File.exist?('app/views/models/_model.html.erb')).to be true
-        models_index = File.read('app/views/models/index.html.erb')
-        expect(models_index).to include('@models.each do |model_info|')
-        expect(models_index).to include('render "models/model",')
-      end
-    end
-
-    it 'uses scaffold-style inline styles by default' do
-      within_test_app(app_path) do
-        index_view = File.read('app/views/chats/index.html.erb')
-        expect(index_view).to include('<p style="color: green">')
-        expect(index_view).not_to include('text-green-700')
-      end
-    end
-
-    it 'creates job file with default name' do
-      within_test_app(app_path) do
-        expect(File.exist?('app/jobs/chat_response_job.rb')).to be true
-      end
-    end
-
-    it 'adds routes for default controllers' do
-      within_test_app(app_path) do
-        routes_content = File.read('config/routes.rb')
-        expect(routes_content).to include('resources :chats')
-        expect(routes_content).to include('resources :messages, only: [ :create ]')
-        expect(routes_content).to include('resources :models, only: [ :index, :show ]')
-      end
-    end
-
-    it 'adds broadcasting to message model' do
-      within_test_app(app_path) do
-        message_content = File.read('app/models/message.rb')
-
-        # Check the acts_as_message declaration
-        expect(message_content).to include('acts_as_message')
-
-        # Check broadcasting setup
-        expect(message_content).to include(%q(broadcasts_to ->(message) { "chat_#{message.chat_id}" }))
-        expect(message_content).to include('inserts_by: :append')
-
-        # Check broadcast_append_chunk method
-        expect(message_content).to include('def broadcast_append_chunk(content)')
-        expect(message_content).to include(%q(broadcast_append_to "chat_#{chat_id}"))
-        expect(message_content).to include(%q(target: "message_#{id}_content"))
-        expect(message_content).to include('content: ERB::Util.html_escape(content.to_s)')
-      end
-    end
-
-    it 'controllers reference correct model classes' do
-      within_test_app(app_path) do
-        chats_controller = File.read('app/controllers/chats_controller.rb')
-        expect(chats_controller).to include('class ChatsController')
-        expect(chats_controller).to include('Chat.find')
-        expect(chats_controller).to include('@chat = Chat.new')
-        expect(chats_controller).to include('@chat_models = available_chat_models')
-        expect(chats_controller).to include('prompt = params.dig(:chat, :prompt)')
-        expect(chats_controller).to include('if prompt.present?')
-        expect(chats_controller).to include('@chat = Chat.create!(model: params.dig(:chat, :model).presence)')
-        expect(chats_controller).not_to include('def model')
-        expect(chats_controller).not_to include('def prompt')
-
-        messages_controller = File.read('app/controllers/messages_controller.rb')
-        expect(messages_controller).to include('class MessagesController')
-        expect(messages_controller).to include('@chat = Chat.find(params[:chat_id])')
-        expect(messages_controller).to include('content = params.dig(:message, :content)')
-        expect(messages_controller).to include('if content.present?')
-        expect(messages_controller).to include('ChatResponseJob.perform_later')
-        expect(messages_controller).to include('format.turbo_stream')
-        expect(messages_controller).not_to include('def content')
-
-        models_controller = File.read('app/controllers/models_controller.rb')
-        expect(models_controller).to include('class ModelsController')
-        expect(models_controller).to include('@models = available_chat_models')
-
-        application_controller = File.read('app/controllers/application_controller.rb')
-        expect(application_controller).to include('def available_chat_models')
-        expect(application_controller).to include('sort_by { |model| [ model.provider.to_s, model.name.to_s ] }')
-      end
-    end
-
-    it 'job references correct model classes' do
-      within_test_app(app_path) do
-        job_content = File.read('app/jobs/chat_response_job.rb')
-        expect(job_content).to include('class ChatResponseJob')
-        expect(job_content).to include('chat = Chat.find(chat_id)')
-        expect(job_content).to include('chat.ask(content)')
-        expect(job_content).to include('message = chat.messages.last')
-      end
-    end
-
-    it 'chat functionality works correctly' do
-      within_test_app(app_path) do
-        test_script = <<~RUBY
-          ActiveJob::Base.queue_adapter = :inline
-          chat = Chat.create!
-          message = chat.messages.create!(role: :user, content: 'Test')
-          exit(message.chat_id == chat.id ? 0 : 1)
-        RUBY
-        success, output = run_rails_runner(test_script)
-        expect(success).to be(true), output
-      end
-    end
+  def expect_messages_helper_content(path)
+    messages_helper = File.read(path)
+    expect(messages_helper).to include('def default_model_display_name')
+    expect(messages_helper).not_to include('def llm_model_label(model)')
+    expect(messages_helper).to include('RubyLLM.models.find(RubyLLM.config.default_model).label')
+    expect(messages_helper).to include('def tool_result_partial(message)')
+    expect(messages_helper).to include('def tool_call_partial(tool_call)')
+    expect(messages_helper).not_to include('def model_display_name(model)')
+    expect(messages_helper).not_to include('def provider_display_name(model_or_provider)')
+    expect(messages_helper).not_to include('def parse_tool_payload(content)')
+    expect(messages_helper).not_to include('def llm_model_info(model)')
   end
 
   def expect_generated_view_set(
@@ -310,122 +133,148 @@ def expect_chat_script_to_succeed(script)
     expect(success).to be(true), output
   end
 
-        user_partial = File.read('app/views/llm/messages/_user.html.erb')
-        expect(user_partial).to include('user.content')
-        expect(user_partial).to include('local_assigns[:message]')
-        assistant_partial = File.read('app/views/llm/messages/_assistant.html.erb')
-        expect(assistant_partial).to include('assistant.content')
-        expect(assistant_partial).to include('local_assigns[:message]')
-        system_partial = File.read('app/views/llm/messages/_system.html.erb')
-        expect(system_partial).to include('system.content')
-        expect(system_partial).to include('local_assigns[:message]')
-        tool_partial = File.read('app/views/llm/messages/_tool.html.erb')
-        expect(tool_partial).to include('render tool_result_partial(tool), tool: tool')
-        tool_calls_partial = File.read('app/views/llm/messages/_tool_calls.html.erb')
-        expect(tool_calls_partial).to include('tool_calls: tool_calls, tool_call: tool_call')
-        expect(tool_calls_partial).to include('local_assigns[:message]')
-        tool_results_default = File.read('app/views/llm/messages/tool_results/_default.html.erb')
-        expect(tool_results_default).to include('tool.tool_error_message')
-        chat_form = File.read('app/views/llm/chats/_form.html.erb')
-        expect(chat_form).to include('@chat_models.map')
-        expect(chat_form).to include('[model.label, model.id]')
-        expect(chat_form).to include('default_model_display_name')
-        create_stream = File.read('app/views/llm/messages/create.turbo_stream.erb')
-        expect(create_stream).to include('turbo_stream.replace "new_llm_message"')
-        expect(create_stream).to include('render "llm/messages/form"')
-
-        # Model views
-        expect(File.exist?('app/views/llm/models/index.html.erb')).to be true
-        expect(File.exist?('app/views/llm/models/show.html.erb')).to be true
-        expect(File.exist?('app/views/llm/models/_model.html.erb')).to be true
-        models_index = File.read('app/views/llm/models/index.html.erb')
-        expect(models_index).to include('@llm_models.each do |model_info|')
-        expect(models_index).to include('render "llm/models/model",')
-      end
-    end
-
-    it 'creates job file with namespaced name' do
-      within_test_app(app_path) do
-        expect(File.exist?('app/jobs/llm_chat_response_job.rb')).to be true
-      end
-    end
-
-    it 'adds routes for namespaced controllers' do
-      within_test_app(app_path) do
-        routes_content = File.read('config/routes.rb')
-        expect(routes_content).to include('namespace :llm')
-        expect(routes_content).to include('resources :chats')
-        expect(routes_content).to include('resources :messages, only: [ :create ]')
-        expect(routes_content).to include('resources :models, only: [ :index, :show ]')
-      end
-    end
-
-    it 'adds broadcasting to namespaced message model' do
-      within_test_app(app_path) do
-        message_content = File.read('app/models/llm/message.rb')
-
-        # Check the acts_as_message declaration
-        expect(message_content).to include("acts_as_message chat: :llm_chat, chat_class: 'Llm::Chat'")
-        expect(message_content).to include("tool_calls: :llm_tool_calls, tool_call_class: 'Llm::ToolCall'")
-        expect(message_content).to include("model: :llm_model, model_class: 'Llm::Model'")
-
-        # Check broadcasting setup
-        expect(message_content).to include(%q(broadcasts_to ->(llm_message) { "llm_chat_#{llm_message.llm_chat_id}" }))
-        expect(message_content).to include('inserts_by: :append')
-
-        # Check broadcast_append_chunk method
-        expect(message_content).to include('def broadcast_append_chunk(content)')
-        expect(message_content).to include(%q(broadcast_append_to "llm_chat_#{llm_chat_id}"))
-        expect(message_content).to include(%q(target: "llm_message_#{id}_content"))
-        expect(message_content).to include('content: ERB::Util.html_escape(content.to_s)')
-      end
-    end
-
-    it 'controllers reference correct namespaced model classes' do
-      within_test_app(app_path) do
-        chats_controller = File.read('app/controllers/llm/chats_controller.rb')
-        expect(chats_controller).to include('class Llm::ChatsController')
-        expect(chats_controller).to include('Llm::Chat.find')
-        expect(chats_controller).to include('@llm_chat = Llm::Chat.new')
-        expect(chats_controller).to include('@chat_models = available_chat_models')
-        expect(chats_controller).to include('prompt = params.dig(:llm_chat, :prompt)')
-        expect(chats_controller).to include('if prompt.present?')
-        expect(chats_controller).to include('@llm_chat = Llm::Chat.create!(model:')
-        expect(chats_controller).to include('params.dig(:llm_chat, :model).presence)')
-        expect(chats_controller).not_to include('def model')
-        expect(chats_controller).not_to include('def prompt')
-
-        messages_controller = File.read('app/controllers/llm/messages_controller.rb')
-        expect(messages_controller).to include('class Llm::MessagesController')
-        expect(messages_controller).to include('@llm_chat = Llm::Chat.find(params[:chat_id])')
-        expect(messages_controller).to include('content = params.dig(:llm_message, :content)')
-        expect(messages_controller).to include('if content.present?')
-        expect(messages_controller).to include('LlmChatResponseJob.perform_later')
-        expect(messages_controller).to include('format.turbo_stream')
-        expect(messages_controller).not_to include('def content')
-
-        models_controller = File.read('app/controllers/llm/models_controller.rb')
-        expect(models_controller).to include('class Llm::ModelsController')
-        expect(models_controller).to include('@llm_models = available_chat_models')
-
-        application_controller = File.read('app/controllers/application_controller.rb')
-        expect(application_controller).to include('def available_chat_models')
-        expect(application_controller).to include('sort_by { |model| [ model.provider.to_s, model.name.to_s ] }')
-      end
-    end
-
-    it 'job references correct namespaced model classes' do
-      within_test_app(app_path) do
-        job_content = File.read('app/jobs/llm_chat_response_job.rb')
-        expect(job_content).to include('class LlmChatResponseJob')
-        expect(job_content).to include('llm_chat = Llm::Chat.find(llm_chat_id)')
-        expect(job_content).to include('llm_chat.ask(content)')
-        expect(job_content).to include('llm_message = llm_chat.llm_messages.last')
-      end
-    end
-
-    it 'views use correct partial paths' do
-      within_test_app(app_path) do
+  {
+    'with default model names' => {
+      app_name: 'test_app_default',
+      template_name: 'default_models_template.rb',
+      controller_example: 'creates controller files with default names',
+      controller_paths: %w[
+        app/controllers/chats_controller.rb
+        app/controllers/messages_controller.rb
+        app/controllers/models_controller.rb
+      ],
+      helper_path: 'app/helpers/messages_helper.rb',
+      view_example: 'creates view files with default paths',
+      view_options: {
+        base_path: 'app/views',
+        chats_target: 'new_message',
+        form_partial_path: 'messages/form',
+        model_index_collection: 'models',
+        model_partial_path: 'models/model'
+      },
+      job_file_example: 'creates job file with default name',
+      job_file_path: 'app/jobs/chat_response_job.rb',
+      routes_example: 'adds routes for default controllers',
+      namespaced_routes: false,
+      broadcasting_example: 'adds broadcasting to message model',
+      broadcasting_options: {
+        path: 'app/models/message.rb',
+        acts_as_message_lines: ['acts_as_message'],
+        broadcasts_to_line: "broadcasts_to ->(message) { \"chat_\#{message.chat_id}\" }",
+        broadcast_target_line: "broadcast_append_to \"chat_\#{chat_id}\"",
+        content_target_line: "target: \"message_\#{id}_content\""
+      },
+      controllers_example: 'controllers reference correct model classes',
+      chats_controller_path: 'app/controllers/chats_controller.rb',
+      chats_controller_expectations: [
+        'class ChatsController',
+        'Chat.find',
+        '@chat = Chat.new',
+        '@chat_models = available_chat_models',
+        'prompt = params.dig(:chat, :prompt)',
+        'if prompt.present?',
+        '@chat = Chat.create!(model: params.dig(:chat, :model).presence)'
+      ],
+      messages_controller_path: 'app/controllers/messages_controller.rb',
+      messages_controller_expectations: [
+        'class MessagesController',
+        '@chat = Chat.find(params[:chat_id])',
+        'content = params.dig(:message, :content)',
+        'if content.present?',
+        'ChatResponseJob.perform_later',
+        'format.turbo_stream'
+      ],
+      models_controller_path: 'app/controllers/models_controller.rb',
+      models_controller_expectations: [
+        'class ModelsController',
+        '@models = available_chat_models'
+      ],
+      job_example: 'job references correct model classes',
+      job_options: {
+        path: 'app/jobs/chat_response_job.rb',
+        class_name: 'class ChatResponseJob',
+        lookup_line: 'chat = Chat.find(chat_id)',
+        ask_line: 'chat.ask(content)',
+        last_message_line: 'message = chat.messages.last'
+      },
+      functionality_example: 'chat functionality works correctly',
+      functionality_script: <<~RUBY
+        ActiveJob::Base.queue_adapter = :inline
+        chat = Chat.create!
+        message = chat.messages.create!(role: :user, content: 'Test')
+        exit(message.chat_id == chat.id ? 0 : 1)
+      RUBY
+    },
+    'with namespaced model names' => {
+      app_name: 'test_app_namespaced',
+      template_name: 'namespaced_models_template.rb',
+      controller_example: 'creates controller files with namespaced paths',
+      controller_paths: %w[
+        app/controllers/llm/chats_controller.rb
+        app/controllers/llm/messages_controller.rb
+        app/controllers/llm/models_controller.rb
+      ],
+      helper_path: 'app/helpers/llm/messages_helper.rb',
+      view_example: 'creates view files with namespaced paths',
+      view_options: {
+        base_path: 'app/views/llm',
+        chats_target: 'new_llm_message',
+        form_partial_path: 'llm/messages/form',
+        model_index_collection: 'llm_models',
+        model_partial_path: 'llm/models/model'
+      },
+      job_file_example: 'creates job file with namespaced name',
+      job_file_path: 'app/jobs/llm_chat_response_job.rb',
+      routes_example: 'adds routes for namespaced controllers',
+      namespaced_routes: true,
+      broadcasting_example: 'adds broadcasting to namespaced message model',
+      broadcasting_options: {
+        path: 'app/models/llm/message.rb',
+        acts_as_message_lines: [
+          "acts_as_message chat: :llm_chat, chat_class: 'Llm::Chat'",
+          "tool_calls: :llm_tool_calls, tool_call_class: 'Llm::ToolCall'",
+          "model: :llm_model, model_class: 'Llm::Model'"
+        ],
+        broadcasts_to_line: "broadcasts_to ->(llm_message) { \"llm_chat_\#{llm_message.llm_chat_id}\" }",
+        broadcast_target_line: "broadcast_append_to \"llm_chat_\#{llm_chat_id}\"",
+        content_target_line: "target: \"llm_message_\#{id}_content\""
+      },
+      controllers_example: 'controllers reference correct namespaced model classes',
+      chats_controller_path: 'app/controllers/llm/chats_controller.rb',
+      chats_controller_expectations: [
+        'class Llm::ChatsController',
+        'Llm::Chat.find',
+        '@llm_chat = Llm::Chat.new',
+        '@chat_models = available_chat_models',
+        'prompt = params.dig(:llm_chat, :prompt)',
+        'if prompt.present?',
+        '@llm_chat = Llm::Chat.create!(model:',
+        'params.dig(:llm_chat, :model).presence)'
+      ],
+      messages_controller_path: 'app/controllers/llm/messages_controller.rb',
+      messages_controller_expectations: [
+        'class Llm::MessagesController',
+        '@llm_chat = Llm::Chat.find(params[:chat_id])',
+        'content = params.dig(:llm_message, :content)',
+        'if content.present?',
+        'LlmChatResponseJob.perform_later',
+        'format.turbo_stream'
+      ],
+      models_controller_path: 'app/controllers/llm/models_controller.rb',
+      models_controller_expectations: [
+        'class Llm::ModelsController',
+        '@llm_models = available_chat_models'
+      ],
+      job_example: 'job references correct namespaced model classes',
+      job_options: {
+        path: 'app/jobs/llm_chat_response_job.rb',
+        class_name: 'class LlmChatResponseJob',
+        lookup_line: 'llm_chat = Llm::Chat.find(llm_chat_id)',
+        ask_line: 'llm_chat.ask(content)',
+        last_message_line: 'llm_message = llm_chat.llm_messages.last'
+      },
+      extra_view_example: 'views use correct partial paths',
+      extra_view_assertions: lambda do
         show_view = File.read('app/views/llm/chats/show.html.erb')
         expect(show_view).to include('render')
         expect(show_view).to include('render "llm/messages/form"')

From bf8fa446eccc0e9fbbc8dcda6ba15f6c25be515f Mon Sep 17 00:00:00 2001
From: arun kumar <arunkumar.ry1@gmail.com>
Date: Tue, 7 Apr 2026 16:05:40 +0530
Subject: [PATCH 5/7] add prompt cache support for bedrock provider

---
 lib/ruby_llm/providers/bedrock/chat.rb        | 32 +++++---
 .../bedrock/chat_cache_point_spec.rb          | 77 +++++++++++++++++++
 2 files changed, 97 insertions(+), 12 deletions(-)
 create mode 100644 spec/ruby_llm/providers/bedrock/chat_cache_point_spec.rb

diff --git a/lib/ruby_llm/providers/bedrock/chat.rb b/lib/ruby_llm/providers/bedrock/chat.rb
index c39fa2942..a2dbfa415 100644
--- a/lib/ruby_llm/providers/bedrock/chat.rb
+++ b/lib/ruby_llm/providers/bedrock/chat.rb
@@ -114,21 +114,26 @@ def render_message_content(msg)
           text_and_media_blocks = Media.render_content(msg.content, used_document_names: @used_document_names)
           blocks.concat(text_and_media_blocks) if text_and_media_blocks
 
-          if msg.tool_call?
-            msg.tool_calls.each_value do |tool_call|
-              blocks << {
-                toolUse: {
-                  toolUseId: tool_call.id,
-                  name: tool_call.name,
-                  input: tool_call.arguments
-                }
-              }
-            end
-          end
+          append_tool_use_blocks(blocks, msg)
+          blocks << { cachePoint: { type: 'default' } } if msg.cache_point?
 
           blocks
         end
 
+        def append_tool_use_blocks(blocks, msg)
+          return unless msg.tool_call?
+
+          msg.tool_calls.each_value do |tool_call|
+            blocks << {
+              toolUse: {
+                toolUseId: tool_call.id,
+                name: tool_call.name,
+                input: tool_call.arguments
+              }
+            }
+          end
+        end
+
         def render_raw_content(content)
           value = content.value
           value.is_a?(Array) ? value : [value]
@@ -200,7 +205,10 @@ def render_role(role)
         end
 
         def render_system(messages)
-          messages.flat_map { |msg| Media.render_content(msg.content, used_document_names: @used_document_names) }
+          messages.flat_map do |msg|
+            blocks = Media.render_content(msg.content, used_document_names: @used_document_names)
+            msg.cache_point? ? blocks + [{ cachePoint: { type: 'default' } }] : blocks
+          end
         end
 
         def render_inference_config(_model, temperature)
diff --git a/spec/ruby_llm/providers/bedrock/chat_cache_point_spec.rb b/spec/ruby_llm/providers/bedrock/chat_cache_point_spec.rb
new file mode 100644
index 000000000..7f562657f
--- /dev/null
+++ b/spec/ruby_llm/providers/bedrock/chat_cache_point_spec.rb
@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe RubyLLM::Providers::Bedrock::Chat do
+  let(:model) do
+    instance_double(RubyLLM::Model::Info,
+                    id: 'anthropic.claude-haiku-4-5-20251001-v1:0',
+                    max_tokens: nil,
+                    metadata: {})
+  end
+
+  let(:base_args) do
+    { tools: {}, temperature: nil, model: model, stream: false }
+  end
+
+  def render(messages)
+    described_class.render_payload(messages, **base_args)
+  end
+
+  def msg(role, content, cache_point: false)
+    RubyLLM::Message.new(role: role, content: content, cache_point: cache_point)
+  end
+
+  describe 'cache_point injection' do
+    context 'with a system message where cache_point is true' do
+      it 'appends a cachePoint block to the system content' do
+        payload = render([msg(:system, 'You are helpful.', cache_point: true),
+                          msg(:user, 'Hi')])
+
+        last_block = payload[:system].last
+        expect(last_block).to eq(cachePoint: { type: 'default' })
+      end
+
+      it 'does not append cachePoint when cache_point is false' do
+        payload = render([msg(:system, 'You are helpful.'), msg(:user, 'Hi')])
+
+        expect(payload[:system]).not_to include(cachePoint: { type: 'default' })
+      end
+    end
+
+    context 'with a user message where cache_point is true' do
+      it 'appends a cachePoint block to the message content' do
+        payload = render([msg(:user, 'Tell me a story.', cache_point: true)])
+
+        last_block = payload[:messages].first[:content].last
+        expect(last_block).to eq(cachePoint: { type: 'default' })
+      end
+
+      it 'does not append cachePoint when cache_point is false' do
+        payload = render([msg(:user, 'Tell me a story.')])
+
+        content = payload[:messages].first[:content]
+        expect(content).not_to include(cachePoint: { type: 'default' })
+      end
+    end
+
+    context 'when multiple messages have cache_point: true' do
+      it 'appends cachePoint to each cache-pointed message' do
+        payload = render([
+                           msg(:system, 'System prompt', cache_point: true),
+                           msg(:user, 'User context', cache_point: true),
+                           msg(:user, 'Dynamic question')
+                         ])
+
+        system_has_cache = payload[:system].last == { cachePoint: { type: 'default' } }
+        user_messages = payload[:messages]
+        first_user_has_cache = user_messages.first[:content].last == { cachePoint: { type: 'default' } }
+        last_user_no_cache = user_messages.last[:content].last != { cachePoint: { type: 'default' } }
+
+        expect(system_has_cache).to be true
+        expect(first_user_has_cache).to be true
+        expect(last_user_no_cache).to be true
+      end
+    end
+  end
+end

From fdf241480746925d1ffb2feffd34205e697d7e9e Mon Sep 17 00:00:00 2001
From: Arun Kumar <arunkumar@Aruns-MacBook-Pro.local>
Date: Tue, 5 May 2026 12:36:26 +0530
Subject: [PATCH 6/7] Add cache control inject for anthropic models via
 openrouter

---
 lib/ruby_llm/providers/openrouter/chat.rb     | 18 +++-
 .../open_router/chat_cache_control_spec.rb    | 89 +++++++++++++++++++
 2 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 spec/ruby_llm/providers/open_router/chat_cache_control_spec.rb

diff --git a/lib/ruby_llm/providers/openrouter/chat.rb b/lib/ruby_llm/providers/openrouter/chat.rb
index 0c3622bdf..f0c917214 100644
--- a/lib/ruby_llm/providers/openrouter/chat.rb
+++ b/lib/ruby_llm/providers/openrouter/chat.rb
@@ -82,15 +82,31 @@ def parse_completion_response(response)
 
         def format_messages(messages)
           messages.map do |msg|
+            content = OpenAI::Media.format_content(msg.content)
+            content = inject_cache_control(content) if msg.cache_point?
+
             {
               role: format_role(msg.role),
-              content: OpenAI::Media.format_content(msg.content),
+              content: content,
               tool_calls: OpenAI::Tools.format_tool_calls(msg.tool_calls),
               tool_call_id: msg.tool_call_id
             }.compact.merge(format_thinking(msg))
           end
         end
 
+        def inject_cache_control(content)
+          # Anthropic cache_control. For other models will be ignored by respective provider.
+          # Wrap plain strings into a text block first so the marker can be attached.
+          blocks = content.is_a?(Array) ? content.dup : [{ type: 'text', text: content }]
+          return blocks if blocks.empty?
+
+          last = blocks.last
+          return blocks if last.is_a?(Hash) && last[:cache_control]
+
+          blocks[-1] = last.merge(cache_control: { type: 'ephemeral' })
+          blocks
+        end
+
         def format_role(role)
           case role
           when :system
diff --git a/spec/ruby_llm/providers/open_router/chat_cache_control_spec.rb b/spec/ruby_llm/providers/open_router/chat_cache_control_spec.rb
new file mode 100644
index 000000000..146128f4c
--- /dev/null
+++ b/spec/ruby_llm/providers/open_router/chat_cache_control_spec.rb
@@ -0,0 +1,89 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe RubyLLM::Providers::OpenRouter::Chat do
+  describe '#inject_cache_control' do
+    context 'with an array of content blocks' do
+      it 'adds cache_control to the last block' do
+        blocks = [{ type: 'text', text: 'hello' }]
+        result = described_class.inject_cache_control(blocks)
+
+        expect(result.last[:cache_control]).to eq(type: 'ephemeral')
+      end
+
+      it 'does not modify earlier blocks' do
+        blocks = [{ type: 'text', text: 'first' }, { type: 'text', text: 'last' }]
+        result = described_class.inject_cache_control(blocks)
+
+        expect(result.first).not_to have_key(:cache_control)
+        expect(result.last[:cache_control]).to eq(type: 'ephemeral')
+      end
+
+      it 'does not duplicate cache_control if already present' do
+        blocks = [{ type: 'text', text: 'hello', cache_control: { type: 'ephemeral' } }]
+        result = described_class.inject_cache_control(blocks)
+
+        blocks_with_cache = result.select { |b| b[:cache_control] }
+        expect(blocks_with_cache.length).to eq(1)
+      end
+
+      it 'returns the array unchanged when empty' do
+        expect(described_class.inject_cache_control([])).to eq([])
+      end
+    end
+
+    context 'with a plain string' do
+      it 'wraps the string in a text block and adds cache_control' do
+        result = described_class.inject_cache_control('Tell me a story.')
+
+        expect(result).to be_an(Array)
+        expect(result.last).to eq(type: 'text', text: 'Tell me a story.', cache_control: { type: 'ephemeral' })
+      end
+    end
+  end
+
+  describe '#format_messages' do
+    context 'when a message has cache_point: true' do
+      it 'injects cache_control into the last content block' do
+        msg = RubyLLM::Message.new(role: :user, content: 'Tell me a story.', cache_point: true)
+        result = described_class.format_messages([msg])
+
+        last_block = result.first[:content].last
+        expect(last_block[:cache_control]).to eq(type: 'ephemeral')
+      end
+
+      it 'works for assistant messages too' do
+        msg = RubyLLM::Message.new(role: :assistant, content: 'I can help.', cache_point: true)
+        result = described_class.format_messages([msg])
+
+        last_block = result.first[:content].last
+        expect(last_block[:cache_control]).to eq(type: 'ephemeral')
+      end
+    end
+
+    context 'when a message has cache_point: false (default)' do
+      it 'does not add cache_control to any content block' do
+        msg = RubyLLM::Message.new(role: :user, content: 'Hello')
+        result = described_class.format_messages([msg])
+
+        content = result.first[:content]
+        content.each { |block| expect(block).not_to have_key(:cache_control) } if content.is_a?(Array)
+      end
+    end
+
+    context 'with multiple messages, only one cache_point' do
+      it 'only injects cache_control on the marked message' do
+        msg1 = RubyLLM::Message.new(role: :user, content: 'First message')
+        msg2 = RubyLLM::Message.new(role: :user, content: 'Second message', cache_point: true)
+        result = described_class.format_messages([msg1, msg2])
+
+        content1 = result[0][:content]
+        content2 = result[1][:content]
+
+        content1.each { |block| expect(block).not_to have_key(:cache_control) } if content1.is_a?(Array)
+        expect(content2.last[:cache_control]).to eq(type: 'ephemeral')
+      end
+    end
+  end
+end

From b3023cbf44673e71fa9b0a053d81203d4198aba4 Mon Sep 17 00:00:00 2001
From: Arun Kumar <arunkumar@Aruns-MacBook-Pro.local>
Date: Thu, 21 May 2026 21:17:22 +0530
Subject: [PATCH 7/7] resolve rubocop offenses

---
 lib/ruby_llm/providers/openrouter/chat.rb | 2 +-
 spec/ruby_llm/message_spec.rb             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/ruby_llm/providers/openrouter/chat.rb b/lib/ruby_llm/providers/openrouter/chat.rb
index 2a1a40bbb..43d35dd65 100644
--- a/lib/ruby_llm/providers/openrouter/chat.rb
+++ b/lib/ruby_llm/providers/openrouter/chat.rb
@@ -130,7 +130,7 @@ def inject_cache_control(content)
           blocks[-1] = last.merge(cache_control: { type: 'ephemeral' })
           blocks
         end
-        
+
         def format_content(content)
           OpenAI::Media.format_content(content)
         end
diff --git a/spec/ruby_llm/message_spec.rb b/spec/ruby_llm/message_spec.rb
index 7463f7a1b..a1d929f25 100644
--- a/spec/ruby_llm/message_spec.rb
+++ b/spec/ruby_llm/message_spec.rb
@@ -18,7 +18,7 @@
       }
     )
   end
-  
+
   describe '#cache_point?' do
     it 'returns false by default' do
       message = described_class.new(role: :user, content: 'hello')