From 60b23cf1145c728811cdb1f94d93f81d29521adc Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 1 Mar 2026 12:57:41 +0000
Subject: [PATCH 1/3] Initial plan


From 2090b397502848305cb0d3fd404a7126c070918f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 1 Mar 2026 13:03:01 +0000
Subject: [PATCH 2/3] Add Cloudflare Workers AI fallback when Gemini is rate
 limited

Co-authored-by: harshithpabbati <43822585+harshithpabbati@users.noreply.github.com>
---
 README.md                       |   2 +
 app/api/webhooks/reply/route.ts | 136 ++++++++++++++++++++++++++------
 env.example                     |   2 +
 3 files changed, 114 insertions(+), 26 deletions(-)
diff --git a/README.md b/README.md
index a0e12c3..20c7563 100644
--- a/README.md
+++ b/README.md
@@ -141,3 +141,5 @@ GRANT ALL ON TABLE "public"."reply_edit" TO "service_role";
 | `NEXT_PUBLIC_BASE_URL` | Base URL of your deployment (e.g. `https://answerify.dev`) |
 | `RESEND_API_KEY` | Resend API key for sending emails |
 | `GEMINI_API_KEY` | Google Gemini API key for embeddings (`gemini-embedding-001`) and completions (`gemini-3-flash-preview`) |
+| `CLOUDFLARE_ACCOUNT_ID` | *(Optional)* Cloudflare account ID – used as a fallback AI provider when Gemini is unavailable (e.g. rate limited) |
+| `CLOUDFLARE_API_TOKEN` | *(Optional)* Cloudflare API token with Workers AI permission – required alongside `CLOUDFLARE_ACCOUNT_ID` for the fallback to activate |
diff --git a/app/api/webhooks/reply/route.ts b/app/api/webhooks/reply/route.ts
index 7ef01ab..a2c11c7 100644
--- a/app/api/webhooks/reply/route.ts
+++ b/app/api/webhooks/reply/route.ts
@@ -11,6 +11,50 @@ function getGenAIClient() {
   return new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY! });
 }
 
+const CLOUDFLARE_MODEL = '@cf/meta/llama-3.1-8b-instruct';
+
+/**
+ * Call Cloudflare Workers AI via the REST API.
+ * Used as a fallback when Gemini is unavailable (e.g. rate limited).
+ */
+async function runCloudflareAgent(systemPrompt: string, userPrompt: string): Promise<string> {
+  const accountId = process.env.CLOUDFLARE_ACCOUNT_ID;
+  const apiToken = process.env.CLOUDFLARE_API_TOKEN;
+
+  if (!accountId || !apiToken) {
+    throw new Error('Cloudflare credentials not configured');
+  }
+
+  const response = await fetch(
+    `https://api.cloudflare.com/client/v4/accounts/${accountId}/ai/run/${CLOUDFLARE_MODEL}`,
+    {
+      method: 'POST',
+      headers: {
+        Authorization: `Bearer ${apiToken}`,
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify({
+        messages: [
+          { role: 'system', content: systemPrompt },
+          { role: 'user', content: userPrompt },
+        ],
+        max_tokens: 1024,
+      }),
+    },
+  );
+
+  if (!response.ok) {
+    throw new Error(`Cloudflare AI request failed: ${response.status}`);
+  }
+
+  const data = await response.json();
+  const text = (data as { result?: { response?: string } }).result?.response ?? '';
+  if (!text) {
+    console.warn('Cloudflare AI returned an empty response');
+  }
+  return text;
+}
+
 /**
  * Derive a 0–1 confidence score from Gemini grounding metadata.
  * Falls back to URL_CONTEXT_FALLBACK_CONFIDENCE when the URL context tool
@@ -99,7 +143,7 @@ async function runResearchAgent(
   subject: string,
   question: string,
   urlList: string,
-): Promise<{ findings: string; candidates: any[] | undefined }> {
+): Promise<{ findings: string; candidates: any[] | undefined; usedFallback?: boolean }> {
   const researchPrompt = codeBlock`
     You are a research assistant for a customer support team.
     Your job is to find and extract the most relevant information from the provided URLs
@@ -113,18 +157,43 @@ async function runResearchAgent(
     - If no relevant information can be found, respond with only: NO_INFORMATION
   `;
 
-  const result = await ai.models.generateContent({
-    model: 'gemini-2.5-flash',
-    contents: `Subject: ${subject}\nCustomer question:\n${question}\n\nURLs to search:\n${urlList}`,
-    config: {
-      systemInstruction: researchPrompt,
-      maxOutputTokens: 1024,
-      temperature: 0.3,
-      tools: [{ urlContext: {} }],
-    },
-  });
+  try {
+    const result = await ai.models.generateContent({
+      model: 'gemini-2.5-flash',
+      contents: `Subject: ${subject}\nCustomer question:\n${question}\n\nURLs to search:\n${urlList}`,
+      config: {
+        systemInstruction: researchPrompt,
+        maxOutputTokens: 1024,
+        temperature: 0.3,
+        tools: [{ urlContext: {} }],
+      },
+    });
+
+    return { findings: result.text ?? '', candidates: result.candidates };
+  } catch (err) {
+    console.warn('Gemini research agent failed, falling back to Cloudflare AI:', err);
+
+    const cloudflareResearchPrompt = codeBlock`
+      You are a research assistant for a customer support team.
+      Your job is to extract the most relevant information to answer a customer's question.
+
+      - Extract only information that is directly relevant to the question
+      - Organise the findings as concise bullet points or short paragraphs
+      - Include specific details: steps, values, settings, or policies that apply
+      - Do not write the final reply – only gather and present the raw facts
+      - If you cannot find relevant information, respond with only: NO_INFORMATION
+    `;
+
+    // Note: unlike Gemini's urlContext tool, Cloudflare AI cannot fetch URL
+    // content. The URLs are listed as context so the model can reference them
+    // in its answer, but the response is based on the model's training data.
+    const findings = await runCloudflareAgent(
+      cloudflareResearchPrompt,
+      `Subject: ${subject}\nCustomer question:\n${question}\n\nKnowledge base sources:\n${urlList}`,
+    );
 
-  return { findings: result.text ?? '', candidates: result.candidates };
+    return { findings, candidates: undefined, usedFallback: true };
+  }
 }
 
 /**
@@ -176,17 +245,24 @@ async function runWritingAgent(
     - Do not output anything outside of the HTML response
   `;
 
-  const result = await ai.models.generateContent({
-    model: 'gemini-2.5-flash',
-    contents: `Subject: ${subject}\nCustomer question:\n${question}\n\nResearch findings:\n${findings}`,
-    config: {
-      systemInstruction: writingPrompt,
-      maxOutputTokens: 1024,
-      temperature: 0.7,
-    },
-  });
-
-  return result.text ?? '';
+  const userContent = `Subject: ${subject}\nCustomer question:\n${question}\n\nResearch findings:\n${findings}`;
+
+  try {
+    const result = await ai.models.generateContent({
+      model: 'gemini-2.5-flash',
+      contents: userContent,
+      config: {
+        systemInstruction: writingPrompt,
+        maxOutputTokens: 1024,
+        temperature: 0.7,
+      },
+    });
+
+    return result.text ?? '';
+  } catch (err) {
+    console.warn('Gemini writing agent failed, falling back to Cloudflare AI:', err);
+    return runCloudflareAgent(writingPrompt, userContent);
+  }
 }
 
 export async function POST(request: Request) {
@@ -267,15 +343,23 @@ export async function POST(request: Request) {
   // Fetch and synthesise relevant information from the datasource URLs.
   // Confidence is derived from the grounding metadata of this step because it
   // is the step that actually reads from the knowledge-base URLs.
-  const { findings, candidates: researchCandidates } = await runResearchAgent(
+  const {
+    findings,
+    candidates: researchCandidates,
+    usedFallback,
+  } = await runResearchAgent(
     ai,
     thread?.subject ?? '',
     record.cleaned_body,
     urlList,
   );
 
-  const confidence = computeConfidence(researchCandidates, datasources.length);
-  const citations = extractCitations(researchCandidates);
+  // When using the Cloudflare fallback, grounding metadata is unavailable so
+  // we use a fixed conservative confidence score to keep the reply as a draft.
+  const confidence = usedFallback
+    ? 0.5
+    : computeConfidence(researchCandidates, datasources.length);
+  const citations = usedFallback ? [] : extractCitations(researchCandidates);
 
   if (!findings || findings.trim() === 'NO_INFORMATION') {
     // Generate clarifying question draft instead of erroring out
diff --git a/env.example b/env.example
index 179e159..24afaac 100644
--- a/env.example
+++ b/env.example
@@ -4,3 +4,5 @@ SUPABASE_SERVICE_KEY=
 NEXT_PUBLIC_BASE_URL=http://localhost:3000
 RESEND_API_KEY=
 GEMINI_API_KEY=
+CLOUDFLARE_ACCOUNT_ID=
+CLOUDFLARE_API_TOKEN=

From 2e37bd35b3e9c458ae6d281ac146f942369c9f7b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 1 Mar 2026 15:34:17 +0000
Subject: [PATCH 3/3] Fetch URL content server-side so Cloudflare fallback
 reads actual knowledge base pages

Co-authored-by: harshithpabbati <43822585+harshithpabbati@users.noreply.github.com>
---
 README.md                       |  2 +
 app/api/webhooks/reply/route.ts | 67 +++++++++++++++++++++++++++++++--
 2 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 20c7563..3d5a143 100644
--- a/README.md
+++ b/README.md
@@ -143,3 +143,5 @@ GRANT ALL ON TABLE "public"."reply_edit" TO "service_role";
 | `GEMINI_API_KEY` | Google Gemini API key for embeddings (`gemini-embedding-001`) and completions (`gemini-3-flash-preview`) |
 | `CLOUDFLARE_ACCOUNT_ID` | *(Optional)* Cloudflare account ID – used as a fallback AI provider when Gemini is unavailable (e.g. rate limited) |
 | `CLOUDFLARE_API_TOKEN` | *(Optional)* Cloudflare API token with Workers AI permission – required alongside `CLOUDFLARE_ACCOUNT_ID` for the fallback to activate |
+
+> **How the Cloudflare fallback handles URLs:** when Gemini is unavailable the server fetches up to 5 knowledge-base URLs itself, strips the HTML to plain text, and injects the content directly into the Cloudflare AI prompt. This gives the fallback model the same knowledge-base information that Gemini obtains via its native URL context tool — no special model capability required.
diff --git a/app/api/webhooks/reply/route.ts b/app/api/webhooks/reply/route.ts
index a2c11c7..b9ec97d 100644
--- a/app/api/webhooks/reply/route.ts
+++ b/app/api/webhooks/reply/route.ts
@@ -1,5 +1,6 @@
 import { GoogleGenAI } from '@google/genai';
 import { codeBlock } from 'common-tags';
+import { JSDOM } from 'jsdom';
 import { Resend } from 'resend';
 
 import { cleanBody } from '@/lib/cleanBody';
@@ -13,6 +14,51 @@ function getGenAIClient() {
 
 const CLOUDFLARE_MODEL = '@cf/meta/llama-3.1-8b-instruct';
 
+// Maximum number of URLs to fetch when running the Cloudflare fallback.
+// Kept intentionally small because we download page content ourselves.
+const MAX_FALLBACK_URLS = 5;
+
+// Maximum plain-text characters to include per fetched URL in the prompt.
+const FALLBACK_URL_CONTENT_LENGTH = 4000;
+
+// Timeout in milliseconds for each URL fetch in the fallback path.
+const URL_FETCH_TIMEOUT_MS = 5000;
+
+/**
+ * Fetch the plain-text content of a URL for use as AI context.
+ *
+ * Any LLM can "read" a knowledge-base URL when the page text is fetched
+ * server-side and injected directly into the prompt, so this approach works
+ * for any fallback model — not just Gemini's native urlContext tool.
+ *
+ * Returns null when the URL cannot be reached or returns no usable text.
+ */
+async function fetchUrlContent(url: string): Promise<{ url: string; text: string } | null> {
+  try {
+    const response = await fetch(url, {
+      headers: { 'User-Agent': 'Answerify/1.0 (+https://answerify.dev)' },
+      signal: AbortSignal.timeout(URL_FETCH_TIMEOUT_MS),
+    });
+
+    if (!response.ok) return null;
+
+    const html = await response.text();
+
+    // Use JSDOM to safely parse and extract plain text — more robust than
+    // regex stripping, which can leave residual tag fragments.
+    const dom = new JSDOM(html);
+    dom.window.document.querySelectorAll('script, style').forEach((el) => el.remove());
+    const text = (dom.window.document.body?.textContent ?? '')
+      .replace(/\s+/g, ' ')
+      .trim()
+      .slice(0, FALLBACK_URL_CONTENT_LENGTH);
+
+    return text ? { url, text } : null;
+  } catch {
+    return null;
+  }
+}
+
 /**
  * Call Cloudflare Workers AI via the REST API.
  * Used as a fallback when Gemini is unavailable (e.g. rate limited).
@@ -184,12 +230,25 @@ async function runResearchAgent(
       - If you cannot find relevant information, respond with only: NO_INFORMATION
     `;
 
-    // Note: unlike Gemini's urlContext tool, Cloudflare AI cannot fetch URL
-    // content. The URLs are listed as context so the model can reference them
-    // in its answer, but the response is based on the model's training data.
+    // Fetch the actual content of the knowledge-base URLs so the model has
+    // real page text to work with. Any LLM can support URL-based knowledge
+    // bases this way — no native URL tool required.
+    const urls = urlList.split('\n').filter(Boolean).slice(0, MAX_FALLBACK_URLS);
+    const settledPages = await Promise.allSettled(urls.map(fetchUrlContent));
+    const fetchedPages = settledPages
+      .filter((r): r is PromiseFulfilledResult<{ url: string; text: string }> =>
+        r.status === 'fulfilled' && r.value !== null,
+      )
+      .map((r) => r.value);
+
+    const urlContext =
+      fetchedPages.length > 0
+        ? fetchedPages.map((p) => `[${p.url}]\n${p.text}`).join('\n\n---\n\n')
+        : urlList;
+
     const findings = await runCloudflareAgent(
       cloudflareResearchPrompt,
-      `Subject: ${subject}\nCustomer question:\n${question}\n\nKnowledge base sources:\n${urlList}`,
+      `Subject: ${subject}\nCustomer question:\n${question}\n\nKnowledge base content:\n${urlContext}`,
     );
 
     return { findings, candidates: undefined, usedFallback: true };