vantage-sh · duksh · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/.github/workflows/scrape.yml b/.github/workflows/scrape.yml
@@ -0,0 +1,41 @@
+name: Scrape Pricing Data
+
+on:
+  schedule:
+    - cron: "0 6 * * *" # Daily at 06:00 UTC
+  workflow_dispatch: # Allow manual runs
+
+jobs:
+  scrape:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+          cache: "npm"
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Run scrapers
+        run: npm run init
+
+      - name: Check for changes
+        id: changes
+        run: |
+          git diff --quiet public/data.json src/forex.json || echo "changed=true" >> "$GITHUB_OUTPUT"
+
+      - name: Commit updated data
+        if: steps.changes.outputs.changed == 'true'
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add public/data.json src/forex.json
+          git commit -m "chore: refresh pricing data [$(date -u '+%Y-%m-%d')]"
+          git push
diff --git a/.gitignore b/.gitignore
@@ -25,4 +25,5 @@ pnpm-debug.log*
 
 # Ignore the data JSON
 public/data.json
-src/forex.json
+# src/forex.json is intentionally committed — small, stable, needed at SSR build time
+# It is refreshed automatically on every `npm run init` run
diff --git a/scraper/constants.ts b/scraper/constants.ts
@@ -81,6 +81,9 @@ const MODEL_REASONING_PREFIXES = {
     "minimax-m2": false,
     // GLM
     "glm-": true,
+    // Microsoft
+    "phi-4": false,
+    phi: false,
 } as const;
 
 export function isReasoningModel(modelId: string): boolean {
@@ -320,6 +323,11 @@ export function isSelfHostableModel(modelId: string, provider: string): boolean
         return true;
     }
 
+    if (provider === "Microsoft") {
+        // Microsoft Phi models are open-source and self-hostable
+        return true;
+    }
+
     throw new Error(
         `Unknown self-hostable status for model ID: ${modelId} with provider: ${provider}. Please update isSelfHostableModel in scraper/constants.ts.`
     );
@@ -368,6 +376,9 @@ const TRANSFORMERS_TOKENIZER_PATHS: Record<string, string> = {
     gemma: "google/gemma-2-9b-it",
     // IBM Granite
     granite: "ibm-granite/granite-3.0-8b-instruct",
+    // Microsoft Phi
+    "phi-4": "microsoft/Phi-4",
+    phi: "microsoft/Phi-4",
 };
 
 export function getTokenizerForModel(modelId: string, provider: string): Tokenizers | undefined {

diff --git a/scraper/runner.ts b/scraper/runner.ts
@@ -10,13 +10,108 @@ import scrapeDeepseekData from "./scrapers/deepseek";
 import scrapeForexData from "./scrapers/forex";
 import scrapeAwsImageData from "./scrapers/aws-image";
 import scrapeOpenaiImageData from "./scrapers/openai-image";
-import { writeFileSync } from "fs";
+import scrapeGcpImageData from "./scrapers/gcp-image";
+import scrapeAzureData from "./scrapers/azure";
+import { writeFileSync, readFileSync, existsSync } from "fs";
 import { dirname, join } from "path";
 import { fileURLToPath } from "url";
 
+const PRICE_CHANGE_THRESHOLD = 0.05; // 5% change triggers a warning
+
+function detectPriceChanges(oldData: DataFormat, newData: DataFormat): void {
+    let changesFound = false;
+
+    for (const [modelId, newModel] of Object.entries(newData.models)) {
+        const oldModel = oldData.models?.[modelId];
+        if (!oldModel) continue;
+
+        for (const newVendor of newModel.vendors) {
+            const oldVendor = oldModel.vendors.find((v) => v.vendorRef === newVendor.vendorRef);
+            if (!oldVendor) continue;
+
+            for (const [region, [newInput, newOutput]] of Object.entries(newVendor.regionPricing)) {
+                const oldPricing = oldVendor.regionPricing[region];
+                if (!oldPricing) continue;
+                const [oldInput, oldOutput] = oldPricing;
+
+                const inputChange = Math.abs((newInput - oldInput) / oldInput);
+                const outputChange = Math.abs((newOutput - oldOutput) / oldOutput);
+
+                if (inputChange > PRICE_CHANGE_THRESHOLD || outputChange > PRICE_CHANGE_THRESHOLD) {
+                    if (!changesFound) {
+                        console.warn("\n⚠️  Price changes detected:");
+                        changesFound = true;
+                    }
+                    if (inputChange > PRICE_CHANGE_THRESHOLD) {
+                        console.warn(
+                            `  ${modelId} @ ${newVendor.vendorRef}/${region}: input $${(oldInput * 1e6).toFixed(4)} → $${(newInput * 1e6).toFixed(4)} per 1M tokens (${(inputChange * 100).toFixed(1)}% change)`
+                        );
+                    }
+                    if (outputChange > PRICE_CHANGE_THRESHOLD) {
+                        console.warn(
+                            `  ${modelId} @ ${newVendor.vendorRef}/${region}: output $${(oldOutput * 1e6).toFixed(4)} → $${(newOutput * 1e6).toFixed(4)} per 1M tokens (${(outputChange * 100).toFixed(1)}% change)`
+                        );
+                    }
+                }
+            }
+        }
+    }
+
+    for (const [modelId, newModel] of Object.entries(newData.imageModels ?? {})) {
+        const oldModel = oldData.imageModels?.[modelId];
+        if (!oldModel) continue;
+
+        for (const newVendor of newModel.vendors) {
+            const oldVendor = oldModel.vendors.find((v) => v.vendorRef === newVendor.vendorRef);
+            if (!oldVendor) continue;
+
+            for (const [region, newTiers] of Object.entries(newVendor.regionPricing)) {
+                const oldTiers = oldVendor.regionPricing[region];
+                if (!oldTiers) continue;
+
+                for (const newTier of newTiers) {
+                    const oldTier = oldTiers.find((t) => t.resolution === newTier.resolution);
+                    if (!oldTier) continue;
+
+                    const change = Math.abs(
+                        (newTier.pricePerImage - oldTier.pricePerImage) / oldTier.pricePerImage
+                    );
+                    if (change > PRICE_CHANGE_THRESHOLD) {
+                        if (!changesFound) {
+                            console.warn("\n⚠️  Price changes detected:");
+                            changesFound = true;
+                        }
+                        console.warn(
+                            `  ${modelId} @ ${newVendor.vendorRef}/${region} (${newTier.resolution}): $${oldTier.pricePerImage} → $${newTier.pricePerImage} per image (${(change * 100).toFixed(1)}% change)`
+                        );
+                    }
+                }
+            }
+        }
+    }
+
+    if (!changesFound) {
+        console.log("✓ No significant price changes detected.");
+    }
+}
+
 async function main() {
+    const selfPath = dirname(fileURLToPath(import.meta.url));
+    const dataJsonPath = join(selfPath, "..", "public", "data.json");
+
+    // Load existing data for diff comparison
+    let previousData: DataFormat | null = null;
+    if (existsSync(dataJsonPath)) {
+        try {
+            previousData = JSON.parse(readFileSync(dataJsonPath, "utf-8")) as DataFormat;
+        } catch {
+            // Ignore parse errors — first run or corrupted file
+        }
+    }
+
     // Invoke all scrapers to build the data format
     const fmt: DataFormat = {
+        scrapedAt: new Date().toISOString(),
         vendors: {},
         models: {},
         imageModels: {},
@@ -31,14 +126,19 @@ async function main() {
         scrapeOpenaiData(fmt),
         scrapeDeepseekData(fmt),
         scrapeForexData(),
+        scrapeAzureData(fmt),
         // Image generation scrapers
         scrapeAwsImageData(fmt),
         scrapeOpenaiImageData(fmt),
+        scrapeGcpImageData(fmt),
     ]);
 
+    // Detect price changes vs previous run
+    if (previousData) {
+        detectPriceChanges(previousData, fmt);
+    }
+
     // Output the data as JSON
-    const selfPath = dirname(fileURLToPath(import.meta.url));
-    const dataJsonPath = join(selfPath, "..", "public", "data.json");
     writeFileSync(dataJsonPath, JSON.stringify(fmt, null, 4), "utf-8");
     console.log(`Wrote data to ${dataJsonPath}`);
     // Note: tiktoken BPE files are fetched at build time via src/pages/tiktoken/[encoding].tiktoken.ts

diff --git a/scraper/scrapers/aws-image.ts b/scraper/scrapers/aws-image.ts
@@ -65,7 +65,7 @@ export default async function scrapeAwsImageData(fmt: DataFormat) {
     }
 
     for (const [_modelId, model] of Object.entries(AWS_IMAGE_MODELS)) {
-        await addImageModelToFormat(fmt, "aws", "us-east-1", model);
+        await addImageModelToFormat(fmt, "aws", "us-east-1", model, "hardcoded", "2026-03-20");
     }
 
     console.log(

diff --git a/scraper/scrapers/aws.ts b/scraper/scrapers/aws.ts
@@ -91,6 +91,7 @@ async function processPriceDimension(
             latencyMs: perfMetrics?.latencyMs ?? 0,
             tokensPerSecond: perfMetrics?.tokensPerSecond ?? 0,
             lowCapacity: false,
+            priceSource: "scraped",
         };
         modelEntry.vendors.push(vendor);
     }

diff --git a/scraper/scrapers/azure.ts b/scraper/scrapers/azure.ts
@@ -0,0 +1,150 @@
+import type { DataFormat } from "@/src/dataFormat";
+import { addModelToFormat, type ModelDefinition } from "../shared";
+import {
+    getModelsForProvider,
+    getCachedInputCost,
+    cleanModelName,
+    type LiteLLMModel,
+} from "../litellm";
+
+// Reuse the same display name overrides as the OpenAI scraper for GPT/o-series models
+const OPENAI_MODEL_NAME_OVERRIDES: Record<string, string> = {
+    "gpt-4o": "GPT-4o",
+    "gpt-4o-mini": "GPT-4o Mini",
+    "gpt-4-turbo": "GPT-4 Turbo",
+    "gpt-4": "GPT-4",
+    "gpt-3.5-turbo": "GPT-3.5 Turbo",
+    o1: "GPT-o1",
+    "o1-mini": "GPT-o1 Mini",
+    o3: "GPT-o3",
+    "o3-mini": "GPT-o3 Mini",
+    "o4-mini": "GPT-o4 Mini",
+    "gpt-4.1": "GPT-4.1",
+    "gpt-4.1-mini": "GPT-4.1 Mini",
+    "gpt-4.1-nano": "GPT-4.1 Nano",
+    "gpt-5": "GPT-5",
+};
+
+const MICROSOFT_MODEL_NAME_OVERRIDES: Record<string, string> = {
+    "phi-4": "Phi-4",
+    "phi-4-mini": "Phi-4 Mini",
+    "phi-4-mini-instruct": "Phi-4 Mini",
+};
+
+// Prefixes of models to include from Azure
+const INCLUDED_MODEL_PREFIXES = [
+    "gpt-4o",
+    "gpt-4-turbo",
+    "gpt-4.1",
+    "gpt-4",
+    "gpt-5",
+    "o1",
+    "o3",
+    "o4",
+    "phi-4",
+];
+
+function shouldIncludeModel(baseId: string): boolean {
+    if (baseId.includes("audio") || baseId.includes("realtime")) return false;
+    if (baseId.includes("embedding")) return false;
+    if (baseId.includes("preview")) return false;
+    return INCLUDED_MODEL_PREFIXES.some((prefix) => baseId.startsWith(prefix));
+}
+
+function getProvider(baseId: string): "OpenAI" | "Microsoft" {
+    if (baseId.startsWith("phi-")) return "Microsoft";
+    return "OpenAI";
+}
+
+function getModelName(baseId: string): string | null {
+    const provider = getProvider(baseId);
+
+    if (provider === "Microsoft") {
+        // Exact match first
+        if (MICROSOFT_MODEL_NAME_OVERRIDES[baseId]) {
+            return MICROSOFT_MODEL_NAME_OVERRIDES[baseId];
+        }
+        // Prefix match
+        for (const [key, name] of Object.entries(MICROSOFT_MODEL_NAME_OVERRIDES)) {
+            if (baseId.startsWith(key)) return name;
+        }
+        return cleanModelName(baseId, "azure");
+    }
+
+    // OpenAI-hosted models on Azure
+    if (OPENAI_MODEL_NAME_OVERRIDES[baseId]) {
+        return OPENAI_MODEL_NAME_OVERRIDES[baseId];
+    }
+    for (const [key, name] of Object.entries(OPENAI_MODEL_NAME_OVERRIDES)) {
+        if (baseId.startsWith(key)) return name;
+    }
+    return cleanModelName(baseId, "azure");
+}
+
+function litellmModelToDefinition(modelId: string, model: LiteLLMModel): ModelDefinition | null {
+    if (!model.input_cost_per_token || !model.output_cost_per_token) {
+        return null;
+    }
+
+    // Strip the azure/ prefix to get the base model ID
+    const baseId = modelId.replace(/^azure\//, "");
+
+    const name = getModelName(baseId);
+    if (!name) return null;
+
+    return {
+        name,
+        provider: getProvider(baseId),
+        pricing: {
+            input: model.input_cost_per_token,
+            output: model.output_cost_per_token,
+            cachedInput: getCachedInputCost(model),
+        },
+        maxInputTokens: model.max_input_tokens,
+        maxOutputTokens: model.max_output_tokens ?? model.max_tokens,
+    };
+}
+
+export default async function scrapeAzureData(fmt: DataFormat) {
+    const models = await getModelsForProvider("azure", "chat");
+    const addedModels = new Set<string>();
+
+    for (const [modelId, model] of models) {
+        const baseId = modelId.replace(/^azure\//, "");
+        if (!shouldIncludeModel(baseId)) continue;
+
+        const definition = litellmModelToDefinition(modelId, model);
+        if (!definition) continue;
+
+        // Deduplicate by model name
+        if (addedModels.has(definition.name)) continue;
+        addedModels.add(definition.name);
+
+        await addModelToFormat(fmt, "azure", "eastus", definition);
+    }
+
+    fmt.vendors["azure"] = {
+        cleanName: "Azure AI",
+        learnMoreUrl: "https://azure.microsoft.com/en-us/products/ai-services/openai-service",
+        euOrUKRegions: ["westeurope", "northeurope", "uksouth", "swedencentral"],
+        usaRegions: ["eastus", "eastus2", "westus", "westus3", "northcentralus", "southcentralus"],
+        regionCleanNames: {
+            "": {
+                eastus: "East US (Virginia)",
+                eastus2: "East US 2 (Virginia)",
+                westus: "West US (California)",
+                westus3: "West US 3 (Arizona)",
+                northcentralus: "North Central US (Illinois)",
+                southcentralus: "South Central US (Texas)",
+                westeurope: "West Europe (Netherlands)",
+                northeurope: "North Europe (Ireland)",
+                uksouth: "UK South (London)",
+                swedencentral: "Sweden Central",
+                australiaeast: "Australia East (New South Wales)",
+                japaneast: "Japan East (Tokyo)",
+            },
+        },
+    };
+
+    console.log(`Finished scraping Azure AI data (${addedModels.size} models from LiteLLM)`);
+}