Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions .github/workflows/scrape.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: Scrape Pricing Data

on:
schedule:
- cron: "0 6 * * *" # Daily at 06:00 UTC
workflow_dispatch: # Allow manual runs

jobs:
scrape:
runs-on: ubuntu-latest

permissions:
contents: write

steps:
- uses: actions/checkout@v4

- uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"

- name: Install dependencies
run: npm ci

- name: Run scrapers
run: npm run init

- name: Check for changes
id: changes
run: |
git diff --quiet public/data.json src/forex.json || echo "changed=true" >> "$GITHUB_OUTPUT"

- name: Commit updated data
if: steps.changes.outputs.changed == 'true'
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add public/data.json src/forex.json
git commit -m "chore: refresh pricing data [$(date -u '+%Y-%m-%d')]"
git push
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ pnpm-debug.log*

# Ignore the data JSON
public/data.json
src/forex.json
# src/forex.json is intentionally committed — small, stable, needed at SSR build time
# It is refreshed automatically on every `npm run init` run
11 changes: 11 additions & 0 deletions scraper/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ const MODEL_REASONING_PREFIXES = {
"minimax-m2": false,
// GLM
"glm-": true,
// Microsoft
"phi-4": false,
phi: false,
} as const;

export function isReasoningModel(modelId: string): boolean {
Expand Down Expand Up @@ -320,6 +323,11 @@ export function isSelfHostableModel(modelId: string, provider: string): boolean
return true;
}

if (provider === "Microsoft") {
// Microsoft Phi models are open-source and self-hostable
return true;
}

throw new Error(
`Unknown self-hostable status for model ID: ${modelId} with provider: ${provider}. Please update isSelfHostableModel in scraper/constants.ts.`
);
Expand Down Expand Up @@ -368,6 +376,9 @@ const TRANSFORMERS_TOKENIZER_PATHS: Record<string, string> = {
gemma: "google/gemma-2-9b-it",
// IBM Granite
granite: "ibm-granite/granite-3.0-8b-instruct",
// Microsoft Phi
"phi-4": "microsoft/Phi-4",
phi: "microsoft/Phi-4",
};

export function getTokenizerForModel(modelId: string, provider: string): Tokenizers | undefined {
Expand Down
106 changes: 103 additions & 3 deletions scraper/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,108 @@ import scrapeDeepseekData from "./scrapers/deepseek";
import scrapeForexData from "./scrapers/forex";
import scrapeAwsImageData from "./scrapers/aws-image";
import scrapeOpenaiImageData from "./scrapers/openai-image";
import { writeFileSync } from "fs";
import scrapeGcpImageData from "./scrapers/gcp-image";
import scrapeAzureData from "./scrapers/azure";
import { writeFileSync, readFileSync, existsSync } from "fs";
import { dirname, join } from "path";
import { fileURLToPath } from "url";

const PRICE_CHANGE_THRESHOLD = 0.05; // 5% change triggers a warning

function detectPriceChanges(oldData: DataFormat, newData: DataFormat): void {
let changesFound = false;

for (const [modelId, newModel] of Object.entries(newData.models)) {
const oldModel = oldData.models?.[modelId];
if (!oldModel) continue;

for (const newVendor of newModel.vendors) {
const oldVendor = oldModel.vendors.find((v) => v.vendorRef === newVendor.vendorRef);
if (!oldVendor) continue;

for (const [region, [newInput, newOutput]] of Object.entries(newVendor.regionPricing)) {
const oldPricing = oldVendor.regionPricing[region];
if (!oldPricing) continue;
const [oldInput, oldOutput] = oldPricing;

const inputChange = Math.abs((newInput - oldInput) / oldInput);
const outputChange = Math.abs((newOutput - oldOutput) / oldOutput);

if (inputChange > PRICE_CHANGE_THRESHOLD || outputChange > PRICE_CHANGE_THRESHOLD) {
if (!changesFound) {
console.warn("\n⚠️ Price changes detected:");
changesFound = true;
}
if (inputChange > PRICE_CHANGE_THRESHOLD) {
console.warn(
` ${modelId} @ ${newVendor.vendorRef}/${region}: input $${(oldInput * 1e6).toFixed(4)} → $${(newInput * 1e6).toFixed(4)} per 1M tokens (${(inputChange * 100).toFixed(1)}% change)`
);
}
if (outputChange > PRICE_CHANGE_THRESHOLD) {
console.warn(
` ${modelId} @ ${newVendor.vendorRef}/${region}: output $${(oldOutput * 1e6).toFixed(4)} → $${(newOutput * 1e6).toFixed(4)} per 1M tokens (${(outputChange * 100).toFixed(1)}% change)`
);
}
}
}
}
}

for (const [modelId, newModel] of Object.entries(newData.imageModels ?? {})) {
const oldModel = oldData.imageModels?.[modelId];
if (!oldModel) continue;

for (const newVendor of newModel.vendors) {
const oldVendor = oldModel.vendors.find((v) => v.vendorRef === newVendor.vendorRef);
if (!oldVendor) continue;

for (const [region, newTiers] of Object.entries(newVendor.regionPricing)) {
const oldTiers = oldVendor.regionPricing[region];
if (!oldTiers) continue;

for (const newTier of newTiers) {
const oldTier = oldTiers.find((t) => t.resolution === newTier.resolution);
if (!oldTier) continue;

const change = Math.abs(
(newTier.pricePerImage - oldTier.pricePerImage) / oldTier.pricePerImage
);
if (change > PRICE_CHANGE_THRESHOLD) {
if (!changesFound) {
console.warn("\n⚠️ Price changes detected:");
changesFound = true;
}
console.warn(
` ${modelId} @ ${newVendor.vendorRef}/${region} (${newTier.resolution}): $${oldTier.pricePerImage} → $${newTier.pricePerImage} per image (${(change * 100).toFixed(1)}% change)`
);
}
}
}
}
}

if (!changesFound) {
console.log("✓ No significant price changes detected.");
}
}

async function main() {
const selfPath = dirname(fileURLToPath(import.meta.url));
const dataJsonPath = join(selfPath, "..", "public", "data.json");

// Load existing data for diff comparison
let previousData: DataFormat | null = null;
if (existsSync(dataJsonPath)) {
try {
previousData = JSON.parse(readFileSync(dataJsonPath, "utf-8")) as DataFormat;
} catch {
// Ignore parse errors — first run or corrupted file
}
}

// Invoke all scrapers to build the data format
const fmt: DataFormat = {
scrapedAt: new Date().toISOString(),
vendors: {},
models: {},
imageModels: {},
Expand All @@ -31,14 +126,19 @@ async function main() {
scrapeOpenaiData(fmt),
scrapeDeepseekData(fmt),
scrapeForexData(),
scrapeAzureData(fmt),
// Image generation scrapers
scrapeAwsImageData(fmt),
scrapeOpenaiImageData(fmt),
scrapeGcpImageData(fmt),
]);

// Detect price changes vs previous run
if (previousData) {
detectPriceChanges(previousData, fmt);
}

// Output the data as JSON
const selfPath = dirname(fileURLToPath(import.meta.url));
const dataJsonPath = join(selfPath, "..", "public", "data.json");
writeFileSync(dataJsonPath, JSON.stringify(fmt, null, 4), "utf-8");
console.log(`Wrote data to ${dataJsonPath}`);
// Note: tiktoken BPE files are fetched at build time via src/pages/tiktoken/[encoding].tiktoken.ts
Expand Down
2 changes: 1 addition & 1 deletion scraper/scrapers/aws-image.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ export default async function scrapeAwsImageData(fmt: DataFormat) {
}

for (const [_modelId, model] of Object.entries(AWS_IMAGE_MODELS)) {
await addImageModelToFormat(fmt, "aws", "us-east-1", model);
await addImageModelToFormat(fmt, "aws", "us-east-1", model, "hardcoded", "2026-03-20");
}

console.log(
Expand Down
1 change: 1 addition & 0 deletions scraper/scrapers/aws.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ async function processPriceDimension(
latencyMs: perfMetrics?.latencyMs ?? 0,
tokensPerSecond: perfMetrics?.tokensPerSecond ?? 0,
lowCapacity: false,
priceSource: "scraped",
};
modelEntry.vendors.push(vendor);
}
Expand Down
150 changes: 150 additions & 0 deletions scraper/scrapers/azure.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import type { DataFormat } from "@/src/dataFormat";
import { addModelToFormat, type ModelDefinition } from "../shared";
import {
getModelsForProvider,
getCachedInputCost,
cleanModelName,
type LiteLLMModel,
} from "../litellm";

// Reuse the same display name overrides as the OpenAI scraper for GPT/o-series models
const OPENAI_MODEL_NAME_OVERRIDES: Record<string, string> = {
"gpt-4o": "GPT-4o",
"gpt-4o-mini": "GPT-4o Mini",
"gpt-4-turbo": "GPT-4 Turbo",
"gpt-4": "GPT-4",
"gpt-3.5-turbo": "GPT-3.5 Turbo",
o1: "GPT-o1",
"o1-mini": "GPT-o1 Mini",
o3: "GPT-o3",
"o3-mini": "GPT-o3 Mini",
"o4-mini": "GPT-o4 Mini",
"gpt-4.1": "GPT-4.1",
"gpt-4.1-mini": "GPT-4.1 Mini",
"gpt-4.1-nano": "GPT-4.1 Nano",
"gpt-5": "GPT-5",
};

const MICROSOFT_MODEL_NAME_OVERRIDES: Record<string, string> = {
"phi-4": "Phi-4",
"phi-4-mini": "Phi-4 Mini",
"phi-4-mini-instruct": "Phi-4 Mini",
};

// Prefixes of models to include from Azure
const INCLUDED_MODEL_PREFIXES = [
"gpt-4o",
"gpt-4-turbo",
"gpt-4.1",
"gpt-4",
"gpt-5",
"o1",
"o3",
"o4",
"phi-4",
];

function shouldIncludeModel(baseId: string): boolean {
if (baseId.includes("audio") || baseId.includes("realtime")) return false;
if (baseId.includes("embedding")) return false;
if (baseId.includes("preview")) return false;
return INCLUDED_MODEL_PREFIXES.some((prefix) => baseId.startsWith(prefix));
}

function getProvider(baseId: string): "OpenAI" | "Microsoft" {
if (baseId.startsWith("phi-")) return "Microsoft";
return "OpenAI";
}

function getModelName(baseId: string): string | null {
const provider = getProvider(baseId);

if (provider === "Microsoft") {
// Exact match first
if (MICROSOFT_MODEL_NAME_OVERRIDES[baseId]) {
return MICROSOFT_MODEL_NAME_OVERRIDES[baseId];
}
// Prefix match
for (const [key, name] of Object.entries(MICROSOFT_MODEL_NAME_OVERRIDES)) {
if (baseId.startsWith(key)) return name;
}
return cleanModelName(baseId, "azure");
}

// OpenAI-hosted models on Azure
if (OPENAI_MODEL_NAME_OVERRIDES[baseId]) {
return OPENAI_MODEL_NAME_OVERRIDES[baseId];
}
for (const [key, name] of Object.entries(OPENAI_MODEL_NAME_OVERRIDES)) {
if (baseId.startsWith(key)) return name;
}
return cleanModelName(baseId, "azure");
}

function litellmModelToDefinition(modelId: string, model: LiteLLMModel): ModelDefinition | null {
if (!model.input_cost_per_token || !model.output_cost_per_token) {
return null;
}

// Strip the azure/ prefix to get the base model ID
const baseId = modelId.replace(/^azure\//, "");

const name = getModelName(baseId);
if (!name) return null;

return {
name,
provider: getProvider(baseId),
pricing: {
input: model.input_cost_per_token,
output: model.output_cost_per_token,
cachedInput: getCachedInputCost(model),
},
maxInputTokens: model.max_input_tokens,
maxOutputTokens: model.max_output_tokens ?? model.max_tokens,
};
}

export default async function scrapeAzureData(fmt: DataFormat) {
const models = await getModelsForProvider("azure", "chat");
const addedModels = new Set<string>();

for (const [modelId, model] of models) {
const baseId = modelId.replace(/^azure\//, "");
if (!shouldIncludeModel(baseId)) continue;

const definition = litellmModelToDefinition(modelId, model);
if (!definition) continue;

// Deduplicate by model name
if (addedModels.has(definition.name)) continue;
addedModels.add(definition.name);

await addModelToFormat(fmt, "azure", "eastus", definition);
}

fmt.vendors["azure"] = {
cleanName: "Azure AI",
learnMoreUrl: "https://azure.microsoft.com/en-us/products/ai-services/openai-service",
euOrUKRegions: ["westeurope", "northeurope", "uksouth", "swedencentral"],
usaRegions: ["eastus", "eastus2", "westus", "westus3", "northcentralus", "southcentralus"],
regionCleanNames: {
"": {
eastus: "East US (Virginia)",
eastus2: "East US 2 (Virginia)",
westus: "West US (California)",
westus3: "West US 3 (Arizona)",
northcentralus: "North Central US (Illinois)",
southcentralus: "South Central US (Texas)",
westeurope: "West Europe (Netherlands)",
northeurope: "North Europe (Ireland)",
uksouth: "UK South (London)",
swedencentral: "Sweden Central",
australiaeast: "Australia East (New South Wales)",
japaneast: "Japan East (Tokyo)",
},
},
};

console.log(`Finished scraping Azure AI data (${addedModels.size} models from LiteLLM)`);
}
Loading