From 58721863c55d72a881b4bab89f0ca58e35e7f020 Mon Sep 17 00:00:00 2001 From: Jared Smith Date: Sat, 30 May 2026 16:02:27 -0700 Subject: [PATCH] updating generation script --- scripts/generate-datasets.mjs | 284 ++++------------------------------ 1 file changed, 27 insertions(+), 257 deletions(-) diff --git a/scripts/generate-datasets.mjs b/scripts/generate-datasets.mjs index b2eb48b..083022f 100644 --- a/scripts/generate-datasets.mjs +++ b/scripts/generate-datasets.mjs @@ -4,280 +4,50 @@ import { fileURLToPath } from 'url'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const projectRoot = path.resolve(__dirname, '..'); -const agmlSource = process.env.AGML_SOURCE || path.resolve(projectRoot, '../.external/AgML'); -const agmlHubSource = process.env.AGML_HUB || path.resolve(projectRoot, '../.external/AgML-Hub'); -const assetsDir = path.join(agmlSource, 'agml', '_assets'); -const datasetsDir = path.join(agmlSource, 'docs', 'datasets'); -const sourceCitationsPath = path.join(assetsDir, 'source_citations.json'); -const outPath = path.join(projectRoot, 'static', 'data', 'datasets.json'); -const hfDatasetsPath = path.join(agmlHubSource, 'frontend', 'public', 'hf_datasets.json'); -const hfOutPath = path.join(projectRoot, 'static', 'data', 'hf_datasets.json'); +const staticDataDir = path.join(projectRoot, 'static', 'data'); +const datasetsPath = path.join(staticDataDir, 'datasets.json'); +const hfDatasetsPath = path.join(staticDataDir, 'hf_datasets.json'); -const SKIP_NAMES = new Set(['iNatAg', 'iNatAg-mini']); - -const KEY_MAP = { - 'Machine Learning Task': 'machine_learning_task', - 'Agricultural Task': 'agricultural_task', - 'Location': 'location', - 'Sensor Modality': 'sensor_modality', - 'Real or Synthetic': 'real_or_synthetic', - 'Platform': 'platform', - 'Input Data Format': 'input_data_format', - 'Annotation Format': 'annotation_format', - 'Number of Images': 'num_images', - 'Documentation': 'documentation', - 'Classes': 'classes', - 'Stats/Mean': 'stats_mean', - 'Stats/Standard Deviation': 'stats_std', -}; - -function stripBackticks(value) { - return String(value).trim().replace(/^`|`$/g, ''); -} - -function parseTableValue(rawKey, value) { - const v = String(value).trim(); - if (rawKey === 'Documentation' && (v === '' || v.toLowerCase() === 'none')) return null; - if (rawKey === 'Number of Images') { - const n = parseInt(v.replace(/,/g, ''), 10); - return Number.isFinite(n) ? n : null; - } - if (rawKey === 'Stats/Mean' || rawKey === 'Stats/Standard Deviation') { - const match = v.match(/\[([^\]]+)\]/); - if (!match) return null; - const nums = match[1].split(',').map((x) => parseFloat(x.trim())); - return nums.every(Number.isFinite) ? nums : null; - } - return v || null; -} - -function extractMetadataTable(content) { - const meta = {}; - const lines = content.split(/\r?\n/); - let inTable = false; - for (let i = 0; i < lines.length; i += 1) { - const line = lines[i]; - if (line.startsWith('## Dataset Metadata')) { - inTable = true; - continue; - } - if (inTable && line.startsWith('## ')) break; - if (!inTable || !line.startsWith('|')) continue; - const cells = line.split('|').map((c) => c.trim()).filter(Boolean); - if (cells.length < 2) continue; - const key = cells[0].replace(/\*\*/g, '').trim(); - const mapped = KEY_MAP[key]; - if (mapped) { - meta[mapped] = parseTableValue(key, cells[1]); - } - } - return meta; +function readJson(filePath) { + if (!fs.existsSync(filePath)) return null; + return JSON.parse(fs.readFileSync(filePath, 'utf8')); } -function extractNameFromContent(content, fallback) { - const match = content.match(/^#\s*`([^`]+)`/m) || content.match(/^#\s+(.+)$/m); - return match ? stripBackticks(match[1]) : fallback; -} - -function extractExamplesImageUrl(content) { - const match = content.match(/!\[[^\]]*\]\s*\(\s*([^\)\s]+)\s*\)/); - return match ? match[1].trim() : null; -} - -function toRawImageUrl(url) { - if (!url || typeof url !== 'string') return null; - const trimmed = url.trim(); - const blobMatch = trimmed.match(/^https:\/\/github\.com\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)$/); - if (blobMatch) { - const [, org, repo, branch, assetPath] = blobMatch; - return `https://raw.githubusercontent.com/${org}/${repo}/${branch}/${assetPath}`; - } - if (trimmed.startsWith('../') || trimmed.startsWith('./')) { - const assetPath = trimmed.replace(/^\.\.?\//, ''); - return `https://raw.githubusercontent.com/Project-AgML/AgML/main/docs/${assetPath}`; - } - return trimmed.startsWith('http') ? trimmed : null; -} - -function parseDatasetFile(filePath) { - const raw = fs.readFileSync(filePath, 'utf8'); - const base = path.basename(filePath, '.md'); - if (SKIP_NAMES.has(base)) return null; - const name = extractNameFromContent(raw, base); - const meta = extractMetadataTable(raw); - const examples_image_url = toRawImageUrl(extractExamplesImageUrl(raw)); +function normalizeDataset(entry) { + if (!entry || typeof entry !== 'object' || Array.isArray(entry)) return null; return { - name, - machine_learning_task: meta.machine_learning_task ?? null, - agricultural_task: meta.agricultural_task ?? null, - location: meta.location ?? null, - sensor_modality: meta.sensor_modality ?? null, - real_or_synthetic: meta.real_or_synthetic ?? null, - platform: meta.platform ?? null, - input_data_format: meta.input_data_format ?? null, - annotation_format: meta.annotation_format ?? null, - num_images: meta.num_images ?? null, - documentation: meta.documentation ?? null, - classes: meta.classes ?? null, - stats_mean: meta.stats_mean ?? null, - stats_std: meta.stats_std ?? null, - examples_image_url: examples_image_url ?? null, - source: 'agml', + ...entry, + source: typeof entry.source === 'string' && entry.source.trim() ? entry.source : 'agml', }; } -function loadSourceCitations() { - if (!fs.existsSync(sourceCitationsPath)) return {}; - return JSON.parse(fs.readFileSync(sourceCitationsPath, 'utf8')); +function normalizeManifest(json) { + const records = Array.isArray(json) ? json : json && typeof json === 'object' ? Object.values(json) : []; + return records.map(normalizeDataset).filter((entry) => entry != null); } -function loadInatSourceCitations() { - const merged = {}; - for (const file of ['iNatAg_source_citations.json', 'iNatAg-mini_source_citations.json']) { - const filePath = path.join(assetsDir, file); - if (!fs.existsSync(filePath)) continue; - const data = JSON.parse(fs.readFileSync(filePath, 'utf8')); - Object.assign(merged, data); - } - return merged; -} - -function inatEntryToDataset(name, raw, parentName) { - const loc = raw.location; - const locationStr = loc && typeof loc === 'object' - ? [loc.continent, loc.country].filter(Boolean).join(', ') - : null; - const inputFmt = raw.input_data_format; - const inputDataFormat = Array.isArray(inputFmt) ? (inputFmt[0] ?? null) : inputFmt; - const classesObj = raw.classes; - const classesStr = classesObj && typeof classesObj === 'object' - ? Object.entries(classesObj).map(([k, v]) => `${k}: ${v}`).join('; ') - : null; - const n = raw.n_images; - const numImages = typeof n === 'number' ? n : (typeof n === 'string' ? parseInt(n, 10) : null); - return { - name, - machine_learning_task: raw.ml_task ?? null, - agricultural_task: raw.ag_task ?? null, - location: locationStr || null, - sensor_modality: raw.sensor_modality ?? null, - real_or_synthetic: raw.real_synthetic ?? null, - platform: raw.platform ?? null, - input_data_format: inputDataFormat ?? null, - annotation_format: raw.annotation_format ?? null, - num_images: Number.isFinite(numImages) ? numImages : null, - documentation: raw.docs_url ?? null, - classes: classesStr ?? null, - stats_mean: raw.stats?.mean ?? null, - stats_std: raw.stats?.std ?? null, - examples_image_url: null, - parent_dataset: parentName, - source: 'agml', - }; -} - -function loadInatDatasets() { - const list = []; - const inatCitations = loadInatSourceCitations(); - const sampleCitation = inatCitations['iNatAg/ailanthus_altissima'] ?? null; - - for (const [file, parentName] of [ - ['iNatAg_public_datasources.json', 'iNatAg'], - ['iNatAg-mini_public_datasources.json', 'iNatAg-mini'], - ]) { - const filePath = path.join(assetsDir, file); - if (!fs.existsSync(filePath)) continue; - const data = JSON.parse(fs.readFileSync(filePath, 'utf8')); - if (typeof data !== 'object') continue; - - let totalImages = 0; - const platforms = new Set(); - let sampleEntry = null; - - for (const [name, raw] of Object.entries(data)) { - list.push(inatEntryToDataset(name, raw, parentName)); - const n = raw.n_images; - totalImages += typeof n === 'number' ? n : (typeof n === 'string' ? parseInt(n, 10) || 0 : 0); - if (raw.platform) platforms.add(raw.platform); - if (!sampleEntry) sampleEntry = raw; - } - - const mdPath = path.join(datasetsDir, `${parentName}.md`); - let examplesImageUrl = null; - if (fs.existsSync(mdPath)) { - const rawMd = fs.readFileSync(mdPath, 'utf8'); - const match = rawMd.match(/!\[[^\]]*\]\s*\(\s*([^\)\s]+)\s*\)/); - if (match) examplesImageUrl = toRawImageUrl(match[1]); - } - - list.unshift({ - name: parentName, - machine_learning_task: sampleEntry?.ml_task ?? 'image_classification', - agricultural_task: sampleEntry?.ag_task ?? 'image_classification', - location: 'worldwide', - sensor_modality: sampleEntry?.sensor_modality ?? 'rgb', - real_or_synthetic: sampleEntry?.real_synthetic ?? 'real', - platform: Array.from(platforms).join(', ') || (sampleEntry?.platform ?? null), - input_data_format: null, - annotation_format: sampleEntry?.annotation_format ?? 'directory_names', - num_images: totalImages || null, - documentation: sampleEntry?.docs_url ?? 'https://www.inaturalist.org/', - classes: null, - stats_mean: null, - stats_std: null, - examples_image_url: examplesImageUrl, - license: sampleCitation?.license ?? null, - citation: sampleCitation?.citation ?? null, - parent_dataset: null, - source: 'agml', - }); - } - - return list; +function writeJson(filePath, value) { + fs.mkdirSync(path.dirname(filePath), { recursive: true }); + fs.writeFileSync(filePath, JSON.stringify(value, null, 2), 'utf8'); } async function generateDatasets() { - let list = []; - if (fs.existsSync(datasetsDir)) { - const files = fs.readdirSync(datasetsDir).filter((f) => f.endsWith('.md') || f.endsWith('.mdx')); - for (const file of files) { - const entry = parseDatasetFile(path.join(datasetsDir, file)); - if (entry) list.push(entry); - } - } - - list = list.concat(loadInatDatasets()); - list.sort((a, b) => a.name.localeCompare(b.name)); - - const citationsByKey = loadSourceCitations(); - Object.assign(citationsByKey, loadInatSourceCitations()); - for (const entry of list) { - const key = entry.name in citationsByKey ? entry.name : entry.name.replace(/-/g, '_'); - const info = citationsByKey[key]; - if (info) { - entry.license = info.license ?? null; - entry.citation = info.citation ?? null; - } else if (!entry.license && !entry.citation) { - entry.license = null; - entry.citation = null; - } - } + const datasets = normalizeManifest(readJson(datasetsPath)); + const hfDatasetsRaw = readJson(hfDatasetsPath); + const hfDatasets = Array.isArray(hfDatasetsRaw) + ? hfDatasetsRaw.map((entry) => ({ ...entry, source: 'huggingface' })) + : hfDatasetsRaw; - fs.mkdirSync(path.dirname(outPath), { recursive: true }); - fs.writeFileSync(outPath, JSON.stringify(list, null, 2), 'utf8'); - console.log('Wrote', outPath, '—', list.length, 'datasets'); + writeJson(datasetsPath, datasets); + console.log('Wrote', datasetsPath, '—', datasets.length, 'datasets'); - if (fs.existsSync(hfDatasetsPath)) { - const hfDatasets = JSON.parse(fs.readFileSync(hfDatasetsPath, 'utf8')); - const hfWithSource = Array.isArray(hfDatasets) - ? hfDatasets.map((entry) => ({ ...entry, source: 'huggingface' })) - : hfDatasets; - fs.writeFileSync(hfOutPath, JSON.stringify(hfWithSource, null, 2), 'utf8'); - console.log('Copied', hfOutPath); + if (hfDatasetsRaw != null) { + writeJson(hfDatasetsPath, hfDatasets); + console.log('Wrote', hfDatasetsPath); } } generateDatasets().catch((err) => { console.error(err); process.exit(1); -}); +}); \ No newline at end of file