diff --git a/packages/ai/src/prompts/extract-labs.ts b/packages/ai/src/prompts/extract-labs.ts index 3e4541d..920491e 100644 --- a/packages/ai/src/prompts/extract-labs.ts +++ b/packages/ai/src/prompts/extract-labs.ts @@ -1,44 +1,41 @@ -export const extractLabsPrompt = `You are a medical lab report parser. Given the text of a lab report, extract all test results as structured data. +export const extractLabsPrompt = `You are a medical lab report parser. Extract ALL test results from the given lab report text as structured JSON. -IMPORTANT: Some documents contain results from MULTIPLE dates (e.g., "Result Trends" or longitudinal reports with columns for different dates). In these cases, emit one result object per analyte per date. Each result MUST include the correct "observedAt" date for that specific value, NOT just a single collection date. +CRITICAL RULES: +1. Extract EVERY SINGLE test result — do NOT skip any. Count them. If the document has 40 results, output 40 results. +2. Output analyte names in STANDARD ENGLISH regardless of document language. +3. For non-English documents: translate the analyte name. Examples: + - "Glucoză" → "Glucose", "Insulină" → "Insulin", "Trigliceride" → "Triglycerides" + - "Colesterol total" → "Total Cholesterol", "HDL colesterol" → "HDL Cholesterol" + - "TSH (hormon hipofizar...)" → "TSH", "FT4 (tiroxina liberă)" → "Free T4" + - "Hematii" → "RBC", "Leucocite" → "WBC", "Trombocite" → "Platelets" + - "Fier seric" → "Iron", "Zinc seric" → "Zinc", "Cortizol seric" → "Cortisol" + - "Proteina C reactivă" → "CRP", "Homocisteină" → "Homocysteine" + - "Hemoglobină glicozilată / HbA1c" → "HbA1c" + - "Ac. anti tireoperoxidază (TPO)" → "TPO Antibodies" +4. Include CBC components: Hemoglobin, Hematocrit, RBC, WBC, Platelets, MCV, MCH, MCHC, RDW, and ALL differential counts (Neutrophils, Lymphocytes, Monocytes, Eosinophils, Basophils — both absolute and percentage). +5. Include hormones: TSH, Free T4, Free T3, Total T3, Total T4, Insulin, Cortisol, Testosterone, DHEA-S, Estradiol, etc. +6. Include vitamins/minerals: Vitamin D, Vitamin B12, Iron, Ferritin, Zinc, Magnesium, Calcium, Folate, etc. +7. When duplicate units exist for the same analyte (e.g., mg/dL AND mmol/L), extract ONLY the first/primary unit row. +8. For date: use the RECOLTAT/collection date from the header, not antecedent dates. -For the same analyte, you may see multiple rows with different reference ranges — each row corresponds to a different lab or date. Only emit a result where a value is actually present. Skip rows with no value. +For each result extract: +- analyte: Standard English name +- value: Numeric value (null if non-numeric) +- valueText: Value as written +- unit: Unit of measurement +- referenceRangeLow: Lower bound (numeric, null if not applicable) +- referenceRangeHigh: Upper bound (numeric, null if not applicable) +- referenceRangeText: Range as written +- isAbnormal: true if outside range +- observedAt: Collection date (ISO YYYY-MM-DD) -For each test result, extract: -- analyte: The normalized name of the test (e.g., "ALT" not "ALT (SGPT)", "AST" not "AST (SGOT)"). Drop parenthetical synonyms. -- value: The numeric value (or text if non-numeric like "Negative") -- valueText: The value as written in the document -- unit: The unit of measurement (normalized — e.g., "IU/L" not "U/L") -- referenceRangeLow: Lower bound of normal range (if provided, numeric) -- referenceRangeHigh: Upper bound of normal range (if provided, numeric) -- referenceRangeText: Full reference range text as written -- isAbnormal: Whether the result is flagged as abnormal (H, L, High, Low, or outside range) -- observedAt: The date THIS SPECIFIC result was collected/observed, in ISO format (YYYY-MM-DD). This is critical for multi-date documents — use the column date header, not a single report date. - -Respond with a JSON object: +Output JSON: { - "patientName": "", - "collectionDate": "", - "reportDate": "", - "labName": "", - "results": [ - { - "analyte": "Glucose", - "value": 95, - "valueText": "95", - "unit": "mg/dL", - "referenceRangeLow": 70, - "referenceRangeHigh": 100, - "referenceRangeText": "70-100 mg/dL", - "isAbnormal": false, - "observedAt": "2024-06-28" - } - ] + "patientName": "...", + "collectionDate": "YYYY-MM-DD", + "reportDate": "YYYY-MM-DD", + "labName": "...", + "results": [...] } -Rules: -- Extract EVERY test result. Do not skip any. -- If a value is non-numeric (e.g., "Reactive", "Negative"), set value to null and put the text in valueText. -- For multi-date trend reports: emit one result per value cell. A table with 5 date columns and 10 analytes could produce up to 50 results. -- When the same analyte appears on multiple rows (with different reference ranges), use the reference range from that specific row. -- Pay close attention to table structure: values are aligned under date column headers.`; +BEFORE RESPONDING: Scan the entire document and count how many distinct test results exist. Your results array must contain ALL of them. Missing results is a failure.`; diff --git a/packages/database/src/seed/data/metric-definitions.ts b/packages/database/src/seed/data/metric-definitions.ts index 84d1046..36c5a6a 100644 --- a/packages/database/src/seed/data/metric-definitions.ts +++ b/packages/database/src/seed/data/metric-definitions.ts @@ -1043,6 +1043,22 @@ export const metricDefinitionSeeds: MetricDefinitionSeed[] = [ displayPrecision: 1, sortOrder: 111, }, + { + id: "homa_ir", + name: "HOMA-IR", + category: "metabolic", + unit: "", + loincCode: null, + snomedCode: null, + aliases: ["HOMA IR", "HOMA-IR index", "homeostatic model assessment"], + referenceRangeLow: null, + referenceRangeHigh: 2.5, + referenceRangeText: "<2.5 (optimal <1.0)", + description: + "Insulin resistance index calculated from fasting glucose and insulin", + displayPrecision: 2, + sortOrder: 112, + }, { id: "c_peptide", name: "C-Peptide", @@ -1578,7 +1594,12 @@ export const metricDefinitionSeeds: MetricDefinitionSeed[] = [ unit: "K/uL", loincCode: "751-8", snomedCode: null, - aliases: ["ANC", "absolute neutrophil count", "neut abs", "neutrophils absolute"], + aliases: [ + "ANC", + "absolute neutrophil count", + "neut abs", + "neutrophils absolute", + ], referenceRangeLow: 1.8, referenceRangeHigh: 7.7, referenceRangeText: "1.8-7.7 K/uL", @@ -1608,7 +1629,12 @@ export const metricDefinitionSeeds: MetricDefinitionSeed[] = [ unit: "K/uL", loincCode: "731-0", snomedCode: null, - aliases: ["ALC", "absolute lymphocyte count", "lymph abs", "lymphocytes absolute"], + aliases: [ + "ALC", + "absolute lymphocyte count", + "lymph abs", + "lymphocytes absolute", + ], referenceRangeLow: 1.0, referenceRangeHigh: 4.8, referenceRangeText: "1.0-4.8 K/uL", @@ -1638,7 +1664,12 @@ export const metricDefinitionSeeds: MetricDefinitionSeed[] = [ unit: "K/uL", loincCode: "742-7", snomedCode: null, - aliases: ["AMC", "absolute monocyte count", "mono abs", "monocytes absolute"], + aliases: [ + "AMC", + "absolute monocyte count", + "mono abs", + "monocytes absolute", + ], referenceRangeLow: 0.2, referenceRangeHigh: 0.8, referenceRangeText: "0.2-0.8 K/uL", @@ -1668,7 +1699,12 @@ export const metricDefinitionSeeds: MetricDefinitionSeed[] = [ unit: "K/uL", loincCode: "711-2", snomedCode: null, - aliases: ["AEC", "absolute eosinophil count", "eos abs", "eosinophils absolute"], + aliases: [ + "AEC", + "absolute eosinophil count", + "eos abs", + "eosinophils absolute", + ], referenceRangeLow: 0.0, referenceRangeHigh: 0.5, referenceRangeText: "0.0-0.5 K/uL", @@ -1698,7 +1734,12 @@ export const metricDefinitionSeeds: MetricDefinitionSeed[] = [ unit: "K/uL", loincCode: "704-7", snomedCode: null, - aliases: ["ABC", "absolute basophil count", "baso abs", "basophils absolute"], + aliases: [ + "ABC", + "absolute basophil count", + "baso abs", + "basophils absolute", + ], referenceRangeLow: 0.0, referenceRangeHigh: 0.2, referenceRangeText: "0.0-0.2 K/uL", @@ -1730,7 +1771,12 @@ export const metricDefinitionSeeds: MetricDefinitionSeed[] = [ unit: null, loincCode: "1759-0", snomedCode: null, - aliases: ["A/G ratio", "A:G ratio", "albumin to globulin ratio", "AG ratio"], + aliases: [ + "A/G ratio", + "A:G ratio", + "albumin to globulin ratio", + "AG ratio", + ], referenceRangeLow: 1.1, referenceRangeHigh: 2.5, referenceRangeText: "1.1-2.5", @@ -1860,7 +1906,8 @@ export const metricDefinitionSeeds: MetricDefinitionSeed[] = [ referenceRangeLow: 8, referenceRangeHigh: null, referenceRangeText: "> 8% (optimal)", - description: "Omega-3 fatty acid index (EPA + DHA as % of total RBC fatty acids)", + description: + "Omega-3 fatty acid index (EPA + DHA as % of total RBC fatty acids)", displayPrecision: 1, sortOrder: 340, }, @@ -1935,7 +1982,12 @@ export const metricDefinitionSeeds: MetricDefinitionSeed[] = [ unit: "ng/mL", loincCode: "2857-1", snomedCode: null, - aliases: ["PSA", "prostate specific antigen", "total PSA", "prostate-specific antigen"], + aliases: [ + "PSA", + "prostate specific antigen", + "total PSA", + "prostate-specific antigen", + ], referenceRangeLow: 0, referenceRangeHigh: 4.0, referenceRangeText: "< 4.0 ng/mL", @@ -1950,7 +2002,12 @@ export const metricDefinitionSeeds: MetricDefinitionSeed[] = [ unit: "ng/mL", loincCode: "58427-2", snomedCode: null, - aliases: ["anti-Mullerian hormone", "anti-Müllerian hormone", "AMH level", "Mullerian inhibiting substance"], + aliases: [ + "anti-Mullerian hormone", + "anti-Müllerian hormone", + "AMH level", + "Mullerian inhibiting substance", + ], referenceRangeLow: 1.0, referenceRangeHigh: 10.0, referenceRangeText: "1.0-10.0 ng/mL (women, varies by age)", @@ -1999,7 +2056,13 @@ export const metricDefinitionSeeds: MetricDefinitionSeed[] = [ unit: "U/L", loincCode: null, snomedCode: null, - aliases: ["CK", "CPK", "creatine phosphokinase", "creatine kinase total", "total CK"], + aliases: [ + "CK", + "CPK", + "creatine phosphokinase", + "creatine kinase total", + "total CK", + ], referenceRangeLow: 22, referenceRangeHigh: 198, referenceRangeText: "22-198 U/L (men), 22-178 U/L (women)", diff --git a/packages/database/src/seed/data/romanian-lab-supplements.ts b/packages/database/src/seed/data/romanian-lab-supplements.ts new file mode 100644 index 0000000..0ac30e2 --- /dev/null +++ b/packages/database/src/seed/data/romanian-lab-supplements.ts @@ -0,0 +1,274 @@ +/** + * Supplemental seed data for Romanian/European lab format support. + * Applied AFTER base metric-definitions seed. + * Updates aliases, display precision, and adds new metrics. + */ + +/** New metric definitions not in the base seed */ +export const additionalMetricSeeds = [ + { + id: "pdw", + name: "PDW", + category: "hematology", + unit: "%", + loincCode: null, + snomedCode: null, + aliases: ["platelet distribution width"], + referenceRangeLow: 9.0, + referenceRangeHigh: 17.0, + referenceRangeText: "9.0 - 17.0 %", + description: "Platelet distribution width", + displayPrecision: 2, + sortOrder: 510, + }, + { + id: "plateletcrit", + name: "Plateletcrit", + category: "hematology", + unit: "%", + loincCode: null, + snomedCode: null, + aliases: ["PCT", "plateletcrit"], + referenceRangeLow: 0.17, + referenceRangeHigh: 0.35, + referenceRangeText: "0.17 - 0.35 %", + description: "Plateletcrit", + displayPrecision: 3, + sortOrder: 511, + }, + { + id: "p_lcr", + name: "P-LCR", + category: "hematology", + unit: "%", + loincCode: null, + snomedCode: null, + aliases: ["platelet large cell ratio", "P-LCR", "PLCR"], + referenceRangeLow: 13.0, + referenceRangeHigh: 43.0, + referenceRangeText: "13.0 - 43.0 %", + description: "Platelet large cell ratio", + displayPrecision: 2, + sortOrder: 512, + }, + { + id: "rdw_sd", + name: "RDW-SD", + category: "hematology", + unit: "fL", + loincCode: null, + snomedCode: null, + aliases: ["RDW SD", "RDW-SD"], + referenceRangeLow: 35.1, + referenceRangeHigh: 43.9, + referenceRangeText: "35.1 - 43.9 fL", + description: "Red cell distribution width - standard deviation", + displayPrecision: 2, + sortOrder: 513, + }, + { + id: "total_lipids", + name: "Total Lipids", + category: "lipid", + unit: "mg/dL", + loincCode: null, + snomedCode: null, + aliases: ["lipide totale", "total lipids"], + referenceRangeLow: 400, + referenceRangeHigh: 800, + referenceRangeText: "400 - 800 mg/dL", + description: "Total serum lipids", + displayPrecision: 2, + sortOrder: 600, + }, + { + id: "anti_thyroglobulin", + name: "Anti-Thyroglobulin Antibodies", + category: "thyroid", + unit: "IU/mL", + loincCode: null, + snomedCode: null, + aliases: ["anti-tiroglobulina", "Anti-thyroglobulin Antibodies", "ATG", "TgAb"], + referenceRangeLow: null, + referenceRangeHigh: 4.0, + referenceRangeText: "< 4.0 IU/mL", + description: "Anti-thyroglobulin antibodies", + displayPrecision: 2, + sortOrder: 425, + }, + { + id: "vldl_cholesterol", + name: "VLDL Cholesterol", + category: "lipid", + unit: "mg/dL", + loincCode: null, + snomedCode: null, + aliases: ["VLDL", "VLDL-C", "VLDL colesterol"], + referenceRangeLow: 2, + referenceRangeHigh: 38, + referenceRangeText: "2 - 38 mg/dL", + description: "Very low density lipoprotein cholesterol", + displayPrecision: 2, + sortOrder: 205, + }, + { + id: "phosphorus", + name: "Phosphorus", + category: "mineral", + unit: "mg/dL", + loincCode: null, + snomedCode: null, + aliases: ["fosfor", "phosphate", "serum phosphorus"], + referenceRangeLow: 2.5, + referenceRangeHigh: 4.5, + referenceRangeText: "2.5 - 4.5 mg/dL", + description: "Serum phosphorus", + displayPrecision: 2, + sortOrder: 710, + }, + { + id: "mpv", + name: "MPV", + category: "hematology", + unit: "fL", + loincCode: null, + snomedCode: null, + aliases: ["mean platelet volume"], + referenceRangeLow: 9.0, + referenceRangeHigh: 13.0, + referenceRangeText: "9.0 - 13.0 fL", + description: "Mean platelet volume", + displayPrecision: 2, + sortOrder: 509, + }, + { + id: "amylase", + name: "Amylase", + category: "metabolic", + unit: "U/L", + loincCode: null, + snomedCode: null, + aliases: ["alpha-amylase", "amilaza", "Alpha-amylase", "serum amylase"], + referenceRangeLow: 28, + referenceRangeHigh: 100, + referenceRangeText: "28 - 100 U/L", + description: "Serum amylase", + displayPrecision: 2, + sortOrder: 350, + }, + // Immunology / inflammation tests common in Romanian labs + { + id: "aslo", name: "ASLO", category: "inflammation", unit: "IU/mL", + loincCode: null, snomedCode: null, + aliases: ["ASO", "antistreptolysin O", "anti-streptolysin"], + referenceRangeLow: null, referenceRangeHigh: 200, + referenceRangeText: "< 200 IU/mL", + description: "Antistreptolysin O titer", displayPrecision: 2, sortOrder: 810, + }, + { + id: "rheumatoid_factor", name: "Rheumatoid Factor", category: "inflammation", unit: "IU/mL", + loincCode: null, snomedCode: null, + aliases: ["RF", "factor reumatoid", "Rheumatoid Factor"], + referenceRangeLow: null, referenceRangeHigh: 14, + referenceRangeText: "< 14 IU/mL", + description: "Rheumatoid factor", displayPrecision: 2, sortOrder: 811, + }, + { + id: "toxoplasma_igg", name: "Toxoplasma IgG", category: "immunology", unit: "IU/mL", + loincCode: null, snomedCode: null, + aliases: ["Toxoplasma gondii IgG", "Toxoplasma gondii IgG Antibodies"], + referenceRangeLow: null, referenceRangeHigh: null, + referenceRangeText: null, + description: "Toxoplasma gondii IgG antibodies", displayPrecision: 2, sortOrder: 820, + }, + { + id: "toxoplasma_igm", name: "Toxoplasma IgM", category: "immunology", unit: "Index", + loincCode: null, snomedCode: null, + aliases: ["Toxoplasma gondii IgM", "Toxoplasma gondii IgM Antibodies"], + referenceRangeLow: null, referenceRangeHigh: null, + referenceRangeText: null, + description: "Toxoplasma gondii IgM antibodies", displayPrecision: 2, sortOrder: 821, + }, + { + id: "toxocara_igg", name: "Toxocara IgG", category: "immunology", unit: "NTU", + loincCode: null, snomedCode: null, + aliases: ["Toxocara canis IgG", "Toxocara canis IgG Antibodies"], + referenceRangeLow: null, referenceRangeHigh: 9, + referenceRangeText: "< 9 NTU (Negative)", + description: "Toxocara canis IgG antibodies", displayPrecision: 2, sortOrder: 822, + }, + { + id: "hbs_ag", name: "HBs Antigen", category: "immunology", unit: null, + loincCode: null, snomedCode: null, + aliases: ["Ag HBs", "HBsAg", "Ag HBs (screening)", "HBs Ag Qualitative", "Hepatitis B surface antigen"], + referenceRangeLow: null, referenceRangeHigh: null, + referenceRangeText: "Negative", + description: "Hepatitis B surface antigen screening", displayPrecision: 2, sortOrder: 830, + }, + { + id: "anti_hcv", name: "Anti-HCV", category: "immunology", unit: null, + loincCode: null, snomedCode: null, + aliases: ["Anti HCV", "HCV antibodies", "Hepatitis C antibodies"], + referenceRangeLow: null, referenceRangeHigh: null, + referenceRangeText: "Negative", + description: "Hepatitis C virus antibodies", displayPrecision: 2, sortOrder: 831, + }, + { + id: "tibc", name: "TIBC", category: "mineral", unit: "mcg/dL", + loincCode: null, snomedCode: null, + aliases: ["total iron binding capacity", "Capacitatea totala de legare a fierului", "CTLF"], + referenceRangeLow: 225, referenceRangeHigh: 535, + referenceRangeText: "225 - 535 mcg/dL", + description: "Total iron binding capacity", displayPrecision: 2, sortOrder: 712, + }, +]; + +/** + * Alias updates for existing metrics to support Romanian lab naming conventions. + * Key: metric definition id, Value: additional aliases to append. + */ +export const aliasUpdates: Record = { + ast: ["ASAT", "TGO", "TGO (ASAT)", "ASAT (TGO)"], + alt: ["ALAT", "TGP", "TGP (ALAT)", "ALAT (TGP)"], + total_protein: ["Total Serum Proteins", "Serum Proteins", "Proteine totale"], + basophils_pct: ["Basophils (%)", "Baso %", "Basophils%"], + eosinophils_pct: ["Eosinophils (%)", "Eos %", "Eosinophils%"], + lymphocytes_pct: ["Lymphocytes (%)", "Lymph %", "Lymphocytes%"], + monocytes_pct: ["Monocytes (%)", "Mono %", "Monocytes%"], + neutrophils_pct: ["Neutrophils (%)", "Neut %", "Neutrophils%"], + basophils_abs: ["Basophils absolute"], + eosinophils_abs: ["Eosinophils absolute"], + lymphocytes_abs: ["Lymphocytes absolute"], + monocytes_abs: ["Monocytes absolute"], + neutrophils_abs: ["Neutrophils absolute"], +}; + +/** + * Display precision overrides — health data needs more decimals than the default. + */ +export const displayPrecisionOverrides: Record = { + crp: 2, + hs_crp: 2, + tsh: 3, + creatinine: 2, + vitamin_d: 2, + magnesium: 2, + calcium: 2, + uric_acid: 2, + albumin: 2, + total_protein: 2, + hemoglobin: 2, + glucose: 1, + cholesterol_total: 1, + hdl_cholesterol: 1, + ldl_cholesterol: 1, + triglycerides: 1, + iron: 1, + ferritin: 1, + hematocrit: 1, + cortisol: 1, + homocysteine: 2, + insulin: 2, + zinc: 2, + hemoglobin_a1c: 2, +}; diff --git a/packages/ingestion/src/normalizer.integration.test.ts b/packages/ingestion/src/normalizer.integration.test.ts new file mode 100644 index 0000000..4f5f7ff --- /dev/null +++ b/packages/ingestion/src/normalizer.integration.test.ts @@ -0,0 +1,137 @@ +import { describe, it, expect } from 'vitest'; +import { matchMetric, normalizeExtractions } from './normalizer'; +import type { MetricDefinition, UnitConversion, RawExtraction } from './types'; + +/** + * Integration tests using realistic data from Romanian lab PDFs. + * These simulate what the AI parser outputs and verify the normalizer + * handles Romanian lab formats correctly. + */ + +// Subset of real metric definitions matching our seed data +const metrics: MetricDefinition[] = [ + { id: 'glucose', name: 'Glucose', category: 'metabolic', unit: 'mg/dL', aliases: ['blood sugar', 'FBG', 'fasting glucose', 'GLU'], referenceRangeLow: 74, referenceRangeHigh: 106 }, + { id: 'hemoglobin_a1c', name: 'Hemoglobin A1c', category: 'metabolic', unit: '%', aliases: ['HbA1c', 'A1c', 'glycated hemoglobin'], referenceRangeLow: 4.0, referenceRangeHigh: 6.0 }, + { id: 'insulin', name: 'Insulin', category: 'metabolic', unit: 'uIU/mL', aliases: ['fasting insulin'], referenceRangeLow: 3.0, referenceRangeHigh: 25.0 }, + { id: 'cholesterol_total', name: 'Total Cholesterol', category: 'lipid', unit: 'mg/dL', aliases: [], referenceRangeLow: null, referenceRangeHigh: 200 }, + { id: 'hdl_cholesterol', name: 'HDL Cholesterol', category: 'lipid', unit: 'mg/dL', aliases: ['HDL'], referenceRangeLow: 60, referenceRangeHigh: null }, + { id: 'ldl_cholesterol', name: 'LDL Cholesterol', category: 'lipid', unit: 'mg/dL', aliases: ['LDL'], referenceRangeLow: null, referenceRangeHigh: 100 }, + { id: 'triglycerides', name: 'Triglycerides', category: 'lipid', unit: 'mg/dL', aliases: ['TG'], referenceRangeLow: null, referenceRangeHigh: 150 }, + { id: 'crp', name: 'CRP', category: 'inflammation', unit: 'mg/L', aliases: ['C-reactive protein'], referenceRangeLow: null, referenceRangeHigh: 5.0 }, + { id: 'ast', name: 'AST', category: 'liver', unit: 'U/L', aliases: ['SGOT', 'ASAT', 'TGO', 'TGO (ASAT)'], referenceRangeLow: 15, referenceRangeHigh: 40 }, + { id: 'alt', name: 'ALT', category: 'liver', unit: 'U/L', aliases: ['SGPT', 'ALAT', 'TGP', 'TGP (ALAT)'], referenceRangeLow: 10, referenceRangeHigh: 40 }, + { id: 'tsh', name: 'TSH', category: 'thyroid', unit: 'mIU/L', aliases: ['thyroid stimulating hormone'], referenceRangeLow: 0.55, referenceRangeHigh: 4.78 }, + { id: 'vitamin_d', name: 'Vitamin D', category: 'vitamin', unit: 'ng/mL', aliases: ['25-OH vitamin D', 'calcidiol', '25-Hydroxyvitamin D'], referenceRangeLow: 30, referenceRangeHigh: 100 }, + { id: 'wbc', name: 'White Blood Cell Count', category: 'hematology', unit: 'K/uL', aliases: ['WBC', 'leukocytes'], referenceRangeLow: 3.9, referenceRangeHigh: 10.2 }, + { id: 'rbc', name: 'Red Blood Cell Count', category: 'hematology', unit: 'M/uL', aliases: ['RBC', 'erythrocytes'], referenceRangeLow: 4.3, referenceRangeHigh: 5.75 }, + { id: 'platelets', name: 'Platelet Count', category: 'hematology', unit: 'K/uL', aliases: ['PLT', 'thrombocytes', 'Platelets'], referenceRangeLow: 150, referenceRangeHigh: 370 }, + { id: 'neutrophils_pct', name: 'Neutrophils %', category: 'hematology', unit: '%', aliases: ['Neutrophils (%)', 'Neut %', 'Neutrophils%'], referenceRangeLow: 42, referenceRangeHigh: 77 }, + { id: 'neutrophils_abs', name: 'Neutrophils (Absolute)', category: 'hematology', unit: 'K/uL', aliases: ['Neutrophils absolute', 'Neutrophils abs'], referenceRangeLow: 1.5, referenceRangeHigh: 7.7 }, +]; + +const conversions: UnitConversion[] = [ + { fromUnit: 'mg/dL', toUnit: 'mg/L', metricCode: 'crp', multiplier: 10, offset: 0 }, + { fromUnit: '/mm³', toUnit: 'K/uL', metricCode: 'wbc', multiplier: 0.001, offset: 0 }, + { fromUnit: '/mm³', toUnit: 'M/uL', metricCode: 'rbc', multiplier: 0.000001, offset: 0 }, + { fromUnit: '/mm³', toUnit: 'K/uL', metricCode: 'platelets', multiplier: 0.001, offset: 0 }, + { fromUnit: '/mm³', toUnit: 'K/uL', metricCode: 'neutrophils_abs', multiplier: 0.001, offset: 0 }, + { fromUnit: 'mU/L', toUnit: 'uIU/mL', metricCode: 'insulin', multiplier: 1, offset: 0 }, + { fromUnit: 'μUI/mL', toUnit: 'mIU/L', metricCode: null, multiplier: 1, offset: 0 }, + { fromUnit: 'μg/dL', toUnit: 'mcg/dL', metricCode: null, multiplier: 1, offset: 0 }, + { fromUnit: 'mg/dl', toUnit: 'mg/dL', metricCode: null, multiplier: 1, offset: 0 }, + { fromUnit: '10^3/ul', toUnit: 'K/uL', metricCode: null, multiplier: 1, offset: 0 }, + { fromUnit: '10^6/uL', toUnit: 'M/uL', metricCode: null, multiplier: 1, offset: 0 }, +]; + +describe('Bioclinica PDF (Dec 2025) - realistic AI output', () => { + // Simulates what Gemini Flash extracts from a Bioclinica Romanian PDF + const bioclinicaExtractions: RawExtraction[] = [ + { analyte: 'CRP', value: 0.05, valueText: '< 0.050', unit: 'mg/dL', referenceRangeLow: null, referenceRangeHigh: 0.33, referenceRangeText: '≤ 0.330 mg/dL', isAbnormal: false, observedAt: '2025-12-09' }, + { analyte: 'Hemoglobin', value: 16.2, valueText: '16.2', unit: 'g/dL', referenceRangeLow: 13.5, referenceRangeHigh: 17.2, referenceRangeText: null, isAbnormal: false, observedAt: '2025-12-09' }, + { analyte: 'HbA1c', value: 5.2, valueText: '5.2', unit: '%', referenceRangeLow: 4.0, referenceRangeHigh: 6.0, referenceRangeText: null, isAbnormal: false, observedAt: '2025-12-09' }, + { analyte: 'Glucose', value: 81, valueText: '81', unit: 'mg/dL', referenceRangeLow: 74, referenceRangeHigh: 106, referenceRangeText: null, isAbnormal: false, observedAt: '2025-12-09' }, + { analyte: 'Insulin', value: 4.2, valueText: '4.2', unit: 'mU/L', referenceRangeLow: 3.0, referenceRangeHigh: 25.0, referenceRangeText: null, isAbnormal: false, observedAt: '2025-12-09' }, + { analyte: 'Total Cholesterol', value: 200, valueText: '200', unit: 'mg/dL', referenceRangeLow: null, referenceRangeHigh: 200, referenceRangeText: null, isAbnormal: false, observedAt: '2025-12-09' }, + { analyte: 'HDL Cholesterol', value: 44, valueText: '44', unit: 'mg/dL', referenceRangeLow: 60, referenceRangeHigh: null, referenceRangeText: null, isAbnormal: true, observedAt: '2025-12-09' }, + { analyte: 'TSH', value: 1.824, valueText: '1.824', unit: 'μUI/mL', referenceRangeLow: 0.55, referenceRangeHigh: 4.78, referenceRangeText: null, isAbnormal: false, observedAt: '2025-12-09' }, + { analyte: 'WBC', value: 5830, valueText: '5830', unit: '/mm³', referenceRangeLow: 3900, referenceRangeHigh: 10200, referenceRangeText: null, isAbnormal: false, observedAt: '2025-12-09' }, + { analyte: 'RBC', value: 5600000, valueText: '5600000', unit: '/mm³', referenceRangeLow: 4300000, referenceRangeHigh: 5750000, referenceRangeText: null, isAbnormal: false, observedAt: '2025-12-09' }, + { analyte: 'Platelets', value: 235000, valueText: '235000', unit: '/mm³', referenceRangeLow: 150000, referenceRangeHigh: 370000, referenceRangeText: null, isAbnormal: false, observedAt: '2025-12-09' }, + { analyte: 'Neutrophils', value: 55.23, valueText: '55.23', unit: '%', referenceRangeLow: 42, referenceRangeHigh: 77, referenceRangeText: null, isAbnormal: false, observedAt: '2025-12-09' }, + { analyte: 'Neutrophils', value: 3220, valueText: '3220', unit: '/mm³', referenceRangeLow: 1500, referenceRangeHigh: 7700, referenceRangeText: null, isAbnormal: false, observedAt: '2025-12-09' }, + ]; + + it('normalizes all Bioclinica extractions', () => { + const result = normalizeExtractions(bioclinicaExtractions, metrics, conversions); + expect(result.normalized.length).toBeGreaterThanOrEqual(11); + expect(result.flagged.length).toBeLessThanOrEqual(2); + }); + + it('converts CRP from mg/dL to mg/L', () => { + const result = normalizeExtractions(bioclinicaExtractions, metrics, conversions); + const crp = result.normalized.find(o => o.metricCode === 'crp'); + expect(crp).toBeDefined(); + expect(crp!.valueNumeric).toBeCloseTo(0.5); + expect(crp!.unit).toBe('mg/L'); + }); + + it('converts WBC from /mm³ to K/uL', () => { + const result = normalizeExtractions(bioclinicaExtractions, metrics, conversions); + const wbc = result.normalized.find(o => o.metricCode === 'wbc'); + expect(wbc).toBeDefined(); + expect(wbc!.valueNumeric).toBeCloseTo(5.83); + expect(wbc!.unit).toBe('K/uL'); + }); + + it('converts RBC from /mm³ to M/uL', () => { + const result = normalizeExtractions(bioclinicaExtractions, metrics, conversions); + const rbc = result.normalized.find(o => o.metricCode === 'rbc'); + expect(rbc).toBeDefined(); + expect(rbc!.valueNumeric).toBeCloseTo(5.6); + }); + + it('converts TSH from μUI/mL to mIU/L', () => { + const result = normalizeExtractions(bioclinicaExtractions, metrics, conversions); + const tsh = result.normalized.find(o => o.metricCode === 'tsh'); + expect(tsh).toBeDefined(); + expect(tsh!.valueNumeric).toBeCloseTo(1.824); + }); + + it('converts Insulin from mU/L to uIU/mL', () => { + const result = normalizeExtractions(bioclinicaExtractions, metrics, conversions); + const ins = result.normalized.find(o => o.metricCode === 'insulin'); + expect(ins).toBeDefined(); + expect(ins!.valueNumeric).toBeCloseTo(4.2); + }); + + it('disambiguates Neutrophils % vs absolute by unit', () => { + const result = normalizeExtractions(bioclinicaExtractions, metrics, conversions); + const neutPct = result.normalized.find(o => o.metricCode === 'neutrophils_pct'); + const neutAbs = result.normalized.find(o => o.metricCode === 'neutrophils_abs'); + expect(neutPct).toBeDefined(); + expect(neutPct!.valueNumeric).toBeCloseTo(55.23); + expect(neutAbs).toBeDefined(); + expect(neutAbs!.valueNumeric).toBeCloseTo(3.22); + }); + + it('flags HDL as abnormal (44 < 60)', () => { + const result = normalizeExtractions(bioclinicaExtractions, metrics, conversions); + const hdl = result.normalized.find(o => o.metricCode === 'hdl_cholesterol'); + expect(hdl).toBeDefined(); + expect(hdl!.isAbnormal).toBe(true); + }); +}); + +describe('Medisim PDF format - case-insensitive units', () => { + const medisimExtractions: RawExtraction[] = [ + { analyte: 'Glucose', value: 95.4, valueText: '95.40', unit: 'mg/dl', referenceRangeLow: 70, referenceRangeHigh: 115, referenceRangeText: null, isAbnormal: false, observedAt: '2024-05-13' }, + { analyte: 'Total Cholesterol', value: 201.97, valueText: '201.97', unit: 'mg/dl', referenceRangeLow: null, referenceRangeHigh: 201.1, referenceRangeText: null, isAbnormal: true, observedAt: '2024-05-13' }, + ]; + + it('handles mg/dl → mg/dL case conversion', () => { + const result = normalizeExtractions(medisimExtractions, metrics, conversions); + expect(result.normalized).toHaveLength(2); + const glucose = result.normalized.find(o => o.metricCode === 'glucose'); + expect(glucose!.valueNumeric).toBeCloseTo(95.4); + }); +}); diff --git a/packages/ingestion/src/normalizer.test.ts b/packages/ingestion/src/normalizer.test.ts new file mode 100644 index 0000000..071e3c1 --- /dev/null +++ b/packages/ingestion/src/normalizer.test.ts @@ -0,0 +1,210 @@ +import { describe, it, expect } from 'vitest'; +import { matchMetric, convertUnit, normalizeExtractions } from './normalizer'; +import type { MetricDefinition, UnitConversion, RawExtraction } from './types'; + +const testMetrics: MetricDefinition[] = [ + { id: 'glucose', name: 'Glucose', category: 'metabolic', unit: 'mg/dL', aliases: ['blood sugar', 'FBG', 'fasting glucose'], referenceRangeLow: 74, referenceRangeHigh: 106 }, + { id: 'hemoglobin', name: 'Hemoglobin', category: 'hematology', unit: 'g/dL', aliases: ['Hgb', 'Hb'], referenceRangeLow: 13.5, referenceRangeHigh: 17.2 }, + { id: 'ast', name: 'AST', category: 'liver', unit: 'U/L', aliases: ['SGOT', 'ASAT', 'TGO', 'TGO (ASAT)'], referenceRangeLow: 15, referenceRangeHigh: 40 }, + { id: 'alt', name: 'ALT', category: 'liver', unit: 'U/L', aliases: ['SGPT', 'ALAT', 'TGP', 'TGP (ALAT)'], referenceRangeLow: 10, referenceRangeHigh: 40 }, + { id: 'neutrophils_pct', name: 'Neutrophils %', category: 'hematology', unit: '%', aliases: ['Neutrophils (%)', 'Neut %'], referenceRangeLow: 42, referenceRangeHigh: 77 }, + { id: 'neutrophils_abs', name: 'Neutrophils (Absolute)', category: 'hematology', unit: 'K/uL', aliases: ['Neutrophils absolute'], referenceRangeLow: 1.5, referenceRangeHigh: 7.7 }, + { id: 'basophils_pct', name: 'Basophils %', category: 'hematology', unit: '%', aliases: ['Basophils (%)', 'Baso %'], referenceRangeLow: null, referenceRangeHigh: 1.75 }, + { id: 'basophils_abs', name: 'Basophils (Absolute)', category: 'hematology', unit: 'K/uL', aliases: ['Basophils absolute'], referenceRangeLow: null, referenceRangeHigh: 0.2 }, + { id: 'crp', name: 'CRP', category: 'inflammation', unit: 'mg/L', aliases: ['C-reactive protein'], referenceRangeLow: null, referenceRangeHigh: 5.0 }, + { id: 'total_protein', name: 'Total Protein', category: 'metabolic', unit: 'g/dL', aliases: ['TP', 'Total Serum Proteins', 'Proteine totale'], referenceRangeLow: 5.7, referenceRangeHigh: 8.2 }, + { id: 'insulin', name: 'Insulin', category: 'metabolic', unit: 'uIU/mL', aliases: ['fasting insulin'], referenceRangeLow: 3.0, referenceRangeHigh: 25.0 }, + { id: 'wbc', name: 'White Blood Cell Count', category: 'hematology', unit: 'K/uL', aliases: ['WBC', 'leukocytes'], referenceRangeLow: 3.9, referenceRangeHigh: 10.2 }, +]; + +const testConversions: UnitConversion[] = [ + { fromUnit: 'mg/dL', toUnit: 'mg/L', metricCode: 'crp', multiplier: 10, offset: 0 }, + { fromUnit: '/mm³', toUnit: 'K/uL', metricCode: 'wbc', multiplier: 0.001, offset: 0 }, + { fromUnit: 'mU/L', toUnit: 'uIU/mL', metricCode: 'insulin', multiplier: 1, offset: 0 }, + { fromUnit: 'mg/dl', toUnit: 'mg/dL', metricCode: null, multiplier: 1, offset: 0 }, +]; + +describe('matchMetric', () => { + it('matches by exact id', () => { + expect(matchMetric('glucose', testMetrics)?.id).toBe('glucose'); + }); + + it('matches by exact name', () => { + expect(matchMetric('Glucose', testMetrics)?.id).toBe('glucose'); + }); + + it('matches by alias', () => { + expect(matchMetric('TGO', testMetrics)?.id).toBe('ast'); + expect(matchMetric('TGP (ALAT)', testMetrics)?.id).toBe('alt'); + expect(matchMetric('ASAT', testMetrics)?.id).toBe('ast'); + }); + + it('matches by partial name', () => { + expect(matchMetric('Total Serum Proteins', testMetrics)?.id).toBe('total_protein'); + }); + + it('matches case-insensitively', () => { + expect(matchMetric('GLUCOSE', testMetrics)?.id).toBe('glucose'); + expect(matchMetric('hemoglobin', testMetrics)?.id).toBe('hemoglobin'); + }); + + it('returns null for unknown analytes', () => { + expect(matchMetric('Unobtanium', testMetrics)).toBeNull(); + }); + + it('disambiguates differentials by unit (% → _pct)', () => { + const result = matchMetric('Neutrophils', testMetrics, '%'); + expect(result?.id).toBe('neutrophils_pct'); + }); + + it('disambiguates differentials by unit (absolute → _abs)', () => { + const result = matchMetric('Neutrophils', testMetrics, '/mm³'); + expect(result?.id).toBe('neutrophils_abs'); + }); + + it('disambiguates Basophils by unit', () => { + expect(matchMetric('Basophils', testMetrics, '%')?.id).toBe('basophils_pct'); + expect(matchMetric('Basophils', testMetrics, 'K/uL')?.id).toBe('basophils_abs'); + }); +}); + +describe('convertUnit', () => { + it('returns value as-is when units match', () => { + expect(convertUnit(100, 'mg/dL', 'mg/dL', testConversions)).toBe(100); + }); + + it('returns value for case-insensitive unit match', () => { + expect(convertUnit(100, 'mg/dl', 'mg/dL', testConversions)).toBe(100); + }); + + it('converts with multiplier', () => { + expect(convertUnit(0.05, 'mg/dL', 'mg/L', testConversions, 'crp')).toBeCloseTo(0.5); + }); + + it('converts hematology units', () => { + expect(convertUnit(5830, '/mm³', 'K/uL', testConversions, 'wbc')).toBeCloseTo(5.83); + }); + + it('converts insulin units (1:1)', () => { + expect(convertUnit(4.2, 'mU/L', 'uIU/mL', testConversions, 'insulin')).toBeCloseTo(4.2); + }); + + it('returns null for unknown conversion', () => { + expect(convertUnit(100, 'foo/bar', 'baz/qux', testConversions)).toBeNull(); + }); +}); + +describe('normalizeExtractions', () => { + it('normalizes a simple extraction', () => { + const extractions: RawExtraction[] = [{ + analyte: 'Glucose', + value: 81, + valueText: '81', + unit: 'mg/dL', + referenceRangeLow: 74, + referenceRangeHigh: 106, + referenceRangeText: '74 - 106 mg/dL', + isAbnormal: false, + observedAt: '2025-12-09', + }]; + + const result = normalizeExtractions(extractions, testMetrics, testConversions); + expect(result.normalized).toHaveLength(1); + expect(result.flagged).toHaveLength(0); + expect(result.normalized[0]!.metricCode).toBe('glucose'); + expect(result.normalized[0]!.valueNumeric).toBe(81); + }); + + it('flags unmatched analytes', () => { + const extractions: RawExtraction[] = [{ + analyte: 'Unobtanium', + value: 42, + valueText: '42', + unit: 'mg/dL', + referenceRangeLow: null, + referenceRangeHigh: null, + referenceRangeText: null, + isAbnormal: null, + observedAt: '2025-12-09', + }]; + + const result = normalizeExtractions(extractions, testMetrics, testConversions); + expect(result.normalized).toHaveLength(0); + expect(result.flagged).toHaveLength(1); + expect(result.flagged[0]!.reason).toBe('unmatched_metric'); + }); + + it('converts units during normalization', () => { + const extractions: RawExtraction[] = [{ + analyte: 'CRP', + value: 0.05, + valueText: '0.05', + unit: 'mg/dL', + referenceRangeLow: null, + referenceRangeHigh: 0.33, + referenceRangeText: '< 0.33 mg/dL', + isAbnormal: false, + observedAt: '2025-12-09', + }]; + + const result = normalizeExtractions(extractions, testMetrics, testConversions); + expect(result.normalized).toHaveLength(1); + expect(result.normalized[0]!.metricCode).toBe('crp'); + expect(result.normalized[0]!.valueNumeric).toBeCloseTo(0.5); // 0.05 * 10 + expect(result.normalized[0]!.unit).toBe('mg/L'); + }); + + it('flags when unit conversion not found', () => { + const extractions: RawExtraction[] = [{ + analyte: 'Insulin', + value: 4.2, + valueText: '4.2', + unit: 'unknown_unit', + referenceRangeLow: 3.0, + referenceRangeHigh: 25.0, + referenceRangeText: null, + isAbnormal: false, + observedAt: '2025-12-09', + }]; + + const result = normalizeExtractions(extractions, testMetrics, testConversions); + expect(result.normalized).toHaveLength(0); + expect(result.flagged).toHaveLength(1); + expect(result.flagged[0]!.reason).toBe('ambiguous_unit'); + }); + + it('matches Romanian aliases (TGO → AST)', () => { + const extractions: RawExtraction[] = [{ + analyte: 'TGO', + value: 20, + valueText: '20', + unit: 'U/L', + referenceRangeLow: 15, + referenceRangeHigh: 40, + referenceRangeText: '15 - 40 U/L', + isAbnormal: false, + observedAt: '2025-12-09', + }]; + + const result = normalizeExtractions(extractions, testMetrics, testConversions); + expect(result.normalized).toHaveLength(1); + expect(result.normalized[0]!.metricCode).toBe('ast'); + }); + + it('detects abnormal values', () => { + const extractions: RawExtraction[] = [{ + analyte: 'Glucose', + value: 130, + valueText: '130', + unit: 'mg/dL', + referenceRangeLow: 74, + referenceRangeHigh: 106, + referenceRangeText: null, + isAbnormal: null, + observedAt: '2025-12-09', + }]; + + const result = normalizeExtractions(extractions, testMetrics, testConversions); + expect(result.normalized[0]!.isAbnormal).toBe(true); + }); +}); diff --git a/packages/ingestion/src/normalizer.ts b/packages/ingestion/src/normalizer.ts index 2a34445..e65aa0d 100644 --- a/packages/ingestion/src/normalizer.ts +++ b/packages/ingestion/src/normalizer.ts @@ -1,5 +1,10 @@ -import type { RawExtraction, NormalizedObservation, FlaggedExtraction, NormalizationResult } from './types'; -import { CONFIDENCE_THRESHOLD } from '@openvitals/common'; +import type { + RawExtraction, + NormalizedObservation, + FlaggedExtraction, + NormalizationResult, +} from "./types"; +import { CONFIDENCE_THRESHOLD } from "@openvitals/common"; export interface DemographicRange { sex: string | null; @@ -35,31 +40,60 @@ export interface UnitConversion { export function matchMetric( analyte: string, - metricDefinitions: MetricDefinition[] + metricDefinitions: MetricDefinition[], + unit?: string | null, ): MetricDefinition | null { const lower = analyte.toLowerCase().trim(); // Exact match on id - const exactId = metricDefinitions.find(m => m.id === lower); + const exactId = metricDefinitions.find((m) => m.id === lower); if (exactId) return exactId; // Exact match on name - const exactName = metricDefinitions.find(m => m.name.toLowerCase() === lower); + const exactName = metricDefinitions.find( + (m) => m.name.toLowerCase() === lower, + ); if (exactName) return exactName; // Alias match - const aliasMatch = metricDefinitions.find(m => - m.aliases.some(a => a.toLowerCase() === lower) + const aliasMatch = metricDefinitions.find((m) => + m.aliases.some((a) => a.toLowerCase() === lower), ); if (aliasMatch) return aliasMatch; // Partial match (analyte contains metric name or vice versa) - const partialMatch = metricDefinitions.find(m => - lower.includes(m.name.toLowerCase()) || m.name.toLowerCase().includes(lower) + // When multiple partial matches exist, use unit to disambiguate + const partialMatches = metricDefinitions.filter( + (m) => + lower.includes(m.name.toLowerCase()) || + m.name.toLowerCase().includes(lower), ); - if (partialMatch) return partialMatch; - return null; + if (partialMatches.length === 1) return partialMatches[0]!; + + if (partialMatches.length > 1 && unit) { + const unitLower = unit.toLowerCase(); + const isPercentage = unitLower === "%"; + const isAbsolute = !isPercentage; + + // For differentials: pick _pct for %, _abs for count units + const unitMatch = partialMatches.find((m) => { + if (isPercentage && (m.id.endsWith("_pct") || m.unit === "%")) + return true; + if (isAbsolute && (m.id.endsWith("_abs") || (m.unit && m.unit !== "%"))) + return true; + return false; + }); + if (unitMatch) return unitMatch; + + // If no unit-based match, try matching by unit compatibility + const sameUnit = partialMatches.find( + (m) => m.unit && m.unit.toLowerCase() === unitLower, + ); + if (sameUnit) return sameUnit; + } + + return partialMatches[0] ?? null; } export function convertUnit( @@ -67,23 +101,25 @@ export function convertUnit( fromUnit: string, toUnit: string, conversions: UnitConversion[], - metricCode?: string + metricCode?: string, ): number | null { if (fromUnit.toLowerCase() === toUnit.toLowerCase()) return value; // Try metric-specific conversion first - const specific = conversions.find(c => - c.fromUnit.toLowerCase() === fromUnit.toLowerCase() && - c.toUnit.toLowerCase() === toUnit.toLowerCase() && - c.metricCode === metricCode + const specific = conversions.find( + (c) => + c.fromUnit.toLowerCase() === fromUnit.toLowerCase() && + c.toUnit.toLowerCase() === toUnit.toLowerCase() && + c.metricCode === metricCode, ); if (specific) return value * specific.multiplier + specific.offset; // Try global conversion - const global = conversions.find(c => - c.fromUnit.toLowerCase() === fromUnit.toLowerCase() && - c.toUnit.toLowerCase() === toUnit.toLowerCase() && - c.metricCode === null + const global = conversions.find( + (c) => + c.fromUnit.toLowerCase() === fromUnit.toLowerCase() && + c.toUnit.toLowerCase() === toUnit.toLowerCase() && + c.metricCode === null, ); if (global) return value * global.multiplier + global.offset; @@ -96,7 +132,7 @@ export function convertUnit( */ function findBestDemographicRange( ranges: DemographicRange[], - demographics: UserDemographics + demographics: UserDemographics, ): DemographicRange | null { let bestRange: DemographicRange | null = null; let bestScore = -1; @@ -104,12 +140,19 @@ function findBestDemographicRange( for (const range of ranges) { // Check age bounds if (demographics.ageInYears !== null) { - if (range.ageMin !== null && demographics.ageInYears < range.ageMin) continue; - if (range.ageMax !== null && demographics.ageInYears > range.ageMax) continue; + if (range.ageMin !== null && demographics.ageInYears < range.ageMin) + continue; + if (range.ageMax !== null && demographics.ageInYears > range.ageMax) + continue; } // Check sex match - if (range.sex !== null && demographics.sex !== null && range.sex !== demographics.sex) continue; + if ( + range.sex !== null && + demographics.sex !== null && + range.sex !== demographics.sex + ) + continue; // Score: sex-specific = +2, narrow age band = +1 let score = 0; @@ -129,7 +172,7 @@ function findBestDemographicRange( /** * Resolve reference range with priority: - * 1. Per-observation range from the extraction + * 1. Per-observation range from the extraction (what the lab printed) * 2. Demographic-matched range from reference_ranges table * 3. Metric definition fallback */ @@ -139,7 +182,10 @@ export function resolveReferenceRange( demographics?: UserDemographics | null, ): { low: number | null; high: number | null } { // Priority 1: per-observation range - if (extraction.referenceRangeLow !== null || extraction.referenceRangeHigh !== null) { + if ( + extraction.referenceRangeLow !== null || + extraction.referenceRangeHigh !== null + ) { return { low: extraction.referenceRangeLow, high: extraction.referenceRangeHigh, @@ -147,8 +193,15 @@ export function resolveReferenceRange( } // Priority 2: demographic match - if (demographics && metric.demographicRanges && metric.demographicRanges.length > 0) { - const match = findBestDemographicRange(metric.demographicRanges, demographics); + if ( + demographics && + metric.demographicRanges && + metric.demographicRanges.length > 0 + ) { + const match = findBestDemographicRange( + metric.demographicRanges, + demographics, + ); if (match) { return { low: match.rangeLow, high: match.rangeHigh }; } @@ -161,6 +214,61 @@ export function resolveReferenceRange( }; } +/** + * Resolve canonical reference range for abnormality calculation. + * Reversed priority: canonical sources first, per-PDF as last resort. + * This ensures consistent NORMAL/ABNORMAL status across labs. + */ +export function resolveCanonicalRange( + extraction: RawExtraction, + metric: MetricDefinition, + demographics?: UserDemographics | null, +): { low: number | null; high: number | null } { + // Priority 1: metric definition (the canonical standard) + if (metric.referenceRangeLow !== null || metric.referenceRangeHigh !== null) { + return { + low: metric.referenceRangeLow, + high: metric.referenceRangeHigh, + }; + } + + // Priority 2: demographic match + if ( + demographics && + metric.demographicRanges && + metric.demographicRanges.length > 0 + ) { + const match = findBestDemographicRange( + metric.demographicRanges, + demographics, + ); + if (match) { + return { low: match.rangeLow, high: match.rangeHigh }; + } + } + + // Priority 3: fall back to what the lab printed + return { + low: extraction.referenceRangeLow, + high: extraction.referenceRangeHigh, + }; +} + +/** + * Canonical metric code map: some metric_definitions have duplicate entries + * (e.g., 'hemoglobin_a1c' and 'hba1c'). This ensures we always store + * observations with a single canonical code regardless of which definition matched. + */ +const CANONICAL_CODE_MAP: Record = { + hemoglobin_a1c: "hba1c", + cholesterol_total: "total_cholesterol", + apob: "apolipoprotein_b", + hs_crp: "crp", + c_reactive_protein: "crp", + "25_hydroxyvitamin_d": "vitamin_d", + vitamin_d_25_hydroxyvitamin_d: "vitamin_d", +}; + export function normalizeExtractions( extractions: RawExtraction[], metricDefinitions: MetricDefinition[], @@ -172,57 +280,86 @@ export function normalizeExtractions( const flagged: FlaggedExtraction[] = []; for (const extraction of extractions) { - const metric = matchMetric(extraction.analyte, metricDefinitions); + const metric = matchMetric( + extraction.analyte, + metricDefinitions, + extraction.unit, + ); if (!metric) { flagged.push({ extraction, - reason: 'unmatched_metric', + reason: "unmatched_metric", details: `No metric definition found for analyte: ${extraction.analyte}`, }); continue; } let finalValue = extraction.value; - let finalUnit = extraction.unit ?? metric.unit ?? ''; + let finalUnit = extraction.unit ?? metric.unit ?? ""; // Unit conversion if needed - if (extraction.unit && metric.unit && extraction.unit.toLowerCase() !== metric.unit.toLowerCase()) { - const converted = convertUnit( - extraction.value ?? 0, - extraction.unit, - metric.unit, - unitConversions, - metric.id - ); + if ( + extraction.unit && + metric.unit && + extraction.unit.toLowerCase() !== metric.unit.toLowerCase() + ) { + const converted = + extraction.value !== null + ? convertUnit( + extraction.value, + extraction.unit, + metric.unit, + unitConversions, + metric.id, + ) + : null; if (converted !== null) { finalValue = converted; finalUnit = metric.unit; + } else if (extraction.value === null) { + // Qualitative result (e.g., "< 0.050") — keep null value, adopt target unit + finalUnit = metric.unit; } else { flagged.push({ extraction, - reason: 'ambiguous_unit', + reason: "ambiguous_unit", details: `Cannot convert ${extraction.unit} to ${metric.unit} for ${metric.id}`, }); continue; } } - // Determine abnormality using demographic-aware ranges - const { low: refLow, high: refHigh } = resolveReferenceRange(extraction, metric, demographics); - const isAbnormal = extraction.isAbnormal ?? - (finalValue !== null && refLow !== null && refHigh !== null - ? finalValue < refLow || finalValue > refHigh + // Store lab-reported range on observation (historical record) + const { low: labLow, high: labHigh } = resolveReferenceRange( + extraction, + metric, + demographics, + ); + // Use canonical range for abnormality (consistent across labs) + const { low: canonLow, high: canonHigh } = resolveCanonicalRange( + extraction, + metric, + demographics, + ); + const isAbnormal = + extraction.isAbnormal ?? + (finalValue !== null && (canonLow !== null || canonHigh !== null) + ? (canonLow !== null && finalValue < canonLow) || + (canonHigh !== null && finalValue > canonHigh) : null); + // Resolve to canonical code (e.g., hemoglobin_a1c → hba1c) + const canonicalCode = CANONICAL_CODE_MAP[metric.id] ?? metric.id; + const obs: NormalizedObservation = { - metricCode: metric.id, + metricCode: canonicalCode, category: metric.category as any, valueNumeric: finalValue, valueText: extraction.valueText, unit: finalUnit, - referenceRangeLow: refLow, - referenceRangeHigh: refHigh, + referenceRangeLow: labLow, + referenceRangeHigh: labHigh, referenceRangeText: extraction.referenceRangeText, isAbnormal, observedAt: new Date(extraction.observedAt), @@ -232,7 +369,7 @@ export function normalizeExtractions( if (baseConfidence < CONFIDENCE_THRESHOLD) { flagged.push({ extraction, - reason: 'low_confidence', + reason: "low_confidence", details: `Confidence ${baseConfidence} below threshold ${CONFIDENCE_THRESHOLD}`, }); } diff --git a/packages/ingestion/src/types.ts b/packages/ingestion/src/types.ts index e55d1f2..00df51f 100644 --- a/packages/ingestion/src/types.ts +++ b/packages/ingestion/src/types.ts @@ -1,4 +1,12 @@ -import type { DataCategory, DocumentType } from '@openvitals/common'; +import type { DataCategory, DocumentType } from "@openvitals/common"; + +// Re-export normalizer types so tests can import from './types' +export type { + MetricDefinition, + UnitConversion, + UserDemographics, + DemographicRange, +} from "./normalizer"; export interface RawExtraction { analyte: string; @@ -38,7 +46,11 @@ export interface NormalizedObservation { export interface FlaggedExtraction { extraction: RawExtraction; - reason: 'low_confidence' | 'unmatched_metric' | 'ambiguous_unit' | 'duplicate_candidate'; + reason: + | "low_confidence" + | "unmatched_metric" + | "ambiguous_unit" + | "duplicate_candidate"; details: string; } diff --git a/services/ingestion-worker/src/lib/ai-provider.ts b/services/ingestion-worker/src/lib/ai-provider.ts new file mode 100644 index 0000000..738d025 --- /dev/null +++ b/services/ingestion-worker/src/lib/ai-provider.ts @@ -0,0 +1,56 @@ +/** + * Shared AI model provider for the ingestion worker. + * + * Supports two backends via the AI_PROVIDER env var: + * - "gateway" (default) — uses @ai-sdk/gateway (Vercel AI Gateway) + * - "openrouter" — uses @openrouter/ai-sdk-provider + * + * Both work with the Vercel AI SDK's `generateText()`. + * + * Usage: + * import { getModel, getModelId } from '../lib/ai-provider'; + * const { text } = await generateText({ model: getModel(), ... }); + */ + +import type { LanguageModel } from "ai"; + +const AI_PROVIDER = process.env.AI_PROVIDER ?? "gateway"; +const DEFAULT_MODEL = "anthropic/claude-sonnet-4-20250514"; + +export function getModelId(): string { + return process.env.AI_DEFAULT_MODEL ?? DEFAULT_MODEL; +} + +export function getModel(modelId?: string): LanguageModel { + const id = modelId ?? getModelId(); + + if (AI_PROVIDER === "openrouter") { + // Lazy import to avoid requiring the package when using gateway + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { createOpenRouter } = require("@openrouter/ai-sdk-provider"); + const openrouter = createOpenRouter({ + apiKey: process.env.OPENROUTER_API_KEY, + }); + return openrouter(id); + } + + // Default: @ai-sdk/gateway + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { gateway } = require("@ai-sdk/gateway"); + return gateway(id); +} + +/** + * Get OpenRouter-compatible headers for raw fetch calls. + * Falls back to gateway-compatible headers if using gateway provider. + */ +export function getOpenRouterHeaders(): Record { + return { + "Content-Type": "application/json", + Authorization: `Bearer ${process.env.OPENROUTER_API_KEY ?? process.env.AI_GATEWAY_API_KEY ?? ""}`, + }; +} + +export function getOpenRouterBaseUrl(): string { + return process.env.OPENROUTER_BASE_URL ?? "https://openrouter.ai/api/v1"; +} diff --git a/services/ingestion-worker/src/parsers/lab-pdf.ts b/services/ingestion-worker/src/parsers/lab-pdf.ts index d2f5796..22d7bc5 100644 --- a/services/ingestion-worker/src/parsers/lab-pdf.ts +++ b/services/ingestion-worker/src/parsers/lab-pdf.ts @@ -1,25 +1,34 @@ -import { generateText } from 'ai'; -import { gateway } from '@ai-sdk/gateway'; -import { getDb } from '@openvitals/database/client'; -import { sourceArtifacts } from '@openvitals/database'; -import { eq } from 'drizzle-orm'; -import { createBlobStorage } from '@openvitals/blob-storage'; -import { extractLabsPrompt } from '@openvitals/ai'; -import type { WorkflowContext } from '../workflow'; -import type { ParseResult, RawExtraction } from '@openvitals/ingestion'; +import { generateText } from "ai"; +import { getDb } from "@openvitals/database/client"; +import { sourceArtifacts } from "@openvitals/database"; +import { eq } from "drizzle-orm"; +import { createBlobStorage } from "@openvitals/blob-storage"; +import { extractLabsPrompt } from "@openvitals/ai"; +import type { WorkflowContext } from "../workflow"; +import type { ParseResult, RawExtraction } from "@openvitals/ingestion"; +import { + getModel, + getModelId, + getOpenRouterHeaders, + getOpenRouterBaseUrl, +} from "../lib/ai-provider"; + +const OCR_MODEL = process.env.AI_OCR_MODEL ?? "google/gemini-2.5-flash"; +const MIN_TEXT_LENGTH = 50; // Below this, assume scanned/image PDF export async function parseLabPdf(ctx: WorkflowContext): Promise { const db = getDb(); // Get artifact with extracted text - const [artifact] = await db.select() + const [artifact] = await db + .select() .from(sourceArtifacts) .where(eq(sourceArtifacts.id, ctx.artifactId)) .limit(1); if (!artifact) throw new Error(`Artifact ${ctx.artifactId} not found`); - let textContent = artifact.rawTextExtracted ?? ''; + let textContent = artifact.rawTextExtracted ?? ""; // If no extracted text yet, download and extract if (!textContent) { @@ -34,47 +43,148 @@ export async function parseLabPdf(ctx: WorkflowContext): Promise { } const buffer = Buffer.concat(chunks); - if (artifact.mimeType === 'application/pdf') { - const { extractTextFromPdf } = await import('../lib/pdf'); + if (artifact.mimeType === "application/pdf") { + const { extractTextFromPdf } = await import("../lib/pdf"); textContent = await extractTextFromPdf(buffer); } else { - textContent = buffer.toString('utf-8'); + textContent = buffer.toString("utf-8"); } } - console.log(`[lab-pdf] Extracted ${textContent.length} chars from artifact=${ctx.artifactId}`); + console.log( + `[lab-pdf] Extracted ${textContent.length} chars from artifact=${ctx.artifactId}`, + ); - // Send to AI for structured extraction - const modelId = process.env.AI_DEFAULT_MODEL ?? 'anthropic/claude-sonnet-4-20250514'; - const { text } = await generateText({ - model: gateway(modelId), - system: extractLabsPrompt, - prompt: textContent.slice(0, 30000), - }); + let text: string; + + if (textContent.trim().length >= MIN_TEXT_LENGTH) { + // Digital PDF - use text extraction + AI parsing + const modelId = getModelId(); + const result = await generateText({ + model: getModel(modelId), + system: extractLabsPrompt, + prompt: textContent.slice(0, 30000), + }); + text = result.text; + } else { + // Scanned/image PDF - use vision model with rendered pages + console.log( + `[lab-pdf] Text too short (${textContent.trim().length} chars), using OCR via ${OCR_MODEL}`, + ); + + const storage = createBlobStorage(); + const blob = await storage.download(artifact.blobPath); + const chunks: Uint8Array[] = []; + const reader = blob.data.getReader(); + while (true) { + const { done, value } = await reader.read(); + if (done) break; + if (value) chunks.push(value); + } + const pdfBuffer = Buffer.concat(chunks); + + // Send PDF as base64 directly to vision model via OpenRouter raw API + // Gemini Flash supports PDF files directly — no page rendering needed + const pdfBase64 = pdfBuffer.toString("base64"); + console.log( + `[lab-pdf] Sending ${(pdfBuffer.length / 1024).toFixed(0)}KB PDF to ${OCR_MODEL} for OCR`, + ); + + const ocrResponse = await fetch( + `${getOpenRouterBaseUrl()}/chat/completions`, + { + method: "POST", + headers: getOpenRouterHeaders(), + body: JSON.stringify({ + model: OCR_MODEL, + messages: [ + { + role: "user", + content: [ + { + type: "image_url", + image_url: { + url: `data:application/pdf;base64,${pdfBase64}`, + }, + }, + { + type: "text", + text: + "Extract all lab test results from this scanned lab report. " + + extractLabsPrompt, + }, + ], + }, + ], + temperature: 0, + }), + }, + ); + + const ocrData = await ocrResponse.json(); + if (ocrData.error) { + console.error("[lab-pdf] OCR API error:", ocrData.error); + text = "{}"; + } else { + text = ocrData.choices[0].message.content; + console.log(`[lab-pdf] OCR response: ${text.length} chars`); + } + } let parsed: any; try { - const jsonStr = text.replace(/^```(?:json)?\s*\n?/m, '').replace(/\n?```\s*$/m, '').trim(); + const jsonStr = text + .replace(/^```(?:json)?\s*\n?/m, "") + .replace(/\n?```\s*$/m, "") + .trim(); parsed = JSON.parse(jsonStr); + const analytes = (parsed.results ?? []).map((r: any) => r.analyte); + console.log(`[lab-pdf] AI extracted ${analytes.length} results:`); + for (const r of parsed.results ?? []) { + console.log( + ` - ${r.analyte}: ${r.value} ${r.unit} (range: ${r.referenceRangeLow}-${r.referenceRangeHigh})`, + ); + } } catch { - console.error('[lab-pdf] Failed to parse AI response:', text.slice(0, 300)); - return { extractions: [], rawMetadata: { parser: 'lab-pdf', version: '0.1.0', error: 'parse_failed' } }; + console.error("[lab-pdf] Failed to parse AI response:", text.slice(0, 500)); + return { + extractions: [], + rawMetadata: { + parser: "lab-pdf", + version: "0.1.0", + error: "parse_failed", + }, + }; } - const fallbackDate = parsed.collectionDate ?? new Date().toISOString().split('T')[0]; - - const extractions: RawExtraction[] = (parsed.results ?? []).map((r: any) => ({ - analyte: r.analyte ?? '', - value: typeof r.value === 'number' ? r.value : null, - valueText: r.valueText ?? (r.value != null ? String(r.value) : null), - unit: r.unit ?? null, - referenceRangeLow: typeof r.referenceRangeLow === 'number' ? r.referenceRangeLow : null, - referenceRangeHigh: typeof r.referenceRangeHigh === 'number' ? r.referenceRangeHigh : null, - referenceRangeText: r.referenceRangeText ?? null, - isAbnormal: typeof r.isAbnormal === 'boolean' ? r.isAbnormal : null, - observedAt: r.observedAt ?? fallbackDate, - category: 'lab_result' as const, - })); + const fallbackDate = + parsed.collectionDate ?? new Date().toISOString().split("T")[0]; + + const extractions: RawExtraction[] = (parsed.results ?? []).map((r: any) => { + // Handle "< X" or "> X" values - strip comparator and use the number + let numValue = typeof r.value === "number" ? r.value : null; + const rawText = r.valueText ?? (r.value != null ? String(r.value) : null); + if (numValue === null && rawText) { + const ltMatch = rawText.match(/^[<>≤≥]\s*([\d.,]+)$/); + if (ltMatch) { + numValue = parseFloat(ltMatch[1]!.replace(",", ".")); + } + } + return { + analyte: r.analyte ?? "", + value: numValue, + valueText: rawText, + unit: r.unit ?? null, + referenceRangeLow: + typeof r.referenceRangeLow === "number" ? r.referenceRangeLow : null, + referenceRangeHigh: + typeof r.referenceRangeHigh === "number" ? r.referenceRangeHigh : null, + referenceRangeText: r.referenceRangeText ?? null, + isAbnormal: typeof r.isAbnormal === "boolean" ? r.isAbnormal : null, + observedAt: r.observedAt ?? fallbackDate, + category: "lab_result" as const, + }; + }); return { extractions, @@ -82,6 +192,6 @@ export async function parseLabPdf(ctx: WorkflowContext): Promise { collectionDate: parsed.collectionDate, reportDate: parsed.reportDate, labName: parsed.labName, - rawMetadata: { parser: 'lab-pdf', version: '0.1.0' }, + rawMetadata: { parser: "lab-pdf", version: "0.1.0" }, }; } diff --git a/services/ingestion-worker/src/steps/auto-identify.ts b/services/ingestion-worker/src/steps/auto-identify.ts new file mode 100644 index 0000000..65ff5f0 --- /dev/null +++ b/services/ingestion-worker/src/steps/auto-identify.ts @@ -0,0 +1,233 @@ +import { generateText } from "ai"; +import { getDb } from "@openvitals/database/client"; +import { sql } from "drizzle-orm"; +import { getModel } from "../lib/ai-provider"; +import { metricDefinitions } from "@openvitals/database"; +import type { WorkflowContext } from "../workflow"; +import type { + NormalizationResult, + FlaggedExtraction, + RawExtraction, + NormalizedObservation, +} from "@openvitals/ingestion"; +import { normalizeExtractions } from "@openvitals/ingestion"; +import type { + MetricDefinition, + UnitConversion, + UserDemographics, +} from "@openvitals/ingestion"; + +const IDENTIFY_MODEL = process.env.AI_OCR_MODEL ?? "google/gemini-2.5-flash"; + +interface IdentifiedBiomarker { + analyte: string; + id: string | null; + standardName: string; + loincCode: string | null; + unit: string | null; + rangeLow: number | null; + rangeHigh: number | null; + category: string; +} + +/** + * Auto-identify unmatched biomarkers using LLM. + * Sends unmatched analytes to Gemini Flash which identifies them + * with proper names, LOINC codes, units, and reference ranges. + * Auto-creates metric definitions and re-normalizes. + */ +export async function autoIdentify( + ctx: WorkflowContext, + normResult: NormalizationResult, + metricDefs: MetricDefinition[], + unitConversions: UnitConversion[], + demographics?: UserDemographics | null, +): Promise { + const unmatched = normResult.flagged.filter( + (f) => f.reason === "unmatched_metric", + ); + + if (unmatched.length === 0) { + console.log("[auto-identify] No unmatched items to identify"); + return normResult; + } + + console.log( + `[auto-identify] Sending ${unmatched.length} unmatched items to ${IDENTIFY_MODEL}`, + ); + + // Build the prompt with analyte details + const analyteList = unmatched + .map( + (f) => + `- "${f.extraction.analyte}" (${f.extraction.value ?? f.extraction.valueText ?? "?"} ${f.extraction.unit ?? ""})`, + ) + .join("\n"); + + const prompt = `You are a medical laboratory expert. For each unmatched lab test analyte below, identify it and provide structured information. + +For each analyte, return: +- analyte: the original name (as given) +- id: kebab_case identifier using underscores (e.g., "aslo", "rheumatoid_factor", "toxoplasma_igg"). Set to null if you cannot identify it. +- standardName: Standard English medical name +- loincCode: LOINC code if you know it (null otherwise) +- unit: Standard unit of measurement (null for qualitative tests) +- rangeLow: Lower bound of normal reference range for adults (null if not applicable) +- rangeHigh: Upper bound of normal reference range for adults (null if not applicable) +- category: One of: hematology, metabolic, lipid, thyroid, hormone, vitamin, mineral, inflammation, immunology, liver, renal, cardiac, urine + +Respond ONLY with a JSON array. No markdown fences. + +Analytes: +${analyteList}`; + + try { + const result = await generateText({ + model: getModel(IDENTIFY_MODEL), + prompt, + temperature: 0, + }); + + const aiText = result.text; + const jsonStr = aiText + .replace(/^```(?:json)?\s*\n?/m, "") + .replace(/\n?```\s*$/m, "") + .trim(); + const identified: IdentifiedBiomarker[] = JSON.parse(jsonStr); + + console.log( + `[auto-identify] LLM identified ${identified.filter((i) => i.id).length}/${identified.length} items`, + ); + + // Create new metric definitions for identified items + const db = getDb(); + const newMetricDefs: MetricDefinition[] = []; + const resolvedExtractions: RawExtraction[] = []; + const remainingFlagged: FlaggedExtraction[] = []; + + // Keep non-unmatched flags as-is + const otherFlagged = normResult.flagged.filter( + (f) => f.reason !== "unmatched_metric", + ); + + for (const flagged of unmatched) { + const match = identified.find( + (i) => i.analyte === flagged.extraction.analyte && i.id !== null, + ); + + if (!match || !match.id) { + remainingFlagged.push(flagged); + continue; + } + + // Validate LLM-returned id format to prevent injection/overwrites + if (!/^[a-z][a-z0-9_]{0,63}$/.test(match.id)) { + console.warn(`[auto-identify] Skipping unsafe id: ${match.id}`); + remainingFlagged.push(flagged); + continue; + } + + // Check if metric already exists + const existing = metricDefs.find( + (m) => + m.id === match.id || + m.name.toLowerCase() === match.standardName.toLowerCase(), + ); + + if (!existing) { + // Create new metric definition + try { + await db + .insert(metricDefinitions) + .values({ + id: match.id, + name: match.standardName, + category: match.category, + unit: match.unit, + loincCode: match.loincCode, + aliases: [flagged.extraction.analyte], + referenceRangeLow: match.rangeLow, + referenceRangeHigh: match.rangeHigh, + referenceRangeText: + match.rangeLow != null || match.rangeHigh != null + ? `${match.rangeLow ?? "?"} - ${match.rangeHigh ?? "?"} ${match.unit ?? ""}` + : null, + description: match.standardName, + displayPrecision: 2, + sortOrder: 900, + }) + .onConflictDoNothing(); + + console.log( + `[auto-identify] Created metric: ${match.id} (${match.standardName})`, + ); + + newMetricDefs.push({ + id: match.id, + name: match.standardName, + category: match.category, + unit: match.unit, + aliases: [flagged.extraction.analyte], + referenceRangeLow: match.rangeLow, + referenceRangeHigh: match.rangeHigh, + }); + } catch (err) { + console.error( + `[auto-identify] Failed to create metric ${match.id}:`, + err, + ); + remainingFlagged.push(flagged); + continue; + } + } else { + // Add alias to existing metric if not present + const existingAliases = existing.aliases ?? []; + if ( + !existingAliases.some( + (a) => a.toLowerCase() === flagged.extraction.analyte.toLowerCase(), + ) + ) { + await db.execute( + sql`UPDATE metric_definitions SET aliases = aliases::jsonb || ${JSON.stringify([flagged.extraction.analyte])}::jsonb WHERE id = ${existing.id}`, + ); + } + newMetricDefs.push(existing); + } + + resolvedExtractions.push(flagged.extraction); + } + + // Re-normalize the resolved extractions with the expanded metric definitions + if (resolvedExtractions.length > 0) { + const allMetrics = [...metricDefs, ...newMetricDefs]; + const reNormResult = normalizeExtractions( + resolvedExtractions, + allMetrics, + unitConversions, + 0.85, + demographics, + ); + + console.log( + `[auto-identify] Re-normalized: ${reNormResult.normalized.length} succeeded, ${reNormResult.flagged.length} still flagged`, + ); + + return { + normalized: [...normResult.normalized, ...reNormResult.normalized], + flagged: [ + ...otherFlagged, + ...remainingFlagged, + ...reNormResult.flagged, + ], + }; + } + + return { + normalized: normResult.normalized, + flagged: [...otherFlagged, ...remainingFlagged], + }; + } catch (err) { + console.error("[auto-identify] Failed:", err); + return normResult; + } +} diff --git a/services/ingestion-worker/src/steps/classify.ts b/services/ingestion-worker/src/steps/classify.ts index 4d8b4e0..851a20f 100644 --- a/services/ingestion-worker/src/steps/classify.ts +++ b/services/ingestion-worker/src/steps/classify.ts @@ -1,5 +1,4 @@ import { generateText } from "ai"; -import { gateway } from "@ai-sdk/gateway"; import { getDb } from "@openvitals/database/client"; import { importJobs, sourceArtifacts } from "@openvitals/database"; import { eq } from "drizzle-orm"; @@ -7,6 +6,7 @@ import { classifyDocumentPrompt } from "@openvitals/ai"; import { createBlobStorage } from "@openvitals/blob-storage"; import type { WorkflowContext } from "../workflow"; import type { ClassificationResult } from "@openvitals/ingestion"; +import { getModel, getModelId } from "../lib/ai-provider"; export async function classify( ctx: WorkflowContext, @@ -108,11 +108,43 @@ export async function classify( .set({ rawTextExtracted: textContent.slice(0, 50000) }) .where(eq(sourceArtifacts.id, ctx.artifactId)); + // If text extraction yielded almost nothing for a PDF, it's likely scanned. + // Auto-classify as lab_report (most common document type for blood work PDFs) + if ( + artifact.mimeType === "application/pdf" && + textContent.trim().length < 50 + ) { + console.log( + `[classify] Scanned PDF detected (${textContent.trim().length} chars), auto-classifying as lab_report`, + ); + const scannedResult: ClassificationResult = { + documentType: "lab_report", + confidence: 0.8, + reasoning: + "Scanned PDF with minimal extractable text - assuming lab report for OCR processing", + }; + await db + .update(importJobs) + .set({ + classifiedType: scannedResult.documentType, + classificationConfidence: scannedResult.confidence, + classifyCompletedAt: new Date(), + }) + .where(eq(importJobs.id, ctx.importJobId)); + await db + .update(sourceArtifacts) + .set({ + classifiedType: scannedResult.documentType, + classificationConfidence: scannedResult.confidence, + }) + .where(eq(sourceArtifacts.id, ctx.artifactId)); + return scannedResult; + } + // Classify with AI - const modelId = - process.env.AI_DEFAULT_MODEL ?? "anthropic/claude-sonnet-4-20250514"; + const modelId = getModelId(); const { text } = await generateText({ - model: gateway(modelId), + model: getModel(modelId), system: classifyDocumentPrompt, prompt: `Document type: ${artifact.mimeType}\nFile name: ${artifact.fileName}\n\nContent:\n${textContent.slice(0, 10000)}`, }); diff --git a/services/ingestion-worker/src/steps/normalize.ts b/services/ingestion-worker/src/steps/normalize.ts index 6aadcac..efb727b 100644 --- a/services/ingestion-worker/src/steps/normalize.ts +++ b/services/ingestion-worker/src/steps/normalize.ts @@ -6,10 +6,17 @@ import type { RawExtraction, NormalizationResult } from '@openvitals/ingestion'; import { normalizeExtractions } from '@openvitals/ingestion'; import type { UserDemographics, DemographicRange } from '@openvitals/ingestion'; +export interface NormalizeOutput { + result: NormalizationResult; + metricDefs: any[]; + unitConversions: any[]; + demographics: any; +} + export async function normalize( ctx: WorkflowContext, extractions: RawExtraction[] -): Promise { +): Promise { const db = getDb(); await db.update(importJobs) @@ -85,5 +92,15 @@ export async function normalize( .set({ normalizeCompletedAt: new Date() }) .where(eq(importJobs.id, ctx.importJobId)); - return normalizeExtractions(extractions, metricDefs, unitConvs, 0.85, demographics); + const result = normalizeExtractions(extractions, metricDefs, unitConvs, 0.85, demographics); + + // Log flagged extractions for debugging + if (result.flagged.length > 0) { + console.log(`[normalize] Flagged ${result.flagged.length} extractions:`); + for (const f of result.flagged) { + console.log(` - "${f.extraction.analyte}" (${f.reason}): ${f.details}`); + } + } + + return { result, metricDefs, unitConversions: unitConvs, demographics }; } diff --git a/services/ingestion-worker/src/workflow.ts b/services/ingestion-worker/src/workflow.ts index 5120790..1c612b9 100644 --- a/services/ingestion-worker/src/workflow.ts +++ b/services/ingestion-worker/src/workflow.ts @@ -1,11 +1,12 @@ -import { getDb } from '@openvitals/database/client'; -import { importJobs } from '@openvitals/database'; -import { eq } from 'drizzle-orm'; -import { emitEvent } from '@openvitals/events'; -import { classify } from './steps/classify'; -import { parse } from './steps/parse'; -import { normalize } from './steps/normalize'; -import { materialize } from './steps/materialize'; +import { getDb } from "@openvitals/database/client"; +import { importJobs } from "@openvitals/database"; +import { eq } from "drizzle-orm"; +import { emitEvent } from "@openvitals/events"; +import { classify } from "./steps/classify"; +import { parse } from "./steps/parse"; +import { normalize } from "./steps/normalize"; +import { autoIdentify } from "./steps/auto-identify"; +import { materialize } from "./steps/materialize"; export interface WorkflowContext { importJobId: string; @@ -20,45 +21,77 @@ export async function processWorkflow(ctx: WorkflowContext): Promise { try { // Step 1: Classify the document const classification = await classify(ctx); - console.log(`[workflow] Classified as ${classification.documentType} (${classification.confidence})`); + console.log( + `[workflow] Classified as ${classification.documentType} (${classification.confidence})`, + ); // If confidence too low, mark for review and stop if (classification.confidence < 0.7) { console.log(`[workflow] Low confidence, marking for review`); - await db.update(importJobs) - .set({ status: 'review_needed', needsReview: true }) + await db + .update(importJobs) + .set({ status: "review_needed", needsReview: true }) .where(eq(importJobs.id, ctx.importJobId)); return; } // Step 2: Parse the document const parseResult = await parse(ctx, classification.documentType); - console.log(`[workflow] Extracted ${parseResult.extractions.length} results`); + console.log( + `[workflow] Extracted ${parseResult.extractions.length} results`, + ); if (parseResult.extractions.length === 0) { - await db.update(importJobs) - .set({ status: 'completed', extractionCount: 0, completedAt: new Date() }) + await db + .update(importJobs) + .set({ + status: "completed", + extractionCount: 0, + completedAt: new Date(), + }) .where(eq(importJobs.id, ctx.importJobId)); return; } // Step 3: Normalize extractions - const normalization = await normalize(ctx, parseResult.extractions); - console.log(`[workflow] Normalized ${normalization.normalized.length}, flagged ${normalization.flagged.length}`); + const { + result: normalization, + metricDefs, + unitConversions, + demographics, + } = await normalize(ctx, parseResult.extractions); + console.log( + `[workflow] Normalized ${normalization.normalized.length}, flagged ${normalization.flagged.length}`, + ); + + // Step 3.5: Auto-identify unmatched items via LLM + const finalNormalization = await autoIdentify( + ctx, + normalization, + metricDefs, + unitConversions, + demographics, + ); + console.log( + `[workflow] After auto-identify: ${finalNormalization.normalized.length} normalized, ${finalNormalization.flagged.length} flagged`, + ); // Step 4: Materialize to database - await materialize(ctx, normalization); - console.log(`[workflow] Materialized. Job complete.`); + await materialize(ctx, finalNormalization); + console.log(`[workflow] Materialized.`); + + console.log(`[workflow] Job complete.`); } catch (error) { const message = error instanceof Error ? error.message : String(error); console.error(`[workflow] Failed for job=${ctx.importJobId}:`, message); - await db.update(importJobs) - .set({ status: 'failed', errorMessage: message }) + await db + .update(importJobs) + .set({ status: "failed", errorMessage: message }) .where(eq(importJobs.id, ctx.importJobId)); emitEvent({ - type: 'import.failed', + type: "import.failed", payload: { importJobId: ctx.importJobId, error: message }, userId: ctx.userId, timestamp: new Date(),