LDFLK · yasandu0505 · Mar 17, 2026 · Mar 18, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/.github/workflows/dataset-validator[tabular].yml b/.github/workflows/dataset-validator[tabular].yml
@@ -0,0 +1,44 @@
+name: Run Tabular Validator
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'data/statistics/**'
+  workflow_dispatch:
+    inputs:
+      mode:
+        description: "Validation mode"
+        required: false
+        default: "tabular"
+
+jobs:
+  validate:
+    name: Validate Statistics Data [Tabular]
+    runs-on: ubuntu-latest
+
+    env:
+      DATASET_PATH: ${{ vars.DATASET_DIR_PATH_VAR || '../../data/statistics' }}
+      MODE: ${{ github.event.inputs.mode || 'tabular' }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: pip install -r scripts/validator/requirements.txt
+
+      - name: Run validator
+        working-directory: scripts/validator
+        run: |
+          echo "Running with:"
+          echo "Dataset Directory: $DATASET_PATH"
+          echo "Validation Mode: $MODE"
+
+          python main.py "$DATASET_PATH" "$MODE"
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,7 @@ ingestion/.env
 # Generated zip files (not used by website)
 website/static/downloads/archive_Data.zip
 website/static/downloads/sources_Data.zip
-website/static/downloads/statistics_Data.zip
+website/static/downloads/statistics_Data.zip
+
+# Virtual environment
+venv/
diff --git a/scripts/validator/README.md b/scripts/validator/README.md
@@ -0,0 +1,75 @@
+# Data Validator Program
+
+A command-line tool for validating datasets using configurable validator types.
+
+---
+
+## Getting Started
+
+### Prerequisites
+
+- Python 3.x
+- `pip` and `venv`
+
+### Local Setup
+
+```bash
+# Navigate to the project directory
+cd scripts/validator
+
+# Create a virtual environment
+python3 -m venv venv
+
+# Activate the virtual environment
+source venv/bin/activate
+
+# Install dependencies
+pip install -r requirements.txt
+```
+
+### Running the Program
+
+```bash
+python main.py <path-to-dataset-directory> <validator-type>
+```
+
+**Example:**
+
+```bash
+python main.py ../../data/statistics tabular
+```
+
+---
+
+## Supported Validators
+
+| Validator | Description              | Status  |
+|-----------|--------------------------|---------|
+| `tabular` | Validates tabular data   | ✅ Available |
+
+> **Note:** Additional validators are currently under development.
+
+---
+
+## Project Structure
+
+```
+scripts/validator/
+│
+├── main.py               # Entry point
+├── requirements.txt      # Python dependencies
+├── README.md             # Project documentation
+│
+├── core/
+│   ├── baseRunner.py     # Base runner logic
+│   └── baseValidator.py  # Base validator interface
+│
+├── models/
+│   └── tabularSchema.json  # Schema definition for tabular validation
+│
+├── utils/
+│   └── utils.py          # Shared utility functions
+│
+└── validators/
+    └── tabular.py        # Tabular validator implementation
+```
diff --git a/scripts/validator/core/baseRunner.py b/scripts/validator/core/baseRunner.py
@@ -0,0 +1,34 @@
+import sys
+from pathlib import Path
+from utils.utils import Utils
+
+def run_validation(file_path, validator):
+    validator = validator()
+    all_errors = []
+    all_warnings = []
+
+    paths = list(Path(file_path).rglob("data.json"))
+
+    if not paths:
+        print("[INFO] No data.json files found")
+        sys.exit(0)
+
+    for path in paths:
+        errors, warnings = validator.validate(path)
+        all_errors.extend(errors)
+        all_warnings.extend(warnings)
+
+    if all_errors:
+        print(f" - {len(all_errors)} errors found")
+        for error in all_errors:
+            print(Utils.format_issue(error))
+
+    if all_warnings:
+        print(f" - {len(all_warnings)} warnings found")
+        for warning in all_warnings:
+            print(Utils.format_issue(warning))
+
+    if not all_errors and not all_warnings:
+        print("All data valid ✅")
+
+    sys.exit(1 if all_errors else 0)
diff --git a/scripts/validator/core/baseValidator.py b/scripts/validator/core/baseValidator.py
@@ -0,0 +1,3 @@
+class BaseValidator:
+    def validate(self, file_path):
+        raise NotImplementedError
diff --git a/scripts/validator/main.py b/scripts/validator/main.py
@@ -0,0 +1,16 @@
+from core.baseRunner import run_validation
+from validators.tabular import TabularValidator
+import sys
+
+def main(file_path, validator):
+    if validator == "tabular":
+        run_validation(file_path, TabularValidator)
+    else:
+        print("Invalid validator")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python main.py <directory> <validator>")
+        sys.exit(1)
+    main(sys.argv[1], sys.argv[2])
diff --git a/scripts/validator/models/tabularSchema.json b/scripts/validator/models/tabularSchema.json
@@ -0,0 +1,22 @@
+{
+  "type": "object",
+  "required": ["columns", "rows"],
+  "properties": {
+    "columns": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      },
+      "minItems": 1
+    },
+    "rows": {
+      "type": "array",
+      "items": {
+        "type": "array",
+        "items": {},
+        "minItems": 1
+      }
+    }
+  },
+  "additionalProperties": false
+}
diff --git a/scripts/validator/requirements.txt b/scripts/validator/requirements.txt
@@ -0,0 +1,2 @@
+jsonschema>=4.25,<5
+
diff --git a/scripts/validator/utils/utils.py b/scripts/validator/utils/utils.py
@@ -0,0 +1,23 @@
+class Utils:
+    @staticmethod
+    def format_issue(issue):
+        location = ""
+
+        if issue.get("row") is not None:
+            location += f"Row {issue['row']}"
+
+        if issue.get("column"):
+            # handle list of columns OR single column
+            if isinstance(issue["column"], list):
+                cols = ", ".join(issue["column"])
+                location += f", Columns [{cols}]"
+            else:
+                location += f", Column '{issue['column']}'"
+
+        return f"[{issue['type'].upper()}] {issue['file']}: {location} {issue['message']}"
+
+    @staticmethod
+    def fits_in_int32(value: int) -> bool:
+        INT32_MIN = -2_147_483_648
+        INT32_MAX = 2_147_483_647
+        return INT32_MIN <= value <= INT32_MAX
diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py
@@ -0,0 +1,139 @@
+from core.baseValidator import BaseValidator
+import json
+from jsonschema import validate, ValidationError
+from collections import Counter
+from utils.utils import Utils
+from pathlib import Path
+
+class TabularValidator(BaseValidator):
+    def __init__(self):
+        with open(Path(__file__).parent / "../models/tabularSchema.json") as f:
+            self.schema = json.load(f)
+
+    def _check_duplicate_columns(self, file_path, columns):
+        if len(columns) != len(set(columns)):
+            column_counts = Counter(columns)
+            duplicates = [col for col, count in column_counts.items() if count > 1]
+            if duplicates:
+                return [
+                    {
+                        "type": "error",
+                        "file": file_path,
+                        "row": None,
+                        "column": duplicates,
+                        "message": f"Duplicate column names found: {', '.join(duplicates)}",
+                    }
+                ]
+        return []
+
+    def _check_row_column_mismatch(self, file_path, i, row, num_cols):
+        if len(row) != num_cols:
+            return [{
+            "type": "error",
+            "file": file_path,
+            "row": i,
+            "column": None,
+            "message": f"has only {len(row)} value(s), expected {num_cols} value(s)",
+        }]
+        return []   
+
+    def _check_data_types(self, file_path, i, row, first_row, columns):
+        errors = []
+        for j, value in enumerate(row):
+            expected_type = type(first_row[j])
+            if expected_type is float:
+                allowed_types = (float, int)
+                expected_msg = "float or whole number"
+            else:
+                allowed_types = (expected_type)
+                expected_msg = expected_type.__name__
+
+            if not isinstance(value, allowed_types):
+                errors.append({
+                    "type": "error",
+                    "file": file_path,
+                    "row": i,
+                    "column": columns[j],
+                    "message": f"has {value} ({type(value).__name__}), expected {expected_msg}",
+                })
+        return errors
+
+    def _check_empty_values(self, file_path, i, row, columns):
+        warnings = []
+        for j, value in enumerate(row):
+            str_value = str(value).strip() if value is not None else ""
+            if str_value == "":
+                warnings.append({
+                            "type": "warning",
+                            "file": file_path,
+                            "row": i,
+                            "column": columns[j],
+                            "message": "has empty value",
+                })
+        return warnings
+
+    def _check_value_overflow(self, file_path, i, row, columns):
+        warnings = []
+        for j, value in enumerate(row):
+            if isinstance(value, int):
+                if not Utils.fits_in_int32(value):
+                        warnings.append({
+                                "type": "warning",
+                                "file": file_path,
+                                "row": i,
+                                "column": columns[j],
+                                "message": f"has {value} ({type(value).__name__}), this is a big integer in postgres , consider when inserting into the database (postgres has 32 bit integer limit)",
+                        })
+        return warnings
+
+    def validate(self, file_path):
+        errors = []
+        warnings = []
+
+        # Load JSON
+        try:
+            with open(file_path) as f:
+                data = json.load(f)
+        except (json.JSONDecodeError, FileNotFoundError) as e:
+            return [f"[ERROR] {file_path}: Invalid JSON ({e})"], []
+
+        # 1. Schema validation
+        try:
+            validate(instance=data, schema=self.schema)
+        except ValidationError as e:
+            return [f"[ERROR] {file_path}: Schema error → {e.message}"], []
+
+        # 2. Custom validation
+        columns = data.get("columns", [])
+        rows = data.get("rows", [])
+        num_cols = len(columns)
+
+        if not rows or not columns:
+            if not rows:
+                message = "No rows found"
+            elif not columns:
+                message = "No columns found"
+            else:
+                message = "No rows or columns found"
+
+            errors.extend([{
+                    "type": "error",
+                    "file": file_path,
+                    "row": rows if rows else None,
+                    "column": columns if columns else None,
+                    "message": message
+                }
+            ])
+            return errors, warnings
+
+        errors.extend(self._check_duplicate_columns(file_path, columns))
+
+        for i, row in enumerate(rows):
+            errors.extend(self._check_row_column_mismatch(file_path, i, row, num_cols))
+            warnings.extend(self._check_empty_values(file_path, i, row, columns))
+            warnings.extend(self._check_value_overflow(file_path, i, row, columns))
+            errors.extend(self._check_data_types(file_path, i, row, rows[0], columns))
+
+        return errors, warnings
+
+