Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions .github/workflows/dataset-validator[tabular].yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: Run Tabular Validator

on:
pull_request:
branches:
- main
paths:
- 'data/statistics/**'
workflow_dispatch:
inputs:
mode:
description: "Validation mode"
required: false
default: "tabular"

jobs:
validate:
name: Validate Statistics Data [Tabular]
runs-on: ubuntu-latest

env:
DATASET_PATH: ${{ vars.DATASET_DIR_PATH_VAR || '../../data/statistics' }}
MODE: ${{ github.event.inputs.mode || 'tabular' }}

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Install dependencies
run: pip install -r scripts/validator/requirements.txt

- name: Run validator
working-directory: scripts/validator
run: |
echo "Running with:"
echo "Dataset Directory: $DATASET_PATH"
echo "Validation Mode: $MODE"

python main.py "$DATASET_PATH" "$MODE"
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,7 @@ ingestion/.env
# Generated zip files (not used by website)
website/static/downloads/archive_Data.zip
website/static/downloads/sources_Data.zip
website/static/downloads/statistics_Data.zip
website/static/downloads/statistics_Data.zip

# Virtual environment
venv/
75 changes: 75 additions & 0 deletions scripts/validator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Data Validator Program

A command-line tool for validating datasets using configurable validator types.

---

## Getting Started

### Prerequisites

- Python 3.x
- `pip` and `venv`

### Local Setup

```bash
# Navigate to the project directory
cd scripts/validator

# Create a virtual environment
python3 -m venv venv

# Activate the virtual environment
source venv/bin/activate

# Install dependencies
pip install -r requirements.txt
```

### Running the Program

```bash
python main.py <path-to-dataset-directory> <validator-type>
```

**Example:**

```bash
python main.py ../../data/statistics tabular
```

---

## Supported Validators

| Validator | Description | Status |
|-----------|--------------------------|---------|
| `tabular` | Validates tabular data | ✅ Available |

> **Note:** Additional validators are currently under development.

---

## Project Structure

```
scripts/validator/
├── main.py # Entry point
├── requirements.txt # Python dependencies
├── README.md # Project documentation
├── core/
│ ├── baseRunner.py # Base runner logic
│ └── baseValidator.py # Base validator interface
├── models/
│ └── tabularSchema.json # Schema definition for tabular validation
├── utils/
│ └── utils.py # Shared utility functions
└── validators/
└── tabular.py # Tabular validator implementation
```
34 changes: 34 additions & 0 deletions scripts/validator/core/baseRunner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import sys
from pathlib import Path
from utils.utils import Utils

def run_validation(file_path, validator):
validator = validator()
all_errors = []
all_warnings = []

paths = list(Path(file_path).rglob("data.json"))

if not paths:
print("[INFO] No data.json files found")
sys.exit(0)

for path in paths:
errors, warnings = validator.validate(path)
all_errors.extend(errors)
all_warnings.extend(warnings)

if all_errors:
print(f" - {len(all_errors)} errors found")
for error in all_errors:
print(Utils.format_issue(error))

if all_warnings:
print(f" - {len(all_warnings)} warnings found")
for warning in all_warnings:
print(Utils.format_issue(warning))

if not all_errors and not all_warnings:
print("All data valid ✅")

sys.exit(1 if all_errors else 0)
3 changes: 3 additions & 0 deletions scripts/validator/core/baseValidator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class BaseValidator:
def validate(self, file_path):
raise NotImplementedError
16 changes: 16 additions & 0 deletions scripts/validator/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from core.baseRunner import run_validation
from validators.tabular import TabularValidator
import sys

def main(file_path, validator):
if validator == "tabular":
run_validation(file_path, TabularValidator)
else:
print("Invalid validator")
sys.exit(1)

if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python main.py <directory> <validator>")
sys.exit(1)
main(sys.argv[1], sys.argv[2])
22 changes: 22 additions & 0 deletions scripts/validator/models/tabularSchema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"type": "object",
"required": ["columns", "rows"],
"properties": {
"columns": {
"type": "array",
"items": {
"type": "string"
},
"minItems": 1
},
"rows": {
"type": "array",
"items": {
"type": "array",
"items": {},
"minItems": 1
}
}
},
"additionalProperties": false
}
2 changes: 2 additions & 0 deletions scripts/validator/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
jsonschema>=4.25,<5

23 changes: 23 additions & 0 deletions scripts/validator/utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
class Utils:
@staticmethod
def format_issue(issue):
location = ""

if issue.get("row") is not None:
location += f"Row {issue['row']}"

if issue.get("column"):
# handle list of columns OR single column
if isinstance(issue["column"], list):
cols = ", ".join(issue["column"])
location += f", Columns [{cols}]"
else:
location += f", Column '{issue['column']}'"

return f"[{issue['type'].upper()}] {issue['file']}: {location} {issue['message']}"

@staticmethod
def fits_in_int32(value: int) -> bool:
INT32_MIN = -2_147_483_648
INT32_MAX = 2_147_483_647
return INT32_MIN <= value <= INT32_MAX
139 changes: 139 additions & 0 deletions scripts/validator/validators/tabular.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from core.baseValidator import BaseValidator
import json
from jsonschema import validate, ValidationError
from collections import Counter
from utils.utils import Utils
from pathlib import Path

class TabularValidator(BaseValidator):
def __init__(self):
with open(Path(__file__).parent / "../models/tabularSchema.json") as f:
self.schema = json.load(f)

def _check_duplicate_columns(self, file_path, columns):
if len(columns) != len(set(columns)):
column_counts = Counter(columns)
duplicates = [col for col, count in column_counts.items() if count > 1]
if duplicates:
return [
{
"type": "error",
"file": file_path,
"row": None,
"column": duplicates,
"message": f"Duplicate column names found: {', '.join(duplicates)}",
}
]
return []

def _check_row_column_mismatch(self, file_path, i, row, num_cols):
if len(row) != num_cols:
return [{
"type": "error",
"file": file_path,
"row": i,
"column": None,
"message": f"has only {len(row)} value(s), expected {num_cols} value(s)",
}]
return []

def _check_data_types(self, file_path, i, row, first_row, columns):
errors = []
for j, value in enumerate(row):
expected_type = type(first_row[j])
if expected_type is float:
allowed_types = (float, int)
expected_msg = "float or whole number"
else:
allowed_types = (expected_type)
expected_msg = expected_type.__name__

if not isinstance(value, allowed_types):
errors.append({
"type": "error",
"file": file_path,
"row": i,
"column": columns[j],
"message": f"has {value} ({type(value).__name__}), expected {expected_msg}",
})
return errors

def _check_empty_values(self, file_path, i, row, columns):
warnings = []
for j, value in enumerate(row):
str_value = str(value).strip() if value is not None else ""
if str_value == "":
warnings.append({
"type": "warning",
"file": file_path,
"row": i,
"column": columns[j],
"message": "has empty value",
})
return warnings

def _check_value_overflow(self, file_path, i, row, columns):
warnings = []
for j, value in enumerate(row):
if isinstance(value, int):
if not Utils.fits_in_int32(value):
warnings.append({
"type": "warning",
"file": file_path,
"row": i,
"column": columns[j],
"message": f"has {value} ({type(value).__name__}), this is a big integer in postgres , consider when inserting into the database (postgres has 32 bit integer limit)",
})
return warnings

def validate(self, file_path):
errors = []
warnings = []

# Load JSON
try:
with open(file_path) as f:
data = json.load(f)
except (json.JSONDecodeError, FileNotFoundError) as e:
return [f"[ERROR] {file_path}: Invalid JSON ({e})"], []

# 1. Schema validation
try:
validate(instance=data, schema=self.schema)
except ValidationError as e:
return [f"[ERROR] {file_path}: Schema error → {e.message}"], []

# 2. Custom validation
columns = data.get("columns", [])
rows = data.get("rows", [])
num_cols = len(columns)

if not rows or not columns:
if not rows:
message = "No rows found"
elif not columns:
message = "No columns found"
else:
message = "No rows or columns found"

errors.extend([{
"type": "error",
"file": file_path,
"row": rows if rows else None,
"column": columns if columns else None,
"message": message
}
])
return errors, warnings

errors.extend(self._check_duplicate_columns(file_path, columns))

for i, row in enumerate(rows):
errors.extend(self._check_row_column_mismatch(file_path, i, row, num_cols))
warnings.extend(self._check_empty_values(file_path, i, row, columns))
warnings.extend(self._check_value_overflow(file_path, i, row, columns))
errors.extend(self._check_data_types(file_path, i, row, rows[0], columns))

return errors, warnings


Loading