diff --git a/.github/workflows/dataset-validator[tabular].yml b/.github/workflows/dataset-validator[tabular].yml new file mode 100644 index 0000000..9db45a2 --- /dev/null +++ b/.github/workflows/dataset-validator[tabular].yml @@ -0,0 +1,44 @@ +name: Run Tabular Validator + +on: + pull_request: + branches: + - main + paths: + - 'data/statistics/**' + workflow_dispatch: + inputs: + mode: + description: "Validation mode" + required: false + default: "tabular" + +jobs: + validate: + name: Validate Statistics Data [Tabular] + runs-on: ubuntu-latest + + env: + DATASET_PATH: ${{ vars.DATASET_DIR_PATH_VAR || '../../data/statistics' }} + MODE: ${{ github.event.inputs.mode || 'tabular' }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: pip install -r scripts/validator/requirements.txt + + - name: Run validator + working-directory: scripts/validator + run: | + echo "Running with:" + echo "Dataset Directory: $DATASET_PATH" + echo "Validation Mode: $MODE" + + python main.py "$DATASET_PATH" "$MODE" \ No newline at end of file diff --git a/.gitignore b/.gitignore index 3deeccb..3bef95e 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,7 @@ ingestion/.env # Generated zip files (not used by website) website/static/downloads/archive_Data.zip website/static/downloads/sources_Data.zip -website/static/downloads/statistics_Data.zip \ No newline at end of file +website/static/downloads/statistics_Data.zip + +# Virtual environment +venv/ \ No newline at end of file diff --git a/scripts/validator/README.md b/scripts/validator/README.md new file mode 100644 index 0000000..f58ab10 --- /dev/null +++ b/scripts/validator/README.md @@ -0,0 +1,75 @@ +# Data Validator Program + +A command-line tool for validating datasets using configurable validator types. + +--- + +## Getting Started + +### Prerequisites + +- Python 3.x +- `pip` and `venv` + +### Local Setup + +```bash +# Navigate to the project directory +cd scripts/validator + +# Create a virtual environment +python3 -m venv venv + +# Activate the virtual environment +source venv/bin/activate + +# Install dependencies +pip install -r requirements.txt +``` + +### Running the Program + +```bash +python main.py +``` + +**Example:** + +```bash +python main.py ../../data/statistics tabular +``` + +--- + +## Supported Validators + +| Validator | Description | Status | +|-----------|--------------------------|---------| +| `tabular` | Validates tabular data | ✅ Available | + +> **Note:** Additional validators are currently under development. + +--- + +## Project Structure + +``` +scripts/validator/ +│ +├── main.py # Entry point +├── requirements.txt # Python dependencies +├── README.md # Project documentation +│ +├── core/ +│ ├── baseRunner.py # Base runner logic +│ └── baseValidator.py # Base validator interface +│ +├── models/ +│ └── tabularSchema.json # Schema definition for tabular validation +│ +├── utils/ +│ └── utils.py # Shared utility functions +│ +└── validators/ + └── tabular.py # Tabular validator implementation +``` \ No newline at end of file diff --git a/scripts/validator/core/baseRunner.py b/scripts/validator/core/baseRunner.py new file mode 100644 index 0000000..06db53c --- /dev/null +++ b/scripts/validator/core/baseRunner.py @@ -0,0 +1,34 @@ +import sys +from pathlib import Path +from utils.utils import Utils + +def run_validation(file_path, validator): + validator = validator() + all_errors = [] + all_warnings = [] + + paths = list(Path(file_path).rglob("data.json")) + + if not paths: + print("[INFO] No data.json files found") + sys.exit(0) + + for path in paths: + errors, warnings = validator.validate(path) + all_errors.extend(errors) + all_warnings.extend(warnings) + + if all_errors: + print(f" - {len(all_errors)} errors found") + for error in all_errors: + print(Utils.format_issue(error)) + + if all_warnings: + print(f" - {len(all_warnings)} warnings found") + for warning in all_warnings: + print(Utils.format_issue(warning)) + + if not all_errors and not all_warnings: + print("All data valid ✅") + + sys.exit(1 if all_errors else 0) diff --git a/scripts/validator/core/baseValidator.py b/scripts/validator/core/baseValidator.py new file mode 100644 index 0000000..998523f --- /dev/null +++ b/scripts/validator/core/baseValidator.py @@ -0,0 +1,3 @@ +class BaseValidator: + def validate(self, file_path): + raise NotImplementedError \ No newline at end of file diff --git a/scripts/validator/main.py b/scripts/validator/main.py new file mode 100644 index 0000000..a9a303d --- /dev/null +++ b/scripts/validator/main.py @@ -0,0 +1,16 @@ +from core.baseRunner import run_validation +from validators.tabular import TabularValidator +import sys + +def main(file_path, validator): + if validator == "tabular": + run_validation(file_path, TabularValidator) + else: + print("Invalid validator") + sys.exit(1) + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Usage: python main.py ") + sys.exit(1) + main(sys.argv[1], sys.argv[2]) \ No newline at end of file diff --git a/scripts/validator/models/tabularSchema.json b/scripts/validator/models/tabularSchema.json new file mode 100644 index 0000000..d3bdb1e --- /dev/null +++ b/scripts/validator/models/tabularSchema.json @@ -0,0 +1,22 @@ +{ + "type": "object", + "required": ["columns", "rows"], + "properties": { + "columns": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1 + }, + "rows": { + "type": "array", + "items": { + "type": "array", + "items": {}, + "minItems": 1 + } + } + }, + "additionalProperties": false +} \ No newline at end of file diff --git a/scripts/validator/requirements.txt b/scripts/validator/requirements.txt new file mode 100644 index 0000000..d07debb --- /dev/null +++ b/scripts/validator/requirements.txt @@ -0,0 +1,2 @@ +jsonschema>=4.25,<5 + diff --git a/scripts/validator/utils/utils.py b/scripts/validator/utils/utils.py new file mode 100644 index 0000000..bdcac55 --- /dev/null +++ b/scripts/validator/utils/utils.py @@ -0,0 +1,23 @@ +class Utils: + @staticmethod + def format_issue(issue): + location = "" + + if issue.get("row") is not None: + location += f"Row {issue['row']}" + + if issue.get("column"): + # handle list of columns OR single column + if isinstance(issue["column"], list): + cols = ", ".join(issue["column"]) + location += f", Columns [{cols}]" + else: + location += f", Column '{issue['column']}'" + + return f"[{issue['type'].upper()}] {issue['file']}: {location} {issue['message']}" + + @staticmethod + def fits_in_int32(value: int) -> bool: + INT32_MIN = -2_147_483_648 + INT32_MAX = 2_147_483_647 + return INT32_MIN <= value <= INT32_MAX \ No newline at end of file diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py new file mode 100644 index 0000000..dd40071 --- /dev/null +++ b/scripts/validator/validators/tabular.py @@ -0,0 +1,139 @@ +from core.baseValidator import BaseValidator +import json +from jsonschema import validate, ValidationError +from collections import Counter +from utils.utils import Utils +from pathlib import Path + +class TabularValidator(BaseValidator): + def __init__(self): + with open(Path(__file__).parent / "../models/tabularSchema.json") as f: + self.schema = json.load(f) + + def _check_duplicate_columns(self, file_path, columns): + if len(columns) != len(set(columns)): + column_counts = Counter(columns) + duplicates = [col for col, count in column_counts.items() if count > 1] + if duplicates: + return [ + { + "type": "error", + "file": file_path, + "row": None, + "column": duplicates, + "message": f"Duplicate column names found: {', '.join(duplicates)}", + } + ] + return [] + + def _check_row_column_mismatch(self, file_path, i, row, num_cols): + if len(row) != num_cols: + return [{ + "type": "error", + "file": file_path, + "row": i, + "column": None, + "message": f"has only {len(row)} value(s), expected {num_cols} value(s)", + }] + return [] + + def _check_data_types(self, file_path, i, row, first_row, columns): + errors = [] + for j, value in enumerate(row): + expected_type = type(first_row[j]) + if expected_type is float: + allowed_types = (float, int) + expected_msg = "float or whole number" + else: + allowed_types = (expected_type) + expected_msg = expected_type.__name__ + + if not isinstance(value, allowed_types): + errors.append({ + "type": "error", + "file": file_path, + "row": i, + "column": columns[j], + "message": f"has {value} ({type(value).__name__}), expected {expected_msg}", + }) + return errors + + def _check_empty_values(self, file_path, i, row, columns): + warnings = [] + for j, value in enumerate(row): + str_value = str(value).strip() if value is not None else "" + if str_value == "": + warnings.append({ + "type": "warning", + "file": file_path, + "row": i, + "column": columns[j], + "message": "has empty value", + }) + return warnings + + def _check_value_overflow(self, file_path, i, row, columns): + warnings = [] + for j, value in enumerate(row): + if isinstance(value, int): + if not Utils.fits_in_int32(value): + warnings.append({ + "type": "warning", + "file": file_path, + "row": i, + "column": columns[j], + "message": f"has {value} ({type(value).__name__}), this is a big integer in postgres , consider when inserting into the database (postgres has 32 bit integer limit)", + }) + return warnings + + def validate(self, file_path): + errors = [] + warnings = [] + + # Load JSON + try: + with open(file_path) as f: + data = json.load(f) + except (json.JSONDecodeError, FileNotFoundError) as e: + return [f"[ERROR] {file_path}: Invalid JSON ({e})"], [] + + # 1. Schema validation + try: + validate(instance=data, schema=self.schema) + except ValidationError as e: + return [f"[ERROR] {file_path}: Schema error → {e.message}"], [] + + # 2. Custom validation + columns = data.get("columns", []) + rows = data.get("rows", []) + num_cols = len(columns) + + if not rows or not columns: + if not rows: + message = "No rows found" + elif not columns: + message = "No columns found" + else: + message = "No rows or columns found" + + errors.extend([{ + "type": "error", + "file": file_path, + "row": rows if rows else None, + "column": columns if columns else None, + "message": message + } + ]) + return errors, warnings + + errors.extend(self._check_duplicate_columns(file_path, columns)) + + for i, row in enumerate(rows): + errors.extend(self._check_row_column_mismatch(file_path, i, row, num_cols)) + warnings.extend(self._check_empty_values(file_path, i, row, columns)) + warnings.extend(self._check_value_overflow(file_path, i, row, columns)) + errors.extend(self._check_data_types(file_path, i, row, rows[0], columns)) + + return errors, warnings + +