From 24a5d714356e0cd55d99ecfac6c542c43e494801 Mon Sep 17 00:00:00 2001 From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com> Date: Tue, 17 Mar 2026 12:47:59 +0530 Subject: [PATCH 01/10] feat : data validation script --- .gitignore | 5 +- scripts/validator/requirements.txt | 5 ++ scripts/validator/schema.json | 22 +++++++ scripts/validator/validator.py | 98 +++++++++++++++++++++++++++++ 4 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 scripts/validator/requirements.txt create mode 100644 scripts/validator/schema.json create mode 100644 scripts/validator/validator.py diff --git a/.gitignore b/.gitignore index 3deeccb3..3bef95e2 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,7 @@ ingestion/.env # Generated zip files (not used by website) website/static/downloads/archive_Data.zip website/static/downloads/sources_Data.zip -website/static/downloads/statistics_Data.zip \ No newline at end of file +website/static/downloads/statistics_Data.zip + +# Virtual environment +venv/ \ No newline at end of file diff --git a/scripts/validator/requirements.txt b/scripts/validator/requirements.txt new file mode 100644 index 00000000..79be454f --- /dev/null +++ b/scripts/validator/requirements.txt @@ -0,0 +1,5 @@ +jsonschema>=4.25,<5 + +# python -m venv .venv +# source .venv/bin/activate +# pip install -r requirements.txt diff --git a/scripts/validator/schema.json b/scripts/validator/schema.json new file mode 100644 index 00000000..d3bdb1e3 --- /dev/null +++ b/scripts/validator/schema.json @@ -0,0 +1,22 @@ +{ + "type": "object", + "required": ["columns", "rows"], + "properties": { + "columns": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1 + }, + "rows": { + "type": "array", + "items": { + "type": "array", + "items": {}, + "minItems": 1 + } + } + }, + "additionalProperties": false +} \ No newline at end of file diff --git a/scripts/validator/validator.py b/scripts/validator/validator.py new file mode 100644 index 00000000..ec9224ac --- /dev/null +++ b/scripts/validator/validator.py @@ -0,0 +1,98 @@ +import sys +import json +from pathlib import Path +from jsonschema import validate, ValidationError + +# Load schema once +with open("./schema.json") as f: + SCHEMA = json.load(f) + +def validate_file(file_path): + errors = [] + + # Load JSON + try: + with open(file_path) as f: + data = json.load(f) + except Exception as e: + return [f"[ERROR] {file_path}: Invalid JSON ({e})"] + + # 1. Schema validation + try: + validate(instance=data, schema=SCHEMA) + except ValidationError as e: + return [f"[ERROR] {file_path}: Schema error → {e.message}"] + + # 2. Custom validation -------------------------------------------------------------------------------------------------- + columns = data["columns"] + rows = data["rows"] + + num_cols = len(columns) + + # Check duplicate columns + if len(columns) != len(set(columns)): + errors.append(f"[ERROR] {file_path}: Duplicate column names found") + + # Check rows and columns mismatches + for i, row in enumerate(rows): + if len(row) != num_cols: + errors.append( + f"[ERROR] {file_path}: Row {i} has {len(row)} value(s), expected {num_cols} value(s)" + ) + + # Check data types + # if the column's first value starts from a sepcific data type, all the values down to the end on that column should be of the same data type + for i, row in enumerate(rows): + for j, value in enumerate(row): + expected_type = type(rows[0][j]) + if not isinstance(value, expected_type): + errors.append( + f"[ERROR] {file_path}: Row {i}, Column '{columns[j]}' has {value} ({type(value).__name__}), expected {expected_type.__name__}" + ) + + # Check for empty values (missing values) + for i, row in enumerate(rows): + for j, value in enumerate(row): + if value is None or value == "": + errors.append( + f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has empty value" + ) + + # Floats should be in strings as a temporary mitigation + for i, row in enumerate(rows): + for j, value in enumerate(row): + if isinstance(value, float): + errors.append( + f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has float value {value}. Convert to string. as a temporary mitigation" + ) + + return errors + + +def main(directory): + all_errors = [] + + paths = list(Path(directory).rglob("data.json")) + + if not paths: + print("[INFO] No data.json files found") + sys.exit(0) + + for path in paths: + all_errors.extend(validate_file(path)) + + if all_errors: + print("\n".join(all_errors)) + sys.exit(1) + else: + print("All data valid ✅") + sys.exit(0) + + +if __name__ == "__main__": + # python validator.py {data/statistics} + if len(sys.argv) < 2: + print("Usage: python validate.py ") + sys.exit(1) + + main(sys.argv[1]) \ No newline at end of file From 306b61eb80d18103db17da35d4cdd75cd617328e Mon Sep 17 00:00:00 2001 From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com> Date: Wed, 18 Mar 2026 12:20:29 +0530 Subject: [PATCH 02/10] feat : package the script --- scripts/validator/core/baseRunner.py | 5 + scripts/validator/core/baseValidator.py | 3 + scripts/validator/main.py | 7 ++ .../tabularSchema.json} | 0 scripts/validator/validator.py | 98 ------------------- scripts/validator/validators/tabular.py | 60 ++++++++++++ 6 files changed, 75 insertions(+), 98 deletions(-) create mode 100644 scripts/validator/core/baseRunner.py create mode 100644 scripts/validator/core/baseValidator.py create mode 100644 scripts/validator/main.py rename scripts/validator/{schema.json => models/tabularSchema.json} (100%) delete mode 100644 scripts/validator/validator.py create mode 100644 scripts/validator/validators/tabular.py diff --git a/scripts/validator/core/baseRunner.py b/scripts/validator/core/baseRunner.py new file mode 100644 index 00000000..4f343723 --- /dev/null +++ b/scripts/validator/core/baseRunner.py @@ -0,0 +1,5 @@ +from validator.validators.tabular import TabularValidator + +def run_validation(file_path): + validator = TabularValidator() + return validator.validate(file_path) \ No newline at end of file diff --git a/scripts/validator/core/baseValidator.py b/scripts/validator/core/baseValidator.py new file mode 100644 index 00000000..c7d83a02 --- /dev/null +++ b/scripts/validator/core/baseValidator.py @@ -0,0 +1,3 @@ +class BaseValidator: + def validate(self, data, file_path): + raise NotImplementedError \ No newline at end of file diff --git a/scripts/validator/main.py b/scripts/validator/main.py new file mode 100644 index 00000000..0e4e96a0 --- /dev/null +++ b/scripts/validator/main.py @@ -0,0 +1,7 @@ +from validator.core.runner import run_validation + +def main(): + run_validation("/Users/yasandu/Documents/datasets/data/statistics") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/validator/schema.json b/scripts/validator/models/tabularSchema.json similarity index 100% rename from scripts/validator/schema.json rename to scripts/validator/models/tabularSchema.json diff --git a/scripts/validator/validator.py b/scripts/validator/validator.py deleted file mode 100644 index ec9224ac..00000000 --- a/scripts/validator/validator.py +++ /dev/null @@ -1,98 +0,0 @@ -import sys -import json -from pathlib import Path -from jsonschema import validate, ValidationError - -# Load schema once -with open("./schema.json") as f: - SCHEMA = json.load(f) - -def validate_file(file_path): - errors = [] - - # Load JSON - try: - with open(file_path) as f: - data = json.load(f) - except Exception as e: - return [f"[ERROR] {file_path}: Invalid JSON ({e})"] - - # 1. Schema validation - try: - validate(instance=data, schema=SCHEMA) - except ValidationError as e: - return [f"[ERROR] {file_path}: Schema error → {e.message}"] - - # 2. Custom validation -------------------------------------------------------------------------------------------------- - columns = data["columns"] - rows = data["rows"] - - num_cols = len(columns) - - # Check duplicate columns - if len(columns) != len(set(columns)): - errors.append(f"[ERROR] {file_path}: Duplicate column names found") - - # Check rows and columns mismatches - for i, row in enumerate(rows): - if len(row) != num_cols: - errors.append( - f"[ERROR] {file_path}: Row {i} has {len(row)} value(s), expected {num_cols} value(s)" - ) - - # Check data types - # if the column's first value starts from a sepcific data type, all the values down to the end on that column should be of the same data type - for i, row in enumerate(rows): - for j, value in enumerate(row): - expected_type = type(rows[0][j]) - if not isinstance(value, expected_type): - errors.append( - f"[ERROR] {file_path}: Row {i}, Column '{columns[j]}' has {value} ({type(value).__name__}), expected {expected_type.__name__}" - ) - - # Check for empty values (missing values) - for i, row in enumerate(rows): - for j, value in enumerate(row): - if value is None or value == "": - errors.append( - f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has empty value" - ) - - # Floats should be in strings as a temporary mitigation - for i, row in enumerate(rows): - for j, value in enumerate(row): - if isinstance(value, float): - errors.append( - f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has float value {value}. Convert to string. as a temporary mitigation" - ) - - return errors - - -def main(directory): - all_errors = [] - - paths = list(Path(directory).rglob("data.json")) - - if not paths: - print("[INFO] No data.json files found") - sys.exit(0) - - for path in paths: - all_errors.extend(validate_file(path)) - - if all_errors: - print("\n".join(all_errors)) - sys.exit(1) - else: - print("All data valid ✅") - sys.exit(0) - - -if __name__ == "__main__": - # python validator.py {data/statistics} - if len(sys.argv) < 2: - print("Usage: python validate.py ") - sys.exit(1) - - main(sys.argv[1]) \ No newline at end of file diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py new file mode 100644 index 00000000..c73f546a --- /dev/null +++ b/scripts/validator/validators/tabular.py @@ -0,0 +1,60 @@ +from validator.core.baseValidator import BaseValidator +import json +from jsonschema import validate, ValidationError + +class TabularValidator(BaseValidator): + def __init__(self): + with open("./models/tabularSchema.json") as f: + self.schema = json.load(f) + + def validate(self, file_path): + errors = [] + + # Load JSON + try: + with open(file_path) as f: + data = json.load(f) + except Exception as e: + return [f"[ERROR] {file_path}: Invalid JSON ({e})"] + + # 1. Schema validation + try: + validate(instance=data, schema=self.schema) + except ValidationError as e: + return [f"[ERROR] {file_path}: Schema error → {e.message}"] + + # 2. Custom validation + columns = data["columns"] + rows = data["rows"] + + num_cols = len(columns) + + # Check duplicate columns + if len(columns) != len(set(columns)): + errors.append(f"[ERROR] {file_path}: Duplicate column names found") + + # Check rows and columns mismatches + for i, row in enumerate(rows): + if len(row) != num_cols: + errors.append( + f"[ERROR] {file_path}: Row {i} has {len(row)} value(s), expected {num_cols} value(s)" + ) + + # Check data types + # if the column's first value starts from a sepcific data type, all the values down to the end on that column should be of the same data type + for i, row in enumerate(rows): + for j, value in enumerate(row): + expected_type = type(rows[0][j]) + if not isinstance(value, expected_type): + errors.append( + f"[ERROR] {file_path}: Row {i}, Column '{columns[j]}' has {value} ({type(value).__name__}), expected {expected_type.__name__}" + ) + + # Check for empty values (missing values) + for i, row in enumerate(rows): + for j, value in enumerate(row): + if value is None or value == "": + errors.append( + f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has empty value" + ) + return errors From 0e315422ee6317c725f2babb0a98af6d1cf94867 Mon Sep 17 00:00:00 2001 From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:09:53 +0530 Subject: [PATCH 03/10] feat : final refacotring and packaging the script --- scripts/validator/core/baseRunner.py | 37 ++++++++- scripts/validator/core/baseValidator.py | 2 +- scripts/validator/main.py | 17 +++- scripts/validator/utils/utils.py | 23 ++++++ scripts/validator/validators/tabular.py | 103 ++++++++++++++++++++---- 5 files changed, 158 insertions(+), 24 deletions(-) create mode 100644 scripts/validator/utils/utils.py diff --git a/scripts/validator/core/baseRunner.py b/scripts/validator/core/baseRunner.py index 4f343723..06db53c6 100644 --- a/scripts/validator/core/baseRunner.py +++ b/scripts/validator/core/baseRunner.py @@ -1,5 +1,34 @@ -from validator.validators.tabular import TabularValidator +import sys +from pathlib import Path +from utils.utils import Utils -def run_validation(file_path): - validator = TabularValidator() - return validator.validate(file_path) \ No newline at end of file +def run_validation(file_path, validator): + validator = validator() + all_errors = [] + all_warnings = [] + + paths = list(Path(file_path).rglob("data.json")) + + if not paths: + print("[INFO] No data.json files found") + sys.exit(0) + + for path in paths: + errors, warnings = validator.validate(path) + all_errors.extend(errors) + all_warnings.extend(warnings) + + if all_errors: + print(f" - {len(all_errors)} errors found") + for error in all_errors: + print(Utils.format_issue(error)) + + if all_warnings: + print(f" - {len(all_warnings)} warnings found") + for warning in all_warnings: + print(Utils.format_issue(warning)) + + if not all_errors and not all_warnings: + print("All data valid ✅") + + sys.exit(1 if all_errors else 0) diff --git a/scripts/validator/core/baseValidator.py b/scripts/validator/core/baseValidator.py index c7d83a02..998523fb 100644 --- a/scripts/validator/core/baseValidator.py +++ b/scripts/validator/core/baseValidator.py @@ -1,3 +1,3 @@ class BaseValidator: - def validate(self, data, file_path): + def validate(self, file_path): raise NotImplementedError \ No newline at end of file diff --git a/scripts/validator/main.py b/scripts/validator/main.py index 0e4e96a0..a9a303d9 100644 --- a/scripts/validator/main.py +++ b/scripts/validator/main.py @@ -1,7 +1,16 @@ -from validator.core.runner import run_validation +from core.baseRunner import run_validation +from validators.tabular import TabularValidator +import sys -def main(): - run_validation("/Users/yasandu/Documents/datasets/data/statistics") +def main(file_path, validator): + if validator == "tabular": + run_validation(file_path, TabularValidator) + else: + print("Invalid validator") + sys.exit(1) if __name__ == "__main__": - main() \ No newline at end of file + if len(sys.argv) < 3: + print("Usage: python main.py ") + sys.exit(1) + main(sys.argv[1], sys.argv[2]) \ No newline at end of file diff --git a/scripts/validator/utils/utils.py b/scripts/validator/utils/utils.py new file mode 100644 index 00000000..bdcac551 --- /dev/null +++ b/scripts/validator/utils/utils.py @@ -0,0 +1,23 @@ +class Utils: + @staticmethod + def format_issue(issue): + location = "" + + if issue.get("row") is not None: + location += f"Row {issue['row']}" + + if issue.get("column"): + # handle list of columns OR single column + if isinstance(issue["column"], list): + cols = ", ".join(issue["column"]) + location += f", Columns [{cols}]" + else: + location += f", Column '{issue['column']}'" + + return f"[{issue['type'].upper()}] {issue['file']}: {location} {issue['message']}" + + @staticmethod + def fits_in_int32(value: int) -> bool: + INT32_MIN = -2_147_483_648 + INT32_MAX = 2_147_483_647 + return INT32_MIN <= value <= INT32_MAX \ No newline at end of file diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py index c73f546a..36a8400c 100644 --- a/scripts/validator/validators/tabular.py +++ b/scripts/validator/validators/tabular.py @@ -1,27 +1,30 @@ -from validator.core.baseValidator import BaseValidator +from core.baseValidator import BaseValidator import json from jsonschema import validate, ValidationError +from collections import Counter +from utils.utils import Utils class TabularValidator(BaseValidator): def __init__(self): with open("./models/tabularSchema.json") as f: self.schema = json.load(f) - + def validate(self, file_path): errors = [] + warnings = [] # Load JSON try: with open(file_path) as f: data = json.load(f) except Exception as e: - return [f"[ERROR] {file_path}: Invalid JSON ({e})"] + return [f"[ERROR] {file_path}: Invalid JSON ({e})"], [] # 1. Schema validation try: validate(instance=data, schema=self.schema) except ValidationError as e: - return [f"[ERROR] {file_path}: Schema error → {e.message}"] + return [f"[ERROR] {file_path}: Schema error → {e.message}"], [] # 2. Custom validation columns = data["columns"] @@ -31,30 +34,100 @@ def validate(self, file_path): # Check duplicate columns if len(columns) != len(set(columns)): - errors.append(f"[ERROR] {file_path}: Duplicate column names found") + column_counts = Counter(columns) + duplicates = [col for col, count in column_counts.items() if count > 1] + if duplicates: + errors.append( + { + "type": "error", + "file": file_path, + "row": None, + "column": duplicates, + "message": f"Duplicate column names found: {', '.join(duplicates)}", + } + ) # Check rows and columns mismatches for i, row in enumerate(rows): if len(row) != num_cols: errors.append( - f"[ERROR] {file_path}: Row {i} has {len(row)} value(s), expected {num_cols} value(s)" + { + "type": "error", + "file": file_path, + "row": i, + "column": None, + "message": f"has {len(row)} value(s), expected {num_cols} value(s)", + } ) - + # Check data types # if the column's first value starts from a sepcific data type, all the values down to the end on that column should be of the same data type for i, row in enumerate(rows): for j, value in enumerate(row): expected_type = type(rows[0][j]) - if not isinstance(value, expected_type): - errors.append( - f"[ERROR] {file_path}: Row {i}, Column '{columns[j]}' has {value} ({type(value).__name__}), expected {expected_type.__name__}" - ) - + if expected_type is str: + if not isinstance(value, str): + errors.append( + { + "type": "error", + "file": file_path, + "row": i, + "column": columns[j], + "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}", + } + ) + elif expected_type is int: + if not isinstance(value, int): + errors.append( + { + "type": "error", + "file": file_path, + "row": i, + "column": columns[j], + "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}", + } + ) + elif expected_type is float: + if not isinstance(value, (float, int)): + errors.append( + { + "type": "error", + "file": file_path, + "row": i, + "column": columns[j], + "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}", + } + ) + # Check for empty values (missing values) for i, row in enumerate(rows): for j, value in enumerate(row): if value is None or value == "": - errors.append( - f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has empty value" + warnings.append( + { + "type": "warning", + "file": file_path, + "row": i, + "column": columns[j], + "message": "has empty value", + } ) - return errors + + # Check for value overflow (temporary fix for opengin system) + for i, row in enumerate(rows): + for j, value in enumerate(row): + if isinstance(value, int): + if not Utils.fits_in_int32(value): + warnings.append( + { + "type": "warning", + "file": file_path, + "row": i, + "column": columns[j], + "message": f"has {value} ({type(value).__name__}), this is a big integer in postgres , consider when inserting into the database (postgres has 32 bit integer limit)", + } + ) + + return errors, warnings + + From 34415470bac370e9fc711039fca92902f7f971ee Mon Sep 17 00:00:00 2001 From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:23:59 +0530 Subject: [PATCH 04/10] fix : fixing review comments --- scripts/validator/validators/tabular.py | 83 ++++++++++++++++--------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py index 36a8400c..63fe78b1 100644 --- a/scripts/validator/validators/tabular.py +++ b/scripts/validator/validators/tabular.py @@ -3,36 +3,15 @@ from jsonschema import validate, ValidationError from collections import Counter from utils.utils import Utils +from pathlib import Path class TabularValidator(BaseValidator): def __init__(self): - with open("./models/tabularSchema.json") as f: + with open(Path(__file__).parent / "../models/tabularSchema.json") as f: self.schema = json.load(f) - def validate(self, file_path): + def _check_duplicate_columns(self, file_path, columns): errors = [] - warnings = [] - - # Load JSON - try: - with open(file_path) as f: - data = json.load(f) - except Exception as e: - return [f"[ERROR] {file_path}: Invalid JSON ({e})"], [] - - # 1. Schema validation - try: - validate(instance=data, schema=self.schema) - except ValidationError as e: - return [f"[ERROR] {file_path}: Schema error → {e.message}"], [] - - # 2. Custom validation - columns = data["columns"] - rows = data["rows"] - - num_cols = len(columns) - - # Check duplicate columns if len(columns) != len(set(columns)): column_counts = Counter(columns) duplicates = [col for col, count in column_counts.items() if count > 1] @@ -45,9 +24,11 @@ def validate(self, file_path): "column": duplicates, "message": f"Duplicate column names found: {', '.join(duplicates)}", } - ) + ) + return errors - # Check rows and columns mismatches + def _check_row_column_mismatches(self, file_path, rows, num_cols): + errors = [] for i, row in enumerate(rows): if len(row) != num_cols: errors.append( @@ -59,9 +40,13 @@ def validate(self, file_path): "message": f"has {len(row)} value(s), expected {num_cols} value(s)", } ) + return errors - # Check data types - # if the column's first value starts from a sepcific data type, all the values down to the end on that column should be of the same data type + def _check_data_types(self, file_path, rows, columns): + errors = [] + if not rows: + return errors + for i, row in enumerate(rows): for j, value in enumerate(row): expected_type = type(rows[0][j]) @@ -98,8 +83,10 @@ def validate(self, file_path): "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}", } ) + return errors - # Check for empty values (missing values) + def _check_empty_values(self, file_path, rows, columns): + warnings = [] for i, row in enumerate(rows): for j, value in enumerate(row): if value is None or value == "": @@ -112,8 +99,10 @@ def validate(self, file_path): "message": "has empty value", } ) - - # Check for value overflow (temporary fix for opengin system) + return warnings + + def _check_value_overflow(self, file_path, rows, columns): + warnings = [] for i, row in enumerate(rows): for j, value in enumerate(row): if isinstance(value, int): @@ -127,6 +116,38 @@ def validate(self, file_path): "message": f"has {value} ({type(value).__name__}), this is a big integer in postgres , consider when inserting into the database (postgres has 32 bit integer limit)", } ) + return warnings + + def validate(self, file_path): + errors = [] + warnings = [] + + # Load JSON + try: + with open(file_path) as f: + data = json.load(f) + except (json.JSONDecodeError, FileNotFoundError) as e: + return [f"[ERROR] {file_path}: Invalid JSON ({e})"], [] + + # 1. Schema validation + try: + validate(instance=data, schema=self.schema) + except ValidationError as e: + return [f"[ERROR] {file_path}: Schema error → {e.message}"], [] + + # 2. Custom validation + columns = data.get("columns", []) + rows = data.get("rows", []) + num_cols = len(columns) + + # errors -------- + errors.extend(self._check_duplicate_columns(file_path, columns)) + errors.extend(self._check_row_column_mismatches(file_path, rows, num_cols)) + errors.extend(self._check_data_types(file_path, rows, columns)) + + # warnings -------- + warnings.extend(self._check_empty_values(file_path, rows, columns)) + warnings.extend(self._check_value_overflow(file_path, rows, columns)) return errors, warnings From 9daa07cab00b14b900768bf4efbdadfddbf93c5b Mon Sep 17 00:00:00 2001 From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:44:57 +0530 Subject: [PATCH 05/10] test : test workflow to validate datasets --- .../workflows/dataset-validator[tabular].yml | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/workflows/dataset-validator[tabular].yml diff --git a/.github/workflows/dataset-validator[tabular].yml b/.github/workflows/dataset-validator[tabular].yml new file mode 100644 index 00000000..05b11ba9 --- /dev/null +++ b/.github/workflows/dataset-validator[tabular].yml @@ -0,0 +1,30 @@ +name: Run Tabular Validator + +on: + pull_request: + branches: + - workflow/validating-datasets + paths: + - 'data/statistics/**' + +jobs: + validate: + name: Validate Statistics Data + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + working-directory: scripts/validator + run: pip install -r requirements.txt + + - name: Run validator + working-directory: scripts/validator + run: python main.py ../../data/statistics tabular \ No newline at end of file From 9f917f7fab73a23ad5b2ed2e7d1751e3780db0a4 Mon Sep 17 00:00:00 2001 From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com> Date: Fri, 20 Mar 2026 11:36:51 +0530 Subject: [PATCH 06/10] feat : adding the ci-pipeline for data validation [tabular data] --- .../workflows/dataset-validator[tabular].yml | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/.github/workflows/dataset-validator[tabular].yml b/.github/workflows/dataset-validator[tabular].yml index 05b11ba9..9db45a2f 100644 --- a/.github/workflows/dataset-validator[tabular].yml +++ b/.github/workflows/dataset-validator[tabular].yml @@ -3,15 +3,25 @@ name: Run Tabular Validator on: pull_request: branches: - - workflow/validating-datasets + - main paths: - 'data/statistics/**' + workflow_dispatch: + inputs: + mode: + description: "Validation mode" + required: false + default: "tabular" jobs: validate: - name: Validate Statistics Data + name: Validate Statistics Data [Tabular] runs-on: ubuntu-latest + env: + DATASET_PATH: ${{ vars.DATASET_DIR_PATH_VAR || '../../data/statistics' }} + MODE: ${{ github.event.inputs.mode || 'tabular' }} + steps: - name: Checkout repository uses: actions/checkout@v4 @@ -20,11 +30,15 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.12' - + - name: Install dependencies - working-directory: scripts/validator - run: pip install -r requirements.txt + run: pip install -r scripts/validator/requirements.txt - name: Run validator working-directory: scripts/validator - run: python main.py ../../data/statistics tabular \ No newline at end of file + run: | + echo "Running with:" + echo "Dataset Directory: $DATASET_PATH" + echo "Validation Mode: $MODE" + + python main.py "$DATASET_PATH" "$MODE" \ No newline at end of file From e6400e33ea0ac0e219941ad43ef8d303ed4988a9 Mon Sep 17 00:00:00 2001 From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com> Date: Mon, 23 Mar 2026 13:48:06 +0530 Subject: [PATCH 07/10] fix: filename error fixing --- scripts/validator/{requirements.txt => requirements.txt} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/validator/{requirements.txt => requirements.txt} (100%) diff --git a/scripts/validator/requirements.txt b/scripts/validator/requirements.txt similarity index 100% rename from scripts/validator/requirements.txt rename to scripts/validator/requirements.txt From ee6c34adc08cac9bf42c24c344243eeabfc8e565 Mon Sep 17 00:00:00 2001 From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com> Date: Mon, 23 Mar 2026 15:22:18 +0530 Subject: [PATCH 08/10] review: resolving review comments by chanuka and sehansi --- scripts/validator/requirements.txt | 3 - scripts/validator/validators/tabular.py | 153 ++++++++++++------------ 2 files changed, 78 insertions(+), 78 deletions(-) diff --git a/scripts/validator/requirements.txt b/scripts/validator/requirements.txt index 79be454f..d07debbe 100644 --- a/scripts/validator/requirements.txt +++ b/scripts/validator/requirements.txt @@ -1,5 +1,2 @@ jsonschema>=4.25,<5 -# python -m venv .venv -# source .venv/bin/activate -# pip install -r requirements.txt diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py index 63fe78b1..365de33a 100644 --- a/scripts/validator/validators/tabular.py +++ b/scripts/validator/validators/tabular.py @@ -11,12 +11,11 @@ def __init__(self): self.schema = json.load(f) def _check_duplicate_columns(self, file_path, columns): - errors = [] if len(columns) != len(set(columns)): column_counts = Counter(columns) duplicates = [col for col, count in column_counts.items() if count > 1] if duplicates: - errors.append( + return [ { "type": "error", "file": file_path, @@ -24,57 +23,27 @@ def _check_duplicate_columns(self, file_path, columns): "column": duplicates, "message": f"Duplicate column names found: {', '.join(duplicates)}", } - ) - return errors + ] + return [] - def _check_row_column_mismatches(self, file_path, rows, num_cols): - errors = [] - for i, row in enumerate(rows): - if len(row) != num_cols: - errors.append( - { - "type": "error", - "file": file_path, - "row": i, - "column": None, - "message": f"has {len(row)} value(s), expected {num_cols} value(s)", - } - ) - return errors + def _check_row_column_mismatch(self, file_path, i, row, num_cols): + if len(row) != num_cols: + return [{ + "type": "error", + "file": file_path, + "row": i, + "column": None, + "message": f"has only {len(row)} value(s), expected {num_cols} value(s)", + }] + return [] - def _check_data_types(self, file_path, rows, columns): + def _check_data_types(self, file_path, i, row, first_row, columns): errors = [] - if not rows: - return errors - - for i, row in enumerate(rows): - for j, value in enumerate(row): - expected_type = type(rows[0][j]) - if expected_type is str: - if not isinstance(value, str): - errors.append( - { - "type": "error", - "file": file_path, - "row": i, - "column": columns[j], - "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}", - } - ) - elif expected_type is int: - if not isinstance(value, int): - errors.append( - { - "type": "error", - "file": file_path, - "row": i, - "column": columns[j], - "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}", - } - ) - elif expected_type is float: - if not isinstance(value, (float, int)): - errors.append( + for j, value in enumerate(row): + expected_type = type(first_row[j]) + if expected_type is str: + if not isinstance(value, expected_type): + errors.append( { "type": "error", "file": file_path, @@ -82,40 +51,57 @@ def _check_data_types(self, file_path, rows, columns): "column": columns[j], "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}", } - ) + ) + elif expected_type is int: + if not isinstance(value, expected_type): + errors.append( + { + "type": "error", + "file": file_path, + "row": i, + "column": columns[j], + "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}", + } + ) + elif expected_type is float: + if not isinstance(value, (expected_type, int)): + errors.append( + { + "type": "error", + "file": file_path, + "row": i, + "column": columns[j], + "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__} or whole number", + } + ) return errors - def _check_empty_values(self, file_path, rows, columns): + def _check_empty_values(self, file_path, i, row, columns): warnings = [] - for i, row in enumerate(rows): - for j, value in enumerate(row): - if value is None or value == "": - warnings.append( - { + for j, value in enumerate(row): + str_value = str(value).strip() if value is not None else "" + if str_value == "": + warnings.append({ "type": "warning", "file": file_path, "row": i, "column": columns[j], "message": "has empty value", - } - ) + }) return warnings - def _check_value_overflow(self, file_path, rows, columns): + def _check_value_overflow(self, file_path, i, row, columns): warnings = [] - for i, row in enumerate(rows): - for j, value in enumerate(row): - if isinstance(value, int): - if not Utils.fits_in_int32(value): - warnings.append( - { + for j, value in enumerate(row): + if isinstance(value, int): + if not Utils.fits_in_int32(value): + warnings.append({ "type": "warning", "file": file_path, "row": i, "column": columns[j], "message": f"has {value} ({type(value).__name__}), this is a big integer in postgres , consider when inserting into the database (postgres has 32 bit integer limit)", - } - ) + }) return warnings def validate(self, file_path): @@ -140,14 +126,31 @@ def validate(self, file_path): rows = data.get("rows", []) num_cols = len(columns) - # errors -------- + if not rows or not columns: + if not rows: + message = "No rows found" + elif not columns: + message = "No columns found" + else: + message = "No rows or columns found" + + errors.extend([{ + "type": "error", + "file": file_path, + "row": rows if rows else None, + "column": columns if columns else None, + "message": message + } + ]) + return errors, warnings + errors.extend(self._check_duplicate_columns(file_path, columns)) - errors.extend(self._check_row_column_mismatches(file_path, rows, num_cols)) - errors.extend(self._check_data_types(file_path, rows, columns)) - - # warnings -------- - warnings.extend(self._check_empty_values(file_path, rows, columns)) - warnings.extend(self._check_value_overflow(file_path, rows, columns)) + + for i, row in enumerate(rows): + errors.extend(self._check_row_column_mismatch(file_path, i, row, num_cols)) + warnings.extend(self._check_empty_values(file_path, i, row, columns)) + warnings.extend(self._check_value_overflow(file_path, i, row, columns)) + errors.extend(self._check_data_types(file_path, i, row, rows[0], columns)) return errors, warnings From a3ed6ac673b766fc7fb427e7bcc947a3adc2b343 Mon Sep 17 00:00:00 2001 From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com> Date: Mon, 23 Mar 2026 15:55:39 +0530 Subject: [PATCH 09/10] fix : fix a condition checking --- scripts/validator/validators/tabular.py | 48 ++++++++----------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py index 365de33a..dd40071d 100644 --- a/scripts/validator/validators/tabular.py +++ b/scripts/validator/validators/tabular.py @@ -41,39 +41,21 @@ def _check_data_types(self, file_path, i, row, first_row, columns): errors = [] for j, value in enumerate(row): expected_type = type(first_row[j]) - if expected_type is str: - if not isinstance(value, expected_type): - errors.append( - { - "type": "error", - "file": file_path, - "row": i, - "column": columns[j], - "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}", - } - ) - elif expected_type is int: - if not isinstance(value, expected_type): - errors.append( - { - "type": "error", - "file": file_path, - "row": i, - "column": columns[j], - "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}", - } - ) - elif expected_type is float: - if not isinstance(value, (expected_type, int)): - errors.append( - { - "type": "error", - "file": file_path, - "row": i, - "column": columns[j], - "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__} or whole number", - } - ) + if expected_type is float: + allowed_types = (float, int) + expected_msg = "float or whole number" + else: + allowed_types = (expected_type) + expected_msg = expected_type.__name__ + + if not isinstance(value, allowed_types): + errors.append({ + "type": "error", + "file": file_path, + "row": i, + "column": columns[j], + "message": f"has {value} ({type(value).__name__}), expected {expected_msg}", + }) return errors def _check_empty_values(self, file_path, i, row, columns): From 5ef1cd2b4a4d1ded232c70fd48ebb4f521f9986a Mon Sep 17 00:00:00 2001 From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com> Date: Mon, 23 Mar 2026 16:07:06 +0530 Subject: [PATCH 10/10] feat : adding the readme file --- scripts/validator/README.md | 75 +++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 scripts/validator/README.md diff --git a/scripts/validator/README.md b/scripts/validator/README.md new file mode 100644 index 00000000..f58ab10c --- /dev/null +++ b/scripts/validator/README.md @@ -0,0 +1,75 @@ +# Data Validator Program + +A command-line tool for validating datasets using configurable validator types. + +--- + +## Getting Started + +### Prerequisites + +- Python 3.x +- `pip` and `venv` + +### Local Setup + +```bash +# Navigate to the project directory +cd scripts/validator + +# Create a virtual environment +python3 -m venv venv + +# Activate the virtual environment +source venv/bin/activate + +# Install dependencies +pip install -r requirements.txt +``` + +### Running the Program + +```bash +python main.py +``` + +**Example:** + +```bash +python main.py ../../data/statistics tabular +``` + +--- + +## Supported Validators + +| Validator | Description | Status | +|-----------|--------------------------|---------| +| `tabular` | Validates tabular data | ✅ Available | + +> **Note:** Additional validators are currently under development. + +--- + +## Project Structure + +``` +scripts/validator/ +│ +├── main.py # Entry point +├── requirements.txt # Python dependencies +├── README.md # Project documentation +│ +├── core/ +│ ├── baseRunner.py # Base runner logic +│ └── baseValidator.py # Base validator interface +│ +├── models/ +│ └── tabularSchema.json # Schema definition for tabular validation +│ +├── utils/ +│ └── utils.py # Shared utility functions +│ +└── validators/ + └── tabular.py # Tabular validator implementation +``` \ No newline at end of file