From 24a5d714356e0cd55d99ecfac6c542c43e494801 Mon Sep 17 00:00:00 2001
From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com>
Date: Tue, 17 Mar 2026 12:47:59 +0530
Subject: [PATCH 01/10] feat : data validation script

---
 .gitignore                          |  5 +-
 scripts/validator/requirements.txt  |  5 ++
 scripts/validator/schema.json       | 22 +++++++
 scripts/validator/validator.py      | 98 +++++++++++++++++++++++++++++
 4 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 scripts/validator/requirements.txt 
 create mode 100644 scripts/validator/schema.json
 create mode 100644 scripts/validator/validator.py

diff --git a/.gitignore b/.gitignore
index 3deeccb3..3bef95e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,7 @@ ingestion/.env
 # Generated zip files (not used by website)
 website/static/downloads/archive_Data.zip
 website/static/downloads/sources_Data.zip
-website/static/downloads/statistics_Data.zip
\ No newline at end of file
+website/static/downloads/statistics_Data.zip
+
+# Virtual environment
+venv/
\ No newline at end of file
diff --git a/scripts/validator/requirements.txt  b/scripts/validator/requirements.txt 
new file mode 100644
index 00000000..79be454f
--- /dev/null
+++ b/scripts/validator/requirements.txt 	
@@ -0,0 +1,5 @@
+jsonschema>=4.25,<5
+
+# python -m venv .venv
+# source .venv/bin/activate
+# pip install -r requirements.txt
diff --git a/scripts/validator/schema.json b/scripts/validator/schema.json
new file mode 100644
index 00000000..d3bdb1e3
--- /dev/null
+++ b/scripts/validator/schema.json
@@ -0,0 +1,22 @@
+{
+  "type": "object",
+  "required": ["columns", "rows"],
+  "properties": {
+    "columns": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      },
+      "minItems": 1
+    },
+    "rows": {
+      "type": "array",
+      "items": {
+        "type": "array",
+        "items": {},
+        "minItems": 1
+      }
+    }
+  },
+  "additionalProperties": false
+}
\ No newline at end of file
diff --git a/scripts/validator/validator.py b/scripts/validator/validator.py
new file mode 100644
index 00000000..ec9224ac
--- /dev/null
+++ b/scripts/validator/validator.py
@@ -0,0 +1,98 @@
+import sys
+import json
+from pathlib import Path
+from jsonschema import validate, ValidationError
+
+# Load schema once
+with open("./schema.json") as f:
+    SCHEMA = json.load(f)
+
+def validate_file(file_path):
+    errors = []
+
+    # Load JSON
+    try:
+        with open(file_path) as f:
+            data = json.load(f)
+    except Exception as e:
+        return [f"[ERROR] {file_path}: Invalid JSON ({e})"]
+
+    # 1. Schema validation
+    try:
+        validate(instance=data, schema=SCHEMA)
+    except ValidationError as e:
+        return [f"[ERROR] {file_path}: Schema error → {e.message}"]
+
+    # 2. Custom validation --------------------------------------------------------------------------------------------------
+    columns = data["columns"]
+    rows = data["rows"]
+
+    num_cols = len(columns)
+
+    # Check duplicate columns
+    if len(columns) != len(set(columns)):
+        errors.append(f"[ERROR] {file_path}: Duplicate column names found")
+
+    # Check rows and columns mismatches
+    for i, row in enumerate(rows):
+        if len(row) != num_cols:
+            errors.append(
+                f"[ERROR] {file_path}: Row {i} has {len(row)} value(s), expected {num_cols} value(s)"
+            )
+    
+    # Check data types
+    # if the column's first value starts from a sepcific data type, all the values down to the end on that column should be of the same data type
+    for i, row in enumerate(rows):
+        for j, value in enumerate(row):
+            expected_type = type(rows[0][j])
+            if not isinstance(value, expected_type):
+                errors.append(
+                    f"[ERROR] {file_path}: Row {i}, Column '{columns[j]}' has {value} ({type(value).__name__}), expected {expected_type.__name__}"
+                )
+    
+    # Check for empty values (missing values)
+    for i, row in enumerate(rows):
+        for j, value in enumerate(row):
+            if value is None or value == "":
+                errors.append(
+                    f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has empty value"
+                )
+    
+    # Floats should be in strings as a temporary mitigation
+    for i, row in enumerate(rows):
+        for j, value in enumerate(row):
+            if isinstance(value, float):
+                errors.append(
+                    f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has float value {value}. Convert to string. as a temporary mitigation"
+                )
+    
+    return errors
+
+
+def main(directory):
+    all_errors = []
+
+    paths = list(Path(directory).rglob("data.json"))
+
+    if not paths:
+        print("[INFO] No data.json files found")
+        sys.exit(0)
+
+    for path in paths:
+        all_errors.extend(validate_file(path))
+
+    if all_errors:
+        print("\n".join(all_errors))
+        sys.exit(1)
+    else:
+        print("All data valid ✅")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    # python validator.py {data/statistics}
+    if len(sys.argv) < 2:
+        print("Usage: python validate.py <directory>")
+        sys.exit(1)
+
+    main(sys.argv[1])
\ No newline at end of file

From 306b61eb80d18103db17da35d4cdd75cd617328e Mon Sep 17 00:00:00 2001
From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com>
Date: Wed, 18 Mar 2026 12:20:29 +0530
Subject: [PATCH 02/10] feat : package the script

---
 scripts/validator/core/baseRunner.py          |  5 +
 scripts/validator/core/baseValidator.py       |  3 +
 scripts/validator/main.py                     |  7 ++
 .../tabularSchema.json}                       |  0
 scripts/validator/validator.py                | 98 -------------------
 scripts/validator/validators/tabular.py       | 60 ++++++++++++
 6 files changed, 75 insertions(+), 98 deletions(-)
 create mode 100644 scripts/validator/core/baseRunner.py
 create mode 100644 scripts/validator/core/baseValidator.py
 create mode 100644 scripts/validator/main.py
 rename scripts/validator/{schema.json => models/tabularSchema.json} (100%)
 delete mode 100644 scripts/validator/validator.py
 create mode 100644 scripts/validator/validators/tabular.py

diff --git a/scripts/validator/core/baseRunner.py b/scripts/validator/core/baseRunner.py
new file mode 100644
index 00000000..4f343723
--- /dev/null
+++ b/scripts/validator/core/baseRunner.py
@@ -0,0 +1,5 @@
+from validator.validators.tabular import TabularValidator
+
+def run_validation(file_path):
+    validator = TabularValidator()
+    return validator.validate(file_path)
\ No newline at end of file
diff --git a/scripts/validator/core/baseValidator.py b/scripts/validator/core/baseValidator.py
new file mode 100644
index 00000000..c7d83a02
--- /dev/null
+++ b/scripts/validator/core/baseValidator.py
@@ -0,0 +1,3 @@
+class BaseValidator:
+    def validate(self, data, file_path):
+        raise NotImplementedError
\ No newline at end of file
diff --git a/scripts/validator/main.py b/scripts/validator/main.py
new file mode 100644
index 00000000..0e4e96a0
--- /dev/null
+++ b/scripts/validator/main.py
@@ -0,0 +1,7 @@
+from validator.core.runner import run_validation
+
+def main():
+    run_validation("/Users/yasandu/Documents/datasets/data/statistics")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/scripts/validator/schema.json b/scripts/validator/models/tabularSchema.json
similarity index 100%
rename from scripts/validator/schema.json
rename to scripts/validator/models/tabularSchema.json
diff --git a/scripts/validator/validator.py b/scripts/validator/validator.py
deleted file mode 100644
index ec9224ac..00000000
--- a/scripts/validator/validator.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import sys
-import json
-from pathlib import Path
-from jsonschema import validate, ValidationError
-
-# Load schema once
-with open("./schema.json") as f:
-    SCHEMA = json.load(f)
-
-def validate_file(file_path):
-    errors = []
-
-    # Load JSON
-    try:
-        with open(file_path) as f:
-            data = json.load(f)
-    except Exception as e:
-        return [f"[ERROR] {file_path}: Invalid JSON ({e})"]
-
-    # 1. Schema validation
-    try:
-        validate(instance=data, schema=SCHEMA)
-    except ValidationError as e:
-        return [f"[ERROR] {file_path}: Schema error → {e.message}"]
-
-    # 2. Custom validation --------------------------------------------------------------------------------------------------
-    columns = data["columns"]
-    rows = data["rows"]
-
-    num_cols = len(columns)
-
-    # Check duplicate columns
-    if len(columns) != len(set(columns)):
-        errors.append(f"[ERROR] {file_path}: Duplicate column names found")
-
-    # Check rows and columns mismatches
-    for i, row in enumerate(rows):
-        if len(row) != num_cols:
-            errors.append(
-                f"[ERROR] {file_path}: Row {i} has {len(row)} value(s), expected {num_cols} value(s)"
-            )
-    
-    # Check data types
-    # if the column's first value starts from a sepcific data type, all the values down to the end on that column should be of the same data type
-    for i, row in enumerate(rows):
-        for j, value in enumerate(row):
-            expected_type = type(rows[0][j])
-            if not isinstance(value, expected_type):
-                errors.append(
-                    f"[ERROR] {file_path}: Row {i}, Column '{columns[j]}' has {value} ({type(value).__name__}), expected {expected_type.__name__}"
-                )
-    
-    # Check for empty values (missing values)
-    for i, row in enumerate(rows):
-        for j, value in enumerate(row):
-            if value is None or value == "":
-                errors.append(
-                    f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has empty value"
-                )
-    
-    # Floats should be in strings as a temporary mitigation
-    for i, row in enumerate(rows):
-        for j, value in enumerate(row):
-            if isinstance(value, float):
-                errors.append(
-                    f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has float value {value}. Convert to string. as a temporary mitigation"
-                )
-    
-    return errors
-
-
-def main(directory):
-    all_errors = []
-
-    paths = list(Path(directory).rglob("data.json"))
-
-    if not paths:
-        print("[INFO] No data.json files found")
-        sys.exit(0)
-
-    for path in paths:
-        all_errors.extend(validate_file(path))
-
-    if all_errors:
-        print("\n".join(all_errors))
-        sys.exit(1)
-    else:
-        print("All data valid ✅")
-        sys.exit(0)
-
-
-if __name__ == "__main__":
-    # python validator.py {data/statistics}
-    if len(sys.argv) < 2:
-        print("Usage: python validate.py <directory>")
-        sys.exit(1)
-
-    main(sys.argv[1])
\ No newline at end of file
diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py
new file mode 100644
index 00000000..c73f546a
--- /dev/null
+++ b/scripts/validator/validators/tabular.py
@@ -0,0 +1,60 @@
+from validator.core.baseValidator import BaseValidator
+import json
+from jsonschema import validate, ValidationError
+
+class TabularValidator(BaseValidator):
+    def __init__(self):
+        with open("./models/tabularSchema.json") as f:
+            self.schema = json.load(f)
+    
+    def validate(self, file_path):
+        errors = []
+
+        # Load JSON
+        try:
+            with open(file_path) as f:
+                data = json.load(f)
+        except Exception as e:
+            return [f"[ERROR] {file_path}: Invalid JSON ({e})"]
+
+        # 1. Schema validation
+        try:
+            validate(instance=data, schema=self.schema)
+        except ValidationError as e:
+            return [f"[ERROR] {file_path}: Schema error → {e.message}"]
+
+        # 2. Custom validation
+        columns = data["columns"]
+        rows = data["rows"]
+
+        num_cols = len(columns)
+
+        # Check duplicate columns
+        if len(columns) != len(set(columns)):
+            errors.append(f"[ERROR] {file_path}: Duplicate column names found")
+
+        # Check rows and columns mismatches
+        for i, row in enumerate(rows):
+            if len(row) != num_cols:
+                errors.append(
+                    f"[ERROR] {file_path}: Row {i} has {len(row)} value(s), expected {num_cols} value(s)"
+                )
+        
+        # Check data types
+        # if the column's first value starts from a sepcific data type, all the values down to the end on that column should be of the same data type
+        for i, row in enumerate(rows):
+            for j, value in enumerate(row):
+                expected_type = type(rows[0][j])
+                if not isinstance(value, expected_type):
+                    errors.append(
+                        f"[ERROR] {file_path}: Row {i}, Column '{columns[j]}' has {value} ({type(value).__name__}), expected {expected_type.__name__}"
+                    )
+        
+        # Check for empty values (missing values)
+        for i, row in enumerate(rows):
+            for j, value in enumerate(row):
+                if value is None or value == "":
+                    errors.append(
+                        f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has empty value"
+                    )
+        return errors

From 0e315422ee6317c725f2babb0a98af6d1cf94867 Mon Sep 17 00:00:00 2001
From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com>
Date: Thu, 19 Mar 2026 15:09:53 +0530
Subject: [PATCH 03/10] feat : final refacotring and packaging the script

---
 scripts/validator/core/baseRunner.py    |  37 ++++++++-
 scripts/validator/core/baseValidator.py |   2 +-
 scripts/validator/main.py               |  17 +++-
 scripts/validator/utils/utils.py        |  23 ++++++
 scripts/validator/validators/tabular.py | 103 ++++++++++++++++++++----
 5 files changed, 158 insertions(+), 24 deletions(-)
 create mode 100644 scripts/validator/utils/utils.py

diff --git a/scripts/validator/core/baseRunner.py b/scripts/validator/core/baseRunner.py
index 4f343723..06db53c6 100644
--- a/scripts/validator/core/baseRunner.py
+++ b/scripts/validator/core/baseRunner.py
@@ -1,5 +1,34 @@
-from validator.validators.tabular import TabularValidator
+import sys
+from pathlib import Path
+from utils.utils import Utils
 
-def run_validation(file_path):
-    validator = TabularValidator()
-    return validator.validate(file_path)
\ No newline at end of file
+def run_validation(file_path, validator):
+    validator = validator()
+    all_errors = []
+    all_warnings = []
+
+    paths = list(Path(file_path).rglob("data.json"))
+
+    if not paths:
+        print("[INFO] No data.json files found")
+        sys.exit(0)
+
+    for path in paths:
+        errors, warnings = validator.validate(path)
+        all_errors.extend(errors)
+        all_warnings.extend(warnings)
+
+    if all_errors:
+        print(f" - {len(all_errors)} errors found")
+        for error in all_errors:
+            print(Utils.format_issue(error))
+
+    if all_warnings:
+        print(f" - {len(all_warnings)} warnings found")
+        for warning in all_warnings:
+            print(Utils.format_issue(warning))
+
+    if not all_errors and not all_warnings:
+        print("All data valid ✅")
+
+    sys.exit(1 if all_errors else 0)
diff --git a/scripts/validator/core/baseValidator.py b/scripts/validator/core/baseValidator.py
index c7d83a02..998523fb 100644
--- a/scripts/validator/core/baseValidator.py
+++ b/scripts/validator/core/baseValidator.py
@@ -1,3 +1,3 @@
 class BaseValidator:
-    def validate(self, data, file_path):
+    def validate(self, file_path):
         raise NotImplementedError
\ No newline at end of file
diff --git a/scripts/validator/main.py b/scripts/validator/main.py
index 0e4e96a0..a9a303d9 100644
--- a/scripts/validator/main.py
+++ b/scripts/validator/main.py
@@ -1,7 +1,16 @@
-from validator.core.runner import run_validation
+from core.baseRunner import run_validation
+from validators.tabular import TabularValidator
+import sys
 
-def main():
-    run_validation("/Users/yasandu/Documents/datasets/data/statistics")
+def main(file_path, validator):
+    if validator == "tabular":
+        run_validation(file_path, TabularValidator)
+    else:
+        print("Invalid validator")
+        sys.exit(1)
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    if len(sys.argv) < 3:
+        print("Usage: python main.py <directory> <validator>")
+        sys.exit(1)
+    main(sys.argv[1], sys.argv[2])
\ No newline at end of file
diff --git a/scripts/validator/utils/utils.py b/scripts/validator/utils/utils.py
new file mode 100644
index 00000000..bdcac551
--- /dev/null
+++ b/scripts/validator/utils/utils.py
@@ -0,0 +1,23 @@
+class Utils:
+    @staticmethod
+    def format_issue(issue):
+        location = ""
+
+        if issue.get("row") is not None:
+            location += f"Row {issue['row']}"
+
+        if issue.get("column"):
+            # handle list of columns OR single column
+            if isinstance(issue["column"], list):
+                cols = ", ".join(issue["column"])
+                location += f", Columns [{cols}]"
+            else:
+                location += f", Column '{issue['column']}'"
+
+        return f"[{issue['type'].upper()}] {issue['file']}: {location} {issue['message']}"
+    
+    @staticmethod
+    def fits_in_int32(value: int) -> bool:
+        INT32_MIN = -2_147_483_648
+        INT32_MAX = 2_147_483_647
+        return INT32_MIN <= value <= INT32_MAX
\ No newline at end of file
diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py
index c73f546a..36a8400c 100644
--- a/scripts/validator/validators/tabular.py
+++ b/scripts/validator/validators/tabular.py
@@ -1,27 +1,30 @@
-from validator.core.baseValidator import BaseValidator
+from core.baseValidator import BaseValidator
 import json
 from jsonschema import validate, ValidationError
+from collections import Counter
+from utils.utils import Utils
 
 class TabularValidator(BaseValidator):
     def __init__(self):
         with open("./models/tabularSchema.json") as f:
             self.schema = json.load(f)
-    
+
     def validate(self, file_path):
         errors = []
+        warnings = []
 
         # Load JSON
         try:
             with open(file_path) as f:
                 data = json.load(f)
         except Exception as e:
-            return [f"[ERROR] {file_path}: Invalid JSON ({e})"]
+            return [f"[ERROR] {file_path}: Invalid JSON ({e})"], []
 
         # 1. Schema validation
         try:
             validate(instance=data, schema=self.schema)
         except ValidationError as e:
-            return [f"[ERROR] {file_path}: Schema error → {e.message}"]
+            return [f"[ERROR] {file_path}: Schema error → {e.message}"], []
 
         # 2. Custom validation
         columns = data["columns"]
@@ -31,30 +34,100 @@ def validate(self, file_path):
 
         # Check duplicate columns
         if len(columns) != len(set(columns)):
-            errors.append(f"[ERROR] {file_path}: Duplicate column names found")
+            column_counts = Counter(columns)
+            duplicates = [col for col, count in column_counts.items() if count > 1]
+            if duplicates:
+                errors.append(
+                    {
+                        "type": "error",
+                        "file": file_path,
+                        "row": None,
+                        "column": duplicates,
+                        "message": f"Duplicate column names found: {', '.join(duplicates)}",
+                    }
+            )
 
         # Check rows and columns mismatches
         for i, row in enumerate(rows):
             if len(row) != num_cols:
                 errors.append(
-                    f"[ERROR] {file_path}: Row {i} has {len(row)} value(s), expected {num_cols} value(s)"
+                    {
+                        "type": "error",
+                        "file": file_path,
+                        "row": i,
+                        "column": None,
+                        "message": f"has {len(row)} value(s), expected {num_cols} value(s)",
+                    }
                 )
-        
+
         # Check data types
         # if the column's first value starts from a sepcific data type, all the values down to the end on that column should be of the same data type
         for i, row in enumerate(rows):
             for j, value in enumerate(row):
                 expected_type = type(rows[0][j])
-                if not isinstance(value, expected_type):
-                    errors.append(
-                        f"[ERROR] {file_path}: Row {i}, Column '{columns[j]}' has {value} ({type(value).__name__}), expected {expected_type.__name__}"
-                    )
-        
+                if expected_type is str:    
+                    if not isinstance(value, str):
+                        errors.append(
+                            {
+                                "type": "error",
+                                "file": file_path,
+                                "row": i,
+                                "column": columns[j],
+                                "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}",
+                            }
+                        )
+                elif expected_type is int:
+                    if not isinstance(value, int):
+                        errors.append(
+                            {
+                                "type": "error",
+                                "file": file_path,
+                                "row": i,
+                                "column": columns[j],
+                                "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}",
+                            }
+                        )
+                elif expected_type is float:
+                    if not isinstance(value, (float, int)):
+                        errors.append(
+                            {
+                                "type": "error",
+                                "file": file_path,
+                                "row": i,
+                                "column": columns[j],
+                                "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}",
+                            }
+                        )
+
         # Check for empty values (missing values)
         for i, row in enumerate(rows):
             for j, value in enumerate(row):
                 if value is None or value == "":
-                    errors.append(
-                        f"[WARNING] {file_path}: Row {i}, Column '{columns[j]}' has empty value"
+                    warnings.append(
+                        {
+                            "type": "warning",
+                            "file": file_path,
+                            "row": i,
+                            "column": columns[j],
+                            "message": "has empty value",
+                        }
                     )
-        return errors
+        
+        # Check for value overflow (temporary fix for opengin system)
+        for i, row in enumerate(rows):
+            for j, value in enumerate(row):
+                if isinstance(value, int):
+                    if not Utils.fits_in_int32(value):
+                        warnings.append(
+                            {
+                                "type": "warning",
+                                "file": file_path,
+                                "row": i,
+                                "column": columns[j],
+                                "message": f"has {value} ({type(value).__name__}), this is a big integer in postgres , consider when inserting into the database (postgres has 32 bit integer limit)",
+                            }
+                        )
+
+        return errors, warnings
+   
+

From 34415470bac370e9fc711039fca92902f7f971ee Mon Sep 17 00:00:00 2001
From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com>
Date: Thu, 19 Mar 2026 15:23:59 +0530
Subject: [PATCH 04/10] fix : fixing review comments

---
 scripts/validator/validators/tabular.py | 83 ++++++++++++++++---------
 1 file changed, 52 insertions(+), 31 deletions(-)

diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py
index 36a8400c..63fe78b1 100644
--- a/scripts/validator/validators/tabular.py
+++ b/scripts/validator/validators/tabular.py
@@ -3,36 +3,15 @@
 from jsonschema import validate, ValidationError
 from collections import Counter
 from utils.utils import Utils
+from pathlib import Path
 
 class TabularValidator(BaseValidator):
     def __init__(self):
-        with open("./models/tabularSchema.json") as f:
+        with open(Path(__file__).parent / "../models/tabularSchema.json") as f:
             self.schema = json.load(f)
 
-    def validate(self, file_path):
+    def _check_duplicate_columns(self, file_path, columns):
         errors = []
-        warnings = []
-
-        # Load JSON
-        try:
-            with open(file_path) as f:
-                data = json.load(f)
-        except Exception as e:
-            return [f"[ERROR] {file_path}: Invalid JSON ({e})"], []
-
-        # 1. Schema validation
-        try:
-            validate(instance=data, schema=self.schema)
-        except ValidationError as e:
-            return [f"[ERROR] {file_path}: Schema error → {e.message}"], []
-
-        # 2. Custom validation
-        columns = data["columns"]
-        rows = data["rows"]
-
-        num_cols = len(columns)
-
-        # Check duplicate columns
         if len(columns) != len(set(columns)):
             column_counts = Counter(columns)
             duplicates = [col for col, count in column_counts.items() if count > 1]
@@ -45,9 +24,11 @@ def validate(self, file_path):
                         "column": duplicates,
                         "message": f"Duplicate column names found: {', '.join(duplicates)}",
                     }
-            )
+                )
+        return errors
 
-        # Check rows and columns mismatches
+    def _check_row_column_mismatches(self, file_path, rows, num_cols):
+        errors = []
         for i, row in enumerate(rows):
             if len(row) != num_cols:
                 errors.append(
@@ -59,9 +40,13 @@ def validate(self, file_path):
                         "message": f"has {len(row)} value(s), expected {num_cols} value(s)",
                     }
                 )
+        return errors
 
-        # Check data types
-        # if the column's first value starts from a sepcific data type, all the values down to the end on that column should be of the same data type
+    def _check_data_types(self, file_path, rows, columns):
+        errors = []
+        if not rows:
+            return errors
+            
         for i, row in enumerate(rows):
             for j, value in enumerate(row):
                 expected_type = type(rows[0][j])
@@ -98,8 +83,10 @@ def validate(self, file_path):
                                 "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}",
                             }
                         )
+        return errors
 
-        # Check for empty values (missing values)
+    def _check_empty_values(self, file_path, rows, columns):
+        warnings = []
         for i, row in enumerate(rows):
             for j, value in enumerate(row):
                 if value is None or value == "":
@@ -112,8 +99,10 @@ def validate(self, file_path):
                             "message": "has empty value",
                         }
                     )
-        
-        # Check for value overflow (temporary fix for opengin system)
+        return warnings
+
+    def _check_value_overflow(self, file_path, rows, columns):
+        warnings = []
         for i, row in enumerate(rows):
             for j, value in enumerate(row):
                 if isinstance(value, int):
@@ -127,6 +116,38 @@ def validate(self, file_path):
                                 "message": f"has {value} ({type(value).__name__}), this is a big integer in postgres , consider when inserting into the database (postgres has 32 bit integer limit)",
                             }
                         )
+        return warnings
+
+    def validate(self, file_path):
+        errors = []
+        warnings = []
+
+        # Load JSON
+        try:
+            with open(file_path) as f:
+                data = json.load(f)
+        except (json.JSONDecodeError, FileNotFoundError) as e:
+            return [f"[ERROR] {file_path}: Invalid JSON ({e})"], []
+
+        # 1. Schema validation
+        try:
+            validate(instance=data, schema=self.schema)
+        except ValidationError as e:
+            return [f"[ERROR] {file_path}: Schema error → {e.message}"], []
+
+        # 2. Custom validation
+        columns = data.get("columns", [])
+        rows = data.get("rows", [])
+        num_cols = len(columns)
+
+        # errors --------
+        errors.extend(self._check_duplicate_columns(file_path, columns))
+        errors.extend(self._check_row_column_mismatches(file_path, rows, num_cols))
+        errors.extend(self._check_data_types(file_path, rows, columns))
+        
+        # warnings --------
+        warnings.extend(self._check_empty_values(file_path, rows, columns))
+        warnings.extend(self._check_value_overflow(file_path, rows, columns))
 
         return errors, warnings
    

From 9daa07cab00b14b900768bf4efbdadfddbf93c5b Mon Sep 17 00:00:00 2001
From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com>
Date: Thu, 19 Mar 2026 15:44:57 +0530
Subject: [PATCH 05/10] test : test workflow to validate datasets

---
 .../workflows/dataset-validator[tabular].yml  | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 .github/workflows/dataset-validator[tabular].yml

diff --git a/.github/workflows/dataset-validator[tabular].yml b/.github/workflows/dataset-validator[tabular].yml
new file mode 100644
index 00000000..05b11ba9
--- /dev/null
+++ b/.github/workflows/dataset-validator[tabular].yml
@@ -0,0 +1,30 @@
+name: Run Tabular Validator
+
+on:
+  pull_request:
+    branches:
+      - workflow/validating-datasets
+    paths:
+      - 'data/statistics/**'
+
+jobs:
+  validate:
+    name: Validate Statistics Data
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        working-directory: scripts/validator
+        run: pip install -r requirements.txt
+
+      - name: Run validator
+        working-directory: scripts/validator
+        run: python main.py ../../data/statistics tabular
\ No newline at end of file

From 9f917f7fab73a23ad5b2ed2e7d1751e3780db0a4 Mon Sep 17 00:00:00 2001
From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com>
Date: Fri, 20 Mar 2026 11:36:51 +0530
Subject: [PATCH 06/10] feat : adding the ci-pipeline for data validation
 [tabular data]

---
 .../workflows/dataset-validator[tabular].yml  | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/dataset-validator[tabular].yml b/.github/workflows/dataset-validator[tabular].yml
index 05b11ba9..9db45a2f 100644
--- a/.github/workflows/dataset-validator[tabular].yml
+++ b/.github/workflows/dataset-validator[tabular].yml
@@ -3,15 +3,25 @@ name: Run Tabular Validator
 on:
   pull_request:
     branches:
-      - workflow/validating-datasets
+      - main
     paths:
       - 'data/statistics/**'
+  workflow_dispatch:
+    inputs:
+      mode:
+        description: "Validation mode"
+        required: false
+        default: "tabular"
 
 jobs:
   validate:
-    name: Validate Statistics Data
+    name: Validate Statistics Data [Tabular]
     runs-on: ubuntu-latest
 
+    env:
+      DATASET_PATH: ${{ vars.DATASET_DIR_PATH_VAR || '../../data/statistics' }}
+      MODE: ${{ github.event.inputs.mode || 'tabular' }}
+
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -20,11 +30,15 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: '3.12'
-
+        
       - name: Install dependencies
-        working-directory: scripts/validator
-        run: pip install -r requirements.txt
+        run: pip install -r scripts/validator/requirements.txt
 
       - name: Run validator
         working-directory: scripts/validator
-        run: python main.py ../../data/statistics tabular
\ No newline at end of file
+        run: |
+          echo "Running with:"
+          echo "Dataset Directory: $DATASET_PATH"
+          echo "Validation Mode: $MODE"
+
+          python main.py "$DATASET_PATH" "$MODE"
\ No newline at end of file

From e6400e33ea0ac0e219941ad43ef8d303ed4988a9 Mon Sep 17 00:00:00 2001
From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com>
Date: Mon, 23 Mar 2026 13:48:06 +0530
Subject: [PATCH 07/10] fix: filename error fixing

---
 scripts/validator/{requirements.txt  => requirements.txt} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/validator/{requirements.txt  => requirements.txt} (100%)

diff --git a/scripts/validator/requirements.txt  b/scripts/validator/requirements.txt
similarity index 100%
rename from scripts/validator/requirements.txt 
rename to scripts/validator/requirements.txt

From ee6c34adc08cac9bf42c24c344243eeabfc8e565 Mon Sep 17 00:00:00 2001
From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com>
Date: Mon, 23 Mar 2026 15:22:18 +0530
Subject: [PATCH 08/10] review: resolving review comments by chanuka and
 sehansi

---
 scripts/validator/requirements.txt      |   3 -
 scripts/validator/validators/tabular.py | 153 ++++++++++++------------
 2 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/scripts/validator/requirements.txt b/scripts/validator/requirements.txt
index 79be454f..d07debbe 100644
--- a/scripts/validator/requirements.txt
+++ b/scripts/validator/requirements.txt
@@ -1,5 +1,2 @@
 jsonschema>=4.25,<5
 
-# python -m venv .venv
-# source .venv/bin/activate
-# pip install -r requirements.txt
diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py
index 63fe78b1..365de33a 100644
--- a/scripts/validator/validators/tabular.py
+++ b/scripts/validator/validators/tabular.py
@@ -11,12 +11,11 @@ def __init__(self):
             self.schema = json.load(f)
 
     def _check_duplicate_columns(self, file_path, columns):
-        errors = []
         if len(columns) != len(set(columns)):
             column_counts = Counter(columns)
             duplicates = [col for col, count in column_counts.items() if count > 1]
             if duplicates:
-                errors.append(
+                return [
                     {
                         "type": "error",
                         "file": file_path,
@@ -24,57 +23,27 @@ def _check_duplicate_columns(self, file_path, columns):
                         "column": duplicates,
                         "message": f"Duplicate column names found: {', '.join(duplicates)}",
                     }
-                )
-        return errors
+                ]
+        return []
 
-    def _check_row_column_mismatches(self, file_path, rows, num_cols):
-        errors = []
-        for i, row in enumerate(rows):
-            if len(row) != num_cols:
-                errors.append(
-                    {
-                        "type": "error",
-                        "file": file_path,
-                        "row": i,
-                        "column": None,
-                        "message": f"has {len(row)} value(s), expected {num_cols} value(s)",
-                    }
-                )
-        return errors
+    def _check_row_column_mismatch(self, file_path, i, row, num_cols):
+        if len(row) != num_cols:
+            return [{
+            "type": "error",
+            "file": file_path,
+            "row": i,
+            "column": None,
+            "message": f"has only {len(row)} value(s), expected {num_cols} value(s)",
+        }]
+        return []   
 
-    def _check_data_types(self, file_path, rows, columns):
+    def _check_data_types(self, file_path, i, row, first_row, columns):
         errors = []
-        if not rows:
-            return errors
-            
-        for i, row in enumerate(rows):
-            for j, value in enumerate(row):
-                expected_type = type(rows[0][j])
-                if expected_type is str:    
-                    if not isinstance(value, str):
-                        errors.append(
-                            {
-                                "type": "error",
-                                "file": file_path,
-                                "row": i,
-                                "column": columns[j],
-                                "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}",
-                            }
-                        )
-                elif expected_type is int:
-                    if not isinstance(value, int):
-                        errors.append(
-                            {
-                                "type": "error",
-                                "file": file_path,
-                                "row": i,
-                                "column": columns[j],
-                                "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}",
-                            }
-                        )
-                elif expected_type is float:
-                    if not isinstance(value, (float, int)):
-                        errors.append(
+        for j, value in enumerate(row):
+            expected_type = type(first_row[j])
+            if expected_type is str:    
+                if not isinstance(value, expected_type):
+                    errors.append(
                             {
                                 "type": "error",
                                 "file": file_path,
@@ -82,40 +51,57 @@ def _check_data_types(self, file_path, rows, columns):
                                 "column": columns[j],
                                 "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}",
                             }
-                        )
+                    )
+            elif expected_type is int:
+                if not isinstance(value, expected_type):
+                    errors.append(
+                        {
+                            "type": "error",
+                            "file": file_path,
+                            "row": i,
+                            "column": columns[j],
+                            "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}",
+                        }
+                    )
+            elif expected_type is float:
+                if not isinstance(value, (expected_type, int)):
+                    errors.append(
+                        {
+                            "type": "error",
+                            "file": file_path,
+                            "row": i,
+                            "column": columns[j],
+                            "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__} or whole number",
+                        }
+                    )
         return errors
 
-    def _check_empty_values(self, file_path, rows, columns):
+    def _check_empty_values(self, file_path, i, row, columns):
         warnings = []
-        for i, row in enumerate(rows):
-            for j, value in enumerate(row):
-                if value is None or value == "":
-                    warnings.append(
-                        {
+        for j, value in enumerate(row):
+            str_value = str(value).strip() if value is not None else ""
+            if str_value == "":
+                warnings.append({
                             "type": "warning",
                             "file": file_path,
                             "row": i,
                             "column": columns[j],
                             "message": "has empty value",
-                        }
-                    )
+                })
         return warnings
 
-    def _check_value_overflow(self, file_path, rows, columns):
+    def _check_value_overflow(self, file_path, i, row, columns):
         warnings = []
-        for i, row in enumerate(rows):
-            for j, value in enumerate(row):
-                if isinstance(value, int):
-                    if not Utils.fits_in_int32(value):
-                        warnings.append(
-                            {
+        for j, value in enumerate(row):
+            if isinstance(value, int):
+                if not Utils.fits_in_int32(value):
+                        warnings.append({
                                 "type": "warning",
                                 "file": file_path,
                                 "row": i,
                                 "column": columns[j],
                                 "message": f"has {value} ({type(value).__name__}), this is a big integer in postgres , consider when inserting into the database (postgres has 32 bit integer limit)",
-                            }
-                        )
+                        })
         return warnings
 
     def validate(self, file_path):
@@ -140,14 +126,31 @@ def validate(self, file_path):
         rows = data.get("rows", [])
         num_cols = len(columns)
 
-        # errors --------
+        if not rows or not columns:
+            if not rows:
+                message = "No rows found"
+            elif not columns:
+                message = "No columns found"
+            else:
+                message = "No rows or columns found"
+
+            errors.extend([{
+                    "type": "error",
+                    "file": file_path,
+                    "row": rows if rows else None,
+                    "column": columns if columns else None,
+                    "message": message
+                }
+            ])
+            return errors, warnings
+
         errors.extend(self._check_duplicate_columns(file_path, columns))
-        errors.extend(self._check_row_column_mismatches(file_path, rows, num_cols))
-        errors.extend(self._check_data_types(file_path, rows, columns))
-        
-        # warnings --------
-        warnings.extend(self._check_empty_values(file_path, rows, columns))
-        warnings.extend(self._check_value_overflow(file_path, rows, columns))
+
+        for i, row in enumerate(rows):
+            errors.extend(self._check_row_column_mismatch(file_path, i, row, num_cols))
+            warnings.extend(self._check_empty_values(file_path, i, row, columns))
+            warnings.extend(self._check_value_overflow(file_path, i, row, columns))
+            errors.extend(self._check_data_types(file_path, i, row, rows[0], columns))
 
         return errors, warnings
    

From a3ed6ac673b766fc7fb427e7bcc947a3adc2b343 Mon Sep 17 00:00:00 2001
From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com>
Date: Mon, 23 Mar 2026 15:55:39 +0530
Subject: [PATCH 09/10] fix : fix a condition checking

---
 scripts/validator/validators/tabular.py | 48 ++++++++-----------------
 1 file changed, 15 insertions(+), 33 deletions(-)

diff --git a/scripts/validator/validators/tabular.py b/scripts/validator/validators/tabular.py
index 365de33a..dd40071d 100644
--- a/scripts/validator/validators/tabular.py
+++ b/scripts/validator/validators/tabular.py
@@ -41,39 +41,21 @@ def _check_data_types(self, file_path, i, row, first_row, columns):
         errors = []
         for j, value in enumerate(row):
             expected_type = type(first_row[j])
-            if expected_type is str:    
-                if not isinstance(value, expected_type):
-                    errors.append(
-                            {
-                                "type": "error",
-                                "file": file_path,
-                                "row": i,
-                                "column": columns[j],
-                                "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}",
-                            }
-                    )
-            elif expected_type is int:
-                if not isinstance(value, expected_type):
-                    errors.append(
-                        {
-                            "type": "error",
-                            "file": file_path,
-                            "row": i,
-                            "column": columns[j],
-                            "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__}",
-                        }
-                    )
-            elif expected_type is float:
-                if not isinstance(value, (expected_type, int)):
-                    errors.append(
-                        {
-                            "type": "error",
-                            "file": file_path,
-                            "row": i,
-                            "column": columns[j],
-                            "message": f"has {value} ({type(value).__name__}), expected {expected_type.__name__} or whole number",
-                        }
-                    )
+            if expected_type is float:
+                allowed_types = (float, int)
+                expected_msg = "float or whole number"
+            else:
+                allowed_types = (expected_type)
+                expected_msg = expected_type.__name__
+            
+            if not isinstance(value, allowed_types):
+                errors.append({
+                    "type": "error",
+                    "file": file_path,
+                    "row": i,
+                    "column": columns[j],
+                    "message": f"has {value} ({type(value).__name__}), expected {expected_msg}",
+                })
         return errors
 
     def _check_empty_values(self, file_path, i, row, columns):

From 5ef1cd2b4a4d1ded232c70fd48ebb4f521f9986a Mon Sep 17 00:00:00 2001
From: Yasandu Imanjith <89386702+yasandu0505@users.noreply.github.com>
Date: Mon, 23 Mar 2026 16:07:06 +0530
Subject: [PATCH 10/10] feat : adding the readme file

---
 scripts/validator/README.md | 75 +++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 scripts/validator/README.md

diff --git a/scripts/validator/README.md b/scripts/validator/README.md
new file mode 100644
index 00000000..f58ab10c
--- /dev/null
+++ b/scripts/validator/README.md
@@ -0,0 +1,75 @@
+# Data Validator Program
+
+A command-line tool for validating datasets using configurable validator types.
+
+---
+
+## Getting Started
+
+### Prerequisites
+
+- Python 3.x
+- `pip` and `venv`
+
+### Local Setup
+
+```bash
+# Navigate to the project directory
+cd scripts/validator
+
+# Create a virtual environment
+python3 -m venv venv
+
+# Activate the virtual environment
+source venv/bin/activate
+
+# Install dependencies
+pip install -r requirements.txt
+```
+
+### Running the Program
+
+```bash
+python main.py <path-to-dataset-directory> <validator-type>
+```
+
+**Example:**
+
+```bash
+python main.py ../../data/statistics tabular
+```
+
+---
+
+## Supported Validators
+
+| Validator | Description              | Status  |
+|-----------|--------------------------|---------|
+| `tabular` | Validates tabular data   | ✅ Available |
+
+> **Note:** Additional validators are currently under development.
+
+---
+
+## Project Structure
+
+```
+scripts/validator/
+│
+├── main.py               # Entry point
+├── requirements.txt      # Python dependencies
+├── README.md             # Project documentation
+│
+├── core/
+│   ├── baseRunner.py     # Base runner logic
+│   └── baseValidator.py  # Base validator interface
+│
+├── models/
+│   └── tabularSchema.json  # Schema definition for tabular validation
+│
+├── utils/
+│   └── utils.py          # Shared utility functions
+│
+└── validators/
+    └── tabular.py        # Tabular validator implementation
+```
\ No newline at end of file