-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcleaning_script.sql
More file actions
240 lines (209 loc) · 8.16 KB
/
cleaning_script.sql
File metadata and controls
240 lines (209 loc) · 8.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
/*
Project: PAN Number Validation & Data Quality Analysis
Database: PostgreSQL
Author: Manohar K
Objective:
- Clean raw PAN number data
- Identify missing, duplicate, and malformed PANs
- Validate PANs using structural and business rules
- Produce a data quality summary report
PAN Format:
AAAAA1234A
Key Rules:
- Exactly 10 characters
- Uppercase alphabets only
- No adjacent repeating letters or digits
- No alphabetical or numeric sequences
Notes:
- Assumes raw data is already loaded into pan_numbers_dataset.pan_number
- Database creation is optional and environment-dependent
*/
------------------------------------------------------------
-- Optional: Create database (run once if needed)
------------------------------------------------------------
-- CREATE DATABASE pan_number_validation_project;
------------------------------------------------------------
-- Base table (assumed to exist in production)
------------------------------------------------------------
-- CREATE TABLE pan_numbers_dataset (
-- pan_number TEXT
-- );
------------------------------------------------------------
-- Importing the data
/*
I imported the CSV data into PostgreSQL using DBeaver’s data import functionality, which internally uses
PostgreSQL COPY. After ingestion, the raw values live in `pan_numbers_dataset.pan_number` and may
contain NULLs, leading/trailing spaces, mixed case, duplicates, or malformed values. The sections below
identify missing values, trim/normalize entries, and validate PAN format rules.
*/
-- Cleaning the PAN Numbers
-- 1. Identify and handle the missing data
SELECT pan_number
FROM pan_numbers_dataset
WHERE pan_number IS NULL
-- 2. Check for the Duplicates
SELECT
pan_number,
COUNT(*)
FROM pan_numbers_dataset
GROUP BY pan_number
HAVING COUNT(*) >1
-- 3. Alternatively, use DISTINCT to remove duplicates
SELECT DISTINCT(pan_number)
FROM pan_numbers_dataset
-- 4. Handling leading/trailing spaces
SELECT
pan_number,
LENGTH(pan_number) AS with_spaces,
LENGTH(TRIM(pan_number)) AS without_spaces
FROM pan_numbers_dataset
WHERE LENGTH(pan_number) != LENGTH(TRIM(pan_number))
-- 5. Correct letter case
SELECT
pan_number,
UPPER(pan_number)
FROM pan_numbers_dataset
-- Main query: combined cleaning logic to produce normalized PAN values
/*
This query normalizes PAN values by trimming whitespace, converting to uppercase,
removing duplicates, and filtering out NULL/empty entries. It provides a cleaned
set of PANs ready for format validation.
*/
SELECT
DISTINCT(TRIM(UPPER(pan_number)))
FROM pan_numbers_dataset
WHERE pan_number IS NOT NULL
AND TRIM(pan_number) != ''
-- PAN Validation Rules
-- 1. It is exactly 10 characters long. And the format is as follows: AAAAA1234A
SELECT pan_number
FROM pan_numbers_dataset
WHERE pan_number ~ '^[A-Z]{5}[0-9]{4}[A-Z]$'
-- 2. The first five characters should be alphabetic (uppercase letters)
SELECT pan_number
FROM pan_numbers_dataset
WHERE SUBSTRING(pan_number,1,5) ~ '^[A-Z]{5}'
-- 3. Adjacent characters(alphabets) cannot be the same (like AABCD is invalid; AXBCD is valid)
SELECT pan_number
FROM pan_numbers_dataset
WHERE SUBSTRING(pan_number,1,5) !~ '([A-Z])\1'
-- 4. All five characters cannot form a sequence (like: ABCDE, BCDEF is invalid; ABCDX is valid)
SELECT pan_number
FROM pan_numbers_dataset
WHERE NOT (SELECT
BOOL_AND(ASCII(substring(pan_number,i+1,1)) - ASCII(substring(pan_number,i,1)) = 1)
FROM generate_series(1,4)i)
-- 5. Adjacent characters(digits) cannot be the same (like 1123 is invalid; 1923 is valid)
SELECT pan_number
FROM pan_numbers_dataset
WHERE SUBSTRING(pan_number,6,4) !~ '([0-9])\1'
-- 6. All four digits must not form a strictly ascending sequence (e.g., 1234, 2345)
SELECT pan_number
FROM pan_numbers_dataset
WHERE (SUBSTRING(pan_number,6,4) NOT IN ('0123','1234','2345','3456','4567','5678','6789'))
-- 7. The last character should be alphabetic (uppercase letter)
SELECT pan_number
FROM pan_numbers_dataset
WHERE SUBSTRING(pan_number,10,1) ~ '[A-Z]$'
-- Main Query - PAN Format Validation: A valid PAN number follows the format:
WITH cte_cleaned_pan AS(
SELECT
DISTINCT(TRIM(UPPER(pan_number))) AS pan_number
FROM pan_numbers_dataset
WHERE pan_number IS NOT NULL
AND TRIM(pan_number) != '')
SELECT
*
FROM cte_cleaned_pan
-- It is exactly 10 characters long. And the format is as follows: AAAAA1234A
WHERE pan_number ~ '^[A-Z]{5}[0-9]{4}[A-Z]$'
-- The first five characters should be alphabetic (uppercase letters)
AND SUBSTRING(pan_number,1,5) ~ '^[A-Z]{5}'
-- Adjacent characters(alphabets) cannot be the same (like AABCD is invalid; AXBCD is valid)
AND SUBSTRING(pan_number,1,5) !~ '([A-Z])\1'
-- All five characters cannot form a sequence (like: ABCDE, BCDEF is invalid; ABCDX is valid)
AND NOT(SELECT
BOOL_AND(ASCII(substring(pan_number,i+1,1)) - ASCII(substring(pan_number,i,1)) = 1)
FROM generate_series(1,4)i)
-- Adjacent characters(digits) cannot be the same (like 1123 is invalid; 1923 is valid)
AND SUBSTRING(pan_number,6,4) !~ '([0-9])\1'
-- All four characters cannot form a sequence (like: 1234, 2345
AND (SUBSTRING(pan_number,6,4) NOT IN ('0123','1234','2345','3456','4567','5678','6789'))
-- The last character should be alphabetic (uppercase letter)
AND SUBSTRING(pan_number,10,1) ~ '[A-Z]$'
/*
Create two separate categories
Valid PAN - If the PAN number matches the above format.
Invalid PAN - If the PAN number does not match the correct format, is incomplete, or contains any non-alphanumeric characters.
*/
SELECT
pan_number,
CASE
WHEN pan_number ~ '^[A-Z]{5}[0-9]{4}[A-Z]$'
-- Adjacent characters(alphabets) cannot be the same (like AABCD is invalid; AXBCD is valid)
AND SUBSTRING(pan_number,1,5) !~ '([A-Z])\1'
-- All five characters cannot form a sequence (like: ABCDE, BCDEF is invalid; ABCDX is valid)
AND NOT(SELECT
BOOL_AND(ASCII(substring(pan_number,i+1,1)) - ASCII(substring(pan_number,i,1)) = 1)
FROM generate_series(1,4)i)
-- Adjacent characters(digits) cannot be the same (like 1123 is invalid; 1923 is valid)
AND SUBSTRING(pan_number,6,4) !~ '([0-9])\1'
-- All four characters cannot form a sequence (like: 1234, 2345
AND (SUBSTRING(pan_number,6,4) NOT IN ('0123','1234','2345','3456','4567','5678','6789'))
THEN 'Valid PAN'
ELSE 'Invalid PAN'
END
FROM pan_numbers_dataset;
-- Create a summary report that provides the following
-- Total records processed
-- Total valid PANs
-- Total invalid PANs
-- Total missing or incomplete PANs (if applicable)
WITH cte_raw_count AS (
SELECT COUNT(*) AS total_raw_pan_count
FROM pan_numbers_dataset
)
,cte_missing_pan AS (
SELECT COUNT(*) AS missing_pan_count
FROM pan_numbers_dataset
WHERE pan_number IS NULL
OR TRIM(pan_number) = ''
)
,cte_cleaned_pan AS(
SELECT
DISTINCT(TRIM(UPPER(pan_number))) AS cleaned_pan_number
FROM pan_numbers_dataset
WHERE pan_number IS NOT NULL
AND LENGTH(TRIM(pan_number)) > 0)
,cte_report AS (
SELECT
cleaned_pan_number,
CASE
WHEN cleaned_pan_number ~ '^[A-Z]{5}[0-9]{4}[A-Z]$'
-- Adjacent characters(alphabets) cannot be the same (like AABCD is invalid; AXBCD is valid)
AND SUBSTRING(cleaned_pan_number,1,5) !~ '([A-Z])\1'
-- All five characters cannot form a sequence (like: ABCDE, BCDEF is invalid; ABCDX is valid)
AND NOT(SELECT
BOOL_AND(ASCII(substring(cleaned_pan_number,i+1,1)) - ASCII(substring(cleaned_pan_number,i,1)) = 1)
FROM generate_series(1,4)i)
-- Adjacent characters(digits) cannot be the same (like 1123 is invalid; 1923 is valid)
AND SUBSTRING(cleaned_pan_number,6,4) !~ '([0-9])\1'
-- All four characters cannot form a sequence (like: 1234, 2345
AND (SUBSTRING(cleaned_pan_number,6,4) NOT IN ('0123','1234','2345','3456','4567','5678','6789'))
THEN 'Valid PAN'
ELSE 'Invalid PAN'
END AS Category
FROM cte_cleaned_pan)
SELECT
r.total_raw_pan_count AS total_pan_before_cleaning,
m.missing_pan_count AS missing_pan_count,
COUNT(*) AS pan_count_after_cleaning,
category
FROM cte_report cr
CROSS JOIN cte_raw_count r
CROSS JOIN cte_missing_pan m
GROUP BY
r.total_raw_pan_count,
m.missing_pan_count,
category
ORDER BY category;