-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLayoffs_Data_Cleaning_SQL.sql
More file actions
380 lines (257 loc) · 8.74 KB
/
Layoffs_Data_Cleaning_SQL.sql
File metadata and controls
380 lines (257 loc) · 8.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
-- Creating a databse
CREATE DATABASE world_layoffs;
-- Entering to the databse
USE world_layoffs;
-- Checkign all the avaialble tables and their contents
SHOW TABLES;
SELECT * FROM layoffs;
-- Data Cleaning Process
-- 1. Removing Duplicats
-- 2. Standerdization (empty spaces, special charecters etc).
-- 3. Dealing with Null/Empty values
-- 4. Dealing with Unwanted Columns/Rows
-- Creating a duplicate table for the modifications
CREATE TABLE layoffs_staging
LIKE layoffs;
-- Copying the table content from the raw table
INSERT layoffs_staging
SELECT * FROM layoffs;
SELECT * FROM layoffs_staging;
-- Using the ROW_Number function to find out the duplicate entires in the table as we do not have the Unique Row ID Column in the table
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY company, industry, total_laid_off, percentage_laid_off, `date`) AS row_num
FROM layoffs_staging;
-- Creating CTE to find the duplicate entries
WITH duplicate_cte AS
(
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY company, industry, total_laid_off, percentage_laid_off, `date`) AS row_num
FROM layoffs_staging
)
SELECT *
from duplicate_cte
WHERE row_num > 1;
-- Checkign the duplicate entries with Company names eg. Oda
SELECT *
FROM layoffs_staging
WHERE company = 'Oda';
-- Found some duplicate and non duplicate entries. Hence, partitioning on all the columns to remove all the non duplicate entries.
WITH duplicate_cte AS
(
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions) AS row_num
FROM layoffs_staging
)
SELECT *
from duplicate_cte
WHERE row_num > 1;
-- Checkign the duplicate entries with company name again
SELECT *
FROM layoffs_staging
WHERE company = 'Casper';
-- Deleting duplicate rows with the DELETE function
WITH duplicate_cte AS
(
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions) AS row_num
FROM layoffs_staging
)
DELETE
from duplicate_cte
WHERE row_num > 1;
-- Creating stating2 table due to an error while deleting rows from the CTE
CREATE TABLE `layoffs_staging2` (
`company` text,
`location` text,
`industry` text,
`total_laid_off` int DEFAULT NULL,
`percentage_laid_off` text,
`date` text,
`stage` text,
`country` text,
`funds_raised_millions` int DEFAULT NULL,
`row_num` int
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
-- Checkign the table after its execution sucessfully
SELECT *
FROM layoffs_staging2;
-- Copied the data from staging1 table with the row_num column
INSERT INTO layoffs_staging2
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions) AS row_num
FROM layoffs_staging;
-- Checkign the table after importing the data
SELECT *
FROM layoffs_staging2;
-- Filtering the duplicate enteries with the help of row_num column
SELECT *
FROM layoffs_staging2
WHERE row_num > 1;
-- Deleting the duplicate rows
DELETE
FROM layoffs_staging2
WHERE row_num > 1;
-- Checkign the table after deleting the duplicate rows
SELECT *
FROM layoffs_staging2
WHERE row_num > 1;
SELECT *
FROM layoffs_staging2;
-- Standerdizing Data : Finding ussuse in the data and fixing it
-- Checkign the first column for issues
SELECT DISTINCT(company)
FROM layoffs_staging2;
-- Found a company name with extra spaces. Hence, checking and removing them with the TRIM function
SELECT company, trim(company)
FROM layoffs_staging2;
-- Now updating the trimed company column to the table.
UPDATE layoffs_staging2
SET company = trim(company);
-- Checkign the table after the first column updation
SELECT *
FROM layoffs_staging2;
-- Checkign the 2nd column Industry
SELECT DISTINCT(industry)
FROM layoffs_staging2;
-- No spaces found but found industry name mentioned differently. Hence, ordering them by Order By function
SELECT DISTINCT(industry)
FROM layoffs_staging2
ORDER BY 1;
-- Checkign all the enteries for Crypto to check if they are same or different
SELECT *
FROM layoffs_staging2
WHERE industry LIKE 'Crypto%';
-- Updating industry Crypto, Crypto Currency, and CryptoCurrency to Crypto as they all are the same but mentioned differently
UPDATE layoffs_staging2
SET industry = 'Crypto'
WHERE industry LIKE 'Crypto%';
-- Checking the industry column again after updating the Crypto to make sure they are corrected
SELECT DISTINCT(industry)
FROM layoffs_staging2
ORDER BY 1;
-- Checkign the next column location
SELECT DISTINCT(location)
FROM layoffs_staging2
ORDER BY 1;
-- Checkign the next column country as everything seems fine with the previous column
SELECT DISTINCT(country)
FROM layoffs_staging2
ORDER BY 1;
-- Removed a fullstop from United State
SELECT *
FROM layoffs_staging2
WHERE country LIKE 'United States%';
-- Checkign the Trim function and comparing them
SELECT DISTINCT country, TRIM(TRAILING '.' FROM Country)
FROM layoffs_staging2
ORDER BY 1;
-- Updating the changes to the table
UPDATE layoffs_staging2
SET country = TRIM(TRAILING '.' FROM Country)
WHERE country LIKE 'United States%';
-- Checking if the changes are updated correctly
SELECT DISTINCT(country)
FROM layoffs_staging2
ORDER BY 1;
-- The date column was kept as text to change later. Now changing string to date and changing the format as well
SELECT `date`,
STR_TO_DATE(`date`, '%m/%d/%Y')
FROM layoffs_staging2;
-- Updating the new formated date column
UPDATE layoffs_staging2
SET `date` = STR_TO_DATE(`date`, '%m/%d/%Y');
-- Checkign if the date column has been upated correctly
SELECT `date`
FROM layoffs_staging2;
-- Updating the date column data type
ALTER TABLE layoffs_staging2
MODIFY COLUMN `date` DATE;
-- We are goign to work on Null and Empty values in the table
-- Checkign the entire table column wise for null and empty values
SELECT *
FROM layoffs_staging2;
SELECT *
FROM layoffs_staging2
WHERE company IS NULL
OR company = '';
-- Checking the second column location for null/empty values as none found in the 1st
SELECT *
FROM layoffs_staging2
WHERE location IS NULL
OR location = '';
-- Checking the 3rd column industry for null/empty values as none found in the previous columns
SELECT *
FROM layoffs_staging2
WHERE industry IS NULL
OR industry = '';
-- Found entries with null and empty values.
-- Checkign all of the companies to see if we could populate values from other rows.
SELECT *
FROM layoffs_staging2
WHERE company = 'Airbnb';
-- Found Airbnb with industry values, will update all other with self join function
SELECT *
FROM layoffs_staging2 t1
JOIN layoffs_staging2 t2
ON t1.company = t2.company
WHERE (t1.industry IS NULL OR t1.industry = '')
AND t2.industry IS NOT NULL;
-- Updating with the industry values from the matching column values
UPDATE layoffs_staging2 t1
JOIN layoffs_staging2 t2
ON t1.company = t2.company
SET t1.industry = t2.industry
WHERE (t1.industry IS NULL OR t1.industry = '')
AND t2.industry IS NOT NULL;
-- Its been updated but 0 row affected. Hence, making some changes like converting all empty values to NULL
UPDATE layoffs_staging2
SET industry = null
WHERE industry = '';
-- Checking if the empty values have been replaced with NULL values
SELECT *
FROM layoffs_staging2
WHERE industry IS NULL;
-- Updating the inductry column after replacing all the empty values with NULL
UPDATE layoffs_staging2 t1
JOIN layoffs_staging2 t2
ON t1.company = t2.company
SET t1.industry = t2.industry
WHERE t1.industry IS NULL
AND t2.industry IS NOT NULL;
-- Checking if all the entries are updated
SELECT company, industry
FROM layoffs_staging2
WHERE industry IS NULL;
-- Checking company Bally's for possible industry null value as all other are updated
SELECT *
FROM layoffs_staging2
WHERE company LIKE 'Bally%';
-- Found only 1 entry with no infomation to populate. Hence, leaving it
-- Checking Total Laid Off and Percentage Laid Off columns for null/empty values
SELECT *
FROM layoffs_staging2
WHERE (total_laid_off IS NULL OR total_laid_off = '')
AND (percentage_laid_off IS NULL OR percentage_laid_off = '');
-- Deleting null/empty values as no way to populate the them
DELETE
FROM layoffs_staging2
WHERE (total_laid_off IS NULL OR total_laid_off = '')
AND (percentage_laid_off IS NULL OR percentage_laid_off = '');
-- Check if there is any null/empty values remains
SELECT *
FROM layoffs_staging2
WHERE (total_laid_off IS NULL OR total_laid_off = '')
AND (percentage_laid_off IS NULL OR percentage_laid_off = '');
SELECT *
FROM layoffs_staging2;
-- Drpping the Row Number columns as almost all the data has been standerdized and remainign null values that has no specific instructions remain
ALTER TABLE layoffs_staging2
DROP COLUMN row_num;
-- Checkign if the Row Number column has been deleted
SELECT *
FROM layoffs_staging2;