-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquery.sql
More file actions
311 lines (254 loc) · 8.59 KB
/
query.sql
File metadata and controls
311 lines (254 loc) · 8.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
-- project- Global Layoffs: Data cleaning with SQL
-- take a look at the dataset
SELECT *
FROM layoffs;
-- now when we are data cleaning we usually follow a few steps
-- 1. check for duplicates and remove any
-- 2. standardize data and fix errors
-- 3. Look at null values or blank values
-- 4. remove any columns and rows that are not necessary - few ways
-- steps:
-- first thing we want to do is create a staging table.
-- This is the one we will work in and clean the data.
-- We want a table with the raw data intact in case something happens.
-- creating staging table
CREATE TABLE layoffs_staging
LIKE layoffs;
-- inserting values in the staging table
INSERT layoffs_staging
SELECT *
FROM layoffs;
-- querying all row in staging table
SELECT *
FROM layoffs_staging;
-- total number of distinct companies
SELECT count(DISTINCT company)
FROM layoffs_staging_new;
-- check the values in the staging table
SELECT *
FROM layoffs_staging;
-- Remove duplicates
-- check for duplicates:
-- step-1: create row number over partitioned by all columns
-- step-2: query the rows where row number greater than 1
-- Finding duplicates
WITH duplicate_cte AS(
SELECT *, ROW_NUMBER() OVER(PARTITION BY company, location, industry, total_laid_off,
percentage_laid_off, `date`, stage, country, funds_raised_millions) row_num
FROM layoffs_staging
)
SELECT *
FROM duplicate_cte
WHERE row_num>1;
SELECT *
FROM layoffs_staging
WHERE company = 'Hibob';
-- we cannot delete the duplicate using delete statement (like update) in mysql unlike sql server or postgres
-- with cte as(
-- select *, row_number() over ( partition by company, location, industry, total_laid_off,
-- percentage_laid_off, `date`, stage, country, funds_raised_millions) row_num
-- from layoffs_staging)
-- delete
-- from cte
-- where row_num>1
-- creating new working table layoffs_staging_new
CREATE TABLE `layoffs_staging_new` (
`company` text,
`location` text,
`industry` text,
`total_laid_off` int DEFAULT NULL,
`percentage_laid_off` text,
`date` text,
`stage` text,
`country` text,
`funds_raised_millions` int DEFAULT NULL,
`row_num` int
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
-- inserting value in the new table layoffs_staging_new
INSERT layoffs_staging_new
SELECT *, ROW_NUMBER() OVER(PARTITION BY company, location, industry, total_laid_off,
percentage_laid_off, `date`, stage, country, funds_raised_millions) row_num
FROM layoffs_staging;
-- check the duplicates
SELECT *
FROM layoffs_staging_new
WHERE row_num>1;
-- now delete the duplicates
DELETE
FROM layoffs_staging_new
WHERE row_num>1;
-- check after deleting
SELECT *
FROM layoffs_staging_new
WHERE row_num>1;
-- standardizing the data
-- first check if there is any white space before and after the word
-- begin with column company
SELECT DISTINCT company
FROM layoffs_staging_new;
-- we notice there is a space before word
-- use trim and see the difference
SELECT company, trim(company)
FROM layoffs_staging_new;
-- update the table by replacing the column with trimmed column
UPDATE layoffs_staging_new
SET company = trim(company);
-- now see the column industry
SELECT DISTINCT industry
FROM layoffs_staging_new
ORDER BY 1;
-- crypto, crypto-currecny should be labelled same
-- lets examine more
-- compare the count of crypto vs crypto%
SELECT count(*)
FROM layoffs_staging_new
WHERE industry LIKE 'crypto%'
UNION ALL
SELECT count(*)
FROM layoffs_staging_new
WHERE industry = 'crypto';
-- update all crypto% replacing with crypto
UPDATE layoffs_staging_new
SET industry = 'Crypto'
WHERE industry LIKE 'crypto%';
-- now see after updating
SELECT DISTINCT industry
FROM layoffs_staging_new;
-- check column location
SELECT DISTINCT location
FROM layoffs_staging_new
ORDER BY 1;
-- check column country
SELECT DISTINCT country
FROM layoffs_staging_new
ORDER BY 1;
-- findings : United States. vs United States
-- . needs to be removed
SELECT *
FROM layoffs_staging2
WHERE country LIKE 'united%.';
-- there are fours like this
-- removing the period
SELECT DISTINCT country, trim(TRAILING '.' FROM country)
FROM layoffs_staging_new
ORDER BY 1;
-- update the table
UPDATE layoffs_staging_new
SET country = trim(TRAILING '.' FROM country)
WHERE country LIKE 'United States%';
-- see the updated table
SELECT DISTINCT country
FROM layoffs_staging_new
ORDER BY 1;
-- check the data type
-- date datatype is text which needs to be changed to date
-- check column date
SELECT `date`
FROM layoffs_staging_new;
-- change the date format m-d-y to standard format y-m-d
SELECT `date`, str_to_date(`date`, '%m/%d/%Y')
FROM layoffs_staging_new;
-- update date column
UPDATE layoffs_staging_new
SET `date` = str_to_date(`date`, '%m/%d/%Y');
-- fix the datatype of date from text to date
ALTER TABLE layoffs_staging_new
MODIFY COLUMN `date` DATE;
-- Now let's work with null and blank cells
-- checking null in one column -- total_laid_off
SELECT *
FROM layoffs_staging_new
WHERE total_laid_off IS NULL;
-- null in multiple columns: total_laid_off and percentage_laid_off
SELECT *
FROM layoffs_staging_new
WHERE total_laid_off IS NULL AND percentage_laid_off IS NULL;
-- check if there is any null or blank in industry column
SELECT *
FROM layoffs_staging_new
WHERE industry IS NULL
OR industry = '';
SELECT *
FROM layoffs_staging_new
WHERE company ='Airbnb';
-- fill out / populate blanks for industries from other rows with same attributes for other columns
SELECT t1.industry, t2.industry
FROM layoffs_staging_new t1
JOIN layoffs_staging_new t2
ON t1.company = t2.company -- and t1.location = t2.location
WHERE (t1.industry IS NULL OR t1.industry ='') AND (t2.industry IS NOT NULL OR t2.industry != '');
-- populate blanks for industries from other rows with same attributes for other columns
SELECT t1.industry, t2.industry
FROM layoffs_staging_new t1
JOIN layoffs_staging_new t2
ON t1.company = t2.company -- and t1.location = t2.location
WHERE (t1.industry IS NULL OR t1.industry ='') AND (t2.industry != '');
-- update table with populting industry name in blanks
UPDATE layoffs_staging_new t1
JOIN layoffs_staging_new t2
ON t1.company = t2.company
SET t1.industry = t2.industry
WHERE (t1.industry IS NULL OR t1.industry ='') AND (t2.industry != '');
-- update layoffs_staging2
-- set industry = null
-- where industry = '';
-- check after updating table with populating industry
SELECT *
FROM layoffs_staging_new
WHERE industry IS NULL OR industry = '';
-- check how many bally's company
SELECT *
FROM layoffs_staging_new
WHERE company LIKE 'bally%';
-- dropping column or row unnecessary
-- deleting rows where both total_laid_off and percentage_laid_off null
SELECT *
FROM layoffs_staging_new
WHERE total_laid_off IS NULL AND percentage_laid_off IS NULL;
DELETE
FROM layoffs_staging_new
WHERE total_laid_off IS NULL AND percentage_laid_off IS NULL;
-- dropping column row_num
ALTER TABLE layoffs_staging_new
DROP COLUMN row_num;
-- drop table layoffs_staging_new
select count(*)
from layoffs_staging_new
--where company = 'Included Health';
--- check the medians
SELECT
Company, Industry,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_laid_off)
OVER (PARTITION BY industry) AS median_total_laid_off,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY percentage_laid_off)
OVER (PARTITION BY industry) AS median_percentage_laid_off,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY funds_raised_millions)
OVER (PARTITION BY industry) AS median_funds_raised_millions
FROM layoffs_staging_new;
-- update table by replacing nulls with medians
WITH industry_medians AS (
SELECT
industry,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_laid_off)
OVER (PARTITION BY industry) AS median_total_laid_off,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY percentage_laid_off)
OVER (PARTITION BY industry) AS median_percentage_laid_off,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY funds_raised_millions)
OVER (PARTITION BY industry) AS median_funds_raised_millions
FROM layoffs_staging_new
)
UPDATE lsn
SET
lsn.total_laid_off = COALESCE(lsn.total_laid_off, im.median_total_laid_off),
lsn.percentage_laid_off = COALESCE(lsn.percentage_laid_off, im.median_percentage_laid_off),
lsn.funds_raised_millions = COALESCE(lsn.funds_raised_millions, im.median_funds_raised_millions)
FROM layoffs_staging_new lsn
JOIN industry_medians im
ON lsn.industry = im.industry;
-- check if there is null in the theree columns
SELECT company, industry, total_laid_off, percentage_laid_off, funds_raised_millions
FROM layoffs_staging_new
WHERE total_laid_off IS NULL OR percentage_laid_off IS NULL OR funds_raised_millions IS NULL;
-- see the final table
SELECT *
FROM layoffs_staging_new;