-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbaseball_operations_analytics.sql
More file actions
346 lines (295 loc) · 11.1 KB
/
baseball_operations_analytics.sql
File metadata and controls
346 lines (295 loc) · 11.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
-- ============================================================================
-- BASEBALL OPERATIONS ANALYTICS
-- ============================================================================
-- Analysing team spending efficiency, player career trajectories, and
-- workforce composition patterns using advanced SQL techniques.
--
-- Skills demonstrated: Window functions, CTEs, cumulative calculations,
-- date manipulation, pivoting, and multi-table joins.
--
-- Author: Rachel Berger
-- GitHub: github.com/bergerache
-- ============================================================================
-- ============================================================================
-- TEAM SPENDING ANALYSIS
-- ============================================================================
-- Understanding how teams allocate budgets over time, identifying top spenders,
-- and tracking cumulative investment milestones.
-- ============================================================================
-- Which teams are in the top 20% for average annual spending?
-- (Banking parallel: Identifying high-value customers by transaction volume)
WITH annual_team_spend AS (
SELECT teamID,
yearID,
SUM(salary) AS total_spend
FROM salaries
GROUP BY teamID, yearID
),
team_percentiles AS (
SELECT teamID,
AVG(total_spend) AS avg_annual_spend,
NTILE(5) OVER (ORDER BY AVG(total_spend) DESC) AS spend_quintile
FROM annual_team_spend
GROUP BY teamID
)
SELECT teamID,
ROUND(avg_annual_spend / 1000000, 2) AS avg_spend_millions,
spend_quintile
FROM team_percentiles
WHERE spend_quintile = 1
ORDER BY avg_spend_millions DESC;
-- How has each team's cumulative spending grown over time?
-- (Banking parallel: Running balance calculations, cumulative AUM tracking)
WITH annual_team_spend AS (
SELECT teamID,
yearID,
SUM(salary) AS total_spend
FROM salaries
GROUP BY teamID, yearID
)
SELECT teamID,
yearID,
ROUND(total_spend / 1000000, 2) AS annual_spend_millions,
ROUND(SUM(total_spend) OVER (
PARTITION BY teamID
ORDER BY yearID
) / 1000000, 2) AS cumulative_spend_millions
FROM annual_team_spend
ORDER BY teamID, yearID;
-- When did each team's cumulative spending first exceed $1 billion?
-- (Banking parallel: Time-to-threshold analysis, milestone tracking)
WITH annual_team_spend AS (
SELECT teamID,
yearID,
SUM(salary) AS total_spend
FROM salaries
GROUP BY teamID, yearID
),
cumulative_spend AS (
SELECT teamID,
yearID,
SUM(total_spend) OVER (
PARTITION BY teamID
ORDER BY yearID
) AS running_total
FROM annual_team_spend
),
billion_threshold AS (
SELECT teamID,
yearID,
running_total,
ROW_NUMBER() OVER (
PARTITION BY teamID
ORDER BY yearID
) AS rn
FROM cumulative_spend
WHERE running_total > 1000000000
)
SELECT teamID,
yearID AS first_year_over_1B,
ROUND(running_total / 1000000000, 2) AS cumulative_spend_billions
FROM billion_threshold
WHERE rn = 1
ORDER BY first_year_over_1B;
-- ============================================================================
-- PLAYER CAREER ANALYSIS
-- ============================================================================
-- Examining player career trajectories, tenure patterns, and team loyalty
-- to understand workforce dynamics.
-- ============================================================================
-- What are the career lengths of players, and at what age did they start/end?
-- (Banking parallel: Customer tenure analysis, lifecycle stage identification)
WITH player_dates AS (
SELECT playerID,
nameGiven,
CAST(CONCAT(birthYear, '-', birthMonth, '-', birthDay) AS DATE) AS birth_date,
CAST(debut AS DATE) AS debut_date,
CAST(finalGame AS DATE) AS final_date
FROM players
)
SELECT playerID,
nameGiven,
birth_date,
debut_date,
final_date,
TIMESTAMPDIFF(YEAR, birth_date, debut_date) AS age_at_debut,
TIMESTAMPDIFF(YEAR, birth_date, final_date) AS age_at_retirement,
TIMESTAMPDIFF(YEAR, debut_date, final_date) AS career_length_years
FROM player_dates
WHERE birth_date IS NOT NULL
AND debut_date IS NOT NULL
ORDER BY career_length_years DESC;
-- Which team did each player start and end their career with?
-- (Banking parallel: First product vs current product analysis, customer journey endpoints)
WITH player_teams AS (
SELECT playerID,
teamID,
yearID,
FIRST_VALUE(teamID) OVER (
PARTITION BY playerID
ORDER BY yearID
) AS first_team,
FIRST_VALUE(teamID) OVER (
PARTITION BY playerID
ORDER BY yearID DESC
) AS last_team,
ROW_NUMBER() OVER (
PARTITION BY playerID
ORDER BY yearID
) AS rn
FROM salaries
)
SELECT playerID,
first_team,
last_team,
CASE WHEN first_team = last_team THEN 'Yes' ELSE 'No' END AS career_loyalty
FROM player_teams
WHERE rn = 1
ORDER BY playerID;
-- How many players showed long-term loyalty (same team for 10+ years)?
-- (Banking parallel: Long-term customer retention analysis)
WITH player_teams AS (
SELECT playerID,
FIRST_VALUE(teamID) OVER (
PARTITION BY playerID
ORDER BY yearID
) AS first_team,
FIRST_VALUE(teamID) OVER (
PARTITION BY playerID
ORDER BY yearID DESC
) AS last_team,
ROW_NUMBER() OVER (
PARTITION BY playerID
ORDER BY yearID
) AS rn
FROM salaries
),
player_careers AS (
SELECT playerID,
TIMESTAMPDIFF(YEAR,
CAST(debut AS DATE),
CAST(finalGame AS DATE)
) AS career_length
FROM players
),
loyal_players AS (
SELECT t.playerID,
t.first_team,
t.last_team,
c.career_length
FROM player_teams t
LEFT JOIN player_careers c ON t.playerID = c.playerID
WHERE t.rn = 1
AND t.first_team = t.last_team
AND c.career_length > 10
)
SELECT COUNT(*) AS loyal_long_career_players,
ROUND(COUNT(*) * 100.0 / (SELECT COUNT(DISTINCT playerID) FROM salaries), 1) AS pct_of_all_players
FROM loyal_players;
-- ============================================================================
-- TALENT PIPELINE ANALYSIS
-- ============================================================================
-- Analysing which institutions produce the most professional players
-- and how this has evolved over time.
-- ============================================================================
-- Which schools have produced the most professional players?
-- (Banking parallel: Lead source analysis, channel effectiveness)
SELECT sd.name_full AS school_name,
COUNT(DISTINCT s.playerID) AS players_produced
FROM schools s
INNER JOIN school_details sd ON s.schoolID = sd.schoolID
GROUP BY s.schoolID, sd.name_full
ORDER BY players_produced DESC
LIMIT 10;
-- How has player production by school changed across decades?
-- (Banking parallel: Cohort analysis, trend identification over time)
WITH school_decades AS (
SELECT FLOOR(s.yearID / 10) * 10 AS decade,
sd.name_full AS school_name,
COUNT(DISTINCT s.playerID) AS players_produced
FROM schools s
INNER JOIN school_details sd ON s.schoolID = sd.schoolID
GROUP BY decade, s.schoolID, sd.name_full
),
ranked_schools AS (
SELECT decade,
school_name,
players_produced,
ROW_NUMBER() OVER (
PARTITION BY decade
ORDER BY players_produced DESC
) AS decade_rank
FROM school_decades
)
SELECT decade,
school_name,
players_produced,
decade_rank
FROM ranked_schools
WHERE decade_rank <= 3
ORDER BY decade DESC, decade_rank;
-- ============================================================================
-- WORKFORCE COMPOSITION ANALYSIS
-- ============================================================================
-- Examining player characteristics and how physical attributes have
-- evolved over time.
-- ============================================================================
-- What is the batting hand distribution across teams?
-- (Banking parallel: Customer segmentation by product preference)
WITH player_team_batting AS (
SELECT DISTINCT
s.teamID,
s.playerID,
p.bats
FROM salaries s
LEFT JOIN players p ON s.playerID = p.playerID
)
SELECT teamID,
COUNT(playerID) AS total_players,
ROUND(SUM(CASE WHEN bats = 'R' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_right,
ROUND(SUM(CASE WHEN bats = 'L' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_left,
ROUND(SUM(CASE WHEN bats = 'B' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_both
FROM player_team_batting
WHERE teamID IS NOT NULL
GROUP BY teamID
ORDER BY teamID;
-- How have player physical attributes changed decade over decade?
-- (Banking parallel: Customer demographic shifts, trend analysis)
WITH decade_averages AS (
SELECT FLOOR(YEAR(debut) / 10) * 10 AS decade,
ROUND(AVG(height), 1) AS avg_height,
ROUND(AVG(weight), 1) AS avg_weight,
COUNT(*) AS player_count
FROM players
WHERE debut IS NOT NULL
AND height IS NOT NULL
AND weight IS NOT NULL
GROUP BY FLOOR(YEAR(debut) / 10) * 10
)
SELECT decade,
avg_height,
avg_weight,
player_count,
ROUND(avg_height - LAG(avg_height) OVER (ORDER BY decade), 2) AS height_change,
ROUND(avg_weight - LAG(avg_weight) OVER (ORDER BY decade), 2) AS weight_change
FROM decade_averages
WHERE decade IS NOT NULL
ORDER BY decade;
-- ============================================================================
-- SUMMARY
-- ============================================================================
-- This analysis demonstrates:
-- • Window functions (ROW_NUMBER, NTILE, LAG, FIRST_VALUE)
-- • Common Table Expressions (CTEs) for readable, modular queries
-- • Cumulative/running calculations
-- • Date manipulation and tenure analysis
-- • Pivoting with CASE WHEN
-- • Multi-table joins
--
-- These techniques directly transfer to financial services analytics:
-- • Customer lifetime value calculations
-- • Running balance and cumulative metrics
-- • Cohort and trend analysis
-- • Customer segmentation
-- • Retention and loyalty analysis
-- ============================================================================