diff --git a/R/06-behaviour.R b/R/06-behaviour.R index 5760954..0c5c0b2 100644 --- a/R/06-behaviour.R +++ b/R/06-behaviour.R @@ -8,41 +8,78 @@ # or manually run 00-load-raw-data.R before this script. # Smoke -------------------------------------------------------------------- - # Load smoking data from relevant sweeps smoking_vars <- list( S1 = ns_data[["S1youngperson"]] %>% - select(NSID, smknw14 = W1cignowYP, smk14 = W1cigfreqYP), + select(NSID, smk14_ever = W1cignowYP, smk14_freq = W1cigfreqYP), S2 = ns_data[["S2youngperson"]] %>% - select(NSID, smknw15 = W2cignowYP, smk15 = W2cigfreqYP), + select(NSID, smk15_ever = W2cignowYP, smk15_freq = W2cigfreqYP), S3 = ns_data[["S3youngperson"]] %>% - select(NSID, smknw16 = W3cignowYP, smk16 = W3cigfreqYP), + select(NSID, smk16_ever = W3cignowYP, smk16_freq = W3cigfreqYP), S4 = ns_data[["S4youngperson"]] %>% select(NSID), S8 = ns_data[["S8selfcompletion"]] %>% - select(NSID, smk25 = W8SMOKING), + select(NSID, smk25_ever_freq = W8SMOKING), S9 = ns_data[["S9maininterview"]] %>% - select(NSID, smk32 = W9SMOKING) + select(NSID, smk32_ever_freq = W9SMOKING) ) # Merge all sweeps smoking_all <- reduce(smoking_vars, full_join, by = "NSID") -# Recode smoke ever/frequency -recode_smk14_16 <- function(x) { + +# Merge all sweeps +# smoking_all <- reduce(smoking_vars, full_join, by = "NSID") %>% +# Rename all smknw to smk_ever and smk to smk_freq for readability +# rename_with( +# ~ stringr::str_replace() +# Add '_raw' suffix to all 'smk*' variable names for simpler re-coding & cross-checks +# rename_with( +# ~ stringr::str_c(.x, "_raw"), +# contains("smk") +# ) + +# Note on smoking variables: +## In some sweeps, participants were first asked if they ever smoked [smknw_raw variables]. +## If positive, they were then asked how often they smoke(d). [smk_raw variables] +## This means that if a person did not smoke, they would have frequency as missing ['Not applicable']. + +## Standardise values -------------------------------------------------------------------- + +## The following code will convert missing values and responses to a common coding scheme. + +# Recode if ever smoking for age 14-16 +recode_smk_ever_14_16 <- function(x) { + case_when( + x == 1 ~ 1, # Yes + x == 2 ~ 0, # No + x == -96 ~ -3, + x %in% c(-92, -97) ~ -9, + x == -91 ~ -1, + x == -1 ~ -8, + x == -99 ~ -3, # YP not interviewed + TRUE ~ -3 + ) +} + +# Recode smoking frequency for age 14-16 +recode_smk_freq_14_16 <- function(x) { case_when( - x %in% c(1, 2, -91) ~ 0, # Never + x %in% c(1, 2) ~ 0, # Never x == 3 ~ 1, # used to, don’t at all now x %in% c(4, 5) ~ 2, # smoke cigs occasionally – not every day x == 6 ~ 3, # smoke cigs almost every day - x %in% c(-99, -97, -96) ~ -2, - x == -92 ~ -9, + x == -91 ~ -1, + x == -96 ~ -3, + x %in% c(-92, -97) ~ -9, x == -1 ~ -8, + x == -99 ~ -3, # YP not interviewed TRUE ~ -3 ) } -recode_smk25_32 <- function(x) { +# Derive smoking frequency for age 25 and 32 +recode_smk_25_32_to_freq <- function(x) { case_when( x > 0 ~ x - 1, # Convert 1-4 to 0-3 x == -9 ~ -9, @@ -52,20 +89,8 @@ recode_smk25_32 <- function(x) { ) } -# Recode smoke now -recode_smknw14_16 <- function(x) { - case_when( - x == 1 ~ 1, # Yes - x == 2 ~ 0, # No - x %in% c(-99, -97, -96) ~ -2, - x == -92 ~ -9, - x == -91 ~ -1, - x == -1 ~ -8, - TRUE ~ -3 - ) -} - -recode_smknw25_32 <- function(x) { +# Derive binary smoking status for age 25 and 32 +recode_smk_25_32_to_ever <- function(x) { case_when( x %in% c(0, 1) ~ 0, x %in% c(2, 3) ~ 1, @@ -73,70 +98,151 @@ recode_smknw25_32 <- function(x) { ) } -# Apply recoding -smoking_all <- smoking_all %>% +smoking_std <- smoking_all %>% mutate( - smk14 = recode_smk14_16(smk14), - smk15 = recode_smk14_16(smk15), - smk16 = recode_smk14_16(smk16), - smk25 = recode_smk25_32(smk25), - smk32 = recode_smk25_32(smk32) - ) %>% + # Smoking ever age 14-16 + across( + c(smk14_ever, smk15_ever, smk16_ever), + recode_smk_ever_14_16, + .names = "{col}_std" + ), + # Smoking freq age 14-16 + across( + c(smk14_freq, smk15_freq, smk16_freq), + recode_smk_freq_14_16, + .names = "{col}_std" + ), + # Smoking freq age 25-32 (derived from combined ever/freq fields) + smk25_freq_std = recode_smk_25_32_to_freq(smk25_ever_freq), + smk32_freq_std = recode_smk_25_32_to_freq(smk32_ever_freq), + # Smoking ever age 25-32 (binary, derived from recoded freq) + smk25_ever_std = recode_smk_25_32_to_ever(smk25_freq_std), + smk32_ever_std = recode_smk_25_32_to_ever(smk32_freq_std) + ) + +## Derive smoking status variables -------------------------------------------------------------------- + +# Helpers + +# Derive binary current smoking status ages 14-16 ('smknw[age]') +# A person counts as 'not currently smoking' if either of the following conditions are met: +# i) indicated not smoking when asked 'Do you ever smoke cigarettes at all?' [EVER questions] +# ii) indicated they never smoke, they tried smoking only once, or they used to smoke but not anymore [FREQ questions] +derive_smk_now <- function(ever_var, freq_var) { + case_when( + ever_var == 0 ~ 0L, # If EVER: 'Not smoking' -> "No" + freq_var %in% c(0, 1) ~ 0L, # otherwise if FREQ: 'Never smoked'/'Only once' or 'Used to smoke but never now' -> 'No' + ever_var == 1 ~ 1L, # otherwise if EVER: 'Yes' to smoking -> 'Yes' + # Missing values: + freq_var == -9 | ever_var == -9 ~ -9L, # otherwise: if either refused -> 'Refusal' + freq_var == -8 | ever_var == -8 ~ -8L, # otherwise: if either dk/insufficient info -> 'dk/insufficient info' + freq_var == -1 | ever_var == -1 ~ -1L, # otherwise: if either not applicable -> not applicable, + .default = -3L # everything else defaults to 'not interviewed/asked etc.' + ) +} + +derive_smk_detailed <- function(ever_var, freq_var) { + case_when( + freq_var >= 0 ~ as.integer(freq_var), + ever_var == 0 ~ 0, # Those who replied 'not ever smoke' -> 'never' (limitation: they were not asked if they never smoked) + freq_var == -9 | ever_var == -9 ~ -9L, + freq_var == -8 | ever_var == -8 ~ -8L, + freq_var == -1 | ever_var == -1 ~ -1L, + TRUE ~ -3L + ) +} + + +smoking_rec <- smoking_std %>% mutate( - smknw14 = case_when( - smk14 == 0 ~ 0, - smknw14 > 0 ~ recode_smknw14_16(smknw14), - TRUE ~ recode_smknw14_16(smknw14) + # Detailed smoking frequency + smk14 = derive_smk_detailed( + ever_var = smk14_ever_std, + freq_var = smk14_freq_std + ), + smk15 = derive_smk_detailed( + ever_var = smk15_ever_std, + freq_var = smk15_freq_std + ), + smk16 = derive_smk_detailed( + ever_var = smk16_ever_std, + freq_var = smk16_freq_std + ), + smk25 = derive_smk_detailed( + ever_var = smk25_ever_std, + freq_var = smk25_freq_std + ), + smk32 = derive_smk_detailed( + ever_var = smk32_ever_std, + freq_var = smk32_freq_std + ), + + # Binary current smoking status + smknw14 = derive_smk_now( + ever_var = smk14_ever_std, + freq_var = smk14_freq_std + ), + smknw15 = derive_smk_now( + ever_var = smk15_ever_std, + freq_var = smk15_freq_std ), - smknw15 = case_when( - smk15 == 0 ~ 0, - smknw15 > 0 ~ recode_smknw14_16(smknw15), - TRUE ~ recode_smknw14_16(smknw15) + smknw16 = derive_smk_now( + ever_var = smk16_ever_std, + freq_var = smk16_freq_std ), - smknw16 = case_when( - smk16 == 0 ~ 0, - smknw16 > 0 ~ recode_smknw14_16(smknw16), - TRUE ~ recode_smknw14_16(smknw16) + smknw25 = derive_smk_now( + ever_var = smk25_ever_std, + freq_var = smk25_freq_std ), - smknw25 = recode_smknw25_32(smk25), - smknw32 = recode_smknw25_32(smk32) + smknw32 = derive_smk_now( + ever_var = smk32_ever_std, + freq_var = smk32_freq_std + ) ) %>% mutate( across( c(smk14, smk15, smk16, smk25, smk32), - ~ factor( + ~ labelled( .x, - levels = c(0, 1, 2, 3, -1, -2, -3, -8, -9), labels = c( - "Never", - "Used to smoke, don’t at all now", - "Smoke occasionally – not every day", - "Smoke almost every day", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "Never" = 0, + "Used to smoke, don’t at all now" = 1, + "Smoke occasionally – not every day" = 2, + "Smoke almost every day" = 3, + common_missing_labels ) ) ), across( c(smknw14, smknw15, smknw16, smknw25, smknw32), - ~ factor( + ~ labelled( .x, - levels = c(0, 1, -1, -2, -3, -8, -9), labels = c( - "No", - "Yes", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "No" = 0, + "Yes" = 1, + common_missing_labels ) ) ) - ) %>% + ) + +# Checks +smoking_rec %>% + count(smk14_ever, smk14_freq, smk14, smknw14) + +smoking_rec %>% + count(smk15_ever, smk15_freq, smk15, smknw15) + +smoking_rec %>% + count(smk16_ever, smk16_freq, smk16, smknw16) + +smoking_rec %>% + count(smk25_ever_freq, smk25, smknw25) + +smoking_rec %>% + count(smk32_ever_freq, smk32, smknw32) + +smoking_all <- smoking_rec %>% select( NSID, smknw14, @@ -178,24 +284,47 @@ alc_vars <- list( ) # Merge all alcohol variables by NSID -alc_all <- reduce(alc_vars, full_join, by = "NSID") - -# First Time Had Alcohol -# Code review / MD (2025-12-05): Vectorised rewrite of the original rowwise() code to improve speed. -# This reproduces the original behaviour, including the current treatment of cases -# with all alcohol indicators missing as "never had alcohol" (99). -# Substantive logic (especially the 'never had alcohol' definition) to be revisited -# at a later debugging/clean-up stage. -alc_all <- alc_all |> +alc_all <- reduce(alc_vars, full_join, by = "NSID") %>% + # Add '_raw' suffix to all 'audit*' variable names for simpler re-coding & cross-checks + rename_with( + ~ stringr::str_c(.x, "_raw"), + contains("audit") + ) + + +## First time had alcohol -------------------------------------------------------------------- + +# Helpers: Derive 'not drinking' from alcever and audita +# This will be used to derive never drinkers. +never_from_alcever <- function(x) { + dplyr::case_when( + x < 0 ~ NA_real_, # negative codes = missing + x == 2 ~ 1, # "never" + x == 1 ~ 0, # "ever" + .default = NA_real_ + ) +} + +never_from_audita <- function(x) { + dplyr::case_when( + x < 0 ~ NA_real_, # negative codes = missing + x == 1 ~ 1, # "never" + x > 1 ~ 0, # "ever" + .default = NA_real_ + ) +} + +alc_first_age_rec <- alc_all |> mutate( + # Derive age first known drinking ever14 = if_else(alcever_S1 == 1 & alcmon_S1 == 1, 14, NA_real_), ever15 = if_else(alcever_S2 == 1, 15, NA_real_), ever16 = if_else(alcever_S3 == 1, 16, NA_real_), ever17 = if_else(alcever_S4 == 1, 17, NA_real_), ever19 = if_else(alcever_S6 == 1, 19, NA_real_), ever20 = if_else(alcever_S7 == 1, 20, NA_real_), - ever25 = if_else(audita25 > 1, 25, NA_real_), - ever32 = if_else(audita32 > 1, 32, NA_real_), + ever25 = if_else(audita25_raw > 1, 25, NA_real_), + ever32 = if_else(audita32_raw > 1, 32, NA_real_), first_age_raw = pmin( ever14, @@ -208,38 +337,65 @@ alc_all <- alc_all |> ever32, na.rm = TRUE ), - first_age_raw = if_else( - is.infinite(first_age_raw), - NA_real_, - first_age_raw - ), - - # This reproduces `all(..., na.rm = TRUE)` but vectorised - never_alc = rowSums( - cbind( - alcever_S1 == 2, - alcever_S2 == 2, - alcever_S3 == 2, - alcever_S4 == 2, - alcever_S6 == 2, - alcever_S7 == 2, - audita25 == 1, - audita32 == 1 - ) == - FALSE, - na.rm = TRUE - ) == - 0, + # Derive known never drinking + # recode to 1 = "never", 0 = "ever", NA = missing + across( + c(alcever_S1, alcever_S2, alcever_S3, alcever_S4, alcever_S6, alcever_S7), + never_from_alcever, + .names = "never_{.col}" + ), + across( + c(audita25_raw, audita32_raw), + never_from_audita, + .names = "never_{.col}" + ), + # Derive never drinkers: + # - 1 = all items observed & all never + # - 0 = at least one drinker + # - NA = no drinkers but some/all missing + never_alc = case_when( + # any 0 -> drinker + if_any(starts_with("never_"), ~ dplyr::coalesce(.x == 0, FALSE)) ~ 0L, + # all observed & 1 + if_all(starts_with("never_"), ~ !is.na(.x) & .x == 1) ~ 1L, + # otherwise NA + .default = NA + ), + # First age -> use the first age when not missing. + # If never drinking, set to 99. + # Anything else is missing. alcfst = case_when( !is.na(first_age_raw) ~ first_age_raw, - never_alc ~ 99, - TRUE ~ -8 + never_alc == 1 ~ 99, + .default = -8 ) ) |> - select(-starts_with("ever"), -first_age_raw, -never_alc) + select(-starts_with(c("ever", "never")), -first_age_raw) + +# Add labels +alc_first_age_rec <- alc_first_age_rec %>% + mutate( + alcfst = labelled( + alcfst, + labels = c( + "Age 14" = 14, + "Age 15" = 15, + "Age 16" = 16, + "Age 17" = 17, + "Age 19" = 19, + "Age 20" = 20, + "Age 25" = 25, + "Age 32" = 32, + "Never had alcohol" = 99, + common_missing_labels + ) + ) + ) -# function - Frequency Recode Across Sweeps +## Alcohol frequency -------------------------------------------------------------------- + +# Helpers recode_freq <- function(x, sweep, ever) { case_when( sweep %in% c("S2", "S3", "S4") ~ case_when( @@ -249,11 +405,11 @@ recode_freq <- function(x, sweep, ever) { x == 5 ~ 1, # once every couple of month x == 6 ~ 0, # less often/not at all ever == 2 ~ 0, # less often/not at all - x %in% c(-99, -97, -96) ~ -2, - x == -92 ~ -9, - x == -1 ~ -1, - x == -91 ~ -1, - TRUE ~ -3 + # Missing values: + x %in% c(-97, -92) | ever %in% c(-97, -92) ~ -9, # refusal if either refused, + x == -1 | ever == -1 ~ -8, # dk/missing info + x == -91 | ever == -91 ~ -1, # not applicable + .default = -3 ), sweep %in% c("S6", "S7") ~ case_when( x %in% c(1, 2) ~ 4, @@ -263,17 +419,17 @@ recode_freq <- function(x, sweep, ever) { x %in% c(7, 8) ~ 0, ever == 2 ~ 0, x == -997 ~ -2, - x == -97 ~ -9, - x == -92 ~ -9, - x == -91 ~ -1, - x == -1 ~ -1, - TRUE ~ -3 + # Missing values: + x %in% c(-97, -92) | ever %in% c(-97, -92) ~ -9, # refusal if either refused, + x == -1 | ever == -1 ~ -8, # dk/missing info + x == -91 | ever == -91 ~ -1, # not applicable + .default = -3 ) ) } -# recode frequency Variables -alc_all <- alc_all %>% +# Recode frequency variables +alc_freq_rec <- alc_first_age_rec %>% mutate( alcfreq14 = case_when( alcfreq_S1 == 1 ~ 4, @@ -284,158 +440,147 @@ alc_all <- alc_all %>% alcfreq_S1 == 6 ~ 0, alcever_S1 == 2 ~ 0, alcmon_S1 == 2 ~ 0, - alcfreq_S1 %in% c(-99, -97, -96) ~ -2, - alcfreq_S1 == -92 ~ -9, - alcfreq_S1 == -1 ~ -1, - alcfreq_S1 == -91 ~ -1, - TRUE ~ -3 + # Missing values: + alcfreq_S1 %in% + c(-97, -92) | + alcmon_S1 %in% c(-97, -92) | + alcever_S1 %in% c(-97, -92) ~ -9, # refusal if either refused, + alcfreq_S1 == -1 | alcmon_S1 == -1 | alcever_S1 == -1 ~ -8, # dk/missing info + alcfreq_S1 == -91 | alcmon_S1 == -91 | alcever_S1 == -91 ~ -1, # not applicable + .default = -3 ), alcfreq15 = recode_freq(alcfreq_S2, "S2", alcever_S2), alcfreq16 = recode_freq(alcfreq_S3, "S3", alcever_S3), alcfreq17 = recode_freq(alcfreq_S4, "S4", alcever_S4), alcfreq19 = recode_freq(alcfreq_S6, "S6", alcever_S6), alcfreq20 = recode_freq(alcfreq_S7, "S7", alcever_S7), + ) %>% + # Add labels + mutate( + across( + c(alcfreq14, alcfreq15, alcfreq16, alcfreq17, alcfreq19, alcfreq20), + ~ labelled( + .x, + labels = c( + "Less often/not at all" = 0, + "Once every couple of months" = 1, + "Once to three times a month" = 2, + "Once or twice a week" = 3, + "Most days" = 4, + common_missing_labels + ) + ) + ) ) +# Check +## Cross-tabs for alcfreq +{ + # Build separate cross-tabs for each sweep with alcfreq_S*, alcever_S* first and alcfreq* last + freq_map <- list( + S1 = c("alcever_S1", "alcfreq_S1", "alcmon_S1", "alcfreq14"), + S2 = c("alcever_S2", "alcfreq_S2", "alcfreq15"), + S3 = c("alcever_S3", "alcfreq_S3", "alcfreq16"), + S4 = c("alcever_S4", "alcfreq_S4", "alcfreq17"), + S6 = c("alcever_S6", "alcfreq_S6", "alcfreq19"), + S7 = c("alcever_S7", "alcfreq_S7", "alcfreq20") + ) + + alc_freq_crosstabs <- purrr::imap(freq_map, function(cols, sweep) { + alc_freq_rec %>% + dplyr::group_by(dplyr::across(dplyr::all_of(cols))) %>% + dplyr::summarise(n = dplyr::n(), .groups = "drop") + }) +} -alc_all_clean <- alc_all %>% +## AUDIT-C -------------------------------------------------------------------- + +alc_all_clean <- alc_freq_rec %>% mutate( audita25 = case_when( - audita25 > 0 ~ audita25 - 1, - audita25 < 0 ~ audita25, - is.na(audita25) ~ -3, + audita25_raw > 0 ~ audita25_raw - 1, + audita25_raw < 0 ~ audita25_raw, + is.na(audita25_raw) ~ -3, ), audita32 = case_when( - audita32 > 0 ~ audita32 - 1, - audita32 < 0 ~ audita32, - is.na(audita32) ~ -3, + audita32_raw > 0 ~ audita32_raw - 1, + audita32_raw < 0 ~ audita32_raw, + is.na(audita32_raw) ~ -3, ) ) %>% mutate( auditb25 = case_when( audita25 == 0 ~ 0, - audita25 > 0 & auditb25 > 0 ~ auditb25, - auditb25 < 0 ~ auditb25, - is.na(auditb25) ~ -3 + audita25 > 0 & auditb25_raw > 0 ~ auditb25_raw, + auditb25_raw < 0 ~ auditb25_raw, + is.na(auditb25_raw) ~ -3 ), auditb32 = case_when( audita32 == 0 ~ 0, - audita32 > 0 & auditb32 > 0 ~ auditb32, - auditb32 < 0 ~ auditb32, - is.na(auditb32) ~ -3 + audita32 > 0 & auditb32_raw > 0 ~ auditb32_raw, + auditb32_raw < 0 ~ auditb32_raw, + is.na(auditb32_raw) ~ -3 ), auditc25 = case_when( audita25 == 0 ~ 0, - is.na(auditc25) ~ -3, - auditc25 < 0 ~ auditc25, - TRUE ~ auditc25 - 1 + is.na(auditc25_raw) ~ -3, + auditc25_raw < 0 ~ auditc25_raw, + TRUE ~ auditc25_raw - 1 ), auditc32 = case_when( audita32 == 0 ~ 0, - is.na(auditc32) ~ -3, - auditc32 < 0 ~ auditc32, - TRUE ~ auditc32 - 1 + is.na(auditc32_raw) ~ -3, + auditc32_raw < 0 ~ auditc32_raw, + TRUE ~ auditc32_raw - 1 ) ) %>% mutate( - alcfst = factor( - alcfst, - levels = c(14, 15, 16, 17, 19, 20, 25, 32, 99, -1, -2, -3, -8, -9), - labels = c( - "Age 14", - "Age 15", - "Age 16", - "Age 17", - "Age 19", - "Age 20", - "Age 25", - "Age 32", - "Never had alcohol", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" - ) - ), - across( - c(alcfreq14, alcfreq15, alcfreq16, alcfreq17, alcfreq19, alcfreq20), - ~ factor( - .x, - levels = c(0, 1, 2, 3, 4, -1, -2, -3, -8, -9), - labels = c( - "Less often/not at all", - "Once every couple of months", - "Once to three times a month", - "Once or twice a week", - "Most days", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" - ) - ) - ), across( c(audita25, audita32), - ~ factor( + ~ labelled( .x, - levels = c(0, 1, 2, 3, 4, -1, -2, -3, -8, -9), labels = c( - "Never", - "Monthly or less", - "2–4 times a month", - "2–3 times a week", - "4 or more times a week", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "Never" = 0, + "Monthly or less" = 1, + "2–4 times a month" = 2, + "2–3 times a week" = 3, + "4 or more times a week" = 4, + common_missing_labels ) ) ), across( c(auditb25, auditb32), - ~ factor( + ~ labelled( .x, - levels = c(0, 1, 2, 3, 4, 5, -1, -2, -3, -8, -9), labels = c( - "0", - "1–2 drinks", - "3–4 drinks", - "5–6 drinks", - "7–9 drinks", - "10+", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "0" = 0, + "1–2 drinks" = 1, + "3–4 drinks" = 2, + "5–6 drinks" = 3, + "7–9 drinks" = 4, + "10+" = 5, + common_missing_labels ) ) ), across( c(auditc25, auditc32), - ~ factor( + ~ labelled( .x, - levels = c(0, 1, 2, 3, 4, -1, -2, -3, -8, -9), labels = c( - "Never", - "Less than monthly", - "Monthly", - "Weekly", - "Daily or almost daily", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "Never" = 0, + "Less than monthly" = 1, + "Monthly" = 2, + "Weekly" = 3, + "Daily or almost daily" = 4, + common_missing_labels ) ) ) - ) %>% + ) + +alc_all_clean <- alc_all_clean %>% select( NSID, alcfst, @@ -454,7 +599,15 @@ alc_all_clean <- alc_all %>% ) # Drug Use -------------------------------------------------------------------- + # Load drug use data from relevant sweeps +## Naming convention: +## canevr - ever used cannabis +## now_cann - currently using cannabis +## yr_cann - used cannabis in past 12 months +## othevr - ever used other drugs +## now_oth - currently using other drugs +## yr_oth - used other drugs in past 12 months drug_vars <- list( S1 = ns_data[["S1youngperson"]] %>% select(NSID, canevr14 = W1canntryYP), @@ -478,32 +631,39 @@ drug_vars <- list( canevr20 = W7DrugYP1YP0a, othevr20 = W7DrugYP1YP0b, now_cann20 = W7DrugOftenYP0a, - now_oth20 = starts_with("W7DrugOftenYP0") + now_oth20 = starts_with("W7DrugOftenYP0") & !any_of("W7DrugOftenYP0a") ), S8 = ns_data[["S8selfcompletion"]] %>% select( NSID, canevr25 = W8DRUGYP10A, - othevr25 = starts_with("W8DRUGYP10"), + othevr25 = starts_with("W8DRUGYP10") & !any_of("W8DRUGYP10A"), yr_cann25 = W8DRUGYP20A, - yr_oth25 = starts_with("W8DRUGYP20"), + yr_oth25 = starts_with("W8DRUGYP20") & !any_of("W8DRUGYP20A"), now_cann25 = W8DRUGOFTEN0A, - now_oth25 = starts_with("W8DRUGOFTEN0") + now_oth25 = starts_with("W8DRUGOFTEN0") & !any_of("W8DRUGOFTEN0A") ), S9 = ns_data[["S9maininterview"]] %>% select( NSID, canevr32 = W9DRUGYP10A, - othevr32 = starts_with("W9DRUGYP1"), + othevr32 = starts_with("W9DRUGYP1") & !any_of("W9DRUGYP10A"), yr_cann32 = W9DRUGYP20A, - yr_oth32 = starts_with("W9DRUGYP2"), + yr_oth32 = starts_with("W9DRUGYP2") & !any_of("W9DRUGYP20A"), now_cann32 = W9DRUGOFTEN0A, - now_oth32 = starts_with("W9DRUGOFTEN0") + now_oth32 = starts_with("W9DRUGOFTEN0") & !any_of("W9DRUGOFTEN0A") ) ) # Merge all datasets -drug_all <- reduce(drug_vars, full_join, by = "NSID") +drug_all <- reduce(drug_vars, full_join, by = "NSID") %>% + # Add '_raw' suffix to all variable names for simpler re-coding & cross-checks + rename_with( + .fn = ~ stringr::str_c(.x, "_raw"), + .cols = !contains("NSID") + ) + +## Recode original response options -------------------------------------------------------------------- # functions: Recode function preserving missing values # recode for whether using drug ever for each sweep @@ -544,32 +704,39 @@ recode_drugbin89 <- function(x) { } # Recode cannabis & other drug variables -drug_all <- drug_all %>% +drug_rec <- drug_all %>% mutate( across( c( - canevr14, - canevr15, - canevr16, - canevr17, - canevr19, - canevr20, - now_cann19, - othevr19, - othevr20, - now_oth19 + canevr14_raw, + canevr15_raw, + canevr16_raw, + canevr17_raw, + canevr19_raw, + canevr20_raw, + now_cann19_raw, + othevr19_raw, + othevr20_raw, + now_oth19_raw ), - recode_drugbin1_7 + recode_drugbin1_7, + # Remove '_raw' suffix from new variable names + .names = "{stringr::str_remove(.col, '_raw$')}" + ), + across( + c(now_cann20_raw, starts_with("now_oth20")), + recode_drugoft7, + # Remove '_raw' suffix from new variable names + .names = "{stringr::str_remove(.col, '_raw$')}" ), - across(c(now_cann20, starts_with("now_oth20")), recode_drugoft7), across( c( - canevr25, - canevr32, - yr_cann25, - yr_cann32, - now_cann25, - now_cann32, + canevr25_raw, + canevr32_raw, + yr_cann25_raw, + yr_cann32_raw, + now_cann25_raw, + now_cann32_raw, starts_with("yr_oth25"), starts_with("yr_oth32"), starts_with("othevr25"), @@ -577,65 +744,130 @@ drug_all <- drug_all %>% starts_with("now_oth25"), starts_with("now_oth32") ), - recode_drugbin89 + recode_drugbin89, + # Remove '_raw' suffix from new variable names + .names = "{stringr::str_remove(.col, '_raw$')}" ) ) -# Derive oth7–9 and now_oth7–9 from multiple variables -# Code review / MD (2025-12-05): Vectorised rewrite of the original rowwise() code to improve speed. -# This reproduces the original behaviour. -# Substantive logic to be revisited at a later debugging/clean-up stage. -row_max_df <- function(df) { - do.call(pmax, c(df, list(na.rm = TRUE))) +# Check: cross-tabs +drug_pairs <- tibble::tibble( + raw_var = names(drug_rec) |> + stringr::str_subset("_raw$") +) |> + dplyr::mutate( + rec_var = stringr::str_remove(raw_var, "_raw$"), + has_rec = rec_var %in% names(drug_rec) + ) |> + dplyr::filter(has_rec) |> + dplyr::select(raw_var, rec_var) + +drug_crosstabs <- drug_pairs |> + purrr::pmap(function(raw_var, rec_var) { + drug_rec |> + dplyr::count( + raw = .data[[raw_var]], + rec = .data[[rec_var]], + name = "n" + ) |> + dplyr::mutate( + raw_var = raw_var, + rec_var = rec_var, + .before = 1 + ) + }) + +## Derive 'Other' -------------------------------------------------------------------- + +# Helper function: Derive 'Other' drug use within a sweep. +## Used derive a single variable indicating if a person used drugs other than cannabis within a sweep. +## It uses the indicator for whether ANY drug was used (based on separate yes/no indicators for each drug). +## Coded as 1 = 'yes' if any drug was used. +## Else, coded as 0 = 'no' if all drugs were reported as not used (conservative). +## Else, missing values follow a hierarchy. +derive_drug_other_within <- function(cols) { + dplyr::case_when( + ## If any variable reported as yes -> 'yes' (1). + dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == 1, FALSE)) ~ 1L, + ## ELSE: If all drugs reported as NOT having used -> 'no' (0). + dplyr::if_all({{ cols }}, \(x) !is.na(x) & x == 0) ~ 0L, + ## ELSE: If any -9 (refusal) -> -9. + dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -9, FALSE)) ~ -9L, + ## ELSE: If any -8 (dk/insufficient info) -> -8. + dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -8, FALSE)) ~ -8L, + ## ELSE: If any -1 (not applicable) -> -1. + dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -1, FALSE)) ~ -1L, + ## ELSE: -3 (not interviewed/asked etc.) + TRUE ~ -3L + ) } -drug_all <- drug_all |> - mutate( - # 25-sweep: use columns 2:9 (8 "other" vars) - othevr25 = row_max_df( - pick(starts_with("othevr25"))[2:9] +drug_rec <- drug_rec |> + dplyr::mutate( + othevr25 = derive_drug_other_within( + starts_with("othevr25") & !ends_with("raw") ), - now_oth20 = row_max_df( - pick(starts_with("now_oth20"))[2:9] + othevr32 = derive_drug_other_within( + starts_with("othevr32") & !ends_with("raw") ), - now_oth25 = row_max_df( - pick(starts_with("now_oth25"))[2:9] + yr_oth25 = derive_drug_other_within( + starts_with("yr_oth25") & !ends_with("raw") ), - yr_oth25 = row_max_df( - pick(starts_with("yr_oth25"))[2:9] + yr_oth32 = derive_drug_other_within( + starts_with("yr_oth32") & !ends_with("raw") ), - - # 32-sweep: use columns 2:10 (9 "other" vars) - othevr32 = row_max_df( - pick(starts_with("othevr32"))[2:10] + now_oth20 = derive_drug_other_within( + starts_with("now_oth20") & !ends_with("raw") ), - now_oth32 = row_max_df( - pick(starts_with("now_oth32"))[2:10] + now_oth25 = derive_drug_other_within( + starts_with("now_oth25") & !ends_with("raw") ), - yr_oth32 = row_max_df( - pick(starts_with("yr_oth32"))[2:10] + now_oth32 = derive_drug_other_within( + starts_with("now_oth32") & !ends_with("raw") ) ) +## Derive EVER used -------------------------------------------------------------------- -# Derive: Ever used -drug_all <- drug_all %>% +# Derive 'ever used' for cannabis and other drugs across sweeps. +# The function takes indicators from each sweep (1 = reported EVER using drug, 0 = reported not EVER using drug) +# and derives a single EVER indicator. +# Coded as 1 = 'yes' if drug was EVER used. +# Else, coded as 0 = 'no' if any sweep reported not EVER using drug (liberal). +# Otherwise, missing values follow a hierarchy. +derive_drug_ever_across <- function(cols) { + dplyr::case_when( + ## If any sweep reported as ever used -> 'yes' (1). + dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == 1, FALSE)) ~ 1L, + ## ELSE: If any sweep reported as NOT having ever used -> 'no' (0). + dplyr::if_any({{ cols }}, \(x) x == 0) ~ 0L, + ## ELSE: If any -9 (refusal) -> -9. + dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -9, FALSE)) ~ -9L, + ## ELSE: If any -8 (dk/insufficient info) -> -8. + dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -8, FALSE)) ~ -8L, + ## ELSE: If any -1 (not applicable) -> -1. + dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -1, FALSE)) ~ -1L, + ## ELSE: -3 (not interviewed/asked etc.) + TRUE ~ -3L + ) +} + +drug_rec_ever <- drug_rec %>% mutate( - drgcnbevr = pmax( - canevr14, - canevr15, - canevr16, - canevr17, - canevr19, - canevr20, - canevr25, - canevr32, - na.rm = FALSE + drgcnbevr = derive_drug_ever_across( + starts_with("canevr") & !ends_with("raw") ), - drgothevr = pmax(othevr19, othevr20, othevr25, othevr32, na.rm = FALSE) + drgothevr = derive_drug_ever_across(c( + othevr19, + othevr20, + othevr25, + othevr32 + )) ) -# Derive: First time use +## Derive first time use -------------------------------------------------------------------- + +# This variable will record the first known age of using cannabis/other drugs. first_wave_age <- c(14, 15, 16, 17, 19, 20, 25, 32) cann_vars <- c( "canevr14", @@ -649,40 +881,38 @@ cann_vars <- c( ) oth_vars <- c("othevr19", "othevr20", "othevr25", "othevr32") -drug_all <- drug_all %>% +drug_rec_first <- drug_rec_ever |> mutate( - # first time reported using cannabis (14-32) drgcnbfst = case_when( - canevr14 == 1 ~ 14, - canevr15 == 1 ~ 15, - canevr16 == 1 ~ 16, - canevr17 == 1 ~ 17, - canevr19 == 1 ~ 19, - canevr20 == 1 ~ 20, - canevr25 == 1 ~ 25, - canevr32 == 1 ~ 32, - all(is.na(canevr14:canevr32)) ~ -3, - rowSums(select(., canevr14:canevr32), na.rm = TRUE) == 0 ~ 99, - TRUE ~ -2 + canevr14 == 1 ~ 14L, + canevr15 == 1 ~ 15L, + canevr16 == 1 ~ 16L, + canevr17 == 1 ~ 17L, + canevr19 == 1 ~ 19L, + canevr20 == 1 ~ 20L, + canevr25 == 1 ~ 25L, + canevr32 == 1 ~ 32L, + + # conservative "never": all included sweeps are exactly 0 + if_all(all_of(cann_vars), ~ .x == 0) ~ 99L, + + .default = -3L ), - # first time reported using other drugs (19-32) drgothfst = case_when( - othevr19 == 1 ~ 19, - othevr20 == 1 ~ 20, - othevr25 == 1 ~ 25, - othevr32 == 1 ~ 32, - all(is.na(c_across(c(othevr19, othevr20, othevr25, othevr32)))) ~ -3, - rowSums( - select(., othevr19, othevr20, othevr25, othevr32), - na.rm = TRUE - ) == - 0 ~ 99, - TRUE ~ -2 + othevr19 == 1 ~ 19L, + othevr20 == 1 ~ 20L, + othevr25 == 1 ~ 25L, + othevr32 == 1 ~ 32L, + + if_all(all_of(oth_vars), ~ .x == 0) ~ 99L, + + .default = -3L ) ) -# Derive: Current use -drug_all <- drug_all %>% +## Derive current use -------------------------------------------------------------------- + +drug_rec_current <- drug_rec_first %>% mutate( drgcnbnw19 = case_when( canevr19 == 0 ~ 0, @@ -723,45 +953,36 @@ drug_all <- drug_all %>% ) ) -# Final selection -drug_final <- drug_all %>% +# Add labels and select variables +drug_all_clean <- drug_rec_current %>% + select(-ends_with("raw")) %>% mutate( across( c(drgcnbevr, drgothevr, starts_with("drgcnbnw"), starts_with("drgothnw")), - ~ factor( + ~ labelled( .x, - levels = c(0, 1, -1, -2, -3, -8, -9), labels = c( - "No", - "Yes", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "No" = 0, + "Yes" = 1, + common_missing_labels ) ) ), across( c(drgcnbfst, drgothfst), - ~ factor( + ~ labelled( .x, - levels = c(14, 15, 16, 17, 19, 20, 25, 32, 99, -1, -2, -3, -8, -9), labels = c( - "Age 14", - "Age 15", - "Age 16", - "Age 17", - "Age 19", - "Age 20", - "Age 25", - "Age 32", - "Never used", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "Age 14" = 14, + "Age 15" = 15, + "Age 16" = 16, + "Age 17" = 17, + "Age 19" = 19, + "Age 20" = 20, + "Age 25" = 25, + "Age 32" = 32, + "Never used" = 99, + common_missing_labels ) ) ) @@ -796,7 +1017,12 @@ exercise_vars <- list( ) # Merge all datasets -spt_all <- reduce(exercise_vars, full_join, by = "NSID") +spt_all <- reduce(exercise_vars, full_join, by = "NSID") %>% + # Add '_raw' suffix to all variable names for simpler re-coding & cross-checks + rename_with( + .fn = ~ stringr::str_c(.x, "_raw"), + .cols = !contains("NSID") + ) # Recode function recode_exercise <- function(x) { @@ -812,57 +1038,86 @@ recode_exercise <- function(x) { ) } -# Apply recoding -spt_all <- spt_all %>% +# Re-coding +## At sweeps 8-9, the question wording changed, +## asking about the number of days per week doing exercise for 30 mins or more. +## For these sweeps, re-coding was done as follows: +## - 5-7 days = "most days" (0) +## - 2-4 days = "more than once a week" (1) +## - 1 day = "once a week" (2) +## - 0 days = "less than once a week/hardly ever/never" (3) +spt_rec <- spt_all %>% mutate( - spt14 = recode_exercise(spt14), - spt15 = recode_exercise(spt15), - spt17 = recode_exercise(spt17), - spt19 = recode_exercise(spt19), - spt20 = recode_exercise(spt20), + spt14 = recode_exercise(spt14_raw), + spt15 = recode_exercise(spt15_raw), + spt17 = recode_exercise(spt17_raw), + spt19 = recode_exercise(spt19_raw), + spt20 = recode_exercise(spt20_raw), spt25 = case_when( # values from 0–7 days - spt25 %in% c(5, 6, 7) ~ 0, - spt25 %in% c(2, 3, 4) ~ 1, - spt25 == 1 ~ 2, - spt25 == 0 ~ 3, - spt25 == -9 ~ -9, - spt25 == -8 ~ -8, - spt25 == -1 ~ -1, - is.na(spt25) ~ -3 + spt25_raw %in% c(5, 6, 7) ~ 0, + spt25_raw %in% c(2, 3, 4) ~ 1, + spt25_raw == 1 ~ 2, + spt25_raw == 0 ~ 3, + spt25_raw == -9 ~ -9, + spt25_raw == -8 ~ -8, + spt25_raw == -1 ~ -1, + is.na(spt25_raw) ~ -3 ), spt32 = case_when( - spt32 %in% c(5, 6, 7) ~ 0, - spt32 %in% c(2, 3, 4) ~ 1, - spt32 == 1 ~ 2, - spt32 == 0 ~ 3, - spt32 == -9 ~ -9, - spt32 == -8 ~ -8, - spt32 == -1 ~ -1, - is.na(spt32) | spt32 == -3 ~ -3 + spt32_raw %in% c(5, 6, 7) ~ 0, + spt32_raw %in% c(2, 3, 4) ~ 1, + spt32_raw == 1 ~ 2, + spt32_raw == 0 ~ 3, + spt32_raw == -9 ~ -9, + spt32_raw == -8 ~ -8, + spt32_raw == -1 ~ -1, + is.na(spt32_raw) | spt32_raw == -3 ~ -3 ) ) %>% - mutate(across( - c(starts_with("spt")), - ~ factor( - .x, - levels = c(0, 1, 2, 3, -1, -2, -3, -8, -9), - labels = c( - "Most days", - "More than once a week", - "Once a week", - "Less than once a week/hardly ever/never", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + mutate( + across( + c(starts_with("spt") & !ends_with("raw")), + ~ labelled( + .x, + labels = c( + "Most days" = 0, + "More than once a week" = 1, + "Once a week" = 2, + "Less than once a week/hardly ever/never" = 3, + common_missing_labels + ) ) ) - )) %>% + ) + +# Cross-tabs +spt_rec %>% + count(spt14_raw, spt14) + +spt_rec %>% + count(spt15_raw, spt15) + +spt_rec %>% + count(spt17_raw, spt17) + +spt_rec %>% + count(spt19_raw, spt19) + +spt_rec %>% + count(spt20_raw, spt20) + +spt_rec %>% + count(spt25_raw, spt25) + +spt_rec %>% + count(spt32_raw, spt32) + +spt_all <- spt_rec %>% select(NSID, spt14, spt15, spt17, spt19, spt20, spt25, spt32) -# Absence -------------------------------------------------------------------- +# School absence -------------------------------------------------------------------- + # Load relevant sweep files and select variables absence_vars <- list( S1 = ns_data[["S1youngperson"]] %>% @@ -876,7 +1131,12 @@ absence_vars <- list( ) # Merge the datasets by NSID -absence_all <- reduce(absence_vars, full_join, by = "NSID") +absence_all <- reduce(absence_vars, full_join, by = "NSID") %>% + # Add '_raw' suffix to all variable names for simpler re-coding & cross-checks + rename_with( + .fn = ~ stringr::str_c(.x, "_raw"), + .cols = !contains("NSID") + ) # Recode function for harmonised values recode_absence <- function(x) { @@ -885,39 +1145,45 @@ recode_absence <- function(x) { x == 2 ~ 0, # no x %in% c(-97, -92) ~ -9, x %in% c(-91) ~ -1, - x %in% c(-96, -1) ~ -8, - x %in% c(-998, -997, -995, -99) ~ -2, - is.na(x) ~ -3, - TRUE ~ -3 + x %in% c(-1) ~ -8, + x %in% c(-998, -997, -995) ~ -2, + .default = -3 ) } # Apply recode to each sweep -absence_all <- absence_all %>% +absence_rec <- absence_all %>% mutate( - abs1m14 = recode_absence(abs1m14), - abs1m15 = recode_absence(abs1m15), - abs1m16 = recode_absence(abs1m16) + abs1m14 = recode_absence(abs1m14_raw), + abs1m15 = recode_absence(abs1m15_raw), + abs1m16 = recode_absence(abs1m16_raw) ) %>% mutate(across( - starts_with("abs1m"), - ~ factor( + starts_with("abs1m") & !ends_with("raw"), + ~ labelled( .x, - levels = c(0, 1, -1, -2, -3, -8, -9), labels = c( - "No", - "Yes", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "No" = 0, + "Yes" = 1, + common_missing_labels ) ) - )) %>% - select(NSID, abs1m14, abs1m15, abs1m16) + )) + +absence_rec %>% + count(abs1m14_raw, abs1m14) + +absence_rec %>% + count(abs1m15_raw, abs1m15) + +absence_rec %>% + count(abs1m16_raw, abs1m16) + +absence_all <- absence_rec %>% + select(NSID, starts_with("abs1m") & !ends_with("raw")) # Suspended/Expelled -------------------------------------------------------------------- + # Load suspension and expulsion variables from each sweep suspend_expel_vars <- list( S1 = ns_data[["S1youngperson"]] %>% @@ -931,7 +1197,12 @@ suspend_expel_vars <- list( ) # Merge all datasets by NSID -suspend_expel_all <- reduce(suspend_expel_vars, full_join, by = "NSID") +suspend_expel_all <- reduce(suspend_expel_vars, full_join, by = "NSID") %>% + # Add '_raw' suffix to all variable names for simpler re-coding & cross-checks + rename_with( + .fn = ~ stringr::str_c(.x, "_raw"), + .cols = !contains("NSID") + ) # Recode function recode_school_discipline <- function(x) { @@ -940,7 +1211,7 @@ recode_school_discipline <- function(x) { x == 2 ~ 0, # no x %in% c(-97, -92) ~ -9, x %in% c(-91) ~ -1, - x %in% c(-96, -1) ~ -8, + x %in% c(-1) ~ -8, x %in% c(-99) ~ -3, is.na(x) ~ -3, TRUE ~ -3 @@ -948,33 +1219,51 @@ recode_school_discipline <- function(x) { } # Apply recoding -suspend_expel_all <- suspend_expel_all %>% +suspend_expel_rec <- suspend_expel_all %>% mutate( - susp14 = recode_school_discipline(susp14), - susp15 = recode_school_discipline(susp15), - susp16 = recode_school_discipline(susp16), - susp17 = recode_school_discipline(susp17), - expl14 = recode_school_discipline(expl14), - expl15 = recode_school_discipline(expl15), - expl16 = recode_school_discipline(expl16), - expl17 = recode_school_discipline(expl17) + susp14 = recode_school_discipline(susp14_raw), + susp15 = recode_school_discipline(susp15_raw), + susp16 = recode_school_discipline(susp16_raw), + susp17 = recode_school_discipline(susp17_raw), + expl14 = recode_school_discipline(expl14_raw), + expl15 = recode_school_discipline(expl15_raw), + expl16 = recode_school_discipline(expl16_raw), + expl17 = recode_school_discipline(expl17_raw) ) %>% mutate(across( - c(starts_with("abs1m"), starts_with("expl")), - ~ factor( + c( + starts_with("susp") & !ends_with("raw"), + starts_with("expl") & !ends_with("raw") + ), + ~ labelled( .x, - levels = c(0, 1, -1, -2, -3, -8, -9), labels = c( - "No", - "Yes", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "No" = 0, + "Yes" = 1, + common_missing_labels ) ) - )) %>% + )) + +# Cross-tabs +suspend_expel_rec %>% + count(susp14_raw, susp14) +suspend_expel_rec %>% + count(susp15_raw, susp15) +suspend_expel_rec %>% + count(susp16_raw, susp16) +suspend_expel_rec %>% + count(susp17_raw, susp17) +suspend_expel_rec %>% + count(expl14_raw, expl14) +suspend_expel_rec %>% + count(expl15_raw, expl15) +suspend_expel_rec %>% + count(expl16_raw, expl16) +suspend_expel_rec %>% + count(expl17_raw, expl17) + +suspend_expel_all <- suspend_expel_rec %>% select(NSID, starts_with("susp"), starts_with("expl")) # Truancy -------------------------------------------------------------------- @@ -1001,13 +1290,13 @@ recode_truancy_early <- function(ever, type) { type == 2 ~ 2, # several days at a time type == 3 ~ 3, # particular days or lessons type == 4 ~ 4, # odd day or lesson - type %in% c(-96, -1) ~ -8, - type %in% c(-97, -92) ~ -9, - type %in% c(-99) ~ -3, - type %in% c(-91) ~ -1, - is.na(type) & ever == 1 ~ -2, - is.na(type) & is.na(ever) ~ -3, - TRUE ~ -3 + # If missing, use hierarchy derived from both ever and type: + # Else if either is refusal -> refusal (-9) + ever %in% c(-92, -97) | type %in% c(-92, -97) ~ -9, + # Else if either is don't know / insufficient info -> -8 + ever == -1 | type == -1 ~ -8, + # Else is not interviewed/asked etc. -> -3 + .default = -3 ) } @@ -1019,7 +1308,7 @@ recode_truancy_s4 <- function(x) { x == 2 ~ 2, x == 3 ~ 3, x == 4 ~ 4, - x %in% c(-96, -1) ~ -8, + x %in% c(-1) ~ -8, x %in% c(-97, -92) ~ -9, x == -99 ~ -3, x == -91 ~ -1, @@ -1029,7 +1318,7 @@ recode_truancy_s4 <- function(x) { } # Apply recoding -truancy_all <- truancy_all %>% +truancy_rec <- truancy_all %>% mutate( trua14 = recode_truancy_early(trua14_ever, trua14_type), trua15 = recode_truancy_early(trua15_ever, trua15_type), @@ -1037,27 +1326,38 @@ truancy_all <- truancy_all %>% trua17 = recode_truancy_s4(trua17_raw) ) %>% mutate(across( - starts_with("trua"), - ~ factor( + c(trua14, trua15, trua16, trua17), + ~ labelled( .x, - levels = c(0, 1, 2, 3, 4, -1, -2, -3, -8, -9), labels = c( - "Never played truant", - "For weeks at a time", - "Several days at a time", - "Particular days or lessons", - "Odd day or lesson", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "Never played truant" = 0, + "For weeks at a time" = 1, + "Several days at a time" = 2, + "Particular days or lessons" = 3, + "Odd day or lesson" = 4, + common_missing_labels ) ) - )) %>% + )) + +# Cross-tabs +truancy_rec %>% + count(trua14_ever, trua14_type, trua14) + +truancy_rec %>% + count(trua15_ever, trua15_type, trua15) + +truancy_rec %>% + count(trua16_ever, trua16_type, trua16) + +truancy_rec %>% + count(trua17, trua17_raw) + +truancy_all <- truancy_rec %>% select(NSID, trua14, trua15, trua16, trua17) # Police Contact -------------------------------------------------------------------- + # Load data for police contact police_vars <- list( S1 = ns_data[["S1youngperson"]] %>% @@ -1089,7 +1389,12 @@ police_vars <- list( ) # Merge datasets -police_all <- reduce(police_vars, full_join, by = "NSID") +police_all <- reduce(police_vars, full_join, by = "NSID") %>% + # Add '_raw' suffix to all variable names for simpler re-coding & cross-checks + rename_with( + .fn = ~ stringr::str_c(.x, "_raw"), + .cols = !contains("NSID") + ) # Recode function for binary variables (pol15,16) recode_pol <- function(x) { @@ -1098,10 +1403,10 @@ recode_pol <- function(x) { x == 2 ~ 0, x %in% c(-97, -92) ~ -9, x == -91 ~ -1, - x %in% c(-96, -1) ~ -8, + x %in% c(-1) ~ -8, x %in% c(-998, -997, -995) ~ -2, x %in% c(-99) ~ -3, - is.na(x) ~ -3 + .default = -3 ) } @@ -1110,22 +1415,101 @@ recode_cnt <- function(x, ever) { case_when( ever %in% c(2, 3) ~ 0, x >= 0 ~ x, - x %in% c(-97, -92) ~ -9, - x == -91 ~ -1, - x %in% c(-96, -1) ~ -8, + # Use hierarchy for missing values: + ever %in% c(-92, -97) | x %in% c(-92, -97) ~ -9, + ever == -1 | x == -1 ~ -8, + # Script errors/information lost: x %in% c(-998, -997, -995) ~ -2, - x %in% c(-99, -996) ~ -3, - is.na(x) ~ -3, + # Else is not interviewed/asked etc. -> -3 + .default = -3 ) } -# Apply recoding -police_all <- police_all %>% +## Police contact -------------------------------------------------------------------- +police_rec_contact <- police_all %>% + mutate( + # Police contact - binary: + pol14 = case_when( + pol14_raw %in% c(1, 3) ~ 1, + pol14_raw == 2 ~ 0, + pol14_raw %in% c(-97, -92) ~ -9, + pol14_raw == -91 ~ -1, + pol14_raw %in% c(-1) ~ -8, + pol14_raw %in% c(-99) ~ -3, + .default = -3 + ), + pol15 = recode_pol(pol15_raw), + pol16 = recode_pol(pol16_raw), + pol17 = case_when( + pol17_raw %in% c(1, 3) ~ 1, + pol17_raw == 2 ~ 0, + pol17_raw %in% c(-97, -92) ~ -9, + pol17_raw == -91 ~ -1, + pol17_raw %in% c(-1) ~ -8, + pol17_raw %in% c(-99) ~ -3, + .default = -3 + ), + # Police contact - count: + polcnt14 = recode_cnt(polcnt14_raw, pol14_raw), + polcnt15 = recode_cnt(polcnt15_raw, pol15_raw), + polcnt16 = recode_cnt(polcnt16_raw, pol16_raw), + polcnt17 = recode_cnt(polcnt17_raw, pol17_raw), + # Add labels + across( + c(pol14, pol15, pol16, pol17), + ~ labelled( + .x, + labels = c( + "No" = 0, + "Yes/not in last 3 years" = 1, + common_missing_labels + ) + ) + ), + across( + c(starts_with("polcnt") & !ends_with("raw")), + ~ labelled( + .x, + labels = c( + "Item not applicable" = -1, + "Script error/information lost" = -2, + "Not asked at the fieldwork stage/participated/interviewed" = -3, + "Don’t know/insufficient information" = -8, + "Refusal" = -9 + ) + ) + ) + ) + + +# Cross-tabs +police_rec_contact %>% + count(pol14_raw, pol14) +police_rec_contact %>% + count(pol15_raw, pol15) +police_rec_contact %>% + count(pol16_raw, pol16) +police_rec_contact %>% + count(pol17_raw, pol17) + +police_rec_contact %>% + count(pol14_raw, polcnt14_raw, polcnt14) %>% + print(n = Inf) +police_rec_contact %>% + count(pol15_raw, polcnt15_raw, polcnt15) %>% + print(n = Inf) +# For polcnt16_raw, -1 is dk/insufficient info (checked the data dictionary, label only missing in the Stata file). +police_rec_contact %>% + count(pol16_raw, polcnt16_raw, polcnt16) %>% + print(n = Inf) +police_rec_contact %>% + count(pol17_raw, polcnt17_raw, polcnt17) %>% + print(n = Inf) + +## Police warning, arrest, caution -------------------------------------------------------------------- + +police_rec_warning <- police_rec_contact %>% mutate( - polcnt14 = recode_cnt(polcnt14, pol14), - polcnt15 = recode_cnt(polcnt15, pol15), - polcnt16 = recode_cnt(polcnt16, pol16), - polcnt17 = recode_cnt(polcnt17, pol17), across( starts_with("polwrn"), ~ case_when( @@ -1133,7 +1517,8 @@ police_all <- police_all %>% .x == 2 ~ 0, .x < 0 ~ .x, TRUE ~ -3 - ) + ), + .names = "{stringr::str_remove(.col, '_raw$')}" ), across( starts_with("polars"), @@ -1142,7 +1527,8 @@ police_all <- police_all %>% .x == 2 ~ 0, .x < 0 ~ .x, TRUE ~ -3 - ) + ), + .names = "{stringr::str_remove(.col, '_raw$')}" ), across( starts_with("polcau"), @@ -1151,8 +1537,60 @@ police_all <- police_all %>% .x == 2 ~ 0, .x < 0 ~ .x, TRUE ~ -3 - ) + ), + .names = "{stringr::str_remove(.col, '_raw$')}" ), + # Add labels + across( + c( + starts_with("polwrn") & !ends_with("raw"), + starts_with("polars") & !ends_with("raw"), + starts_with("polcau") & !ends_with("raw") + ), + ~ labelled( + .x, + labels = c( + "No" = 0, + "Yes" = 1, + common_missing_labels + ) + ) + ) + ) + +# Cross-tabs +pol_warn_pairs <- tibble::tibble( + raw_var = names(police_rec_warning) |> + stringr::str_subset("^(polwrn|polars|polcau).*_raw$") +) |> + dplyr::mutate( + rec_var = stringr::str_remove(raw_var, "_raw$"), + has_rec = rec_var %in% names(police_rec_warning) + ) |> + dplyr::filter(has_rec) |> + dplyr::select(raw_var, rec_var) + +police_warn_crosstabs <- pol_warn_pairs |> + purrr::pmap(function(raw_var, rec_var) { + police_rec_warning |> + dplyr::count( + raw = .data[[raw_var]], + rec = .data[[rec_var]], + name = "n" + ) |> + dplyr::mutate( + raw_var = raw_var, + rec_var = rec_var, + .before = 1 + ) + }) + +police_warn_crosstabs + +## Found guilty, penalty notice -------------------------------------------------------------------- + +police_rec_guilty <- police_rec_warning %>% + mutate( across( starts_with("polglt"), ~ case_when( @@ -1160,7 +1598,8 @@ police_all <- police_all %>% .x == 2 ~ 0, .x < 0 ~ .x, TRUE ~ -3 - ) + ), + .names = "{stringr::str_remove(.col, '_raw$')}" ), across( starts_with("polpnd"), @@ -1169,84 +1608,38 @@ police_all <- police_all %>% .x == 2 ~ 0, .x < 0 ~ .x, TRUE ~ -3 - ) - ) - ) %>% - mutate( - pol14 = case_when( - pol14 %in% c(1, 3) ~ 1, - pol14 == 2 ~ 0, - pol14 %in% c(-97, -92) ~ -9, - pol14 == -91 ~ -1, - pol14 %in% c(-96, -1) ~ -8, - pol14 %in% c(-99) ~ -3, - TRUE ~ -3 + ), + .names = "{stringr::str_remove(.col, '_raw$')}" ), - pol15 = recode_pol(pol15), - pol16 = recode_pol(pol16), - pol17 = case_when( - pol17 %in% c(1, 3) ~ 1, - pol17 == 2 ~ 0, - pol17 %in% c(-97, -92) ~ -9, - pol17 == -91 ~ -1, - pol17 %in% c(-96, -1) ~ -8, - pol17 %in% c(-99) ~ -3, - TRUE ~ -3 - ) - ) %>% - mutate( + # Add labels across( c( - starts_with("polwrn"), - starts_with("polars"), - starts_with("polcau"), - starts_with("polglt"), - starts_with("polpnd") + starts_with("polglt") & !ends_with("raw"), + starts_with("polpnd") & !ends_with("raw") ), - ~ factor( - .x, - levels = c(0, 1, -1, -2, -3, -8, -9), - labels = c( - "No", - "Yes", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" - ) - ) - ), - across( - c(pol14, pol15, pol16, pol17), - ~ factor( - .x, - levels = c(0, 1, -1, -2, -3, -8, -9), - labels = c( - "No", - "Yes/not in last 3 years", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" - ) - ) - ), - across( - c(starts_with("polcnt")), ~ labelled( .x, labels = c( - "Item not applicable" = -1, - "Script error/information lost" = -2, - "Not asked at the fieldwork stage/participated/interviewed" = -3, - "Don’t know/insufficient information" = -8, - "Refusal" = -9 + "No" = 0, + "Yes" = 1, + common_missing_labels ) ) ) - ) %>% + ) + +# Cross-tabs +police_rec_guilty %>% + count(polglt25_raw, polglt25) +police_rec_guilty %>% + count(polglt32_raw, polglt32) +police_rec_guilty %>% + count(polpnd25_raw, polpnd25) +police_rec_guilty %>% + count(polpnd32_raw, polpnd32) + +police_all <- police_rec_guilty %>% + select(!ends_with("raw")) %>% select( NSID, pol14, @@ -1264,15 +1657,16 @@ police_all <- police_all %>% starts_with("polpnd") ) -# Bully -------------------------------------------------------------------- +# Bullying -------------------------------------------------------------------- + # Load and harmonise bullying variables across sweeps 1–4, 7–8 bully_vars <- list( S1 = ns_data[["S1youngperson"]] %>% - select(NSID, bul14 = W1bulrc), + select(NSID, bul14_raw = W1bulrc), S2 = ns_data[["S2youngperson"]] %>% - select(NSID, bul15 = W2bulrc), + select(NSID, bul15_raw = W2bulrc), S3 = ns_data[["S3youngperson"]] %>% - select(NSID, bul16 = W3bulrc), + select(NSID, bul16_raw = W3bulrc), S4 = ns_data[["S4youngperson"]] %>% select( NSID, @@ -1298,69 +1692,77 @@ recode_yesno <- function(x) { x == 1 ~ 1, x == 2 ~ 0, x %in% c(-92, -97) ~ -9, - x %in% c(-96, -8) ~ -8, - x == -99 ~ -3, - is.na(x) ~ -3, - TRUE ~ -2 + x %in% c(-8, -1) ~ -8, + .default = -3 ) } # Apply recodes -bully_all <- bully_all %>% +bully_rec <- bully_all %>% mutate( - bul14 = recode_yesno(bul14), - bul15 = recode_yesno(bul15), - bul16 = recode_yesno(bul16), + bul14 = recode_yesno(bul14_raw), + bul15 = recode_yesno(bul15_raw), + bul16 = recode_yesno(bul16_raw), bul17 = case_when( - rowSums(across(starts_with("bul17_")) == 1, na.rm = TRUE) > 0 ~ 1, - rowSums( - across(c("bul17_1", "bul17_2", "bul17_4", "bul17_5", "bul17_6")) == 2, - na.rm = TRUE - ) == - 5 ~ 0, - rowSums( - across(c("bul17_1", "bul17_2", "bul17_4", "bul17_5", "bul17_6")) < 0, - na.rm = TRUE - ) > - 0 ~ -8, - rowSums(is.na(across(starts_with("bul17_")))) == 6 ~ -3, - TRUE ~ -2 + # If any of the 6 indicators of bullying in S4 is reported as "yes" (1) -> bullied (1) + if_any(starts_with("bul17_"), ~ .x == 1) ~ 1, + # Else if all 6 indicators of bullying in S4 are reported as "no" (2) -> not bullied (0) + if_all(starts_with("bul17_"), ~ .x == 2) ~ 0, + # Else if any of the 6 indicators of bullying in S4 is refused (-92, -97) -> refused (-9) + if_any(starts_with("bul17_"), ~ .x == -92 | .x == -97) ~ -9, + # Else if any of the 6 indicators of bullying in S4 is don't know/insufficient info (-1) -> -8 + if_any(starts_with("bul17_"), ~ .x == -1) ~ -8, + # Else if any of the 6 indicators of bullying in S4 is not applicable (-91) -> not applicable (-1) + if_any(starts_with("bul17_"), ~ .x == -91) ~ -1, + # Else -> -3 (i.e. not interviewed/asked etc.) + .default = -3 ), bul20 = case_when( - rowSums( - across(starts_with("W7BullyTypeYP0")) > 0 & - across(starts_with("W7BullyTypeYP0")) < 8, - na.rm = TRUE - ) > - 0 ~ 1, - rowSums(across(starts_with("W7BullyTypeYP0")) == 8, na.rm = TRUE) == - 6 ~ 0, - rowSums(across(starts_with("W7BullyTypeYP0")) < 0, na.rm = TRUE) > 0 ~ -8, - rowSums(is.na(across(starts_with("W7BullyTypeYP0")))) == 6 ~ -3, - TRUE ~ -2 + # If any of the indicators of bullying in S7 reported as between 1 (every day) and 7 ( bullied (1) + if_any(starts_with("W7BullyTypeYP0"), ~ .x %in% 1:7) ~ 1, + # Else if all of the indicators of bullying in S7 are reported as 8 (never) -> not bullied (0) + if_all(starts_with("W7BullyTypeYP0"), ~ .x == 8) ~ 0, + # Else if any of the indicators of bullying in S7 is refused (-92, -97) -> refused (-9) + if_any(starts_with("W7BullyTypeYP0"), ~ .x %in% c(-92, -97)) ~ -9, + # Else if any of the indicators of bullying in S7 is not applicable (-91) -> -1 + if_any(starts_with("W7BullyTypeYP0"), ~ .x == -91) ~ -1, + # Else -> -3 (i.e. not interviewed/asked etc.) + .default = -3 ), bul25 = case_when( - rowSums(across(starts_with("W8BULLYTYPE0")) == 1, na.rm = TRUE) > 0 ~ 1, - rowSums(across(starts_with("W8BULLYTYPE0")) == 2, na.rm = TRUE) == 7 ~ 0, - rowSums(across(starts_with("W8BULLYTYPE0")) < 0, na.rm = TRUE) > 0 ~ -8, - rowSums(is.na(across(starts_with("W8BULLYTYPE0")))) == 7 ~ -3, - TRUE ~ -2 + # If any of the indicators of bullying in S8 reported as 1 (yes) -> bullied (1), + if_any(starts_with("W8BULLYTYPE0"), ~ .x == 1) ~ 1, + # Else if all of the indicators of bullying in S8 are reported as 2 (no) -> not bullied (0) + if_all(starts_with("W8BULLYTYPE0"), ~ .x == 2) ~ 0, + # Else if any of the indicators of bullying in S8 is refused (-9) -> refused (-9) + if_any(starts_with("W8BULLYTYPE0"), ~ .x == -9) ~ -9, + # Else if any of the indicators of bullying in S8 is don't know/insufficient info (-8) -> -8 + if_any(starts_with("W8BULLYTYPE0"), ~ .x == -8) ~ -8, + # Else if any of the indicators of bullying in S8 is not applicable (-1) -> not applicable (-1) + if_any(starts_with("W8BULLYTYPE0"), ~ .x == -1) ~ -1, + # Else if any of the indicators of bullying in S8 is not interviewed/asked etc. (-3) -> -3 + if_any(starts_with("W8BULLYTYPE0"), ~ .x == -3) ~ -3 ) ) %>% mutate(across( - starts_with("bul"), - ~ factor( + starts_with("bul") & !ends_with("raw"), + ~ labelled( .x, - levels = c(0, 1, -1, -2, -3, -8, -9), labels = c( - "No", - "Yes", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "No" = 0, + "Yes" = 1, + common_missing_labels ) ) - )) %>% + )) + +# Cross-tabs +bully_rec %>% + count(bul14_raw, bul14) +bully_rec %>% + count(bul15_raw, bul15) +bully_rec %>% + count(bul16_raw, bul16) + +bully_all <- bully_rec %>% select(NSID, bul14, bul15, bul16, bul17, bul20, bul25) diff --git a/build-core-dataset.R b/build-core-dataset.R index be2f3e8..494f02f 100644 --- a/build-core-dataset.R +++ b/build-core-dataset.R @@ -58,26 +58,24 @@ long_vars <- read_dta(file.path(data_path, sweeps$longitudinal)) %>% W8OUTCOME, W9OUTCOME ), - ~ factor( + ~ labelled( .x, - levels = c(1, 2, 3, 4, 5, 6, -1), labels = c( - "Productive", - "Refusal", - "Non-contact and other unproductive", - "Ineligible", - "Untraced", - "Not issued", - "No contact" + "Productive" = 1, + "Refusal" = 2, + "Non-contact and other unproductive" = 3, + "Ineligible" = 4, + "Untraced" = 5, + "Not issued" = 6, + "No contact" = -1 ) ) ), - DATA_AVAILABILITY = factor( + DATA_AVAILABILITY = labelled( DATA_AVAILABILITY, - levels = c(0, 1), - labels = c("Not available", "Available for research") + labels = c("Not available" = 0, "Available for research" = 1) ), - MAINBOOST = factor(MAINBOOST, levels = c(1, 2), labels = c("Main", "Boost")) + MAINBOOST = labelled(MAINBOOST, labels = c("Main" = 1, "Boost" = 2)) ) # Merge All Datasets -------------------------------------------------------------------- @@ -113,7 +111,7 @@ derived_vars <- list( lsi_all, smoking_all, alc_all_clean, - drug_final, + drug_all_clean, spt_all, absence_all, suspend_expel_all,