diff --git a/R/06-behaviour.R b/R/06-behaviour.R
index 5760954..0c5c0b2 100644
--- a/R/06-behaviour.R
+++ b/R/06-behaviour.R
@@ -8,41 +8,78 @@
 # or manually run 00-load-raw-data.R before this script.
 
 # Smoke --------------------------------------------------------------------
-
 # Load smoking data from relevant sweeps
 smoking_vars <- list(
   S1 = ns_data[["S1youngperson"]] %>%
-    select(NSID, smknw14 = W1cignowYP, smk14 = W1cigfreqYP),
+    select(NSID, smk14_ever = W1cignowYP, smk14_freq = W1cigfreqYP),
   S2 = ns_data[["S2youngperson"]] %>%
-    select(NSID, smknw15 = W2cignowYP, smk15 = W2cigfreqYP),
+    select(NSID, smk15_ever = W2cignowYP, smk15_freq = W2cigfreqYP),
   S3 = ns_data[["S3youngperson"]] %>%
-    select(NSID, smknw16 = W3cignowYP, smk16 = W3cigfreqYP),
+    select(NSID, smk16_ever = W3cignowYP, smk16_freq = W3cigfreqYP),
   S4 = ns_data[["S4youngperson"]] %>%
     select(NSID),
   S8 = ns_data[["S8selfcompletion"]] %>%
-    select(NSID, smk25 = W8SMOKING),
+    select(NSID, smk25_ever_freq = W8SMOKING),
   S9 = ns_data[["S9maininterview"]] %>%
-    select(NSID, smk32 = W9SMOKING)
+    select(NSID, smk32_ever_freq = W9SMOKING)
 )
 
 # Merge all sweeps
 smoking_all <- reduce(smoking_vars, full_join, by = "NSID")
 
-# Recode smoke ever/frequency
-recode_smk14_16 <- function(x) {
+
+# Merge all sweeps
+# smoking_all <- reduce(smoking_vars, full_join, by = "NSID") %>%
+# Rename all smknw to smk_ever and smk to smk_freq for readability
+#  rename_with(
+#    ~ stringr::str_replace()
+# Add '_raw' suffix to all 'smk*' variable names for simpler re-coding & cross-checks
+#  rename_with(
+#    ~ stringr::str_c(.x, "_raw"),
+#    contains("smk")
+#  )
+
+# Note on smoking variables:
+## In some sweeps, participants were first asked if they ever smoked [smknw_raw variables].
+## If positive, they were then asked how often they smoke(d). [smk_raw variables]
+## This means that if a person did not smoke, they would have frequency as missing ['Not applicable'].
+
+## Standardise values --------------------------------------------------------------------
+
+## The following code will convert missing values and responses to a common coding scheme.
+
+# Recode if ever smoking for age 14-16
+recode_smk_ever_14_16 <- function(x) {
+  case_when(
+    x == 1 ~ 1, # Yes
+    x == 2 ~ 0, # No
+    x == -96 ~ -3,
+    x %in% c(-92, -97) ~ -9,
+    x == -91 ~ -1,
+    x == -1 ~ -8,
+    x == -99 ~ -3, # YP not interviewed
+    TRUE ~ -3
+  )
+}
+
+# Recode smoking frequency for age 14-16
+recode_smk_freq_14_16 <- function(x) {
   case_when(
-    x %in% c(1, 2, -91) ~ 0, # Never
+    x %in% c(1, 2) ~ 0, # Never
     x == 3 ~ 1, # used to, don’t at all now
     x %in% c(4, 5) ~ 2, #  smoke cigs occasionally – not every day
     x == 6 ~ 3, # smoke cigs almost every day
-    x %in% c(-99, -97, -96) ~ -2,
-    x == -92 ~ -9,
+    x == -91 ~ -1,
+    x == -96 ~ -3,
+    x %in% c(-92, -97) ~ -9,
     x == -1 ~ -8,
+    x == -99 ~ -3, # YP not interviewed
     TRUE ~ -3
   )
 }
 
-recode_smk25_32 <- function(x) {
+# Derive smoking frequency for age 25 and 32
+recode_smk_25_32_to_freq <- function(x) {
   case_when(
     x > 0 ~ x - 1, # Convert 1-4 to 0-3
     x == -9 ~ -9,
@@ -52,20 +89,8 @@ recode_smk25_32 <- function(x) {
   )
 }
 
-# Recode smoke now
-recode_smknw14_16 <- function(x) {
-  case_when(
-    x == 1 ~ 1, # Yes
-    x == 2 ~ 0, # No
-    x %in% c(-99, -97, -96) ~ -2,
-    x == -92 ~ -9,
-    x == -91 ~ -1,
-    x == -1 ~ -8,
-    TRUE ~ -3
-  )
-}
-
-recode_smknw25_32 <- function(x) {
+# Derive binary smoking status for age 25 and 32
+recode_smk_25_32_to_ever <- function(x) {
   case_when(
     x %in% c(0, 1) ~ 0,
     x %in% c(2, 3) ~ 1,
@@ -73,70 +98,151 @@ recode_smknw25_32 <- function(x) {
   )
 }
 
-# Apply recoding
-smoking_all <- smoking_all %>%
+smoking_std <- smoking_all %>%
   mutate(
-    smk14 = recode_smk14_16(smk14),
-    smk15 = recode_smk14_16(smk15),
-    smk16 = recode_smk14_16(smk16),
-    smk25 = recode_smk25_32(smk25),
-    smk32 = recode_smk25_32(smk32)
-  ) %>%
+    # Smoking ever age 14-16
+    across(
+      c(smk14_ever, smk15_ever, smk16_ever),
+      recode_smk_ever_14_16,
+      .names = "{col}_std"
+    ),
+    # Smoking freq age 14-16
+    across(
+      c(smk14_freq, smk15_freq, smk16_freq),
+      recode_smk_freq_14_16,
+      .names = "{col}_std"
+    ),
+    # Smoking freq age 25-32 (derived from combined ever/freq fields)
+    smk25_freq_std = recode_smk_25_32_to_freq(smk25_ever_freq),
+    smk32_freq_std = recode_smk_25_32_to_freq(smk32_ever_freq),
+    # Smoking ever age 25-32 (binary, derived from recoded freq)
+    smk25_ever_std = recode_smk_25_32_to_ever(smk25_freq_std),
+    smk32_ever_std = recode_smk_25_32_to_ever(smk32_freq_std)
+  )
+
+## Derive smoking status variables --------------------------------------------------------------------
+
+# Helpers
+
+# Derive binary current smoking status ages 14-16  ('smknw[age]')
+# A person counts as 'not currently smoking' if either of the following conditions are met:
+# i) indicated not smoking when asked 'Do you ever smoke cigarettes at all?' [EVER questions]
+# ii) indicated they never smoke, they tried smoking only once, or they used to smoke but not anymore [FREQ questions]
+derive_smk_now <- function(ever_var, freq_var) {
+  case_when(
+    ever_var == 0 ~ 0L, # If EVER: 'Not smoking' -> "No"
+    freq_var %in% c(0, 1) ~ 0L, # otherwise if FREQ: 'Never smoked'/'Only once' or 'Used to smoke but never now' -> 'No'
+    ever_var == 1 ~ 1L, # otherwise if EVER: 'Yes' to smoking -> 'Yes'
+    # Missing values:
+    freq_var == -9 | ever_var == -9 ~ -9L, # otherwise: if either refused -> 'Refusal'
+    freq_var == -8 | ever_var == -8 ~ -8L, # otherwise: if either dk/insufficient info -> 'dk/insufficient info'
+    freq_var == -1 | ever_var == -1 ~ -1L, # otherwise: if either not applicable -> not applicable,
+    .default = -3L # everything else defaults to 'not interviewed/asked etc.'
+  )
+}
+
+derive_smk_detailed <- function(ever_var, freq_var) {
+  case_when(
+    freq_var >= 0 ~ as.integer(freq_var),
+    ever_var == 0 ~ 0, # Those who replied 'not ever smoke' -> 'never' (limitation: they were not asked if they never smoked)
+    freq_var == -9 | ever_var == -9 ~ -9L,
+    freq_var == -8 | ever_var == -8 ~ -8L,
+    freq_var == -1 | ever_var == -1 ~ -1L,
+    TRUE ~ -3L
+  )
+}
+
+
+smoking_rec <- smoking_std %>%
   mutate(
-    smknw14 = case_when(
-      smk14 == 0 ~ 0,
-      smknw14 > 0 ~ recode_smknw14_16(smknw14),
-      TRUE ~ recode_smknw14_16(smknw14)
+    # Detailed smoking frequency
+    smk14 = derive_smk_detailed(
+      ever_var = smk14_ever_std,
+      freq_var = smk14_freq_std
+    ),
+    smk15 = derive_smk_detailed(
+      ever_var = smk15_ever_std,
+      freq_var = smk15_freq_std
+    ),
+    smk16 = derive_smk_detailed(
+      ever_var = smk16_ever_std,
+      freq_var = smk16_freq_std
+    ),
+    smk25 = derive_smk_detailed(
+      ever_var = smk25_ever_std,
+      freq_var = smk25_freq_std
+    ),
+    smk32 = derive_smk_detailed(
+      ever_var = smk32_ever_std,
+      freq_var = smk32_freq_std
+    ),
+
+    # Binary current smoking status
+    smknw14 = derive_smk_now(
+      ever_var = smk14_ever_std,
+      freq_var = smk14_freq_std
+    ),
+    smknw15 = derive_smk_now(
+      ever_var = smk15_ever_std,
+      freq_var = smk15_freq_std
     ),
-    smknw15 = case_when(
-      smk15 == 0 ~ 0,
-      smknw15 > 0 ~ recode_smknw14_16(smknw15),
-      TRUE ~ recode_smknw14_16(smknw15)
+    smknw16 = derive_smk_now(
+      ever_var = smk16_ever_std,
+      freq_var = smk16_freq_std
     ),
-    smknw16 = case_when(
-      smk16 == 0 ~ 0,
-      smknw16 > 0 ~ recode_smknw14_16(smknw16),
-      TRUE ~ recode_smknw14_16(smknw16)
+    smknw25 = derive_smk_now(
+      ever_var = smk25_ever_std,
+      freq_var = smk25_freq_std
     ),
-    smknw25 = recode_smknw25_32(smk25),
-    smknw32 = recode_smknw25_32(smk32)
+    smknw32 = derive_smk_now(
+      ever_var = smk32_ever_std,
+      freq_var = smk32_freq_std
+    )
   ) %>%
   mutate(
     across(
       c(smk14, smk15, smk16, smk25, smk32),
-      ~ factor(
+      ~ labelled(
         .x,
-        levels = c(0, 1, 2, 3, -1, -2, -3, -8, -9),
         labels = c(
-          "Never",
-          "Used to smoke, don’t at all now",
-          "Smoke occasionally – not every day",
-          "Smoke almost every day",
-          "Item not applicable",
-          "Script error/information lost",
-          "Not asked at the fieldwork stage/participated/interviewed",
-          "Don’t know/insufficient information",
-          "Refusal"
+          "Never" = 0,
+          "Used to smoke, don’t at all now" = 1,
+          "Smoke occasionally – not every day" = 2,
+          "Smoke almost every day" = 3,
+          common_missing_labels
         )
       )
     ),
     across(
       c(smknw14, smknw15, smknw16, smknw25, smknw32),
-      ~ factor(
+      ~ labelled(
         .x,
-        levels = c(0, 1, -1, -2, -3, -8, -9),
         labels = c(
-          "No",
-          "Yes",
-          "Item not applicable",
-          "Script error/information lost",
-          "Not asked at the fieldwork stage/participated/interviewed",
-          "Don’t know/insufficient information",
-          "Refusal"
+          "No" = 0,
+          "Yes" = 1,
+          common_missing_labels
         )
       )
     )
-  ) %>%
+  )
+
+# Checks
+smoking_rec %>%
+  count(smk14_ever, smk14_freq, smk14, smknw14)
+
+smoking_rec %>%
+  count(smk15_ever, smk15_freq, smk15, smknw15)
+
+smoking_rec %>%
+  count(smk16_ever, smk16_freq, smk16, smknw16)
+
+smoking_rec %>%
+  count(smk25_ever_freq, smk25, smknw25)
+
+smoking_rec %>%
+  count(smk32_ever_freq, smk32, smknw32)
+
+smoking_all <- smoking_rec %>%
   select(
     NSID,
     smknw14,
@@ -178,24 +284,47 @@ alc_vars <- list(
 )
 
 # Merge all alcohol variables by NSID
-alc_all <- reduce(alc_vars, full_join, by = "NSID")
-
-# First Time Had Alcohol
-# Code review / MD (2025-12-05): Vectorised rewrite of the original rowwise() code to improve speed.
-# This reproduces the original behaviour, including the current treatment of cases
-# with all alcohol indicators missing as "never had alcohol" (99).
-# Substantive logic (especially the 'never had alcohol' definition) to be revisited
-# at a later debugging/clean-up stage.
-alc_all <- alc_all |>
+alc_all <- reduce(alc_vars, full_join, by = "NSID") %>%
+  # Add '_raw' suffix to all 'audit*' variable names for simpler re-coding & cross-checks
+  rename_with(
+    ~ stringr::str_c(.x, "_raw"),
+    contains("audit")
+  )
+
+
+## First time had alcohol --------------------------------------------------------------------
+
+# Helpers: Derive 'not drinking' from alcever and audita
+# This will be used to derive never drinkers.
+never_from_alcever <- function(x) {
+  dplyr::case_when(
+    x < 0 ~ NA_real_, # negative codes = missing
+    x == 2 ~ 1, # "never"
+    x == 1 ~ 0, # "ever"
+    .default = NA_real_
+  )
+}
+
+never_from_audita <- function(x) {
+  dplyr::case_when(
+    x < 0 ~ NA_real_, # negative codes = missing
+    x == 1 ~ 1, # "never"
+    x > 1 ~ 0, # "ever"
+    .default = NA_real_
+  )
+}
+
+alc_first_age_rec <- alc_all |>
   mutate(
+    # Derive age first known drinking
     ever14 = if_else(alcever_S1 == 1 & alcmon_S1 == 1, 14, NA_real_),
     ever15 = if_else(alcever_S2 == 1, 15, NA_real_),
     ever16 = if_else(alcever_S3 == 1, 16, NA_real_),
     ever17 = if_else(alcever_S4 == 1, 17, NA_real_),
     ever19 = if_else(alcever_S6 == 1, 19, NA_real_),
     ever20 = if_else(alcever_S7 == 1, 20, NA_real_),
-    ever25 = if_else(audita25 > 1, 25, NA_real_),
-    ever32 = if_else(audita32 > 1, 32, NA_real_),
+    ever25 = if_else(audita25_raw > 1, 25, NA_real_),
+    ever32 = if_else(audita32_raw > 1, 32, NA_real_),
 
     first_age_raw = pmin(
       ever14,
@@ -208,38 +337,65 @@ alc_all <- alc_all |>
       ever32,
       na.rm = TRUE
     ),
-    first_age_raw = if_else(
-      is.infinite(first_age_raw),
-      NA_real_,
-      first_age_raw
-    ),
-
-    # This reproduces `all(..., na.rm = TRUE)` but vectorised
-    never_alc = rowSums(
-      cbind(
-        alcever_S1 == 2,
-        alcever_S2 == 2,
-        alcever_S3 == 2,
-        alcever_S4 == 2,
-        alcever_S6 == 2,
-        alcever_S7 == 2,
-        audita25 == 1,
-        audita32 == 1
-      ) ==
-        FALSE,
-      na.rm = TRUE
-    ) ==
-      0,
 
+    # Derive known never drinking
+    # recode to 1 = "never", 0 = "ever", NA = missing
+    across(
+      c(alcever_S1, alcever_S2, alcever_S3, alcever_S4, alcever_S6, alcever_S7),
+      never_from_alcever,
+      .names = "never_{.col}"
+    ),
+    across(
+      c(audita25_raw, audita32_raw),
+      never_from_audita,
+      .names = "never_{.col}"
+    ),
+    # Derive never drinkers:
+    # - 1 = all items observed & all never
+    # - 0 = at least one drinker
+    # - NA = no drinkers but some/all missing
+    never_alc = case_when(
+      # any 0 -> drinker
+      if_any(starts_with("never_"), ~ dplyr::coalesce(.x == 0, FALSE)) ~ 0L,
+      # all observed & 1
+      if_all(starts_with("never_"), ~ !is.na(.x) & .x == 1) ~ 1L,
+      # otherwise NA
+      .default = NA
+    ),
+    # First age -> use the first age when not missing.
+    # If never drinking, set to 99.
+    # Anything else is missing.
     alcfst = case_when(
       !is.na(first_age_raw) ~ first_age_raw,
-      never_alc ~ 99,
-      TRUE ~ -8
+      never_alc == 1 ~ 99,
+      .default = -8
     )
   ) |>
-  select(-starts_with("ever"), -first_age_raw, -never_alc)
+  select(-starts_with(c("ever", "never")), -first_age_raw)
+
+# Add labels
+alc_first_age_rec <- alc_first_age_rec %>%
+  mutate(
+    alcfst = labelled(
+      alcfst,
+      labels = c(
+        "Age 14" = 14,
+        "Age 15" = 15,
+        "Age 16" = 16,
+        "Age 17" = 17,
+        "Age 19" = 19,
+        "Age 20" = 20,
+        "Age 25" = 25,
+        "Age 32" = 32,
+        "Never had alcohol" = 99,
+        common_missing_labels
+      )
+    )
+  )
 
-# function - Frequency Recode Across Sweeps
+## Alcohol frequency --------------------------------------------------------------------
+
+# Helpers
 recode_freq <- function(x, sweep, ever) {
   case_when(
     sweep %in% c("S2", "S3", "S4") ~ case_when(
@@ -249,11 +405,11 @@ recode_freq <- function(x, sweep, ever) {
       x == 5 ~ 1, # once every couple of month
       x == 6 ~ 0, # less often/not at all
       ever == 2 ~ 0, # less often/not at all
-      x %in% c(-99, -97, -96) ~ -2,
-      x == -92 ~ -9,
-      x == -1 ~ -1,
-      x == -91 ~ -1,
-      TRUE ~ -3
+      # Missing  values:
+      x %in% c(-97, -92) | ever %in% c(-97, -92) ~ -9, # refusal if either refused,
+      x == -1 | ever == -1 ~ -8, # dk/missing info
+      x == -91 | ever == -91 ~ -1, # not applicable
+      .default = -3
     ),
     sweep %in% c("S6", "S7") ~ case_when(
       x %in% c(1, 2) ~ 4,
@@ -263,17 +419,17 @@ recode_freq <- function(x, sweep, ever) {
       x %in% c(7, 8) ~ 0,
       ever == 2 ~ 0,
       x == -997 ~ -2,
-      x == -97 ~ -9,
-      x == -92 ~ -9,
-      x == -91 ~ -1,
-      x == -1 ~ -1,
-      TRUE ~ -3
+      # Missing  values:
+      x %in% c(-97, -92) | ever %in% c(-97, -92) ~ -9, # refusal if either refused,
+      x == -1 | ever == -1 ~ -8, # dk/missing info
+      x == -91 | ever == -91 ~ -1, # not applicable
+      .default = -3
     )
   )
 }
 
-# recode frequency Variables
-alc_all <- alc_all %>%
+# Recode frequency variables
+alc_freq_rec <- alc_first_age_rec %>%
   mutate(
     alcfreq14 = case_when(
       alcfreq_S1 == 1 ~ 4,
@@ -284,158 +440,147 @@ alc_all <- alc_all %>%
       alcfreq_S1 == 6 ~ 0,
       alcever_S1 == 2 ~ 0,
       alcmon_S1 == 2 ~ 0,
-      alcfreq_S1 %in% c(-99, -97, -96) ~ -2,
-      alcfreq_S1 == -92 ~ -9,
-      alcfreq_S1 == -1 ~ -1,
-      alcfreq_S1 == -91 ~ -1,
-      TRUE ~ -3
+      # Missing  values:
+      alcfreq_S1 %in%
+        c(-97, -92) |
+        alcmon_S1 %in% c(-97, -92) |
+        alcever_S1 %in% c(-97, -92) ~ -9, # refusal if either refused,
+      alcfreq_S1 == -1 | alcmon_S1 == -1 | alcever_S1 == -1 ~ -8, # dk/missing info
+      alcfreq_S1 == -91 | alcmon_S1 == -91 | alcever_S1 == -91 ~ -1, # not applicable
+      .default = -3
     ),
     alcfreq15 = recode_freq(alcfreq_S2, "S2", alcever_S2),
     alcfreq16 = recode_freq(alcfreq_S3, "S3", alcever_S3),
     alcfreq17 = recode_freq(alcfreq_S4, "S4", alcever_S4),
     alcfreq19 = recode_freq(alcfreq_S6, "S6", alcever_S6),
     alcfreq20 = recode_freq(alcfreq_S7, "S7", alcever_S7),
+  ) %>%
+  # Add labels
+  mutate(
+    across(
+      c(alcfreq14, alcfreq15, alcfreq16, alcfreq17, alcfreq19, alcfreq20),
+      ~ labelled(
+        .x,
+        labels = c(
+          "Less often/not at all" = 0,
+          "Once every couple of months" = 1,
+          "Once to three times a month" = 2,
+          "Once or twice a week" = 3,
+          "Most days" = 4,
+          common_missing_labels
+        )
+      )
+    )
   )
 
+# Check
+## Cross-tabs for alcfreq
+{
+  # Build separate cross-tabs for each sweep with alcfreq_S*, alcever_S* first and alcfreq* last
+  freq_map <- list(
+    S1 = c("alcever_S1", "alcfreq_S1", "alcmon_S1", "alcfreq14"),
+    S2 = c("alcever_S2", "alcfreq_S2", "alcfreq15"),
+    S3 = c("alcever_S3", "alcfreq_S3", "alcfreq16"),
+    S4 = c("alcever_S4", "alcfreq_S4", "alcfreq17"),
+    S6 = c("alcever_S6", "alcfreq_S6", "alcfreq19"),
+    S7 = c("alcever_S7", "alcfreq_S7", "alcfreq20")
+  )
+
+  alc_freq_crosstabs <- purrr::imap(freq_map, function(cols, sweep) {
+    alc_freq_rec %>%
+      dplyr::group_by(dplyr::across(dplyr::all_of(cols))) %>%
+      dplyr::summarise(n = dplyr::n(), .groups = "drop")
+  })
+}
 
-alc_all_clean <- alc_all %>%
+## AUDIT-C --------------------------------------------------------------------
+
+alc_all_clean <- alc_freq_rec %>%
   mutate(
     audita25 = case_when(
-      audita25 > 0 ~ audita25 - 1,
-      audita25 < 0 ~ audita25,
-      is.na(audita25) ~ -3,
+      audita25_raw > 0 ~ audita25_raw - 1,
+      audita25_raw < 0 ~ audita25_raw,
+      is.na(audita25_raw) ~ -3,
     ),
     audita32 = case_when(
-      audita32 > 0 ~ audita32 - 1,
-      audita32 < 0 ~ audita32,
-      is.na(audita32) ~ -3,
+      audita32_raw > 0 ~ audita32_raw - 1,
+      audita32_raw < 0 ~ audita32_raw,
+      is.na(audita32_raw) ~ -3,
     )
   ) %>%
   mutate(
     auditb25 = case_when(
       audita25 == 0 ~ 0,
-      audita25 > 0 & auditb25 > 0 ~ auditb25,
-      auditb25 < 0 ~ auditb25,
-      is.na(auditb25) ~ -3
+      audita25 > 0 & auditb25_raw > 0 ~ auditb25_raw,
+      auditb25_raw < 0 ~ auditb25_raw,
+      is.na(auditb25_raw) ~ -3
     ),
     auditb32 = case_when(
       audita32 == 0 ~ 0,
-      audita32 > 0 & auditb32 > 0 ~ auditb32,
-      auditb32 < 0 ~ auditb32,
-      is.na(auditb32) ~ -3
+      audita32 > 0 & auditb32_raw > 0 ~ auditb32_raw,
+      auditb32_raw < 0 ~ auditb32_raw,
+      is.na(auditb32_raw) ~ -3
     ),
     auditc25 = case_when(
       audita25 == 0 ~ 0,
-      is.na(auditc25) ~ -3,
-      auditc25 < 0 ~ auditc25,
-      TRUE ~ auditc25 - 1
+      is.na(auditc25_raw) ~ -3,
+      auditc25_raw < 0 ~ auditc25_raw,
+      TRUE ~ auditc25_raw - 1
     ),
     auditc32 = case_when(
       audita32 == 0 ~ 0,
-      is.na(auditc32) ~ -3,
-      auditc32 < 0 ~ auditc32,
-      TRUE ~ auditc32 - 1
+      is.na(auditc32_raw) ~ -3,
+      auditc32_raw < 0 ~ auditc32_raw,
+      TRUE ~ auditc32_raw - 1
     )
   ) %>%
   mutate(
-    alcfst = factor(
-      alcfst,
-      levels = c(14, 15, 16, 17, 19, 20, 25, 32, 99, -1, -2, -3, -8, -9),
-      labels = c(
-        "Age 14",
-        "Age 15",
-        "Age 16",
-        "Age 17",
-        "Age 19",
-        "Age 20",
-        "Age 25",
-        "Age 32",
-        "Never had alcohol",
-        "Item not applicable",
-        "Script error/information lost",
-        "Not asked at the fieldwork stage/participated/interviewed",
-        "Don’t know/insufficient information",
-        "Refusal"
-      )
-    ),
-    across(
-      c(alcfreq14, alcfreq15, alcfreq16, alcfreq17, alcfreq19, alcfreq20),
-      ~ factor(
-        .x,
-        levels = c(0, 1, 2, 3, 4, -1, -2, -3, -8, -9),
-        labels = c(
-          "Less often/not at all",
-          "Once every couple of months",
-          "Once to three times a month",
-          "Once or twice a week",
-          "Most days",
-          "Item not applicable",
-          "Script error/information lost",
-          "Not asked at the fieldwork stage/participated/interviewed",
-          "Don’t know/insufficient information",
-          "Refusal"
-        )
-      )
-    ),
     across(
       c(audita25, audita32),
-      ~ factor(
+      ~ labelled(
         .x,
-        levels = c(0, 1, 2, 3, 4, -1, -2, -3, -8, -9),
         labels = c(
-          "Never",
-          "Monthly or less",
-          "2–4 times a month",
-          "2–3 times a week",
-          "4 or more times a week",
-          "Item not applicable",
-          "Script error/information lost",
-          "Not asked at the fieldwork stage/participated/interviewed",
-          "Don’t know/insufficient information",
-          "Refusal"
+          "Never" = 0,
+          "Monthly or less" = 1,
+          "2–4 times a month" = 2,
+          "2–3 times a week" = 3,
+          "4 or more times a week" = 4,
+          common_missing_labels
         )
       )
     ),
     across(
       c(auditb25, auditb32),
-      ~ factor(
+      ~ labelled(
         .x,
-        levels = c(0, 1, 2, 3, 4, 5, -1, -2, -3, -8, -9),
         labels = c(
-          "0",
-          "1–2 drinks",
-          "3–4 drinks",
-          "5–6 drinks",
-          "7–9 drinks",
-          "10+",
-          "Item not applicable",
-          "Script error/information lost",
-          "Not asked at the fieldwork stage/participated/interviewed",
-          "Don’t know/insufficient information",
-          "Refusal"
+          "0" = 0,
+          "1–2 drinks" = 1,
+          "3–4 drinks" = 2,
+          "5–6 drinks" = 3,
+          "7–9 drinks" = 4,
+          "10+" = 5,
+          common_missing_labels
         )
       )
     ),
     across(
       c(auditc25, auditc32),
-      ~ factor(
+      ~ labelled(
         .x,
-        levels = c(0, 1, 2, 3, 4, -1, -2, -3, -8, -9),
         labels = c(
-          "Never",
-          "Less than monthly",
-          "Monthly",
-          "Weekly",
-          "Daily or almost daily",
-          "Item not applicable",
-          "Script error/information lost",
-          "Not asked at the fieldwork stage/participated/interviewed",
-          "Don’t know/insufficient information",
-          "Refusal"
+          "Never" = 0,
+          "Less than monthly" = 1,
+          "Monthly" = 2,
+          "Weekly" = 3,
+          "Daily or almost daily" = 4,
+          common_missing_labels
         )
       )
     )
-  ) %>%
+  )
+
+alc_all_clean <- alc_all_clean %>%
   select(
     NSID,
     alcfst,
@@ -454,7 +599,15 @@ alc_all_clean <- alc_all %>%
   )
 
 # Drug Use --------------------------------------------------------------------
+
 # Load drug use data from relevant sweeps
+## Naming convention:
+## canevr - ever used cannabis
+## now_cann - currently using cannabis
+## yr_cann - used cannabis in past 12 months
+## othevr - ever used other drugs
+## now_oth - currently using other drugs
+## yr_oth - used other drugs in past 12 months
 drug_vars <- list(
   S1 = ns_data[["S1youngperson"]] %>%
     select(NSID, canevr14 = W1canntryYP),
@@ -478,32 +631,39 @@ drug_vars <- list(
       canevr20 = W7DrugYP1YP0a,
       othevr20 = W7DrugYP1YP0b,
       now_cann20 = W7DrugOftenYP0a,
-      now_oth20 = starts_with("W7DrugOftenYP0")
+      now_oth20 = starts_with("W7DrugOftenYP0") & !any_of("W7DrugOftenYP0a")
     ),
   S8 = ns_data[["S8selfcompletion"]] %>%
     select(
       NSID,
       canevr25 = W8DRUGYP10A,
-      othevr25 = starts_with("W8DRUGYP10"),
+      othevr25 = starts_with("W8DRUGYP10") & !any_of("W8DRUGYP10A"),
       yr_cann25 = W8DRUGYP20A,
-      yr_oth25 = starts_with("W8DRUGYP20"),
+      yr_oth25 = starts_with("W8DRUGYP20") & !any_of("W8DRUGYP20A"),
       now_cann25 = W8DRUGOFTEN0A,
-      now_oth25 = starts_with("W8DRUGOFTEN0")
+      now_oth25 = starts_with("W8DRUGOFTEN0") & !any_of("W8DRUGOFTEN0A")
     ),
   S9 = ns_data[["S9maininterview"]] %>%
     select(
       NSID,
       canevr32 = W9DRUGYP10A,
-      othevr32 = starts_with("W9DRUGYP1"),
+      othevr32 = starts_with("W9DRUGYP1") & !any_of("W9DRUGYP10A"),
       yr_cann32 = W9DRUGYP20A,
-      yr_oth32 = starts_with("W9DRUGYP2"),
+      yr_oth32 = starts_with("W9DRUGYP2") & !any_of("W9DRUGYP20A"),
       now_cann32 = W9DRUGOFTEN0A,
-      now_oth32 = starts_with("W9DRUGOFTEN0")
+      now_oth32 = starts_with("W9DRUGOFTEN0") & !any_of("W9DRUGOFTEN0A")
     )
 )
 
 # Merge all datasets
-drug_all <- reduce(drug_vars, full_join, by = "NSID")
+drug_all <- reduce(drug_vars, full_join, by = "NSID") %>%
+  # Add '_raw' suffix to all variable names for simpler re-coding & cross-checks
+  rename_with(
+    .fn = ~ stringr::str_c(.x, "_raw"),
+    .cols = !contains("NSID")
+  )
+
+## Recode original response options --------------------------------------------------------------------
 
 # functions: Recode function preserving missing values
 # recode for whether using drug ever for each sweep
@@ -544,32 +704,39 @@ recode_drugbin89 <- function(x) {
 }
 
 # Recode cannabis & other drug variables
-drug_all <- drug_all %>%
+drug_rec <- drug_all %>%
   mutate(
     across(
       c(
-        canevr14,
-        canevr15,
-        canevr16,
-        canevr17,
-        canevr19,
-        canevr20,
-        now_cann19,
-        othevr19,
-        othevr20,
-        now_oth19
+        canevr14_raw,
+        canevr15_raw,
+        canevr16_raw,
+        canevr17_raw,
+        canevr19_raw,
+        canevr20_raw,
+        now_cann19_raw,
+        othevr19_raw,
+        othevr20_raw,
+        now_oth19_raw
       ),
-      recode_drugbin1_7
+      recode_drugbin1_7,
+      # Remove '_raw' suffix from new variable names
+      .names = "{stringr::str_remove(.col, '_raw$')}"
+    ),
+    across(
+      c(now_cann20_raw, starts_with("now_oth20")),
+      recode_drugoft7,
+      # Remove '_raw' suffix from new variable names
+      .names = "{stringr::str_remove(.col, '_raw$')}"
     ),
-    across(c(now_cann20, starts_with("now_oth20")), recode_drugoft7),
     across(
       c(
-        canevr25,
-        canevr32,
-        yr_cann25,
-        yr_cann32,
-        now_cann25,
-        now_cann32,
+        canevr25_raw,
+        canevr32_raw,
+        yr_cann25_raw,
+        yr_cann32_raw,
+        now_cann25_raw,
+        now_cann32_raw,
         starts_with("yr_oth25"),
         starts_with("yr_oth32"),
         starts_with("othevr25"),
@@ -577,65 +744,130 @@ drug_all <- drug_all %>%
         starts_with("now_oth25"),
         starts_with("now_oth32")
       ),
-      recode_drugbin89
+      recode_drugbin89,
+      # Remove '_raw' suffix from new variable names
+      .names = "{stringr::str_remove(.col, '_raw$')}"
     )
   )
 
-# Derive oth7–9 and now_oth7–9 from multiple variables
-# Code review / MD (2025-12-05): Vectorised rewrite of the original rowwise() code to improve speed.
-# This reproduces the original behaviour.
-# Substantive logic to be revisited at a later debugging/clean-up stage.
-row_max_df <- function(df) {
-  do.call(pmax, c(df, list(na.rm = TRUE)))
+# Check: cross-tabs
+drug_pairs <- tibble::tibble(
+  raw_var = names(drug_rec) |>
+    stringr::str_subset("_raw$")
+) |>
+  dplyr::mutate(
+    rec_var = stringr::str_remove(raw_var, "_raw$"),
+    has_rec = rec_var %in% names(drug_rec)
+  ) |>
+  dplyr::filter(has_rec) |>
+  dplyr::select(raw_var, rec_var)
+
+drug_crosstabs <- drug_pairs |>
+  purrr::pmap(function(raw_var, rec_var) {
+    drug_rec |>
+      dplyr::count(
+        raw = .data[[raw_var]],
+        rec = .data[[rec_var]],
+        name = "n"
+      ) |>
+      dplyr::mutate(
+        raw_var = raw_var,
+        rec_var = rec_var,
+        .before = 1
+      )
+  })
+
+## Derive 'Other' --------------------------------------------------------------------
+
+# Helper function: Derive 'Other' drug use within a sweep.
+## Used derive a single variable indicating if a person used drugs other than cannabis within a sweep.
+## It uses the indicator for whether ANY drug was used (based on separate yes/no indicators for each drug).
+## Coded as 1 = 'yes' if any drug was used.
+## Else, coded as 0 = 'no' if all drugs were reported as not used (conservative).
+## Else, missing values follow a hierarchy.
+derive_drug_other_within <- function(cols) {
+  dplyr::case_when(
+    ## If any variable reported as yes -> 'yes' (1).
+    dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == 1, FALSE)) ~ 1L,
+    ## ELSE: If all drugs reported as NOT having used -> 'no' (0).
+    dplyr::if_all({{ cols }}, \(x) !is.na(x) & x == 0) ~ 0L,
+    ## ELSE: If any -9 (refusal) -> -9.
+    dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -9, FALSE)) ~ -9L,
+    ## ELSE: If any -8 (dk/insufficient info) -> -8.
+    dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -8, FALSE)) ~ -8L,
+    ## ELSE: If any -1 (not applicable) -> -1.
+    dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -1, FALSE)) ~ -1L,
+    ## ELSE: -3 (not interviewed/asked etc.)
+    TRUE ~ -3L
+  )
 }
 
-drug_all <- drug_all |>
-  mutate(
-    # 25-sweep: use columns 2:9 (8 "other" vars)
-    othevr25 = row_max_df(
-      pick(starts_with("othevr25"))[2:9]
+drug_rec <- drug_rec |>
+  dplyr::mutate(
+    othevr25 = derive_drug_other_within(
+      starts_with("othevr25") & !ends_with("raw")
     ),
-    now_oth20 = row_max_df(
-      pick(starts_with("now_oth20"))[2:9]
+    othevr32 = derive_drug_other_within(
+      starts_with("othevr32") & !ends_with("raw")
     ),
-    now_oth25 = row_max_df(
-      pick(starts_with("now_oth25"))[2:9]
+    yr_oth25 = derive_drug_other_within(
+      starts_with("yr_oth25") & !ends_with("raw")
     ),
-    yr_oth25 = row_max_df(
-      pick(starts_with("yr_oth25"))[2:9]
+    yr_oth32 = derive_drug_other_within(
+      starts_with("yr_oth32") & !ends_with("raw")
     ),
-
-    # 32-sweep: use columns 2:10 (9 "other" vars)
-    othevr32 = row_max_df(
-      pick(starts_with("othevr32"))[2:10]
+    now_oth20 = derive_drug_other_within(
+      starts_with("now_oth20") & !ends_with("raw")
     ),
-    now_oth32 = row_max_df(
-      pick(starts_with("now_oth32"))[2:10]
+    now_oth25 = derive_drug_other_within(
+      starts_with("now_oth25") & !ends_with("raw")
     ),
-    yr_oth32 = row_max_df(
-      pick(starts_with("yr_oth32"))[2:10]
+    now_oth32 = derive_drug_other_within(
+      starts_with("now_oth32") & !ends_with("raw")
     )
   )
 
+## Derive EVER used --------------------------------------------------------------------
 
-# Derive: Ever used
-drug_all <- drug_all %>%
+# Derive 'ever used' for cannabis and other drugs across sweeps.
+# The function takes indicators from each sweep (1 = reported EVER using drug, 0 = reported not EVER using drug)
+# and derives a single EVER indicator.
+# Coded as 1 = 'yes' if drug was EVER used.
+# Else, coded as 0 = 'no' if any sweep reported not EVER using drug (liberal).
+# Otherwise, missing values follow a hierarchy.
+derive_drug_ever_across <- function(cols) {
+  dplyr::case_when(
+    ## If any sweep reported as ever used -> 'yes' (1).
+    dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == 1, FALSE)) ~ 1L,
+    ## ELSE: If any sweep reported as NOT having ever used -> 'no' (0).
+    dplyr::if_any({{ cols }}, \(x) x == 0) ~ 0L,
+    ## ELSE: If any -9 (refusal) -> -9.
+    dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -9, FALSE)) ~ -9L,
+    ## ELSE: If any -8 (dk/insufficient info) -> -8.
+    dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -8, FALSE)) ~ -8L,
+    ## ELSE: If any -1 (not applicable) -> -1.
+    dplyr::if_any({{ cols }}, \(x) dplyr::coalesce(x == -1, FALSE)) ~ -1L,
+    ## ELSE: -3 (not interviewed/asked etc.)
+    TRUE ~ -3L
+  )
+}
+
+drug_rec_ever <- drug_rec %>%
   mutate(
-    drgcnbevr = pmax(
-      canevr14,
-      canevr15,
-      canevr16,
-      canevr17,
-      canevr19,
-      canevr20,
-      canevr25,
-      canevr32,
-      na.rm = FALSE
+    drgcnbevr = derive_drug_ever_across(
+      starts_with("canevr") & !ends_with("raw")
     ),
-    drgothevr = pmax(othevr19, othevr20, othevr25, othevr32, na.rm = FALSE)
+    drgothevr = derive_drug_ever_across(c(
+      othevr19,
+      othevr20,
+      othevr25,
+      othevr32
+    ))
   )
 
-# Derive: First time use
+## Derive first time use --------------------------------------------------------------------
+
+# This variable will record the first known age of using cannabis/other drugs.
 first_wave_age <- c(14, 15, 16, 17, 19, 20, 25, 32)
 cann_vars <- c(
   "canevr14",
@@ -649,40 +881,38 @@ cann_vars <- c(
 )
 oth_vars <- c("othevr19", "othevr20", "othevr25", "othevr32")
 
-drug_all <- drug_all %>%
+drug_rec_first <- drug_rec_ever |>
   mutate(
-    # first time reported using cannabis (14-32)
     drgcnbfst = case_when(
-      canevr14 == 1 ~ 14,
-      canevr15 == 1 ~ 15,
-      canevr16 == 1 ~ 16,
-      canevr17 == 1 ~ 17,
-      canevr19 == 1 ~ 19,
-      canevr20 == 1 ~ 20,
-      canevr25 == 1 ~ 25,
-      canevr32 == 1 ~ 32,
-      all(is.na(canevr14:canevr32)) ~ -3,
-      rowSums(select(., canevr14:canevr32), na.rm = TRUE) == 0 ~ 99,
-      TRUE ~ -2
+      canevr14 == 1 ~ 14L,
+      canevr15 == 1 ~ 15L,
+      canevr16 == 1 ~ 16L,
+      canevr17 == 1 ~ 17L,
+      canevr19 == 1 ~ 19L,
+      canevr20 == 1 ~ 20L,
+      canevr25 == 1 ~ 25L,
+      canevr32 == 1 ~ 32L,
+
+      # conservative "never": all included sweeps are exactly 0
+      if_all(all_of(cann_vars), ~ .x == 0) ~ 99L,
+
+      .default = -3L
     ),
-    # first time reported using other drugs (19-32)
     drgothfst = case_when(
-      othevr19 == 1 ~ 19,
-      othevr20 == 1 ~ 20,
-      othevr25 == 1 ~ 25,
-      othevr32 == 1 ~ 32,
-      all(is.na(c_across(c(othevr19, othevr20, othevr25, othevr32)))) ~ -3,
-      rowSums(
-        select(., othevr19, othevr20, othevr25, othevr32),
-        na.rm = TRUE
-      ) ==
-        0 ~ 99,
-      TRUE ~ -2
+      othevr19 == 1 ~ 19L,
+      othevr20 == 1 ~ 20L,
+      othevr25 == 1 ~ 25L,
+      othevr32 == 1 ~ 32L,
+
+      if_all(all_of(oth_vars), ~ .x == 0) ~ 99L,
+
+      .default = -3L
     )
   )
 
-# Derive: Current use
-drug_all <- drug_all %>%
+## Derive current use --------------------------------------------------------------------
+
+drug_rec_current <- drug_rec_first %>%
   mutate(
     drgcnbnw19 = case_when(
       canevr19 == 0 ~ 0,
@@ -723,45 +953,36 @@ drug_all <- drug_all %>%
     )
   )
 
-# Final selection
-drug_final <- drug_all %>%
+# Add labels and select variables
+drug_all_clean <- drug_rec_current %>%
+  select(-ends_with("raw")) %>%
   mutate(
     across(
       c(drgcnbevr, drgothevr, starts_with("drgcnbnw"), starts_with("drgothnw")),
-      ~ factor(
+      ~ labelled(
         .x,
-        levels = c(0, 1, -1, -2, -3, -8, -9),
         labels = c(
-          "No",
-          "Yes",
-          "Item not applicable",
-          "Script error/information lost",
-          "Not asked at the fieldwork stage/participated/interviewed",
-          "Don’t know/insufficient information",
-          "Refusal"
+          "No" = 0,
+          "Yes" = 1,
+          common_missing_labels
         )
       )
     ),
     across(
       c(drgcnbfst, drgothfst),
-      ~ factor(
+      ~ labelled(
         .x,
-        levels = c(14, 15, 16, 17, 19, 20, 25, 32, 99, -1, -2, -3, -8, -9),
         labels = c(
-          "Age 14",
-          "Age 15",
-          "Age 16",
-          "Age 17",
-          "Age 19",
-          "Age 20",
-          "Age 25",
-          "Age 32",
-          "Never used",
-          "Item not applicable",
-          "Script error/information lost",
-          "Not asked at the fieldwork stage/participated/interviewed",
-          "Don’t know/insufficient information",
-          "Refusal"
+          "Age 14" = 14,
+          "Age 15" = 15,
+          "Age 16" = 16,
+          "Age 17" = 17,
+          "Age 19" = 19,
+          "Age 20" = 20,
+          "Age 25" = 25,
+          "Age 32" = 32,
+          "Never used" = 99,
+          common_missing_labels
         )
       )
     )
@@ -796,7 +1017,12 @@ exercise_vars <- list(
 )
 
 # Merge all datasets
-spt_all <- reduce(exercise_vars, full_join, by = "NSID")
+spt_all <- reduce(exercise_vars, full_join, by = "NSID") %>%
+  # Add '_raw' suffix to all variable names for simpler re-coding & cross-checks
+  rename_with(
+    .fn = ~ stringr::str_c(.x, "_raw"),
+    .cols = !contains("NSID")
+  )
 
 # Recode function
 recode_exercise <- function(x) {
@@ -812,57 +1038,86 @@ recode_exercise <- function(x) {
   )
 }
 
-# Apply recoding
-spt_all <- spt_all %>%
+# Re-coding
+## At sweeps 8-9, the question wording changed,
+## asking about the number of days per week doing exercise for 30 mins or more.
+## For these sweeps, re-coding was done as follows:
+## - 5-7 days = "most days" (0)
+## - 2-4 days = "more than once a week" (1)
+## - 1 day = "once a week" (2)
+## - 0 days = "less than once a week/hardly ever/never" (3)
+spt_rec <- spt_all %>%
   mutate(
-    spt14 = recode_exercise(spt14),
-    spt15 = recode_exercise(spt15),
-    spt17 = recode_exercise(spt17),
-    spt19 = recode_exercise(spt19),
-    spt20 = recode_exercise(spt20),
+    spt14 = recode_exercise(spt14_raw),
+    spt15 = recode_exercise(spt15_raw),
+    spt17 = recode_exercise(spt17_raw),
+    spt19 = recode_exercise(spt19_raw),
+    spt20 = recode_exercise(spt20_raw),
     spt25 = case_when(
       # values from 0–7 days
-      spt25 %in% c(5, 6, 7) ~ 0,
-      spt25 %in% c(2, 3, 4) ~ 1,
-      spt25 == 1 ~ 2,
-      spt25 == 0 ~ 3,
-      spt25 == -9 ~ -9,
-      spt25 == -8 ~ -8,
-      spt25 == -1 ~ -1,
-      is.na(spt25) ~ -3
+      spt25_raw %in% c(5, 6, 7) ~ 0,
+      spt25_raw %in% c(2, 3, 4) ~ 1,
+      spt25_raw == 1 ~ 2,
+      spt25_raw == 0 ~ 3,
+      spt25_raw == -9 ~ -9,
+      spt25_raw == -8 ~ -8,
+      spt25_raw == -1 ~ -1,
+      is.na(spt25_raw) ~ -3
     ),
     spt32 = case_when(
-      spt32 %in% c(5, 6, 7) ~ 0,
-      spt32 %in% c(2, 3, 4) ~ 1,
-      spt32 == 1 ~ 2,
-      spt32 == 0 ~ 3,
-      spt32 == -9 ~ -9,
-      spt32 == -8 ~ -8,
-      spt32 == -1 ~ -1,
-      is.na(spt32) | spt32 == -3 ~ -3
+      spt32_raw %in% c(5, 6, 7) ~ 0,
+      spt32_raw %in% c(2, 3, 4) ~ 1,
+      spt32_raw == 1 ~ 2,
+      spt32_raw == 0 ~ 3,
+      spt32_raw == -9 ~ -9,
+      spt32_raw == -8 ~ -8,
+      spt32_raw == -1 ~ -1,
+      is.na(spt32_raw) | spt32_raw == -3 ~ -3
     )
   ) %>%
-  mutate(across(
-    c(starts_with("spt")),
-    ~ factor(
-      .x,
-      levels = c(0, 1, 2, 3, -1, -2, -3, -8, -9),
-      labels = c(
-        "Most days",
-        "More than once a week",
-        "Once a week",
-        "Less than once a week/hardly ever/never",
-        "Item not applicable",
-        "Script error/information lost",
-        "Not asked at the fieldwork stage/participated/interviewed",
-        "Don’t know/insufficient information",
-        "Refusal"
+  mutate(
+    across(
+      c(starts_with("spt") & !ends_with("raw")),
+      ~ labelled(
+        .x,
+        labels = c(
+          "Most days" = 0,
+          "More than once a week" = 1,
+          "Once a week" = 2,
+          "Less than once a week/hardly ever/never" = 3,
+          common_missing_labels
+        )
       )
     )
-  )) %>%
+  )
+
+# Cross-tabs
+spt_rec %>%
+  count(spt14_raw, spt14)
+
+spt_rec %>%
+  count(spt15_raw, spt15)
+
+spt_rec %>%
+  count(spt17_raw, spt17)
+
+spt_rec %>%
+  count(spt19_raw, spt19)
+
+spt_rec %>%
+  count(spt20_raw, spt20)
+
+spt_rec %>%
+  count(spt25_raw, spt25)
+
+spt_rec %>%
+  count(spt32_raw, spt32)
+
+spt_all <- spt_rec %>%
   select(NSID, spt14, spt15, spt17, spt19, spt20, spt25, spt32)
 
-# Absence --------------------------------------------------------------------
+# School absence --------------------------------------------------------------------
+
 # Load relevant sweep files and select variables
 absence_vars <- list(
   S1 = ns_data[["S1youngperson"]] %>%
@@ -876,7 +1131,12 @@ absence_vars <- list(
 )
 
 # Merge the datasets by NSID
-absence_all <- reduce(absence_vars, full_join, by = "NSID")
+absence_all <- reduce(absence_vars, full_join, by = "NSID") %>%
+  # Add '_raw' suffix to all variable names for simpler re-coding & cross-checks
+  rename_with(
+    .fn = ~ stringr::str_c(.x, "_raw"),
+    .cols = !contains("NSID")
+  )
 
 # Recode function for harmonised values
 recode_absence <- function(x) {
@@ -885,39 +1145,45 @@ recode_absence <- function(x) {
     x == 2 ~ 0, # no
     x %in% c(-97, -92) ~ -9,
     x %in% c(-91) ~ -1,
-    x %in% c(-96, -1) ~ -8,
-    x %in% c(-998, -997, -995, -99) ~ -2,
-    is.na(x) ~ -3,
-    TRUE ~ -3
+    x %in% c(-1) ~ -8,
+    x %in% c(-998, -997, -995) ~ -2,
+    .default = -3
   )
 }
 
 # Apply recode to each sweep
-absence_all <- absence_all %>%
+absence_rec <- absence_all %>%
   mutate(
-    abs1m14 = recode_absence(abs1m14),
-    abs1m15 = recode_absence(abs1m15),
-    abs1m16 = recode_absence(abs1m16)
+    abs1m14 = recode_absence(abs1m14_raw),
+    abs1m15 = recode_absence(abs1m15_raw),
+    abs1m16 = recode_absence(abs1m16_raw)
   ) %>%
   mutate(across(
-    starts_with("abs1m"),
-    ~ factor(
+    starts_with("abs1m") & !ends_with("raw"),
+    ~ labelled(
       .x,
-      levels = c(0, 1, -1, -2, -3, -8, -9),
       labels = c(
-        "No",
-        "Yes",
-        "Item not applicable",
-        "Script error/information lost",
-        "Not asked at the fieldwork stage/participated/interviewed",
-        "Don’t know/insufficient information",
-        "Refusal"
+        "No" = 0,
+        "Yes" = 1,
+        common_missing_labels
       )
     )
-  )) %>%
-  select(NSID, abs1m14, abs1m15, abs1m16)
+  ))
+
+absence_rec %>%
+  count(abs1m14_raw, abs1m14)
+
+absence_rec %>%
+  count(abs1m15_raw, abs1m15)
+
+absence_rec %>%
+  count(abs1m16_raw, abs1m16)
+
+absence_all <- absence_rec %>%
+  select(NSID, starts_with("abs1m") & !ends_with("raw"))
 
 # Suspended/Expelled --------------------------------------------------------------------
+
 # Load suspension and expulsion variables from each sweep
 suspend_expel_vars <- list(
   S1 = ns_data[["S1youngperson"]] %>%
@@ -931,7 +1197,12 @@ suspend_expel_vars <- list(
 )
 
 # Merge all datasets by NSID
-suspend_expel_all <- reduce(suspend_expel_vars, full_join, by = "NSID")
+suspend_expel_all <- reduce(suspend_expel_vars, full_join, by = "NSID") %>%
+  # Add '_raw' suffix to all variable names for simpler re-coding & cross-checks
+  rename_with(
+    .fn = ~ stringr::str_c(.x, "_raw"),
+    .cols = !contains("NSID")
+  )
 
 # Recode function
 recode_school_discipline <- function(x) {
@@ -940,7 +1211,7 @@ recode_school_discipline <- function(x) {
     x == 2 ~ 0, # no
     x %in% c(-97, -92) ~ -9,
     x %in% c(-91) ~ -1,
-    x %in% c(-96, -1) ~ -8,
+    x %in% c(-1) ~ -8,
     x %in% c(-99) ~ -3,
     is.na(x) ~ -3,
     TRUE ~ -3
@@ -948,33 +1219,51 @@ recode_school_discipline <- function(x) {
 }
 
 # Apply recoding
-suspend_expel_all <- suspend_expel_all %>%
+suspend_expel_rec <- suspend_expel_all %>%
   mutate(
-    susp14 = recode_school_discipline(susp14),
-    susp15 = recode_school_discipline(susp15),
-    susp16 = recode_school_discipline(susp16),
-    susp17 = recode_school_discipline(susp17),
-    expl14 = recode_school_discipline(expl14),
-    expl15 = recode_school_discipline(expl15),
-    expl16 = recode_school_discipline(expl16),
-    expl17 = recode_school_discipline(expl17)
+    susp14 = recode_school_discipline(susp14_raw),
+    susp15 = recode_school_discipline(susp15_raw),
+    susp16 = recode_school_discipline(susp16_raw),
+    susp17 = recode_school_discipline(susp17_raw),
+    expl14 = recode_school_discipline(expl14_raw),
+    expl15 = recode_school_discipline(expl15_raw),
+    expl16 = recode_school_discipline(expl16_raw),
+    expl17 = recode_school_discipline(expl17_raw)
   ) %>%
   mutate(across(
-    c(starts_with("abs1m"), starts_with("expl")),
-    ~ factor(
+    c(
+      starts_with("susp") & !ends_with("raw"),
+      starts_with("expl") & !ends_with("raw")
+    ),
+    ~ labelled(
       .x,
-      levels = c(0, 1, -1, -2, -3, -8, -9),
       labels = c(
-        "No",
-        "Yes",
-        "Item not applicable",
-        "Script error/information lost",
-        "Not asked at the fieldwork stage/participated/interviewed",
-        "Don’t know/insufficient information",
-        "Refusal"
+        "No" = 0,
+        "Yes" = 1,
+        common_missing_labels
       )
     )
-  )) %>%
+  ))
+
+# Cross-tabs
+suspend_expel_rec %>%
+  count(susp14_raw, susp14)
+suspend_expel_rec %>%
+  count(susp15_raw, susp15)
+suspend_expel_rec %>%
+  count(susp16_raw, susp16)
+suspend_expel_rec %>%
+  count(susp17_raw, susp17)
+suspend_expel_rec %>%
+  count(expl14_raw, expl14)
+suspend_expel_rec %>%
+  count(expl15_raw, expl15)
+suspend_expel_rec %>%
+  count(expl16_raw, expl16)
+suspend_expel_rec %>%
+  count(expl17_raw, expl17)
+
+suspend_expel_all <- suspend_expel_rec %>%
   select(NSID, starts_with("susp"), starts_with("expl"))
 
 # Truancy --------------------------------------------------------------------
@@ -1001,13 +1290,13 @@ recode_truancy_early <- function(ever, type) {
     type == 2 ~ 2, # several days at a time
     type == 3 ~ 3, # particular days or lessons
     type == 4 ~ 4, # odd day or lesson
-    type %in% c(-96, -1) ~ -8,
-    type %in% c(-97, -92) ~ -9,
-    type %in% c(-99) ~ -3,
-    type %in% c(-91) ~ -1,
-    is.na(type) & ever == 1 ~ -2,
-    is.na(type) & is.na(ever) ~ -3,
-    TRUE ~ -3
+    # If missing, use hierarchy derived from both ever and type:
+    # Else if either is refusal -> refusal (-9)
+    ever %in% c(-92, -97) | type %in% c(-92, -97) ~ -9,
+    # Else if either is don't know / insufficient info -> -8
+    ever == -1 | type == -1 ~ -8,
+    # Else is not interviewed/asked etc. -> -3
+    .default = -3
   )
 }
 
@@ -1019,7 +1308,7 @@ recode_truancy_s4 <- function(x) {
     x == 2 ~ 2,
     x == 3 ~ 3,
     x == 4 ~ 4,
-    x %in% c(-96, -1) ~ -8,
+    x %in% c(-1) ~ -8,
     x %in% c(-97, -92) ~ -9,
     x == -99 ~ -3,
     x == -91 ~ -1,
@@ -1029,7 +1318,7 @@ recode_truancy_s4 <- function(x) {
 }
 
 # Apply recoding
-truancy_all <- truancy_all %>%
+truancy_rec <- truancy_all %>%
   mutate(
     trua14 = recode_truancy_early(trua14_ever, trua14_type),
     trua15 = recode_truancy_early(trua15_ever, trua15_type),
@@ -1037,27 +1326,38 @@ truancy_all <- truancy_all %>%
     trua17 = recode_truancy_s4(trua17_raw)
   ) %>%
   mutate(across(
-    starts_with("trua"),
-    ~ factor(
+    c(trua14, trua15, trua16, trua17),
+    ~ labelled(
       .x,
-      levels = c(0, 1, 2, 3, 4, -1, -2, -3, -8, -9),
       labels = c(
-        "Never played truant",
-        "For weeks at a time",
-        "Several days at a time",
-        "Particular days or lessons",
-        "Odd day or lesson",
-        "Item not applicable",
-        "Script error/information lost",
-        "Not asked at the fieldwork stage/participated/interviewed",
-        "Don’t know/insufficient information",
-        "Refusal"
+        "Never played truant" = 0,
+        "For weeks at a time" = 1,
+        "Several days at a time" = 2,
+        "Particular days or lessons" = 3,
+        "Odd day or lesson" = 4,
+        common_missing_labels
       )
     )
-  )) %>%
+  ))
+
+# Cross-tabs
+truancy_rec %>%
+  count(trua14_ever, trua14_type, trua14)
+
+truancy_rec %>%
+  count(trua15_ever, trua15_type, trua15)
+
+truancy_rec %>%
+  count(trua16_ever, trua16_type, trua16)
+
+truancy_rec %>%
+  count(trua17, trua17_raw)
+
+truancy_all <- truancy_rec %>%
   select(NSID, trua14, trua15, trua16, trua17)
 
 # Police Contact --------------------------------------------------------------------
+
 # Load data for police contact
 police_vars <- list(
   S1 = ns_data[["S1youngperson"]] %>%
@@ -1089,7 +1389,12 @@ police_vars <- list(
 )
 
 # Merge datasets
-police_all <- reduce(police_vars, full_join, by = "NSID")
+police_all <- reduce(police_vars, full_join, by = "NSID") %>%
+  # Add '_raw' suffix to all variable names for simpler re-coding & cross-checks
+  rename_with(
+    .fn = ~ stringr::str_c(.x, "_raw"),
+    .cols = !contains("NSID")
+  )
 
 # Recode function for binary variables (pol15,16)
 recode_pol <- function(x) {
@@ -1098,10 +1403,10 @@ recode_pol <- function(x) {
     x == 2 ~ 0,
     x %in% c(-97, -92) ~ -9,
     x == -91 ~ -1,
-    x %in% c(-96, -1) ~ -8,
+    x %in% c(-1) ~ -8,
     x %in% c(-998, -997, -995) ~ -2,
     x %in% c(-99) ~ -3,
-    is.na(x) ~ -3
+    .default = -3
   )
 }
 
@@ -1110,22 +1415,101 @@ recode_cnt <- function(x, ever) {
   case_when(
     ever %in% c(2, 3) ~ 0,
     x >= 0 ~ x,
-    x %in% c(-97, -92) ~ -9,
-    x == -91 ~ -1,
-    x %in% c(-96, -1) ~ -8,
+    # Use hierarchy for missing values:
+    ever %in% c(-92, -97) | x %in% c(-92, -97) ~ -9,
+    ever == -1 | x == -1 ~ -8,
+    # Script errors/information lost:
     x %in% c(-998, -997, -995) ~ -2,
-    x %in% c(-99, -996) ~ -3,
-    is.na(x) ~ -3,
+    # Else is not interviewed/asked etc. -> -3
+    .default = -3
   )
 }
 
-# Apply recoding
-police_all <- police_all %>%
+## Police contact --------------------------------------------------------------------
+police_rec_contact <- police_all %>%
+  mutate(
+    # Police contact - binary:
+    pol14 = case_when(
+      pol14_raw %in% c(1, 3) ~ 1,
+      pol14_raw == 2 ~ 0,
+      pol14_raw %in% c(-97, -92) ~ -9,
+      pol14_raw == -91 ~ -1,
+      pol14_raw %in% c(-1) ~ -8,
+      pol14_raw %in% c(-99) ~ -3,
+      .default = -3
+    ),
+    pol15 = recode_pol(pol15_raw),
+    pol16 = recode_pol(pol16_raw),
+    pol17 = case_when(
+      pol17_raw %in% c(1, 3) ~ 1,
+      pol17_raw == 2 ~ 0,
+      pol17_raw %in% c(-97, -92) ~ -9,
+      pol17_raw == -91 ~ -1,
+      pol17_raw %in% c(-1) ~ -8,
+      pol17_raw %in% c(-99) ~ -3,
+      .default = -3
+    ),
+    # Police contact - count:
+    polcnt14 = recode_cnt(polcnt14_raw, pol14_raw),
+    polcnt15 = recode_cnt(polcnt15_raw, pol15_raw),
+    polcnt16 = recode_cnt(polcnt16_raw, pol16_raw),
+    polcnt17 = recode_cnt(polcnt17_raw, pol17_raw),
+    # Add labels
+    across(
+      c(pol14, pol15, pol16, pol17),
+      ~ labelled(
+        .x,
+        labels = c(
+          "No" = 0,
+          "Yes/not in last 3 years" = 1,
+          common_missing_labels
+        )
+      )
+    ),
+    across(
+      c(starts_with("polcnt") & !ends_with("raw")),
+      ~ labelled(
+        .x,
+        labels = c(
+          "Item not applicable" = -1,
+          "Script error/information lost" = -2,
+          "Not asked at the fieldwork stage/participated/interviewed" = -3,
+          "Don’t know/insufficient information" = -8,
+          "Refusal" = -9
+        )
+      )
+    )
+  )
+
+
+# Cross-tabs
+police_rec_contact %>%
+  count(pol14_raw, pol14)
+police_rec_contact %>%
+  count(pol15_raw, pol15)
+police_rec_contact %>%
+  count(pol16_raw, pol16)
+police_rec_contact %>%
+  count(pol17_raw, pol17)
+
+police_rec_contact %>%
+  count(pol14_raw, polcnt14_raw, polcnt14) %>%
+  print(n = Inf)
+police_rec_contact %>%
+  count(pol15_raw, polcnt15_raw, polcnt15) %>%
+  print(n = Inf)
+# For polcnt16_raw, -1 is dk/insufficient info (checked the data dictionary, label only missing in the Stata file).
+police_rec_contact %>%
+  count(pol16_raw, polcnt16_raw, polcnt16) %>%
+  print(n = Inf)
+police_rec_contact %>%
+  count(pol17_raw, polcnt17_raw, polcnt17) %>%
+  print(n = Inf)
+
+## Police warning, arrest, caution  --------------------------------------------------------------------
+
+police_rec_warning <- police_rec_contact %>%
   mutate(
-    polcnt14 = recode_cnt(polcnt14, pol14),
-    polcnt15 = recode_cnt(polcnt15, pol15),
-    polcnt16 = recode_cnt(polcnt16, pol16),
-    polcnt17 = recode_cnt(polcnt17, pol17),
     across(
       starts_with("polwrn"),
       ~ case_when(
@@ -1133,7 +1517,8 @@ police_all <- police_all %>%
         .x == 2 ~ 0,
         .x < 0 ~ .x,
         TRUE ~ -3
-      )
+      ),
+      .names = "{stringr::str_remove(.col, '_raw$')}"
     ),
     across(
       starts_with("polars"),
@@ -1142,7 +1527,8 @@ police_all <- police_all %>%
         .x == 2 ~ 0,
         .x < 0 ~ .x,
         TRUE ~ -3
-      )
+      ),
+      .names = "{stringr::str_remove(.col, '_raw$')}"
     ),
     across(
       starts_with("polcau"),
@@ -1151,8 +1537,60 @@ police_all <- police_all %>%
         .x == 2 ~ 0,
         .x < 0 ~ .x,
         TRUE ~ -3
-      )
+      ),
+      .names = "{stringr::str_remove(.col, '_raw$')}"
     ),
+    # Add labels
+    across(
+      c(
+        starts_with("polwrn") & !ends_with("raw"),
+        starts_with("polars") & !ends_with("raw"),
+        starts_with("polcau") & !ends_with("raw")
+      ),
+      ~ labelled(
+        .x,
+        labels = c(
+          "No" = 0,
+          "Yes" = 1,
+          common_missing_labels
+        )
+      )
+    )
+  )
+
+# Cross-tabs
+pol_warn_pairs <- tibble::tibble(
+  raw_var = names(police_rec_warning) |>
+    stringr::str_subset("^(polwrn|polars|polcau).*_raw$")
+) |>
+  dplyr::mutate(
+    rec_var = stringr::str_remove(raw_var, "_raw$"),
+    has_rec = rec_var %in% names(police_rec_warning)
+  ) |>
+  dplyr::filter(has_rec) |>
+  dplyr::select(raw_var, rec_var)
+
+police_warn_crosstabs <- pol_warn_pairs |>
+  purrr::pmap(function(raw_var, rec_var) {
+    police_rec_warning |>
+      dplyr::count(
+        raw = .data[[raw_var]],
+        rec = .data[[rec_var]],
+        name = "n"
+      ) |>
+      dplyr::mutate(
+        raw_var = raw_var,
+        rec_var = rec_var,
+        .before = 1
+      )
+  })
+
+police_warn_crosstabs
+
+## Found guilty, penalty notice  --------------------------------------------------------------------
+
+police_rec_guilty <- police_rec_warning %>%
+  mutate(
     across(
       starts_with("polglt"),
       ~ case_when(
@@ -1160,7 +1598,8 @@ police_all <- police_all %>%
         .x == 2 ~ 0,
         .x < 0 ~ .x,
         TRUE ~ -3
-      )
+      ),
+      .names = "{stringr::str_remove(.col, '_raw$')}"
     ),
     across(
       starts_with("polpnd"),
@@ -1169,84 +1608,38 @@ police_all <- police_all %>%
         .x == 2 ~ 0,
         .x < 0 ~ .x,
         TRUE ~ -3
-      )
-    )
-  ) %>%
-  mutate(
-    pol14 = case_when(
-      pol14 %in% c(1, 3) ~ 1,
-      pol14 == 2 ~ 0,
-      pol14 %in% c(-97, -92) ~ -9,
-      pol14 == -91 ~ -1,
-      pol14 %in% c(-96, -1) ~ -8,
-      pol14 %in% c(-99) ~ -3,
-      TRUE ~ -3
+      ),
+      .names = "{stringr::str_remove(.col, '_raw$')}"
     ),
-    pol15 = recode_pol(pol15),
-    pol16 = recode_pol(pol16),
-    pol17 = case_when(
-      pol17 %in% c(1, 3) ~ 1,
-      pol17 == 2 ~ 0,
-      pol17 %in% c(-97, -92) ~ -9,
-      pol17 == -91 ~ -1,
-      pol17 %in% c(-96, -1) ~ -8,
-      pol17 %in% c(-99) ~ -3,
-      TRUE ~ -3
-    )
-  ) %>%
-  mutate(
+    # Add labels
     across(
       c(
-        starts_with("polwrn"),
-        starts_with("polars"),
-        starts_with("polcau"),
-        starts_with("polglt"),
-        starts_with("polpnd")
+        starts_with("polglt") & !ends_with("raw"),
+        starts_with("polpnd") & !ends_with("raw")
       ),
-      ~ factor(
-        .x,
-        levels = c(0, 1, -1, -2, -3, -8, -9),
-        labels = c(
-          "No",
-          "Yes",
-          "Item not applicable",
-          "Script error/information lost",
-          "Not asked at the fieldwork stage/participated/interviewed",
-          "Don’t know/insufficient information",
-          "Refusal"
-        )
-      )
-    ),
-    across(
-      c(pol14, pol15, pol16, pol17),
-      ~ factor(
-        .x,
-        levels = c(0, 1, -1, -2, -3, -8, -9),
-        labels = c(
-          "No",
-          "Yes/not in last 3 years",
-          "Item not applicable",
-          "Script error/information lost",
-          "Not asked at the fieldwork stage/participated/interviewed",
-          "Don’t know/insufficient information",
-          "Refusal"
-        )
-      )
-    ),
-    across(
-      c(starts_with("polcnt")),
       ~ labelled(
         .x,
         labels = c(
-          "Item not applicable" = -1,
-          "Script error/information lost" = -2,
-          "Not asked at the fieldwork stage/participated/interviewed" = -3,
-          "Don’t know/insufficient information" = -8,
-          "Refusal" = -9
+          "No" = 0,
+          "Yes" = 1,
+          common_missing_labels
         )
       )
     )
-  ) %>%
+  )
+
+# Cross-tabs
+police_rec_guilty %>%
+  count(polglt25_raw, polglt25)
+police_rec_guilty %>%
+  count(polglt32_raw, polglt32)
+police_rec_guilty %>%
+  count(polpnd25_raw, polpnd25)
+police_rec_guilty %>%
+  count(polpnd32_raw, polpnd32)
+
+police_all <- police_rec_guilty %>%
+  select(!ends_with("raw")) %>%
   select(
     NSID,
     pol14,
@@ -1264,15 +1657,16 @@ police_all <- police_all %>%
     starts_with("polpnd")
   )
 
-# Bully --------------------------------------------------------------------
+# Bullying --------------------------------------------------------------------
+
 # Load and harmonise bullying variables across sweeps 1–4, 7–8
 bully_vars <- list(
   S1 = ns_data[["S1youngperson"]] %>%
-    select(NSID, bul14 = W1bulrc),
+    select(NSID, bul14_raw = W1bulrc),
   S2 = ns_data[["S2youngperson"]] %>%
-    select(NSID, bul15 = W2bulrc),
+    select(NSID, bul15_raw = W2bulrc),
   S3 = ns_data[["S3youngperson"]] %>%
-    select(NSID, bul16 = W3bulrc),
+    select(NSID, bul16_raw = W3bulrc),
   S4 = ns_data[["S4youngperson"]] %>%
     select(
       NSID,
@@ -1298,69 +1692,77 @@ recode_yesno <- function(x) {
     x == 1 ~ 1,
     x == 2 ~ 0,
     x %in% c(-92, -97) ~ -9,
-    x %in% c(-96, -8) ~ -8,
-    x == -99 ~ -3,
-    is.na(x) ~ -3,
-    TRUE ~ -2
+    x %in% c(-8, -1) ~ -8,
+    .default = -3
   )
 }
 
 # Apply recodes
-bully_all <- bully_all %>%
+bully_rec <- bully_all %>%
   mutate(
-    bul14 = recode_yesno(bul14),
-    bul15 = recode_yesno(bul15),
-    bul16 = recode_yesno(bul16),
+    bul14 = recode_yesno(bul14_raw),
+    bul15 = recode_yesno(bul15_raw),
+    bul16 = recode_yesno(bul16_raw),
     bul17 = case_when(
-      rowSums(across(starts_with("bul17_")) == 1, na.rm = TRUE) > 0 ~ 1,
-      rowSums(
-        across(c("bul17_1", "bul17_2", "bul17_4", "bul17_5", "bul17_6")) == 2,
-        na.rm = TRUE
-      ) ==
-        5 ~ 0,
-      rowSums(
-        across(c("bul17_1", "bul17_2", "bul17_4", "bul17_5", "bul17_6")) < 0,
-        na.rm = TRUE
-      ) >
-        0 ~ -8,
-      rowSums(is.na(across(starts_with("bul17_")))) == 6 ~ -3,
-      TRUE ~ -2
+      # If any of the 6 indicators of bullying in S4 is reported as "yes" (1) -> bullied (1)
+      if_any(starts_with("bul17_"), ~ .x == 1) ~ 1,
+      # Else if all 6 indicators of bullying in S4 are reported as "no" (2) -> not bullied (0)
+      if_all(starts_with("bul17_"), ~ .x == 2) ~ 0,
+      # Else if any of the 6 indicators of bullying in S4 is refused (-92, -97) -> refused (-9)
+      if_any(starts_with("bul17_"), ~ .x == -92 | .x == -97) ~ -9,
+      # Else if any of the 6 indicators of bullying in S4 is don't know/insufficient info (-1) -> -8
+      if_any(starts_with("bul17_"), ~ .x == -1) ~ -8,
+      # Else if any of the 6 indicators of bullying in S4 is not applicable (-91) -> not applicable (-1)
+      if_any(starts_with("bul17_"), ~ .x == -91) ~ -1,
+      # Else -> -3 (i.e. not interviewed/asked etc.)
+      .default = -3
     ),
     bul20 = case_when(
-      rowSums(
-        across(starts_with("W7BullyTypeYP0")) > 0 &
-          across(starts_with("W7BullyTypeYP0")) < 8,
-        na.rm = TRUE
-      ) >
-        0 ~ 1,
-      rowSums(across(starts_with("W7BullyTypeYP0")) == 8, na.rm = TRUE) ==
-        6 ~ 0,
-      rowSums(across(starts_with("W7BullyTypeYP0")) < 0, na.rm = TRUE) > 0 ~ -8,
-      rowSums(is.na(across(starts_with("W7BullyTypeYP0")))) == 6 ~ -3,
-      TRUE ~ -2
+      # If any of the indicators of bullying in S7 reported as between 1 (every day) and 7 (<once a month) -> bullied (1)
+      if_any(starts_with("W7BullyTypeYP0"), ~ .x %in% 1:7) ~ 1,
+      # Else if all of the indicators of bullying in S7 are reported as 8 (never) -> not bullied (0)
+      if_all(starts_with("W7BullyTypeYP0"), ~ .x == 8) ~ 0,
+      # Else if any of the indicators of bullying in S7 is refused (-92, -97) -> refused (-9)
+      if_any(starts_with("W7BullyTypeYP0"), ~ .x %in% c(-92, -97)) ~ -9,
+      # Else if any of the indicators of bullying in S7 is not applicable (-91) -> -1
+      if_any(starts_with("W7BullyTypeYP0"), ~ .x == -91) ~ -1,
+      # Else -> -3 (i.e. not interviewed/asked etc.)
+      .default = -3
     ),
     bul25 = case_when(
-      rowSums(across(starts_with("W8BULLYTYPE0")) == 1, na.rm = TRUE) > 0 ~ 1,
-      rowSums(across(starts_with("W8BULLYTYPE0")) == 2, na.rm = TRUE) == 7 ~ 0,
-      rowSums(across(starts_with("W8BULLYTYPE0")) < 0, na.rm = TRUE) > 0 ~ -8,
-      rowSums(is.na(across(starts_with("W8BULLYTYPE0")))) == 7 ~ -3,
-      TRUE ~ -2
+      # If any of the indicators of bullying in S8 reported as 1 (yes) -> bullied (1),
+      if_any(starts_with("W8BULLYTYPE0"), ~ .x == 1) ~ 1,
+      # Else if all of the indicators of bullying in S8 are reported as 2 (no) -> not bullied (0)
+      if_all(starts_with("W8BULLYTYPE0"), ~ .x == 2) ~ 0,
+      # Else if any of the indicators of bullying in S8 is refused (-9) -> refused (-9)
+      if_any(starts_with("W8BULLYTYPE0"), ~ .x == -9) ~ -9,
+      # Else if any of the indicators of bullying in S8 is don't know/insufficient info (-8) -> -8
+      if_any(starts_with("W8BULLYTYPE0"), ~ .x == -8) ~ -8,
+      # Else if any of the indicators of bullying in S8 is not applicable (-1) -> not applicable (-1)
+      if_any(starts_with("W8BULLYTYPE0"), ~ .x == -1) ~ -1,
+      # Else if any of the indicators of bullying in S8 is not interviewed/asked etc. (-3) -> -3
+      if_any(starts_with("W8BULLYTYPE0"), ~ .x == -3) ~ -3
     )
   ) %>%
   mutate(across(
-    starts_with("bul"),
-    ~ factor(
+    starts_with("bul") & !ends_with("raw"),
+    ~ labelled(
       .x,
-      levels = c(0, 1, -1, -2, -3, -8, -9),
       labels = c(
-        "No",
-        "Yes",
-        "Item not applicable",
-        "Script error/information lost",
-        "Not asked at the fieldwork stage/participated/interviewed",
-        "Don’t know/insufficient information",
-        "Refusal"
+        "No" = 0,
+        "Yes" = 1,
+        common_missing_labels
       )
     )
-  )) %>%
+  ))
+
+# Cross-tabs
+bully_rec %>%
+  count(bul14_raw, bul14)
+bully_rec %>%
+  count(bul15_raw, bul15)
+bully_rec %>%
+  count(bul16_raw, bul16)
+
+bully_all <- bully_rec %>%
   select(NSID, bul14, bul15, bul16, bul17, bul20, bul25)
diff --git a/build-core-dataset.R b/build-core-dataset.R
index be2f3e8..494f02f 100644
--- a/build-core-dataset.R
+++ b/build-core-dataset.R
@@ -58,26 +58,24 @@ long_vars <- read_dta(file.path(data_path, sweeps$longitudinal)) %>%
         W8OUTCOME,
         W9OUTCOME
       ),
-      ~ factor(
+      ~ labelled(
         .x,
-        levels = c(1, 2, 3, 4, 5, 6, -1),
         labels = c(
-          "Productive",
-          "Refusal",
-          "Non-contact and other unproductive",
-          "Ineligible",
-          "Untraced",
-          "Not issued",
-          "No contact"
+          "Productive" = 1,
+          "Refusal" = 2,
+          "Non-contact and other unproductive" = 3,
+          "Ineligible" = 4,
+          "Untraced" = 5,
+          "Not issued" = 6,
+          "No contact" = -1
         )
       )
     ),
-    DATA_AVAILABILITY = factor(
+    DATA_AVAILABILITY = labelled(
       DATA_AVAILABILITY,
-      levels = c(0, 1),
-      labels = c("Not available", "Available for research")
+      labels = c("Not available" = 0, "Available for research" = 1)
     ),
-    MAINBOOST = factor(MAINBOOST, levels = c(1, 2), labels = c("Main", "Boost"))
+    MAINBOOST = labelled(MAINBOOST, labels = c("Main" = 1, "Boost" = 2))
   )
 
 # Merge All Datasets --------------------------------------------------------------------
@@ -113,7 +111,7 @@ derived_vars <- list(
   lsi_all,
   smoking_all,
   alc_all_clean,
-  drug_final,
+  drug_all_clean,
   spt_all,
   absence_all,
   suspend_expel_all,