diff --git a/R/03-socioeconomic.R b/R/03-socioeconomic.R index 0d4e2bd..279bf92 100644 --- a/R/03-socioeconomic.R +++ b/R/03-socioeconomic.R @@ -12,152 +12,162 @@ ecoact_vars <- list( S1 = ns_data[["S1youngperson"]] %>% select(NSID), S4 = ns_data[["S4youngperson"]] %>% - select(NSID, ecoact17 = W4empsYP), + select(NSID, ecoact17_raw = W4empsYP), S5 = ns_data[["S5youngperson"]] %>% - select(NSID, ecoact18 = W5mainactYP), + select(NSID, ecoact18_raw = W5mainactYP), S6 = ns_data[["S6youngperson"]] %>% - select(NSID, ecoact19 = W6TCurrentAct), + select(NSID, ecoact19_raw = W6TCurrentAct), S7 = ns_data[["S7youngperson"]] %>% - select(NSID, ecoact20 = W7TCurrentAct), + select(NSID, ecoact20_raw = W7TCurrentAct), S8 = ns_data[["S8derivedvariable"]] %>% - select(NSID, ecoactadu25 = W8DACTIVITYC), + select(NSID, ecoactadu25_raw = W8DACTIVITYC), S9 = ns_data[["S9derivedvariable"]] %>% - select(NSID, ecoactadu32 = W9DACTIVITYC) + select(NSID, ecoactadu32_raw = W9DACTIVITYC) ) + # Merge by NSID ecoact_all <- reduce(ecoact_vars, full_join, by = "NSID") # Harmonise missing values and derive economic activity variables -ecoact_all <- ecoact_all %>% +ecoact_rec <- ecoact_all %>% mutate( ## Sweep 4 ecoact17 = case_when( - ecoact17 %in% 1:2 ~ 1, # In paid work - ecoact17 == 4 ~ 2, # Apprenticeship/government training scheme/training - ecoact17 == 5 | ecoact17 == -91 ~ 3, # Education - ecoact17 == 3 ~ 4, # Unemployed - ecoact17 == 6 ~ 5, # Looking after home/family - ecoact17 %in% c(7, 8, 9) ~ 6, # Sick/disabled, other, doing something else - ecoact17 == -92 ~ -9, - ecoact17 == -999 ~ -2, - ecoact17 == -94 ~ -8, + ecoact17_raw %in% 1:2 ~ 1, # In paid work + ecoact17_raw == 4 ~ 2, # Apprenticeship/government training scheme/training + ecoact17_raw == 5 | ecoact17_raw == -91 ~ 3, # Education + ecoact17_raw == 3 ~ 4, # Unemployed + ecoact17_raw == 6 ~ 5, # Looking after home/family + ecoact17_raw %in% c(7, 8, 9) ~ 6, # Sick/disabled, other, doing something else + ecoact17_raw == -92 ~ -9, + ecoact17_raw == -999 ~ -2, + ecoact17_raw == -94 ~ -8, TRUE ~ -3 ), ## Sweep 5 ecoact18 = case_when( - ecoact18 == 3 ~ 1, - ecoact18 %in% c(1, 5, 6) ~ 2, - ecoact18 %in% c(2, 4) ~ 3, - ecoact18 == 7 ~ 4, - ecoact18 == 8 ~ 5, - ecoact18 %in% 9:11 ~ 6, - ecoact18 == -94 ~ -8, + ecoact18_raw == 3 ~ 1, + ecoact18_raw %in% c(1, 5, 6) ~ 2, + ecoact18_raw %in% c(2, 4) ~ 3, + ecoact18_raw == 7 ~ 4, + ecoact18_raw == 8 ~ 5, + ecoact18_raw %in% 9:11 ~ 6, + ecoact18_raw == -94 ~ -8, TRUE ~ -3 ), ## Sweep 6 ecoact19 = case_when( - ecoact19 == 3 ~ 1, - ecoact19 %in% c(4, 5) ~ 2, - ecoact19 %in% c(1, 2, 10) ~ 3, - ecoact19 == 8 ~ 4, - ecoact19 == 7 ~ 5, - ecoact19 %in% c(6, 9, 11) ~ 6, - ecoact19 == -91 ~ -8, + ecoact19_raw == 3 ~ 1, + ecoact19_raw %in% c(4, 5) ~ 2, + ecoact19_raw %in% c(1, 2, 10) ~ 3, + ecoact19_raw == 8 ~ 4, + ecoact19_raw == 7 ~ 5, + ecoact19_raw %in% c(6, 9, 11) ~ 6, + ecoact19_raw == -91 ~ -8, TRUE ~ -3 ), ## Sweep 7 ecoact20 = case_when( - ecoact20 == 3 ~ 1, - ecoact20 %in% c(4, 5, 11) ~ 2, - ecoact20 %in% c(1, 2, 9) ~ 3, - ecoact20 == 8 ~ 4, - ecoact20 == 7 ~ 5, - ecoact20 %in% c(6, 10, 12:15) ~ 6, - ecoact20 == -91 ~ -1, + ecoact20_raw == 3 ~ 1, + ecoact20_raw %in% c(4, 5, 11) ~ 2, + ecoact20_raw %in% c(1, 2, 9) ~ 3, + ecoact20_raw == 8 ~ 4, + ecoact20_raw == 7 ~ 5, + ecoact20_raw %in% c(6, 10, 12:15) ~ 6, + ecoact20_raw == -91 ~ -1, TRUE ~ -3 ), ## Sweep 8 ecoact25 = case_when( - ecoactadu25 %in% c(1, 2) ~ 1, - ecoactadu25 %in% c(6, 7) ~ 2, - ecoactadu25 == 5 ~ 3, - ecoactadu25 == 4 ~ 4, - ecoactadu25 == 9 ~ 5, - ecoactadu25 %in% c(3, 8, 10) ~ 6, - ecoactadu25 == -9 ~ -9, - ecoactadu25 == -8 ~ -8, - ecoactadu25 == -1 ~ -1, + ecoactadu25_raw %in% c(1, 2) ~ 1, + ecoactadu25_raw %in% c(6, 7) ~ 2, + ecoactadu25_raw == 5 ~ 3, + ecoactadu25_raw == 4 ~ 4, + ecoactadu25_raw == 9 ~ 5, + ecoactadu25_raw %in% c(3, 8, 10) ~ 6, + ecoactadu25_raw == -9 ~ -9, + ecoactadu25_raw == -8 ~ -8, + ecoactadu25_raw == -1 ~ -1, TRUE ~ -3 ), ## Sweep 9 ecoact32 = case_when( - ecoactadu32 %in% c(1, 2) ~ 1, - ecoactadu32 %in% c(6, 7) ~ 2, - ecoactadu32 == 5 ~ 3, - ecoactadu32 == 4 ~ 4, - ecoactadu32 == 9 ~ 5, - ecoactadu32 %in% c(3, 8, 10) ~ 6, - ecoactadu32 == -9 ~ -9, - ecoactadu32 == -8 ~ -8, - ecoactadu32 == -1 ~ -1, + ecoactadu32_raw %in% c(1, 2) ~ 1, + ecoactadu32_raw %in% c(6, 7) ~ 2, + ecoactadu32_raw == 5 ~ 3, + ecoactadu32_raw == 4 ~ 4, + ecoactadu32_raw == 9 ~ 5, + ecoactadu32_raw %in% c(3, 8, 10) ~ 6, + ecoactadu32_raw == -9 ~ -9, + ecoactadu32_raw == -8 ~ -8, + ecoactadu32_raw == -1 ~ -1, TRUE ~ -3 ), ## Detailed versions (S8, S9 only) ecoactadu25 = case_when( - !is.na(ecoactadu25) ~ ecoactadu25, - is.na(ecoactadu25) ~ -3 + !is.na(ecoactadu25_raw) ~ ecoactadu25_raw, + is.na(ecoactadu25_raw) ~ -3 ), ecoactadu32 = case_when( - !is.na(ecoactadu32) ~ ecoactadu32, - is.na(ecoactadu32) ~ -3 + !is.na(ecoactadu32_raw) ~ ecoactadu32_raw, + is.na(ecoactadu32_raw) ~ -3 ) ) %>% mutate( across( c(ecoact17, ecoact18, ecoact19, ecoact20, ecoact25, ecoact32), - ~ factor( + ~ labelled( .x, - levels = c(1, 2, 3, 4, 5, 6, -1, -2, -3, -8, -9), labels = c( - "In paid work", - "Apprenticeship/government training scheme/training", - "Education", - "Unemployed", - "Looking after home", - "Other", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "In paid work" = 1L, + "Apprenticeship/government training scheme/training" = 2L, + "Education" = 3L, + "Unemployed" = 4L, + "Looking after home" = 5L, + "Other" = 6L, + common_missing_labels ) ) ), across( c(ecoactadu25, ecoactadu32), - ~ factor( + ~ labelled( .x, - levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -2, -3, -8, -9), labels = c( - "employee – in paid work", - "self employed", - "voluntary work", - "Unemployed", - "Education", - "Apprenticeship", - "government employment scheme", - "sick/disabled", - "Looking after home/family", - "Something else", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "employee – in paid work" = 1L, + "self employed" = 2L, + "voluntary work" = 3L, + "Unemployed" = 4L, + "Education" = 5L, + "Apprenticeship" = 6L, + "government employment scheme" = 7L, + "sick/disabled" = 8L, + "Looking after home/family" = 9L, + "Something else" = 10L, + common_missing_labels ) ) ) - ) %>% + ) + +# Checks +ecoact_rec %>% + count(ecoact17_raw, ecoact17) + +ecoact_rec %>% + count(ecoact18_raw, ecoact18) + +ecoact_rec %>% + count(ecoact19_raw, ecoact19) + +ecoact_rec %>% + count(ecoact20_raw, ecoact20) + +ecoact_rec %>% + count(ecoactadu25_raw, ecoact25) + +# Extract variables +ecoact_all <- ecoact_rec %>% select( NSID, ecoact17, @@ -171,6 +181,7 @@ ecoact_all <- ecoact_all %>% ) # Economic Activity Parents -------------------------------------------------------------------- + # Load & select parental employment variables for Sweeps 1–4 ecoactDT_parents_vars <- list( S1 = ns_data[["S1familybackground"]] %>% @@ -183,11 +194,17 @@ ecoactDT_parents_vars <- list( select(NSID, ecoactdtma17 = w4empsmum, ecoactdtpa17 = w4empsdad) ) -# Merge all -ecoactDT_parents_all <- reduce(ecoactDT_parents_vars, full_join, by = "NSID") +ecoactDT_parents_all <- ecoactDT_parents_vars %>% + # Merge all + reduce(full_join, by = "NSID") %>% + # Add '_raw' suffix to all 'ecoact*' variable names for simpler re-coding & cross-checks + rename_with( + ~ stringr::str_c(.x, "_raw"), + contains("ecoact") + ) # Recode helper function -recode_detailed <- function(x) { +recode_ecoactDT <- function(x) { case_when( x == 1 ~ 1, # FT x == 2 ~ 2, # PT @@ -208,44 +225,76 @@ recode_detailed <- function(x) { } # Apply recode to each sweep -ecoactDT_parents_all <- ecoactDT_parents_all %>% +ecoactDT_parents_rec <- ecoactDT_parents_all %>% mutate( - ecoactdtma14 = recode_detailed(ecoactdtma14), - ecoactdtpa14 = recode_detailed(ecoactdtpa14), - ecoactdtma15 = recode_detailed(ecoactdtma15), - ecoactdtpa15 = recode_detailed(ecoactdtpa15), - ecoactdtma16 = recode_detailed(ecoactdtma16), - ecoactdtpa16 = recode_detailed(ecoactdtpa16), - ecoactdtma17 = recode_detailed(ecoactdtma17), - ecoactdtpa17 = recode_detailed(ecoactdtpa17) + ecoactdtma14 = recode_ecoactDT(ecoactdtma14_raw), + ecoactdtpa14 = recode_ecoactDT(ecoactdtpa14_raw), + ecoactdtma15 = recode_ecoactDT(ecoactdtma15_raw), + ecoactdtpa15 = recode_ecoactDT(ecoactdtpa15_raw), + ecoactdtma16 = recode_ecoactDT(ecoactdtma16_raw), + ecoactdtpa16 = recode_ecoactDT(ecoactdtpa16_raw), + ecoactdtma17 = recode_ecoactDT(ecoactdtma17_raw), + ecoactdtpa17 = recode_ecoactDT(ecoactdtpa17_raw) ) %>% - mutate(across( - c(starts_with("ecoactdtma"), starts_with("ecoactdtpa")), - ~ factor( - .x, - levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2, -3, -8, -9), - labels = c( - "FT paid work", - "PT paid work", - "Unemployed", - "Training", - "Education", - "Looking after home/family", - "Retired from work altogether", - "Sick/disabled", - "Other", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + mutate( + across( + c(starts_with("ecoactdt") & !ends_with("raw")), + ~ labelled( + .x, + labels = c( + "FT paid work" = 1, + "PT paid work" = 2, + "Unemployed" = 3, + "Training" = 4, + "Education" = 5, + "Looking after home/family" = 6, + "Retired from work altogether" = 7, + "Sick/disabled" = 8, + "Other" = 9, + common_missing_labels + ) ) ) - )) %>% + ) + +# Checks + +ecoactDT_names <- ecoactDT_parents_rec %>% + dplyr::select(!ends_with("raw"), -NSID) %>% + names() + +ecoactDT_pairs <- tibble( + y = ecoactDT_names, + x = str_c(ecoactDT_names, "_raw") +) + +make_crosstab <- function(data, x, y) { + data |> + count( + across(all_of(c(x, y))), + name = "n" + ) +} + +ecoactDT_crosstabs <- ecoactDT_pairs |> + mutate( + crosstab = map2( + x, + y, + ~ make_crosstab(ecoactDT_parents_rec, .x, .y) + ) + ) + +ecoactDT_crosstabs %>% + pull(crosstab) + +# Extract variables +ecoactDT_parents_all <- ecoactDT_parents_rec %>% select(NSID, starts_with("ecoactdtma"), starts_with("ecoactdtpa")) # NS-SEC Own -------------------------------------------------------------------- -# Load NS-SEC variables from relevant sweeps + +# Import NS-SEC variables from relevant sweeps nssec_vars <- list( S1 = ns_data[["S1youngperson"]] %>% select(NSID), S4 = ns_data[["S4youngperson"]] %>% @@ -263,118 +312,199 @@ nssec_vars <- list( ) # Merge all NS-SEC variables by NSID -nssec_all <- reduce(nssec_vars, full_join, by = "NSID") +nssec_all <- reduce(nssec_vars, full_join, by = "NSID") %>% + # Add '_raw' suffix to all 'nssec*' variable names for simpler re-coding & cross-checks + rename_with( + ~ stringr::str_c(.x, "_raw"), + contains("nssec") + ) -# Harmonise NS-SEC values and derive categories +## Fix source variable labels -------------------------------------------------------------------- + +# This is done for simpler validation. +nssec_s4_s5_s6_s7_missing <- c( + `YP Not interviewed` = -99, + `Not applicable` = -91 +) + +nssec_s8_missing <- c( + `Refused` = -9, + `Insufficient information` = -8, + `Not applicable` = -1 +) + +nssec_labels_core <- c( + `Employers in large organisations` = 1, + `Higher managerial occupations` = 2, + `Higher professional traditional employee` = 3.1, + `Higher professional new employee` = 3.2, + `Higher professional traditional self emp` = 3.3, + `Higher professional new self emp` = 3.4, + `Lower professional traditional employee` = 4.1, + `Lower professional new employee` = 4.2, + `Lower professional traditional self emp` = 4.3, + `Lower professional new self emp` = 4.4, + `Lower managerial occupations` = 5, + `Higher supervisory occupations` = 6, + `Intermediate clerical and administrative` = 7.1, + `Intermediate sales and service` = 7.2, + `Intermediate technical and auxiliary` = 7.3, + `Intermediate engineering` = 7.4, + `Employers in small orgs non-professional` = 8.1, + `Employers in small orgs agriculture` = 8.2, + `Own account workers non professional` = 9.1, + `Own account workers agriculture` = 9.2, + `Lower supervisory occupations` = 10, + `Lower technical craft` = 11.1, + `Lower technical process operative` = 11.2, + `Semi routine sales` = 12.1, + `Semi routine services` = 12.2, + `Semi routine technical` = 12.3, + `Semi routine operative` = 12.4, + `Semi routine agricultural` = 12.5, + `Semi routine clerical` = 12.6, + `Semi routine childcare` = 12.7, + `Routine sales and service` = 13.1, + `Routine production` = 13.2, + `Routine technical` = 13.3, + `Routine operative` = 13.4, + `Routine agricultural` = 13.5, + `Never worked` = 14.1, + `Long-term unemployed` = 14.2, + `Not working` = 14.3, + `Full-time students` = 15, + `Not classified or inadequately stated` = 16, + `Not classifiable for other reasons` = 17 +) + +# Apply common labels & sweep-specific features nssec_all <- nssec_all %>% + mutate( + # Sweeps with common labels + across( + c(nssec17_raw, nssec18_raw, nssec19_raw, nssec20_raw), + ~ labelled(.x, labels = c(nssec_s4_s5_s6_s7_missing, nssec_labels_core)) + ), + # S8: Same occupation labels & values, different missing values + nssec25_raw = labelled( + nssec25_raw, + labels = c(nssec_s8_missing, nssec_labels_core) + ) + ) + +## Recode -------------------------------------------------------------------- + +# Harmonise NS-SEC values and derive categories +nssec_rec <- nssec_all %>% mutate( ## Sweep 4 (age 17) nssec17 = case_when( - is.na(nssec17) ~ -3, - floor(nssec17) %in% 1:17 ~ floor(nssec17), - nssec17 == -91 ~ -1, - nssec17 == -99 ~ -3, + is.na(nssec17_raw) ~ -3, + floor(nssec17_raw) %in% 1:17 ~ floor(nssec17_raw), + nssec17_raw == -91 ~ -1, + nssec17_raw == -99 ~ -3, TRUE ~ -3 ), ## Sweep 5 (age 18) nssec18 = case_when( - is.na(nssec18) ~ -3, - floor(nssec18) %in% 1:17 ~ floor(nssec18), - nssec18 == -91 ~ -1, - nssec18 == -99 ~ -3, + is.na(nssec18_raw) ~ -3, + floor(nssec18_raw) %in% 1:17 ~ floor(nssec18_raw), + nssec18_raw == -91 ~ -1, + nssec18_raw == -99 ~ -3, TRUE ~ -3 ), ## Sweep 6 (age 19) nssec19 = case_when( - is.na(nssec19) ~ -3, - floor(nssec19) %in% 1:17 ~ floor(nssec19), - nssec19 == -91 ~ -1, - nssec19 == -99 ~ -3, + is.na(nssec19_raw) ~ -3, + floor(nssec19_raw) %in% 1:17 ~ floor(nssec19_raw), + nssec19_raw == -91 ~ -1, + nssec19_raw == -99 ~ -3, TRUE ~ -3 ), ## Sweep 7 (age 20) nssec20 = case_when( - is.na(nssec20) ~ -3, - floor(nssec20) %in% 1:17 ~ floor(nssec20), - nssec20 == -91 ~ -1, - nssec20 == -99 ~ -3, + is.na(nssec20_raw) ~ -3, + floor(nssec20_raw) %in% 1:17 ~ floor(nssec20_raw), + nssec20_raw == -91 ~ -1, + nssec20_raw == -99 ~ -3, TRUE ~ -3 ), ## Sweep 8 (age 25) nssec25 = case_when( - is.na(nssec25) ~ -3, - floor(nssec25) %in% 1:14 ~ floor(nssec25), + is.na(nssec25_raw) ~ -3, + floor(nssec25_raw) %in% 1:14 ~ floor(nssec25_raw), ecoactadu25 == 5 ~ 15, # full-time student - nssec25 == -9 ~ -9, - nssec25 == -8 ~ -8, - nssec25 == -1 ~ -1, + nssec25_raw == -9 ~ -9, + nssec25_raw == -8 ~ -8, + nssec25_raw == -1 ~ -1, ), ## Sweep 9 (age 32) nssec32 = case_when( - is.na(nssec32) ~ -3, - nssec32 %in% 1:17 ~ nssec32, - nssec32 == -9 ~ -9, - nssec32 == -8 ~ -8, - nssec32 == -1 ~ -1 + is.na(nssec32_raw) ~ -3, + nssec32_raw %in% 1:17 ~ nssec32_raw, + nssec32_raw == -9 ~ -9, + nssec32_raw == -8 ~ -8, + nssec32_raw == -1 ~ -1 ) ) %>% - mutate(across( - starts_with("nssec"), - ~ factor( - .x, - levels = c( - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - -1, - -2, - -3, - -8, - -9 - ), - labels = c( - "Employers in large organisations", - "Higher managerial and administrative occupations", - "Higher professional occupations", - "Lower professional and higher technical occupations", - "Lower managerial and administrative occupations", - "Higher supervisory occupations", - "Intermediate occupations", - "Employers in small establishments", - "Own account workers", - "Lower supervisory occupations", - "Lower technical occupations", - "Semi-routine occupations", - "Routine occupations", - "Never worked and long-term unemployed", - "Full-time student", - "Not classified or inadequately stated", - "Not classifiable for other reasons", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + mutate( + across( + starts_with("nssec") & !ends_with("raw"), + ~ labelled( + .x, + labels = c( + "Employers in large organisations" = 1, + "Higher managerial and administrative occupations" = 2, + "Higher professional occupations" = 3, + "Lower professional and higher technical occupations" = 4, + "Lower managerial and administrative occupations" = 5, + "Higher supervisory occupations" = 6, + "Intermediate occupations" = 7, + "Employers in small establishments" = 8, + "Own account workers" = 9, + "Lower supervisory occupations" = 10, + "Lower technical occupations" = 11, + "Semi-routine occupations" = 12, + "Routine occupations" = 13, + "Never worked and long-term unemployed" = 14, + "Full-time student" = 15, + "Not classified or inadequately stated" = 16, + "Not classifiable for other reasons" = 17, + common_missing_labels + ) ) ) - )) %>% + ) + +## Checks -------------------------------------------------------------------- + +nssec_names <- nssec_rec %>% + dplyr::select(starts_with("nssec") & !ends_with("raw")) %>% + names() + +nssec_pairs <- tibble( + y = nssec_names, + x = str_c(nssec_names, "_raw") +) + +nssec_crosstabs <- nssec_pairs |> + mutate( + crosstab = map2( + x, + y, + ~ make_crosstab(nssec_rec, .x, .y) + ) + ) + +nssec_crosstabs %>% + pull(crosstab) %>% + purrr::walk(~ print(.x, n = Inf)) + +nssec_all <- nssec_rec %>% select(NSID, nssec17, nssec18, nssec19, nssec20, nssec25, nssec32) # NS-SEC Parents -------------------------------------------------------------------- + # Load and select parental NS-SEC variables from Sweeps 1–5 nssec_parents_vars <- list( S1 = ns_data[["S1familybackground"]] %>% @@ -390,86 +520,114 @@ nssec_parents_vars <- list( ) # Merge all parental NS-SEC variables by NSID -nssec_parents_all <- reduce(nssec_parents_vars, full_join, by = "NSID") +nssec_parents_all <- reduce(nssec_parents_vars, full_join, by = "NSID") %>% + # Add '_raw' suffix to all 'nssec*' variable names for simpler re-coding & cross-checks + rename_with( + ~ stringr::str_c(.x, "_raw"), + contains("nssec") + ) -# Harmonise values (preserve decimals, apply missing codes) -recode_nssec_detail <- function(x) { +# Fix labels +## Missing labels +nssec_parents_labels_missing <- c( + "Missing - household data lost" = -999, + "Parent not interviewed" = -99, + "Parent not present" = -98, + "Insufficient information" = -94 +) + +nssec_parents_all <- nssec_parents_all %>% + mutate( + # Sweeps with common labels + across( + -NSID, + ~ labelled( + .x, + ## Re-using nssec_labels_core as the occupation codes/labels are the same for parent variables as well. + labels = c(nssec_parents_labels_missing, nssec_labels_core) + ) + ) + ) + +# Function to harmonise values (handle decimals, recode missing codes) +recode_nssec_parents <- function(x) { case_when( floor(x) %in% 1:17 ~ floor(x), - x %in% c(-999, -94) ~ -2, + x == -999 ~ -2, x %in% c(-99, -98) | is.na(x) ~ -3, + x == -94 ~ -8, TRUE ~ x ) } # Apply recode and assign to derived variables -nssec_parents_all <- nssec_parents_all %>% +nssec_parents_rec <- nssec_parents_all %>% mutate( - nssecma14 = recode_nssec_detail(nssecma14), - nssecpa14 = recode_nssec_detail(nssecpa14), - nssecma15 = recode_nssec_detail(nssecma15), - nssecpa15 = recode_nssec_detail(nssecpa15), - nssecma16 = recode_nssec_detail(nssecma16), - nssecpa16 = recode_nssec_detail(nssecpa16), - nssecma17 = recode_nssec_detail(nssecma17), - nssecpa17 = recode_nssec_detail(nssecpa17), - nssecma18 = recode_nssec_detail(nssecma18), - nssecpa18 = recode_nssec_detail(nssecpa18) + nssecma14 = recode_nssec_parents(nssecma14_raw), + nssecpa14 = recode_nssec_parents(nssecpa14_raw), + nssecma15 = recode_nssec_parents(nssecma15_raw), + nssecpa15 = recode_nssec_parents(nssecpa15_raw), + nssecma16 = recode_nssec_parents(nssecma16_raw), + nssecpa16 = recode_nssec_parents(nssecpa16_raw), + nssecma17 = recode_nssec_parents(nssecma17_raw), + nssecpa17 = recode_nssec_parents(nssecpa17_raw), + nssecma18 = recode_nssec_parents(nssecma18_raw), + nssecpa18 = recode_nssec_parents(nssecpa18_raw) ) %>% - mutate(across( - c(starts_with("nssecma"), starts_with("nssecpa")), - ~ factor( - .x, - levels = c( - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - -1, - -2, - -3, - -8, - -9 - ), - labels = c( - "Employers in large organisations", - "Higher managerial and administrative occupations", - "Higher professional occupations", - "Lower professional and higher technical occupations", - "Lower managerial and administrative occupations", - "Higher supervisory occupations", - "Intermediate occupations", - "Employers in small establishments", - "Own account workers", - "Lower supervisory occupations", - "Lower technical occupations", - "Semi-routine occupations", - "Routine occupations", - "Never worked and long-term unemployed", - "Full-time student", - "Not classified or inadequately stated", - "Not classifiable for other reasons", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + mutate( + across( + c(starts_with("nssec") & !ends_with("raw")), + ~ labelled( + .x, + labels = c( + "Employers in large organisations" = 1, + "Higher managerial and administrative occupations" = 2, + "Higher professional occupations" = 3, + "Lower professional and higher technical occupations" = 4, + "Lower managerial and administrative occupations" = 5, + "Higher supervisory occupations" = 6, + "Intermediate occupations" = 7, + "Employers in small establishments" = 8, + "Own account workers" = 9, + "Lower supervisory occupations" = 10, + "Lower technical occupations" = 11, + "Semi-routine occupations" = 12, + "Routine occupations" = 13, + "Never worked and long-term unemployed" = 14, + "Full-time student" = 15, + "Not classified or inadequately stated" = 16, + "Not classifiable for other reasons" = 17, + common_missing_labels + ) ) ) - )) %>% + ) + +# Cross-checks +nssec_parents_names <- nssec_parents_rec %>% + dplyr::select(starts_with("nssec") & !ends_with("raw")) %>% + names() + +nssec_parents_pairs <- tibble( + y = nssec_parents_names, + x = str_c(nssec_parents_names, "_raw") +) + +nssec_parents_crosstabs <- nssec_parents_pairs |> + mutate( + crosstab = map2( + x, + y, + ~ make_crosstab(nssec_parents_rec, .x, .y) + ) + ) + +nssec_parents_crosstabs %>% + pull(crosstab) %>% + purrr::walk(~ print(.x, n = Inf)) + +# Extract derived variables +nssec_parents_all <- nssec_parents_rec %>% select( NSID, nssecma14, @@ -484,8 +642,8 @@ nssec_parents_all <- nssec_parents_all %>% nssecpa18 ) - # House Ownership -------------------------------------------------------------------- + # Load and select house ownership variables from relevant sweeps housing_vars <- list( S1 = ns_data[["S1familybackground"]] %>% @@ -497,11 +655,26 @@ housing_vars <- list( S4 = ns_data[["S4familybackground"]] %>% select(NSID, hown17 = W4Hous12HH), S5 = ns_data[["S5familybackground"]] %>% - select(NSID, W5Hous12HH, W5Hous12BHH, W5Hous12CHH), + select( + NSID, + s5_tenure_type = W5Hous12HH, + s5_tenure_owned = W5Hous12BHH, + s5_tenure_rented = W5Hous12CHH + ), S6 = ns_data[["S6youngperson"]] %>% - select(NSID, W6Hous12YP, W6Hous12bYP, W6Hous12cYP), + select( + NSID, + s6_tenure_type = W6Hous12YP, + s6_tenure_owned = W6Hous12bYP, + s6_tenure_rented = W6Hous12cYP + ), S7 = ns_data[["S7youngperson"]] %>% - select(NSID, W7Hous12YP, W7Hous12bYP, W7Hous12cYP), + select( + NSID, + s7_tenure_type = W7Hous12YP, + s7_tenure_owned = W7Hous12bYP, + s7_tenure_rented = W7Hous12cYP + ), S8 = ns_data[["S8maininterview"]] %>% select(NSID, hown25 = W8TENURE), S9 = ns_data[["S9derivedvariable"]] %>% @@ -509,253 +682,288 @@ housing_vars <- list( ) # Merge all datasets -hown_all <- reduce(housing_vars, full_join, by = "NSID") +hown_all <- reduce(housing_vars, full_join, by = "NSID") %>% + # Add '_raw' suffix to all 'hown*' variable names for simpler re-coding & cross-checks + rename_with( + ~ stringr::str_c(.x, "_raw"), + contains("hown") + ) + +# Helpers -------------------------------------------------------------------- + +# Note: +## For most sweeps, tenure is derived from a single source variable. +## This is different for sweeps 5-7, where each sweep provides three indicators: +## 1) Indicates whether owned/rented. +## 2) Specifies further tenure if owned (e.g. if owned outright, mortgage, ...). +## 3) Specifies further tenure if rented (e.g. e.g. rented privately, from a council, rent free, ...) + +# Helper for detailed tenure +recode_tenure_detailed <- function(tenure_type, tenure_owned, tenure_rented) { + case_when( + tenure_owned == 1 ~ 1, + tenure_owned == 2 ~ 2, + tenure_owned == 3 ~ 3, + tenure_rented == 1 ~ 4, + tenure_rented == 2 ~ 5, + tenure_rented == 3 ~ 6, + tenure_rented == 4 ~ 7, + tenure_type == 3 | tenure_owned == 4 | tenure_rented == 5 ~ 8, # Other + tenure_owned == -999 | tenure_rented == -999 ~ -2, # Information loss + tenure_owned == -92 | tenure_rented == -92 ~ -9, # Refusal + tenure_owned == -91 | tenure_rented == -91 ~ -1, # Not applicable + tenure_owned == -1 | tenure_rented == -1 ~ -8, # Don't know + .default = -3 # Anything else -> Not asked or similar + ) +} + +# Helper for collapsed tenure +recode_tenure_collapsed <- function(tenure_type, tenure_owned, tenure_rented) { + dplyr::case_when( + tenure_owned == 1 ~ 1, # own outright + tenure_owned == 2 ~ 2, # mortgage/loan + tenure_owned == 3 ~ 3, # shared ownership + tenure_rented %in% 1:3 ~ 4, # rent it (any rent source collapsed) + tenure_rented == 4 ~ 5, # live rent free + tenure_type == 3 | tenure_owned == 4 | tenure_rented == 5 ~ 6, # other + tenure_owned == -999 | tenure_rented == -999 ~ -2, # information loss + tenure_owned == -92 | tenure_rented == -92 ~ -9, # refusal + tenure_owned == -91 | tenure_rented == -91 ~ -1, # not applicable + tenure_owned == -1 | tenure_rented == -1 ~ -8, # don't know + .default = -3 + ) +} + +## Recode: detailed tenure -------------------------------------------------------------------- -# Derive harmonised variables -hown_all <- hown_all %>% +hown_rec_detailed <- hown_all %>% mutate( - # Detailed versions for S1-S7 hownteen14 = case_when( - hown14 > 0 ~ hown14, - hown14 == -999 ~ -2, - hown14 == -92 ~ -9, - hown14 == -91 ~ -1, - hown14 == -1 ~ -8, - is.na(hown14) ~ -3 + hown14_raw > 0 ~ hown14_raw, + hown14_raw == -999 ~ -2, + hown14_raw == -92 ~ -9, + hown14_raw == -91 ~ -1, + hown14_raw == -1 ~ -8, + is.na(hown14_raw) ~ -3 ), hownteen15 = case_when( - hown15 > 0 ~ hown15, - hown15 %in% c(-998, -997, -995, -99) ~ -2, - hown15 == -92 ~ -9, - hown15 == -91 ~ -1, - hown15 == -1 ~ -8, - is.na(hown15) ~ -3 + hown15_raw > 0 ~ hown15_raw, + hown15_raw %in% c(-998, -997, -995, -99) ~ -2, + hown15_raw == -92 ~ -9, + hown15_raw == -91 ~ -1, + hown15_raw == -1 ~ -8, + is.na(hown15_raw) ~ -3 ), hownteen16 = case_when( - hown16 > 0 ~ hown16, - hown16 == -999 ~ -2, - hown16 == -92 ~ -9, - hown16 == -91 ~ -1, - hown16 == -1 ~ -8, - is.na(hown16) ~ -3 + hown16_raw > 0 ~ hown16_raw, + hown16_raw == -999 ~ -2, + hown16_raw == -92 ~ -9, + hown16_raw == -91 ~ -1, + hown16_raw == -1 ~ -8, + is.na(hown16_raw) ~ -3 ), hownteen17 = case_when( - hown17 > 0 ~ hown17, - hown17 %in% c(-999, -997) ~ -2, - hown17 == -92 ~ -9, - hown17 == -91 ~ -1, - hown17 == -1 ~ -8, - is.na(hown17) ~ -3 + hown17_raw > 0 ~ hown17_raw, + hown17_raw %in% c(-999, -997) ~ -2, + hown17_raw == -92 ~ -9, + hown17_raw == -91 ~ -1, + hown17_raw == -1 ~ -8, + is.na(hown17_raw) ~ -3 ), - hownteen18 = case_when( - W5Hous12BHH == 1 ~ 1, # Owned outright - W5Hous12BHH == 2 ~ 2, # Being bought on a mortgage/bank loan - W5Hous12BHH == 3 ~ 3, # Shared ownership (owns & rents property) - W5Hous12CHH == 1 ~ 4, # Rented from a Council or New Town - W5Hous12CHH == 2 ~ 5, # Rented from a Housing Association - W5Hous12CHH == 3 ~ 6, # Rented privately - W5Hous12CHH == 4 ~ 7, # Rent free - W5Hous12HH == 3 | W5Hous12BHH == 4 | W5Hous12CHH == 5 ~ 8, # Other - W5Hous12BHH %in% c(-999, -92) | W5Hous12CHH == -92 ~ -9, - W5Hous12BHH == -91 | W5Hous12CHH == -91 ~ -1, - W5Hous12BHH == -1 | W5Hous12CHH == -1 ~ -8, - is.na(W5Hous12BHH) & is.na(W5Hous12CHH) ~ -3 + hownteen18 = recode_tenure_detailed( + tenure_type = s5_tenure_type, + tenure_owned = s5_tenure_owned, + tenure_rented = s5_tenure_rented ), - hownteen19 = case_when( - W6Hous12bYP == 1 ~ 1, - W6Hous12bYP == 2 ~ 2, - W6Hous12bYP == 3 ~ 3, - W6Hous12cYP == 1 ~ 4, - W6Hous12cYP == 2 ~ 5, - W6Hous12cYP == 3 ~ 6, - W6Hous12cYP == 4 ~ 7, - W6Hous12YP == 3 | W6Hous12bYP == 4 | W6Hous12cYP == 5 ~ 8, - W6Hous12bYP %in% c(-999, -92) | W6Hous12cYP == -92 ~ -9, - W6Hous12bYP == -91 | W6Hous12cYP == -91 ~ -1, - W6Hous12bYP == -1 | W6Hous12cYP == -1 ~ -8, - is.na(W6Hous12bYP) & is.na(W6Hous12cYP) ~ -3 + hownteen19 = recode_tenure_detailed( + tenure_type = s6_tenure_type, + tenure_owned = s6_tenure_owned, + tenure_rented = s6_tenure_rented ), - hownteen20 = case_when( - W7Hous12bYP == 1 ~ 1, - W7Hous12bYP == 2 ~ 2, - W7Hous12bYP == 3 ~ 3, - W7Hous12cYP == 1 ~ 4, - W7Hous12cYP == 2 ~ 5, - W7Hous12cYP == 3 ~ 6, - W7Hous12cYP == 4 ~ 7, - W7Hous12YP == 3 | W7Hous12bYP == 4 | W7Hous12cYP == 5 ~ 8, - W7Hous12bYP %in% c(-999, -92) | W7Hous12cYP == -92 ~ -9, - W7Hous12bYP == -91 | W7Hous12cYP == -91 ~ -1, - W7Hous12bYP == -1 | W7Hous12cYP == -1 ~ -8, - is.na(W7Hous12bYP) & is.na(W7Hous12cYP) ~ -3 + hownteen20 = recode_tenure_detailed( + tenure_type = s7_tenure_type, + tenure_owned = s7_tenure_owned, + tenure_rented = s7_tenure_rented ) - ) %>% + ) + +# Add labels +hown_rec_detailed <- hown_rec_detailed %>% + mutate( + across( + c( + hownteen14, + hownteen15, + hownteen16, + hownteen17, + hownteen18, + hownteen19, + hownteen20 + ), + ~ labelled( + .x, + labels = c( + "Owned outright" = 1, + "Being bought on a mortgage/bank loan" = 2, + "Shared ownership (owns & rents property)" = 3, + "Rented from a Council or New Town" = 4, + "Rented from a Housing Association" = 5, + "Rented privately" = 6, + "Rent free" = 7, + "Some other arrangement" = 8, + common_missing_labels + ) + ) + ) + ) + +## Recode: simple tenure -------------------------------------------------------------------- + +hown_rec_simple <- hown_rec_detailed %>% mutate( hown14 = case_when( - hown14 == 1 ~ 1, # own outright - hown14 == 2 ~ 2, # own, buying with help of mortgage/loan - hown14 == 3 ~ 3, # part rent, part mortgage - hown14 %in% 4:6 ~ 4, # rent it - hown14 == 7 ~ 5, # live-in rent free - hown14 == 8 ~ 6, # other - hown14 == -999 ~ -2, - hown14 == -92 ~ -9, - hown14 == -91 ~ -1, - hown14 == -1 ~ -8, - is.na(hown14) ~ -3 + hown14_raw == 1 ~ 1, # own outright + hown14_raw == 2 ~ 2, # own, buying with help of mortgage/loan + hown14_raw == 3 ~ 3, # part rent, part mortgage + hown14_raw %in% 4:6 ~ 4, # rent it + hown14_raw == 7 ~ 5, # live-in rent free + hown14_raw == 8 ~ 6, # other + hown14_raw == -999 ~ -2, + hown14_raw == -92 ~ -9, + hown14_raw == -91 ~ -1, + hown14_raw == -1 ~ -8, + is.na(hown14_raw) ~ -3 ), hown15 = case_when( - hown15 == 1 ~ 1, - hown15 == 2 ~ 2, - hown15 == 3 ~ 3, - hown15 %in% 4:6 ~ 4, - hown15 == 7 ~ 5, - hown15 == 8 ~ 6, - hown15 %in% c(-998, -997, -995, -99) ~ -2, - hown15 == -92 ~ -9, - hown15 == -91 ~ -1, - hown15 == -1 ~ -8, - is.na(hown15) ~ -3 + hown15_raw == 1 ~ 1, + hown15_raw == 2 ~ 2, + hown15_raw == 3 ~ 3, + hown15_raw %in% 4:6 ~ 4, + hown15_raw == 7 ~ 5, + hown15_raw == 8 ~ 6, + hown15_raw %in% c(-998, -997, -995, -99) ~ -2, + hown15_raw == -92 ~ -9, + hown15_raw == -91 ~ -1, + hown15_raw == -1 ~ -8, + is.na(hown15_raw) ~ -3 ), hown16 = case_when( - hown16 == 1 ~ 1, - hown16 == 2 ~ 2, - hown16 == 3 ~ 3, - hown16 %in% 4:6 ~ 4, - hown16 == 7 ~ 5, - hown16 == 8 ~ 6, - hown16 == -999 ~ -2, - hown16 == -92 ~ -9, - hown16 == -91 ~ -1, - hown16 == -1 ~ -8, - is.na(hown16) ~ -3 + hown16_raw == 1 ~ 1, + hown16_raw == 2 ~ 2, + hown16_raw == 3 ~ 3, + hown16_raw %in% 4:6 ~ 4, + hown16_raw == 7 ~ 5, + hown16_raw == 8 ~ 6, + hown16_raw == -999 ~ -2, + hown16_raw == -92 ~ -9, + hown16_raw == -91 ~ -1, + hown16_raw == -1 ~ -8, + is.na(hown16_raw) ~ -3 ), hown17 = case_when( - hown17 == 1 ~ 1, - hown17 == 2 ~ 2, - hown17 == 3 ~ 3, - hown17 %in% 4:6 ~ 4, - hown17 == 7 ~ 5, - hown17 == 8 ~ 6, - hown17 %in% c(-999, -997) ~ -2, - hown17 == -92 ~ -9, - hown17 == -91 ~ -1, - hown17 == -1 ~ -8, - is.na(hown17) ~ -3 + hown17_raw == 1 ~ 1, + hown17_raw == 2 ~ 2, + hown17_raw == 3 ~ 3, + hown17_raw %in% 4:6 ~ 4, + hown17_raw == 7 ~ 5, + hown17_raw == 8 ~ 6, + hown17_raw %in% c(-999, -997) ~ -2, + hown17_raw == -92 ~ -9, + hown17_raw == -91 ~ -1, + hown17_raw == -1 ~ -8, + is.na(hown17_raw) ~ -3 ), - hown18 = case_when( - W5Hous12BHH == 1 ~ 1, - W5Hous12BHH == 2 ~ 2, - W5Hous12BHH == 3 ~ 3, - W5Hous12CHH %in% 1:3 ~ 4, - W5Hous12CHH == 4 ~ 5, - W5Hous12BHH == 4 | W5Hous12CHH == 5 ~ 6, - W5Hous12BHH %in% c(-999, -92) | W5Hous12CHH == -92 ~ -9, - W5Hous12BHH == -91 | W5Hous12CHH == -91 ~ -1, - W5Hous12BHH == -1 | W5Hous12CHH == -1 ~ -8, - is.na(W5Hous12BHH) & is.na(W5Hous12CHH) ~ -3 + hown18 = recode_tenure_collapsed( + tenure_type = s5_tenure_type, + tenure_owned = s5_tenure_owned, + tenure_rented = s5_tenure_rented ), - hown19 = case_when( - W6Hous12bYP == 1 ~ 1, - W6Hous12bYP == 2 ~ 2, - W6Hous12bYP == 3 ~ 3, - W6Hous12cYP %in% 1:3 ~ 4, - W6Hous12cYP == 4 ~ 5, - W6Hous12bYP == 4 | W6Hous12cYP == 5 ~ 6, - W6Hous12bYP %in% c(-999, -92) | W6Hous12cYP == -92 ~ -9, - W6Hous12bYP == -91 | W6Hous12cYP == -91 ~ -1, - W6Hous12bYP == -1 | W6Hous12cYP == -1 ~ -8, - is.na(W6Hous12bYP) & is.na(W6Hous12cYP) ~ -3 + hown19 = recode_tenure_collapsed( + tenure_type = s6_tenure_type, + tenure_owned = s6_tenure_owned, + tenure_rented = s6_tenure_rented ), - hown20 = case_when( - W7Hous12bYP == 1 ~ 1, - W7Hous12bYP == 2 ~ 2, - W7Hous12bYP == 3 ~ 3, - W7Hous12cYP %in% 1:3 ~ 4, - W7Hous12cYP == 4 ~ 5, - W7Hous12bYP == 4 | W7Hous12cYP == 5 ~ 6, - W7Hous12bYP %in% c(-999, -92) | W7Hous12cYP == -92 ~ -9, - W7Hous12bYP == -91 | W7Hous12cYP == -91 ~ -1, - W7Hous12bYP == -1 | W7Hous12cYP == -1 ~ -8, - is.na(W7Hous12bYP) & is.na(W7Hous12cYP) ~ -3 + hown20 = recode_tenure_collapsed( + tenure_type = s7_tenure_type, + tenure_owned = s7_tenure_owned, + tenure_rented = s7_tenure_rented ), hown25 = case_when( - hown25 == 1 ~ 1, - hown25 == 2 ~ 2, - hown25 == 3 ~ 3, - hown25 == 4 ~ 4, - hown25 == 5 ~ 5, - hown25 %in% 6:7 ~ 6, - hown25 == -9 ~ -9, - hown25 == -8 ~ -8, - hown25 == -1 ~ -1, - is.na(hown25) ~ -3 + hown25_raw == 1 ~ 1, + hown25_raw == 2 ~ 2, + hown25_raw == 3 ~ 3, + hown25_raw == 4 ~ 4, + hown25_raw == 5 ~ 5, + hown25_raw %in% 6:7 ~ 6, + hown25_raw == -9 ~ -9, + hown25_raw == -8 ~ -8, + hown25_raw == -1 ~ -1, + is.na(hown25_raw) ~ -3 ), hown32 = case_when( - hown32 == 1 ~ 1, - hown32 == 2 ~ 2, - hown32 == 3 ~ 3, - hown32 == 4 ~ 4, - hown32 == 5 ~ 5, - hown32 %in% 6:7 ~ 6, - hown32 == -9 ~ -9, - hown32 == -8 ~ -8, - hown32 == -1 ~ -1, - is.na(hown32) ~ -3 + hown32_raw == 1 ~ 1, + hown32_raw == 2 ~ 2, + hown32_raw == 3 ~ 3, + hown32_raw == 4 ~ 4, + hown32_raw == 5 ~ 5, + hown32_raw %in% 6:7 ~ 6, + hown32_raw == -9 ~ -9, + hown32_raw == -8 ~ -8, + hown32_raw == -1 ~ -1, + is.na(hown32_raw) ~ -3 ) - ) %>% + ) + +# Recode labels +hown_rec_simple <- hown_rec_simple %>% mutate( - across( - c( - hownteen14, - hownteen15, - hownteen16, - hownteen17, - hownteen18, - hownteen19, - hownteen20 - ), - ~ factor( - .x, - levels = c(1, 2, 3, 4, 5, 6, 7, 8, -1, -2, -3, -8, -9), - labels = c( - "Owned outright", - "Being bought on a mortgage/bank loan", - "Shared ownership (owns & rents property)", - "Rented from a Council or New Town", - "Rented from a Housing Association", - "Rented privately", - "Rent free", - "Some other arrangement", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" - ) - ) - ), across( c(hown14, hown15, hown16, hown17, hown18, hown19, hown20, hown25, hown32), - ~ factor( + ~ labelled( .x, - levels = c(1, 2, 3, 4, 5, 6, -1, -2, -3, -8, -9), labels = c( - "Owned outright", - "Owned, buying with help of mortgage/loan", - "Spart rent, part mortgage", - "Rent it", - "live rent-free", - "Other", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "Owned outright" = 1, + "Owned, buying with help of mortgage/loan" = 2, + "Part rent, part mortgage" = 3, + "Rent it" = 4, + "live rent-free" = 5, + "Other" = 6, + common_missing_labels ) ) ) - ) %>% + ) + + +## Cross-checks for single-variable re-codes -------------------------------------------------------------------- + +hown_names <- hown_rec_simple %>% + dplyr::select(starts_with("hown") & !ends_with(c("raw", 18:20))) %>% + names() + +hown_pairs <- tibble( + y = hown_names, + x = str_c(hown_names, "_raw") +) %>% + # 'hownteen' must be cross-tabulated against 'hown_raw' + mutate(x = str_replace(x, "hownteen", "hown")) + +hown_crosstabs <- hown_pairs |> + mutate( + crosstab = map2( + x, + y, + ~ make_crosstab(hown_rec_simple, .x, .y) + ) + ) + +hown_crosstabs %>% + pull(crosstab) %>% + purrr::walk(~ print(.x, n = Inf)) + +# Extract derived variables + +hown_all <- hown_rec_simple %>% select( NSID, hown14, @@ -776,85 +984,70 @@ hown_all <- hown_all %>% hownteen20 ) - # Income Own + Partner -------------------------------------------------------------------- + # Load and select income variables from relevant sweeps income_vars <- list( S1 = ns_data[["S1youngperson"]] %>% select(NSID), S4 = ns_data[["S4youngperson"]] %>% select(NSID), S8 = ns_data[["S8derivedvariable"]] %>% - select(NSID, inc25 = W8DINCB), + select(NSID, inc25_raw = W8DINCB), S9 = ns_data[["S9derivedvariable"]] %>% - select(NSID, inc32 = W9DINCB) + select(NSID, inc32_raw = W9DINCB) ) # Merge all income variables by NSID income_all <- reduce(income_vars, full_join, by = "NSID") # Recode -income_all <- income_all %>% +income_rec <- income_all %>% mutate( inc25 = case_when( - is.na(inc25) ~ -3, - TRUE ~ inc25 + is.na(inc25_raw) ~ -3, + TRUE ~ inc25_raw ), inc32 = case_when( - is.na(inc32) ~ -3, - TRUE ~ inc32 + is.na(inc32_raw) ~ -3, + TRUE ~ inc32_raw ) ) %>% - mutate(across( - c(inc25, inc32), - ~ factor( - .x, - levels = c( - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - -1, - -2, - -3, - -8, - -9 - ), - labels = c( - "less than £25 per week", - "25-50", - "50-90", - "90-140", - "140-240", - "240-300", - "300-350", - "350-400", - "400-500", - "500-600", - "600-700", - "700-800", - "800-900", - "900-1200", - "1200-1400", - "more than 1400", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + mutate( + across( + c(inc25, inc32), + ~ labelled( + .x, + labels = c( + "less than £25 per week" = 1, + "25-50" = 2, + "50-90" = 3, + "90-140" = 4, + "140-240" = 5, + "240-300" = 6, + "300-350" = 7, + "350-400" = 8, + "400-500" = 9, + "500-600" = 10, + "600-700" = 11, + "700-800" = 12, + "800-900" = 13, + "900-1200" = 14, + "1200-1400" = 15, + "more than 1400" = 16, + common_missing_labels + ) ) ) - )) %>% + ) + +# Checks +income_rec %>% + count(inc25_raw, inc25) + +income_rec %>% + count(inc32_raw, inc32) + +# Extract derived variables +income_all <- income_rec %>% select(NSID, inc25, inc32) # Income Parents -------------------------------------------------------------------- @@ -871,7 +1064,12 @@ hh_income_vars <- list( ) # Merge all household income variables by NSID -hh_income_all <- reduce(hh_income_vars, full_join, by = "NSID") +hh_income_all <- reduce(hh_income_vars, full_join, by = "NSID") %>% + # Add '_raw' suffix to all 'incwhh*' variable names for simpler re-coding & cross-checks + rename_with( + ~ stringr::str_c(.x, "_raw"), + contains("incwhh") + ) # Derive banded income for continuous measures (S1–S2) convert_to_band <- function(x) { @@ -896,120 +1094,93 @@ convert_to_band <- function(x) { ) } -hh_income_all <- hh_income_all %>% +hh_income_rec <- hh_income_all %>% mutate( # Sweep 1 incwhh14 = case_when( - is.na(incwhh14) ~ -3, - incwhh14 == -92 ~ -9, - incwhh14 %in% c(-999, -992, -94) ~ -2, - incwhh14 == -99 ~ -3, - incwhh14 == -91 ~ -1, - incwhh14 == -1 ~ -8, - incwhh14 == -3 ~ -3, - TRUE ~ convert_to_band(incwhh14) + is.na(incwhh14_raw) ~ -3, + incwhh14_raw %in% c(-92, -992) ~ -9, + incwhh14_raw == -999 ~ -2, + incwhh14_raw == -99 ~ -3, + incwhh14_raw == -91 ~ -1, + incwhh14_raw %in% c(-1, -94) ~ -8, + incwhh14_raw == -3 ~ -1, + TRUE ~ convert_to_band(incwhh14_raw) ), incwhhcnt14 = case_when( - is.na(incwhh14) ~ -3, - incwhh14 == -92 ~ -9, - incwhh14 %in% c(-999, -992, -94) ~ -2, - incwhh14 == -99 ~ -3, - incwhh14 == -91 ~ -1, - incwhh14 == -1 ~ -8, - incwhh14 == -3 ~ -3, - TRUE ~ incwhh14 + is.na(incwhh14_raw) ~ -3, + incwhh14_raw %in% c(-92, -992) ~ -9, + incwhh14_raw == -999 ~ -2, + incwhh14_raw == -99 ~ -3, + incwhh14_raw == -91 ~ -1, + incwhh14_raw %in% c(-1, -94) ~ -8, + incwhh14_raw == -3 ~ -1, + TRUE ~ incwhh14_raw ), # Sweep 2 incwhh15 = case_when( - is.na(incwhh15) ~ -3, - incwhh15 == -92 ~ -9, - incwhh15 %in% c(-999, -992, -94) ~ -2, - incwhh15 == -99 ~ -3, - incwhh15 == -91 ~ -1, - incwhh15 == -1 ~ -8, - incwhh15 == -3 ~ -3, - TRUE ~ convert_to_band(incwhh15) + is.na(incwhh15_raw) ~ -3, + incwhh15_raw %in% c(-92, -992) ~ -9, + incwhh15_raw == -999 ~ -2, + incwhh15_raw == -99 ~ -3, + incwhh15_raw == -91 ~ -1, + incwhh15_raw %in% c(-1, -94) ~ -8, + incwhh15_raw == -3 ~ -1, + TRUE ~ convert_to_band(incwhh15_raw) ), incwhhcnt15 = case_when( - is.na(incwhh15) ~ -3, - incwhh15 == -92 ~ -9, - incwhh15 %in% c(-999, -992, -94) ~ -2, - incwhh15 == -99 ~ -3, - incwhh15 == -91 ~ -1, - incwhh15 == -1 ~ -8, - incwhh15 == -3 ~ -3, - TRUE ~ incwhh15 + is.na(incwhh15_raw) ~ -3, + incwhh15_raw %in% c(-92, -992) ~ -9, + incwhh15_raw == -999 ~ -2, + incwhh15_raw == -99 ~ -3, + incwhh15_raw == -91 ~ -1, + incwhh15_raw %in% c(-1, -94) ~ -8, + incwhh15_raw == -3 ~ -1, + TRUE ~ incwhh15_raw ), # Sweep 3 incwhh16 = case_when( - is.na(incwhh16) ~ -3, - incwhh16 == -99 ~ -3, - incwhh16 == -92 ~ -9, - incwhh16 == -1 ~ -8, - incwhh16 >= 1 & incwhh16 <= 12 ~ incwhh16 + is.na(incwhh16_raw) ~ -3, + incwhh16_raw == -99 ~ -3, + incwhh16_raw == -92 ~ -9, + incwhh16_raw == -1 ~ -8, + incwhh16_raw >= 1 & incwhh16_raw <= 12 ~ incwhh16_raw ), # Sweep 4 incwhh17 = case_when( - is.na(incwhh17) ~ -3, - incwhh17 %in% c(-996, -99) ~ -3, - incwhh17 == -92 ~ -9, - incwhh17 == -1 ~ -8, - incwhh17 >= 1 & incwhh17 <= 12 ~ incwhh17 + is.na(incwhh17_raw) ~ -3, + incwhh17_raw %in% c(-996, -99) ~ -3, + incwhh17_raw == -92 ~ -9, + incwhh17_raw == -1 ~ -8, + incwhh17_raw >= 1 & incwhh17_raw <= 12 ~ incwhh17_raw ) ) %>% mutate( across( c(incwhh14, incwhh15), - ~ factor( + ~ labelled( .x, - levels = c( - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - -1, - -2, - -3, - -8, - -9 - ), labels = c( - "less than £25 per week", - "25-50", - "50-90", - "90-140", - "140-240", - "240-300", - "300-350", - "350-400", - "400-500", - "500-600", - "600-700", - "700-800", - "800-900", - "900-1200", - "1200-1400", - "more than 1400", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "less than £25 per week" = 1, + "25-50" = 2, + "50-90" = 3, + "90-140" = 4, + "140-240" = 5, + "240-300" = 6, + "300-350" = 7, + "350-400" = 8, + "400-500" = 9, + "500-600" = 10, + "600-700" = 11, + "700-800" = 12, + "800-900" = 13, + "900-1200" = 14, + "1200-1400" = 15, + "more than 1400" = 16, + common_missing_labels ) ) ), @@ -1028,31 +1199,28 @@ hh_income_all <- hh_income_all %>% ), across( c(incwhh16, incwhh17), - ~ factor( + ~ labelled( .x, - levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -2, -3, -8, -9), labels = c( - "up to 49", - "50-99", - "100-199", - "200-299", - "300-399", - "400-499", - "500-599", - "600-699", - "700-799", - "800-899", - "900-999", - "1000 or more", - "Item not applicable", - "Script error/information lost", - "Not asked at the fieldwork stage/participated/interviewed", - "Don’t know/insufficient information", - "Refusal" + "up to 49" = 1, + "50-99" = 2, + "100-199" = 3, + "200-299" = 4, + "300-399" = 5, + "400-499" = 6, + "500-599" = 7, + "600-699" = 8, + "700-799" = 9, + "800-899" = 10, + "900-999" = 11, + "1000 or more" = 12, + common_missing_labels ) ) ) - ) %>% + ) + +hh_income_all <- hh_income_rec %>% select(NSID, incwhh14, incwhh15, incwhhcnt14, incwhhcnt15, incwhh16, incwhh17) # IMD -------------------------------------------------------------------- @@ -1061,47 +1229,60 @@ imd_vars <- list( S1 = ns_data[["S1youngperson"]] %>% select(NSID), S4 = ns_data[["S4youngperson"]] %>% select(NSID), S2 = ns_data[["S2familybackground"]] %>% - select(NSID, imd15 = IMDRSCORE), + select(NSID, imd15_raw = IMDRSCORE), S3 = ns_data[["S3familybackground"]] %>% - select(NSID, imd16 = IMDRSCORE), + select(NSID, imd16_raw = IMDRSCORE), S9 = ns_data[["S9derivedvariable"]] %>% - select(NSID, imd32 = W9DIMDD) + select(NSID, imd32_raw = W9DIMDD) ) # Merge all IMD variables by NSID imd_all <- reduce(imd_vars, full_join, by = "NSID") # Recode derived variables -imd_all <- imd_all %>% +imd_rec <- imd_all %>% mutate( imd15 = case_when( - is.na(imd15) ~ -3, - imd15 == -94 ~ -8, - TRUE ~ imd15 + is.na(imd15_raw) ~ -3, + imd15_raw == -94 ~ -8, + TRUE ~ imd15_raw ), imd16 = case_when( - is.na(imd16) ~ -3, - imd16 == -94 ~ -8, - TRUE ~ imd16 + is.na(imd16_raw) ~ -3, + imd16_raw == -94 ~ -8, + TRUE ~ imd16_raw ), imd32 = case_when( - is.na(imd32) ~ -3, - imd32 == -8 ~ -8, - TRUE ~ imd32 + is.na(imd32_raw) ~ -3, + imd32_raw == -8 ~ -8, + TRUE ~ imd32_raw ) ) %>% - mutate(across( - c(imd15, imd16, imd32), - ~ labelled( - .x, - labels = c( - "Item not applicable" = -1, - "Script error/information lost" = -2, - "Not asked at the fieldwork stage/participated/interviewed" = -3, - "Don’t know/insufficient information" = -8 + mutate( + across( + c(imd15, imd16, imd32), + ~ labelled( + .x, + labels = c( + "Item not applicable" = -1, + "Script error/information lost" = -2, + "Not asked at the fieldwork stage/participated/interviewed" = -3, + "Don’t know/insufficient information" = -8 + ) ) ) - )) %>% + ) + +imd_rec %>% + count(imd15_raw, imd15) + +imd_rec %>% + count(imd16_raw, imd16) + +imd_rec %>% + count(imd32_raw, imd32) + +imd_all <- imd_rec %>% select(NSID, imd15, imd16, imd32)