3.1 What is labelled data?
3.1.1 The basics
When reading a dataset using haven, variables have labels and other metadata attached as attributes.
Standard attributes included regardless of variable type are:
A
label
attribute with the variable labelA
format.stata
,format.spss
, orformat.sas
attribute, depending on the input type, storing the variable format for the specified file type (e.g."F1.0"
)
library(haven)
library(labelled)
library(dplyr, warn.conflicts = FALSE)
gss <- read_sav("data/gss/GSS2018.sav", user_na = TRUE)
gss_dta <- read_dta("data/gss/GSS2018.dta")
# A standard numeric variable, with additional attributes
class(gss$YEAR)
#> [1] "numeric"
str(gss$YEAR)
#> num [1:2348] 2018 2018 2018 2018 2018 ...
#> - attr(*, "label")= chr "GSS year for this respondent"
#> - attr(*, "format.spss")= chr "F4.0"
attributes(gss$YEAR)
#> $label
#> [1] "GSS year for this respondent"
#>
#> $format.spss
#> [1] "F4.0"
class(gss_dta$year)
#> [1] "numeric"
str(gss_dta$year)
#> num [1:2348] 2018 2018 2018 2018 2018 ...
#> - attr(*, "label")= chr "gss year for this respondent "
#> - attr(*, "format.stata")= chr "%8.0g"
attributes(gss_dta$year)
#> $label
#> [1] "gss year for this respondent "
#>
#> $format.stata
#> [1] "%8.0g"
If a variable contains labelled values it will be imported as a haven_labelled
vector, which stores the variable labels in the labels
attribute.
If we’re reading an SPSS file and the variable contains user-defined missing values it will be imported as a haven_labelled_spss
vector. This is an extension of the haven_labelled
class that also records user-defined missing values in the na_values
or na_range
attribute as appropriate.
# A "labelled" categorical variable
class(gss$HEALTH)
#> [1] "haven_labelled_spss" "haven_labelled" "vctrs_vctr"
#> [4] "double"
str(gss$HEALTH)
#> dbl+lbl [1:2348] 2, 1, 0, 0, 1, 2, 2, 1, 0, 0, 1, 0, 4, 1, 0, 2, 0, 1, 1, ...
#> @ label : chr "Condition of health"
#> @ na_values : num [1:3] 0 8 9
#> @ format.spss: chr "F1.0"
#> @ labels : Named num [1:7] 0 1 2 3 4 8 9
#> ..- attr(*, "names")= chr [1:7] "IAP" "EXCELLENT" "GOOD" "FAIR" ...
attributes(gss$HEALTH)
#> $label
#> [1] "Condition of health"
#>
#> $na_values
#> [1] 0 8 9
#>
#> $class
#> [1] "haven_labelled_spss" "haven_labelled" "vctrs_vctr"
#> [4] "double"
#>
#> $format.spss
#> [1] "F1.0"
#>
#> $labels
#> IAP EXCELLENT GOOD FAIR POOR DK NA
#> 0 1 2 3 4 8 9
class(gss_dta$health)
#> [1] "haven_labelled" "vctrs_vctr" "double"
str(gss_dta$health)
#> dbl+lbl [1:2348] 2, 1, NA(i), NA(i), 1, 2, 2, 1, N...
#> @ label : chr "condition of health"
#> @ format.stata: chr "%8.0g"
#> @ labels : Named num [1:7] 1 2 3 4 NA NA NA
#> ..- attr(*, "names")= chr [1:7] "excellent" "good" "fair" "poor" ...
attributes(gss_dta$health)
#> $label
#> [1] "condition of health"
#>
#> $format.stata
#> [1] "%8.0g"
#>
#> $class
#> [1] "haven_labelled" "vctrs_vctr" "double"
#>
#> $labels
#> excellent good fair poor DK IAP NA
#> 1 2 3 4 NA NA NA
One immediate advantage of labelled vectors is that value labels are used in data frame printing when using tibble (and by extension the wider tidyverse) and other packages using the pillar printing methods.
# Print helpers
gss %>% count(HEALTH)
#> # A tibble: 6 x 2
#> HEALTH n
#> <dbl+lbl> <int>
#> 1 0 (NA) [IAP] 774
#> 2 1 [EXCELLENT] 359
#> 3 2 [GOOD] 771
#> 4 3 [FAIR] 355
#> 5 4 [POOR] 84
#> 6 8 (NA) [DK] 5
gss %>% count(HELPSICK)
#> # A tibble: 8 x 2
#> HELPSICK n
#> <dbl+lbl> <int>
#> 1 0 (NA) [IAP] 785
#> 2 1 [GOVT SHOULD HELP] 528
#> 3 2 306
#> 4 3 [AGREE WITH BOTH] 448
#> 5 4 129
#> 6 5 [PEOPLE HELP SELVES] 117
#> 7 8 (NA) [DK] 33
#> 8 9 (NA) [NA] 2
Using head()
on a variable will print a nicely formatted summary of the attached metadata, excluding formats.
head(gss$HEALTH)
#> <labelled_spss<double>[6]>: Condition of health
#> [1] 2 1 0 0 1 2
#> Missing values: 0, 8, 9
#>
#> Labels:
#> value label
#> 0 IAP
#> 1 EXCELLENT
#> 2 GOOD
#> 3 FAIR
#> 4 POOR
#> 8 DK
#> 9 NA
head(gss_dta$health)
#> <labelled<double>[6]>: condition of health
#> [1] 2 1 NA(i) NA(i) 1 2
#>
#> Labels:
#> value label
#> 1 excellent
#> 2 good
#> 3 fair
#> 4 poor
#> NA(d) DK
#> NA(i) IAP
#> NA(n) NA
3.1.2 Missing values
3.1.2.1 User-defined missing values (SPSS)
SPSS allows for user-defined missing values, where the user can tag a discrete set or a range of values to be treated as missing.
These are relatively simple to deal with in haven, and allow for easy differential treatment of missing values in formatting and recoding methods as we’ll see later. They get a handy (NA)
prefix when printed in a tibble and return TRUE
from is.na()
.
# Missing values 0, 8 and 9
head(gss$HEALTH)
#> <labelled_spss<double>[6]>: Condition of health
#> [1] 2 1 0 0 1 2
#> Missing values: 0, 8, 9
#>
#> Labels:
#> value label
#> 0 IAP
#> 1 EXCELLENT
#> 2 GOOD
#> 3 FAIR
#> 4 POOR
#> 8 DK
#> 9 NA
gss %>% count(HEALTH, is.na(HEALTH))
#> # A tibble: 6 x 3
#> HEALTH `is.na(HEALTH)` n
#> <dbl+lbl> <lgl> <int>
#> 1 0 (NA) [IAP] TRUE 774
#> 2 1 [EXCELLENT] FALSE 359
#> 3 2 [GOOD] FALSE 771
#> 4 3 [FAIR] FALSE 355
#> 5 4 [POOR] FALSE 84
#> 6 8 (NA) [DK] TRUE 5
One gotcha in our experience is that although they return TRUE
from is.na()
they are not considered equivalent to NA
in other contexts.
# These are not equivalent!
gss %>% count(HEALTH, is.na(HEALTH), HEALTH %in% NA)
#> # A tibble: 6 x 4
#> HEALTH `is.na(HEALTH)` `HEALTH %in% NA` n
#> <dbl+lbl> <lgl> <lgl> <int>
#> 1 0 (NA) [IAP] TRUE FALSE 774
#> 2 1 [EXCELLENT] FALSE FALSE 359
#> 3 2 [GOOD] FALSE FALSE 771
#> 4 3 [FAIR] FALSE FALSE 355
#> 5 4 [POOR] FALSE FALSE 84
#> 6 8 (NA) [DK] TRUE FALSE 5
Ranges work similarly to discrete values but will exclude all missing values in the range, as you would expect.
# Missing value range 13 - 99, plus discrete value 0
head(gss$RINCOME)
#> <labelled_spss<double>[6]>: Respondents income
#> [1] 13 0 12 12 0 0
#> Missing values: 0
#> Missing range: [13, 99]
#>
#> Labels:
#> value label
#> 0 IAP
#> 1 LT $1000
#> 2 $1000 TO 2999
#> 3 $3000 TO 3999
#> 4 $4000 TO 4999
#> 5 $5000 TO 5999
#> 6 $6000 TO 6999
#> 7 $7000 TO 7999
#> 8 $8000 TO 9999
#> 9 $10000 - 14999
#> 10 $15000 - 19999
#> 11 $20000 - 24999
#> 12 $25000 OR MORE
#> 13 REFUSED
#> 98 DK
#> 99 NA
gss %>% count(RINCOME, is.na(RINCOME))
#> # A tibble: 15 x 3
#> RINCOME `is.na(RINCOME)` n
#> <dbl+lbl> <lgl> <int>
#> 1 0 (NA) [IAP] TRUE 899
#> 2 1 [LT $1000] FALSE 33
#> 3 2 [$1000 TO 2999] FALSE 32
#> 4 3 [$3000 TO 3999] FALSE 32
#> 5 4 [$4000 TO 4999] FALSE 21
#> 6 5 [$5000 TO 5999] FALSE 21
#> 7 6 [$6000 TO 6999] FALSE 12
#> 8 7 [$7000 TO 7999] FALSE 18
#> 9 8 [$8000 TO 9999] FALSE 33
#> 10 9 [$10000 - 14999] FALSE 94
#> 11 10 [$15000 - 19999] FALSE 61
#> 12 11 [$20000 - 24999] FALSE 107
#> 13 12 [$25000 OR MORE] FALSE 851
#> 14 13 (NA) [REFUSED] TRUE 108
#> 15 98 (NA) [DK] TRUE 26
3.1.2.2 Tagged missing values (SAS, Stata)
SAS and Stata take the opposite approach to SPSS - rather than tagging a value as missing, they tag missing data with a “type”. This is also supported by haven, albeit in a slightly different way.
Tagged missing values appear in the label set as an NA
with an attached letter flagging the type.
head(gss_dta$health)
#> <labelled<double>[6]>: condition of health
#> [1] 2 1 NA(i) NA(i) 1 2
#>
#> Labels:
#> value label
#> 1 excellent
#> 2 good
#> 3 fair
#> 4 poor
#> NA(d) DK
#> NA(i) IAP
#> NA(n) NA
Treatment of tagged missing values can be a bit funny compared to user-defined missing values. Note that, in the example below, doing a straight count for “IAP” does not match the SPSS example and is actually combining the “IAP” and “DK” values.
gss_dta %>% count(health, is.na(health))
#> # A tibble: 5 x 3
#> health `is.na(health)` n
#> <dbl+lbl> <lgl> <int>
#> 1 1 [excellent] FALSE 359
#> 2 2 [good] FALSE 771
#> 3 3 [fair] FALSE 355
#> 4 4 [poor] FALSE 84
#> 5 NA(i) [IAP] TRUE 779
In many circumstances tagged NA
values will be grouped together like this, which can be misleading, and need to be treated a bit differently.
You can use na_tag()
to extract the tagged type of the NA
values, or is_tagged_na()
to check for values with a particular tag.
gss_dta %>%
count(
health,
is.na(health),
na_tag(health)
)
#> # A tibble: 6 x 4
#> health `is.na(health)` `na_tag(health)` n
#> <dbl+lbl> <lgl> <chr> <int>
#> 1 1 [excellent] FALSE <NA> 359
#> 2 2 [good] FALSE <NA> 771
#> 3 3 [fair] FALSE <NA> 355
#> 4 4 [poor] FALSE <NA> 84
#> 5 NA(d) [DK] TRUE d 5
#> 6 NA(i) [IAP] TRUE i 774
gss_dta %>%
count(
health,
na_tag(health),
is_tagged_na(health),
is_tagged_na(health, "d")
)
#> # A tibble: 6 x 5
#> health `na_tag(health)` `is_tagged_na(heal… `is_tagged_na(healt… n
#> <dbl+lbl> <chr> <lgl> <lgl> <int>
#> 1 1 [excell… <NA> FALSE FALSE 359
#> 2 2 [good] <NA> FALSE FALSE 771
#> 3 3 [fair] <NA> FALSE FALSE 355
#> 4 4 [poor] <NA> FALSE FALSE 84
#> 5 NA(d) [DK] d TRUE TRUE 5
#> 6 NA(i) [IAP] i TRUE FALSE 774
3.1.2.3 Zapping
To convert tagged or user-defined missing values to a standard R NA
, you can use the zap_missing()
function on either a vector or a data frame.
gss %>% count(HEALTH, zap_missing(HEALTH))
#> # A tibble: 6 x 3
#> HEALTH `zap_missing(HEALTH)` n
#> <dbl+lbl> <dbl+lbl> <int>
#> 1 0 (NA) [IAP] NA 774
#> 2 1 [EXCELLENT] 1 [EXCELLENT] 359
#> 3 2 [GOOD] 2 [GOOD] 771
#> 4 3 [FAIR] 3 [FAIR] 355
#> 5 4 [POOR] 4 [POOR] 84
#> 6 8 (NA) [DK] NA 5
gss_dta %>% count(health, na_tag(health), zap_missing(health))
#> # A tibble: 6 x 4
#> health `na_tag(health)` `zap_missing(health)` n
#> <dbl+lbl> <chr> <dbl+lbl> <int>
#> 1 1 [excellent] <NA> 1 [excellent] 359
#> 2 2 [good] <NA> 2 [good] 771
#> 3 3 [fair] <NA> 3 [fair] 355
#> 4 4 [poor] <NA> 4 [poor] 84
#> 5 NA(d) [DK] d NA 5
#> 6 NA(i) [IAP] i NA 774
You may recall earlier that we mentioned the user_na = TRUE
argument for read_sav()
. If you use user_na = FALSE
(the default), it will convert user defined missing values to NA
on the way in.
read_sav("data/gss/GSS2018.sav", user_na = TRUE) %>%
zap_missing() %>%
count(HEALTH)
#> # A tibble: 5 x 2
#> HEALTH n
#> <dbl+lbl> <int>
#> 1 1 [EXCELLENT] 359
#> 2 2 [GOOD] 771
#> 3 3 [FAIR] 355
#> 4 4 [POOR] 84
#> 5 NA 779
read_sav("data/gss/GSS2018.sav", user_na = FALSE) %>%
count(HEALTH)
#> # A tibble: 5 x 2
#> HEALTH n
#> <dbl+lbl> <int>
#> 1 1 [EXCELLENT] 359
#> 2 2 [GOOD] 771
#> 3 3 [FAIR] 355
#> 4 4 [POOR] 84
#> 5 NA 779