3.1 What is labelled data?

3.1.1 The basics

When reading a dataset using haven, variables have labels and other metadata attached as attributes.

Standard attributes included regardless of variable type are:

A label attribute with the variable label
A format.stata, format.spss, or format.sas attribute, depending on the input type, storing the variable format for the specified file type (e.g. "F1.0")

library(haven)
library(labelled)
library(dplyr, warn.conflicts = FALSE)

gss <- read_sav("data/gss/GSS2018.sav", user_na = TRUE)
gss_dta <- read_dta("data/gss/GSS2018.dta")

# A standard numeric variable, with additional attributes
class(gss$YEAR)
#> [1] "numeric"
str(gss$YEAR)
#>  num [1:2348] 2018 2018 2018 2018 2018 ...
#>  - attr(*, "label")= chr "GSS year for this respondent"
#>  - attr(*, "format.spss")= chr "F4.0"
attributes(gss$YEAR)
#> $label
#> [1] "GSS year for this respondent"
#> 
#> $format.spss
#> [1] "F4.0"

class(gss_dta$year)
#> [1] "numeric"
str(gss_dta$year)
#>  num [1:2348] 2018 2018 2018 2018 2018 ...
#>  - attr(*, "label")= chr "gss year for this respondent "
#>  - attr(*, "format.stata")= chr "%8.0g"
attributes(gss_dta$year)
#> $label
#> [1] "gss year for this respondent "
#> 
#> $format.stata
#> [1] "%8.0g"

If a variable contains labelled values it will be imported as a haven_labelled vector, which stores the variable labels in the labels attribute.

If we’re reading an SPSS file and the variable contains user-defined missing values it will be imported as a haven_labelled_spss vector. This is an extension of the haven_labelled class that also records user-defined missing values in the na_values or na_range attribute as appropriate.

# A "labelled" categorical variable
class(gss$HEALTH)
#> [1] "haven_labelled_spss" "haven_labelled"      "vctrs_vctr"         
#> [4] "double"
str(gss$HEALTH)
#>  dbl+lbl [1:2348] 2, 1, 0, 0, 1, 2, 2, 1, 0, 0, 1, 0, 4, 1, 0, 2, 0, 1, 1, ...
#>  @ label      : chr "Condition of health"
#>  @ na_values  : num [1:3] 0 8 9
#>  @ format.spss: chr "F1.0"
#>  @ labels     : Named num [1:7] 0 1 2 3 4 8 9
#>   ..- attr(*, "names")= chr [1:7] "IAP" "EXCELLENT" "GOOD" "FAIR" ...
attributes(gss$HEALTH)
#> $label
#> [1] "Condition of health"
#> 
#> $na_values
#> [1] 0 8 9
#> 
#> $class
#> [1] "haven_labelled_spss" "haven_labelled"      "vctrs_vctr"         
#> [4] "double"             
#> 
#> $format.spss
#> [1] "F1.0"
#> 
#> $labels
#>       IAP EXCELLENT      GOOD      FAIR      POOR        DK        NA 
#>         0         1         2         3         4         8         9

class(gss_dta$health)
#> [1] "haven_labelled" "vctrs_vctr"     "double"
str(gss_dta$health)
#>  dbl+lbl [1:2348]     2,     1, NA(i), NA(i),     1,     2,     2,     1, N...
#>  @ label       : chr "condition of health"
#>  @ format.stata: chr "%8.0g"
#>  @ labels      : Named num [1:7] 1 2 3 4 NA NA NA
#>   ..- attr(*, "names")= chr [1:7] "excellent" "good" "fair" "poor" ...
attributes(gss_dta$health)
#> $label
#> [1] "condition of health"
#> 
#> $format.stata
#> [1] "%8.0g"
#> 
#> $class
#> [1] "haven_labelled" "vctrs_vctr"     "double"        
#> 
#> $labels
#> excellent      good      fair      poor        DK       IAP        NA 
#>         1         2         3         4        NA        NA        NA

One immediate advantage of labelled vectors is that value labels are used in data frame printing when using tibble (and by extension the wider tidyverse) and other packages using the pillar printing methods.

# Print helpers
gss %>% count(HEALTH)
#> # A tibble: 6 x 2
#>          HEALTH     n
#>       <dbl+lbl> <int>
#> 1 0 (NA) [IAP]    774
#> 2 1 [EXCELLENT]   359
#> 3 2 [GOOD]        771
#> 4 3 [FAIR]        355
#> 5 4 [POOR]         84
#> 6 8 (NA) [DK]       5

gss %>% count(HELPSICK)
#> # A tibble: 8 x 2
#>                 HELPSICK     n
#>                <dbl+lbl> <int>
#> 1 0 (NA) [IAP]             785
#> 2 1 [GOVT SHOULD HELP]     528
#> 3 2                        306
#> 4 3 [AGREE WITH BOTH]      448
#> 5 4                        129
#> 6 5 [PEOPLE HELP SELVES]   117
#> 7 8 (NA) [DK]               33
#> 8 9 (NA) [NA]                2

Using head() on a variable will print a nicely formatted summary of the attached metadata, excluding formats.

head(gss$HEALTH)
#> <labelled_spss<double>[6]>: Condition of health
#> [1] 2 1 0 0 1 2
#> Missing values: 0, 8, 9
#> 
#> Labels:
#>  value     label
#>      0       IAP
#>      1 EXCELLENT
#>      2      GOOD
#>      3      FAIR
#>      4      POOR
#>      8        DK
#>      9        NA

head(gss_dta$health)
#> <labelled<double>[6]>: condition of health
#> [1]     2     1 NA(i) NA(i)     1     2
#> 
#> Labels:
#>  value     label
#>      1 excellent
#>      2      good
#>      3      fair
#>      4      poor
#>  NA(d)        DK
#>  NA(i)       IAP
#>  NA(n)        NA

3.1.2 Missing values

3.1.2.1 User-defined missing values (SPSS)

SPSS allows for user-defined missing values, where the user can tag a discrete set or a range of values to be treated as missing.

These are relatively simple to deal with in haven, and allow for easy differential treatment of missing values in formatting and recoding methods as we’ll see later. They get a handy (NA) prefix when printed in a tibble and return TRUE from is.na().

# Missing values 0, 8 and 9
head(gss$HEALTH)
#> <labelled_spss<double>[6]>: Condition of health
#> [1] 2 1 0 0 1 2
#> Missing values: 0, 8, 9
#> 
#> Labels:
#>  value     label
#>      0       IAP
#>      1 EXCELLENT
#>      2      GOOD
#>      3      FAIR
#>      4      POOR
#>      8        DK
#>      9        NA

gss %>% count(HEALTH, is.na(HEALTH))
#> # A tibble: 6 x 3
#>          HEALTH `is.na(HEALTH)`     n
#>       <dbl+lbl> <lgl>           <int>
#> 1 0 (NA) [IAP]  TRUE              774
#> 2 1 [EXCELLENT] FALSE             359
#> 3 2 [GOOD]      FALSE             771
#> 4 3 [FAIR]      FALSE             355
#> 5 4 [POOR]      FALSE              84
#> 6 8 (NA) [DK]   TRUE                5

One gotcha in our experience is that although they return TRUE from is.na() they are not considered equivalent to NA in other contexts.

# These are not equivalent!
gss %>% count(HEALTH, is.na(HEALTH), HEALTH %in% NA)
#> # A tibble: 6 x 4
#>          HEALTH `is.na(HEALTH)` `HEALTH %in% NA`     n
#>       <dbl+lbl> <lgl>           <lgl>            <int>
#> 1 0 (NA) [IAP]  TRUE            FALSE              774
#> 2 1 [EXCELLENT] FALSE           FALSE              359
#> 3 2 [GOOD]      FALSE           FALSE              771
#> 4 3 [FAIR]      FALSE           FALSE              355
#> 5 4 [POOR]      FALSE           FALSE               84
#> 6 8 (NA) [DK]   TRUE            FALSE                5

Ranges work similarly to discrete values but will exclude all missing values in the range, as you would expect.

# Missing value range 13 - 99, plus discrete value 0
head(gss$RINCOME)
#> <labelled_spss<double>[6]>: Respondents income
#> [1] 13  0 12 12  0  0
#> Missing values: 0
#> Missing range:  [13, 99]
#> 
#> Labels:
#>  value          label
#>      0            IAP
#>      1       LT $1000
#>      2  $1000 TO 2999
#>      3  $3000 TO 3999
#>      4  $4000 TO 4999
#>      5  $5000 TO 5999
#>      6  $6000 TO 6999
#>      7  $7000 TO 7999
#>      8  $8000 TO 9999
#>      9 $10000 - 14999
#>     10 $15000 - 19999
#>     11 $20000 - 24999
#>     12 $25000 OR MORE
#>     13        REFUSED
#>     98             DK
#>     99             NA

gss %>% count(RINCOME, is.na(RINCOME))
#> # A tibble: 15 x 3
#>                RINCOME `is.na(RINCOME)`     n
#>              <dbl+lbl> <lgl>            <int>
#>  1  0 (NA) [IAP]       TRUE               899
#>  2  1 [LT $1000]       FALSE               33
#>  3  2 [$1000 TO 2999]  FALSE               32
#>  4  3 [$3000 TO 3999]  FALSE               32
#>  5  4 [$4000 TO 4999]  FALSE               21
#>  6  5 [$5000 TO 5999]  FALSE               21
#>  7  6 [$6000 TO 6999]  FALSE               12
#>  8  7 [$7000 TO 7999]  FALSE               18
#>  9  8 [$8000 TO 9999]  FALSE               33
#> 10  9 [$10000 - 14999] FALSE               94
#> 11 10 [$15000 - 19999] FALSE               61
#> 12 11 [$20000 - 24999] FALSE              107
#> 13 12 [$25000 OR MORE] FALSE              851
#> 14 13 (NA) [REFUSED]   TRUE               108
#> 15 98 (NA) [DK]        TRUE                26

3.1.2.2 Tagged missing values (SAS, Stata)

SAS and Stata take the opposite approach to SPSS - rather than tagging a value as missing, they tag missing data with a “type”. This is also supported by haven, albeit in a slightly different way.

Tagged missing values appear in the label set as an NA with an attached letter flagging the type.

head(gss_dta$health)
#> <labelled<double>[6]>: condition of health
#> [1]     2     1 NA(i) NA(i)     1     2
#> 
#> Labels:
#>  value     label
#>      1 excellent
#>      2      good
#>      3      fair
#>      4      poor
#>  NA(d)        DK
#>  NA(i)       IAP
#>  NA(n)        NA

Treatment of tagged missing values can be a bit funny compared to user-defined missing values. Note that, in the example below, doing a straight count for “IAP” does not match the SPSS example and is actually combining the “IAP” and “DK” values.

gss_dta %>% count(health, is.na(health))
#> # A tibble: 5 x 3
#>              health `is.na(health)`     n
#>           <dbl+lbl> <lgl>           <int>
#> 1     1 [excellent] FALSE             359
#> 2     2 [good]      FALSE             771
#> 3     3 [fair]      FALSE             355
#> 4     4 [poor]      FALSE              84
#> 5 NA(i) [IAP]       TRUE              779

In many circumstances tagged NA values will be grouped together like this, which can be misleading, and need to be treated a bit differently.

You can use na_tag() to extract the tagged type of the NA values, or is_tagged_na() to check for values with a particular tag.

gss_dta %>%
  count(
    health,
    is.na(health),
    na_tag(health)
  )
#> # A tibble: 6 x 4
#>              health `is.na(health)` `na_tag(health)`     n
#>           <dbl+lbl> <lgl>           <chr>            <int>
#> 1     1 [excellent] FALSE           <NA>               359
#> 2     2 [good]      FALSE           <NA>               771
#> 3     3 [fair]      FALSE           <NA>               355
#> 4     4 [poor]      FALSE           <NA>                84
#> 5 NA(d) [DK]        TRUE            d                    5
#> 6 NA(i) [IAP]       TRUE            i                  774

gss_dta %>%
  count(
    health,
    na_tag(health),
    is_tagged_na(health),
    is_tagged_na(health, "d")
  )
#> # A tibble: 6 x 5
#>           health `na_tag(health)` `is_tagged_na(heal… `is_tagged_na(healt…     n
#>        <dbl+lbl> <chr>            <lgl>               <lgl>                <int>
#> 1     1 [excell… <NA>             FALSE               FALSE                  359
#> 2     2 [good]   <NA>             FALSE               FALSE                  771
#> 3     3 [fair]   <NA>             FALSE               FALSE                  355
#> 4     4 [poor]   <NA>             FALSE               FALSE                   84
#> 5 NA(d) [DK]     d                TRUE                TRUE                     5
#> 6 NA(i) [IAP]    i                TRUE                FALSE                  774

3.1.2.3 Zapping

To convert tagged or user-defined missing values to a standard R NA, you can use the zap_missing() function on either a vector or a data frame.

gss %>% count(HEALTH, zap_missing(HEALTH))
#> # A tibble: 6 x 3
#>          HEALTH `zap_missing(HEALTH)`     n
#>       <dbl+lbl>             <dbl+lbl> <int>
#> 1 0 (NA) [IAP]         NA               774
#> 2 1 [EXCELLENT]         1 [EXCELLENT]   359
#> 3 2 [GOOD]              2 [GOOD]        771
#> 4 3 [FAIR]              3 [FAIR]        355
#> 5 4 [POOR]              4 [POOR]         84
#> 6 8 (NA) [DK]          NA                 5

gss_dta %>% count(health, na_tag(health), zap_missing(health))
#> # A tibble: 6 x 4
#>              health `na_tag(health)` `zap_missing(health)`     n
#>           <dbl+lbl> <chr>                        <dbl+lbl> <int>
#> 1     1 [excellent] <NA>                     1 [excellent]   359
#> 2     2 [good]      <NA>                     2 [good]        771
#> 3     3 [fair]      <NA>                     3 [fair]        355
#> 4     4 [poor]      <NA>                     4 [poor]         84
#> 5 NA(d) [DK]        d                       NA                 5
#> 6 NA(i) [IAP]       i                       NA               774

You may recall earlier that we mentioned the user_na = TRUE argument for read_sav(). If you use user_na = FALSE (the default), it will convert user defined missing values to NA on the way in.

read_sav("data/gss/GSS2018.sav", user_na = TRUE) %>%
  zap_missing() %>%
  count(HEALTH)
#> # A tibble: 5 x 2
#>           HEALTH     n
#>        <dbl+lbl> <int>
#> 1  1 [EXCELLENT]   359
#> 2  2 [GOOD]        771
#> 3  3 [FAIR]        355
#> 4  4 [POOR]         84
#> 5 NA               779

read_sav("data/gss/GSS2018.sav", user_na = FALSE) %>%
  count(HEALTH)
#> # A tibble: 5 x 2
#>           HEALTH     n
#>        <dbl+lbl> <int>
#> 1  1 [EXCELLENT]   359
#> 2  2 [GOOD]        771
#> 3  3 [FAIR]        355
#> 4  4 [POOR]         84
#> 5 NA               779