3.2 Converting labelled vectors

Labelled datasets are great for accessing metadata in the R console, but many functions need base R data types.

3.2.1 Factors

The labelled package has a couple of helper functions for converting labelled vectors to factors and character vectors. The to_factor() function is versatile, and can manipulate labels in various ways on the way to factor levels.

The levels argument controls how levels are derived from the value labels.

# Convert to factors, using the labels as levels
gss %>% count(HEALTH = to_factor(HEALTH))
#> # A tibble: 6 x 2
#>   HEALTH        n
#>   <fct>     <int>
#> 1 IAP         774
#> 2 EXCELLENT   359
#> 3 GOOD        771
#> 4 FAIR        355
#> 5 POOR         84
#> 6 DK            5

# Include the category code in the label
gss %>% count(HEALTH = to_factor(HEALTH, levels = "prefixed"))
#> # A tibble: 6 x 2
#>   HEALTH            n
#>   <fct>         <int>
#> 1 [0] IAP         774
#> 2 [1] EXCELLENT   359
#> 3 [2] GOOD        771
#> 4 [3] FAIR        355
#> 5 [4] POOR         84
#> 6 [8] DK            5

# Use the category code instead of the label
gss %>% count(HEALTH = to_factor(HEALTH, levels = "values"))
#> # A tibble: 6 x 2
#>   HEALTH     n
#>   <fct>  <int>
#> 1 0        774
#> 2 1        359
#> 3 2        771
#> 4 3        355
#> 5 4         84
#> 6 8          5

User defined missing values can be removed from the levels and converted to NA using user_na_to_na = TRUE.

# Remove user-defined NA values
gss %>% count(HEALTH = to_factor(HEALTH, user_na_to_na = TRUE))
#> # A tibble: 5 x 2
#>   HEALTH        n
#>   <fct>     <int>
#> 1 EXCELLENT   359
#> 2 GOOD        771
#> 3 FAIR        355
#> 4 POOR         84
#> 5 <NA>        779

Labels that don’t exist in the data can be dropped from the levels using drop_unused_labels = TRUE.

# Drop unused labels
table(to_factor(gss$HEALTH))
#> 
#>       IAP EXCELLENT      GOOD      FAIR      POOR        DK        NA 
#>       774       359       771       355        84         5         0
table(to_factor(gss$HEALTH, drop_unused_labels = TRUE))
#> 
#>       IAP EXCELLENT      GOOD      FAIR      POOR        DK 
#>       774       359       771       355        84         5

Factor Levels can easily be sorted by either value or label using the sort_levels argument. By default, they are sorted by value.

# Sort by value
levels(to_factor(gss$HEALTH, levels = "prefixed", sort_levels = "values"))
#> [1] "[0] IAP"       "[1] EXCELLENT" "[2] GOOD"      "[3] FAIR"     
#> [5] "[4] POOR"      "[8] DK"        "[9] NA"

# Sort by label
levels(to_factor(gss$HEALTH, levels = "prefixed", sort_levels = "labels"))
#> [1] "[8] DK"        "[1] EXCELLENT" "[3] FAIR"      "[2] GOOD"     
#> [5] "[0] IAP"       "[9] NA"        "[4] POOR"

# Sort descending
levels(to_factor(gss$HEALTH, levels = "prefixed", sort_levels = "values", decreasing = TRUE))
#> [1] "[9] NA"        "[8] DK"        "[4] POOR"      "[3] FAIR"     
#> [5] "[2] GOOD"      "[1] EXCELLENT" "[0] IAP"

By default unlabelled values will be included with the value used as the factor level. They can be discarded with no_label_to_na = TRUE.

gss %>% count(HELPSICK = to_factor(HELPSICK))
#> # A tibble: 8 x 2
#>   HELPSICK               n
#>   <fct>              <int>
#> 1 IAP                  785
#> 2 GOVT SHOULD HELP     528
#> 3 2                    306
#> 4 AGREE WITH BOTH      448
#> 5 4                    129
#> 6 PEOPLE HELP SELVES   117
#> 7 DK                    33
#> 8 NA                     2

# Convert unlabelled levels to NA
gss %>% count(HELPSICK = to_factor(HELPSICK, nolabel_to_na = TRUE))
#> # A tibble: 7 x 2
#>   HELPSICK               n
#>   <fct>              <int>
#> 1 IAP                  785
#> 2 GOVT SHOULD HELP     528
#> 3 AGREE WITH BOTH      448
#> 4 PEOPLE HELP SELVES   117
#> 5 DK                    33
#> 6 NA                     2
#> 7 <NA>                 435

And all labelled vectors in the data frame can be converted to factors in one go.

# Convert all labelled vectors to factors
to_factor(gss)
#> # A tibble: 2,348 x 1,065
#>    ABANY ABDEFECT ABFELEGL ABHELP1 ABHELP2 ABHELP3 ABHELP4 ABHLTH ABINSPAY
#>    <fct> <fct>    <fct>    <fct>   <fct>   <fct>   <fct>   <fct>  <fct>   
#>  1 NO    YES      IAP      Yes     Yes     Yes     Yes     YES    People …
#>  2 YES   YES      It depe… No      No      No      No      YES    People …
#>  3 IAP   IAP      IAP      Yes     No      Yes     Yes     IAP    People …
#>  4 IAP   IAP      Should   Yes     Yes     Yes     Yes     IAP    People …
#>  5 NO    YES      IAP      No      No      No      Yes     YES    People …
#>  6 YES   YES      Should   Yes     Yes     Yes     Yes     YES    People …
#>  7 YES   YES      It depe… Yes     No      Yes     Yes     YES    People …
#>  8 NO    YES      IAP      Yes     No      Yes     Yes     YES    No answ…
#>  9 IAP   IAP      It depe… Yes     Yes     Yes     Yes     IAP    People …
#> 10 IAP   IAP      IAP      Yes     No      No      Yes     IAP    People …
#> # … with 2,338 more rows, and 1,056 more variables: ABMEDGOV1 <fct>,
#> #   ABMEDGOV2 <fct>, ABMELEGL <fct>, ABMORAL <fct>, ABNOMORE <fct>,
#> #   ABPOOR <fct>, ABPOORW <fct>, ABRAPE <fct>, ABSINGLE <fct>, ABSTATE1 <fct>,
#> #   ABSTATE2 <fct>, ACQNTSEX <fct>, ACTSSOC <fct>, ADMINCONSENT <fct>,
#> #   ADULTS <fct>, ADVFRONT <fct>, AFFRMACT <fct>, AFRAIDOF <fct>,
#> #   AFTERLIF <fct>, AGE <fct>, AGED <fct>, AGEKDBRN <fct>, ANCESTRS <fct>,
#> #   ARTHRTIS <fct>, ASTROLGY <fct>, ASTROSCI <fct>, ATHEISTS <fct>,
#> #   ATTEND <fct>, ATTEND12 <fct>, ATTENDMA <fct>, ATTENDPA <fct>, AWAY1 <fct>,
#> #   AWAY11 <fct>, AWAY2 <fct>, AWAY3 <fct>, AWAY4 <fct>, AWAY5 <fct>,
#> #   AWAY6 <fct>, AWAY7 <fct>, BABIES <fct>, BACKPAIN <fct>, BALLOT <fct>,
#> #   BALNEG <fct>, BALPOS <fct>, BEFAIR <fct>, BETRLANG <fct>, BIBLE <fct>,
#> #   BIGBANG <fct>, BIGBANG1 <fct>, BIGBANG2 <fct>, BIRD <fct>, BIRDB4 <fct>,
#> #   BORN <fct>, BOYORGRL <fct>, BREAKDWN <fct>, BUDDHSTS <fct>, BUYESOP <fct>,
#> #   BUYVALUE <fct>, CANTRUST <fct>, CAPPUN <fct>, CAT <fct>, CATB4 <fct>,
#> #   CHARACTR <fct>, CHEMGEN <fct>, CHILDS <fct>, CHLDIDEL <fct>,
#> #   CHRISTNS <fct>, CHURHPOW <fct>, CLASS <fct>, CLERGVTE <fct>,
#> #   CLOSETO1 <fct>, CLOSETO2 <fct>, CLOSETO3 <fct>, CLOSETO4 <fct>,
#> #   CLOSETO5 <fct>, CNTCTFAM <fct>, CNTCTFRD <fct>, CNTCTKID <fct>,
#> #   CNTCTPAR <fct>, CNTCTSIB <fct>, CODEG <fct>, CODEN <fct>, COEDUC <fct>,
#> #   COEVWORK <fct>, COFUND <fct>, COHORT <fct>, COHRS1 <fct>, COHRS2 <fct>,
#> #   COIND10 <fct>, COISCO08 <fct>, COJew <fct>, COLATH <fct>, COLCOM <fct>,
#> #   COLDEG1 <fct>, COLHOMO <fct>, COLMIL <fct>, COLMSLM <fct>, COLRAC <fct>,
#> #   COLSCI <fct>, COLSCINM <fct>, …

3.2.2 Character vectors

The to_character() function allows you to convert to a character vector instead of a factor, using the same general conversion arguments as to_factor().

# Convert to a character variable
gss %>% count(HEALTH = to_character(HEALTH, levels = "prefixed"))
#> # A tibble: 6 x 2
#>   HEALTH            n
#>   <chr>         <int>
#> 1 [0] IAP         774
#> 2 [1] EXCELLENT   359
#> 3 [2] GOOD        771
#> 4 [3] FAIR        355
#> 5 [4] POOR         84
#> 6 [8] DK            5

# Remove tagged NA values
gss %>% count(HEALTH = to_character(HEALTH, user_na_to_na = TRUE))
#> # A tibble: 5 x 2
#>   HEALTH        n
#>   <chr>     <int>
#> 1 EXCELLENT   359
#> 2 FAIR        355
#> 3 GOOD        771
#> 4 POOR         84
#> 5 <NA>        779