3.2 Converting labelled vectors
Labelled datasets are great for accessing metadata in the R console, but many functions need base R data types.
3.2.1 Factors
The labelled package has a couple of helper functions for converting labelled vectors to factors and character vectors. The to_factor()
function is versatile, and can manipulate labels in various ways on the way to factor levels.
The levels
argument controls how levels are derived from the value labels.
# Convert to factors, using the labels as levels
gss %>% count(HEALTH = to_factor(HEALTH))
#> # A tibble: 6 x 2
#> HEALTH n
#> <fct> <int>
#> 1 IAP 774
#> 2 EXCELLENT 359
#> 3 GOOD 771
#> 4 FAIR 355
#> 5 POOR 84
#> 6 DK 5
# Include the category code in the label
gss %>% count(HEALTH = to_factor(HEALTH, levels = "prefixed"))
#> # A tibble: 6 x 2
#> HEALTH n
#> <fct> <int>
#> 1 [0] IAP 774
#> 2 [1] EXCELLENT 359
#> 3 [2] GOOD 771
#> 4 [3] FAIR 355
#> 5 [4] POOR 84
#> 6 [8] DK 5
# Use the category code instead of the label
gss %>% count(HEALTH = to_factor(HEALTH, levels = "values"))
#> # A tibble: 6 x 2
#> HEALTH n
#> <fct> <int>
#> 1 0 774
#> 2 1 359
#> 3 2 771
#> 4 3 355
#> 5 4 84
#> 6 8 5
User defined missing values can be removed from the levels and converted to NA
using user_na_to_na = TRUE
.
# Remove user-defined NA values
gss %>% count(HEALTH = to_factor(HEALTH, user_na_to_na = TRUE))
#> # A tibble: 5 x 2
#> HEALTH n
#> <fct> <int>
#> 1 EXCELLENT 359
#> 2 GOOD 771
#> 3 FAIR 355
#> 4 POOR 84
#> 5 <NA> 779
Labels that don’t exist in the data can be dropped from the levels using drop_unused_labels = TRUE
.
# Drop unused labels
table(to_factor(gss$HEALTH))
#>
#> IAP EXCELLENT GOOD FAIR POOR DK NA
#> 774 359 771 355 84 5 0
table(to_factor(gss$HEALTH, drop_unused_labels = TRUE))
#>
#> IAP EXCELLENT GOOD FAIR POOR DK
#> 774 359 771 355 84 5
Factor Levels can easily be sorted by either value or label using the sort_levels
argument. By default, they are sorted by value.
# Sort by value
levels(to_factor(gss$HEALTH, levels = "prefixed", sort_levels = "values"))
#> [1] "[0] IAP" "[1] EXCELLENT" "[2] GOOD" "[3] FAIR"
#> [5] "[4] POOR" "[8] DK" "[9] NA"
# Sort by label
levels(to_factor(gss$HEALTH, levels = "prefixed", sort_levels = "labels"))
#> [1] "[8] DK" "[1] EXCELLENT" "[3] FAIR" "[2] GOOD"
#> [5] "[0] IAP" "[9] NA" "[4] POOR"
# Sort descending
levels(to_factor(gss$HEALTH, levels = "prefixed", sort_levels = "values", decreasing = TRUE))
#> [1] "[9] NA" "[8] DK" "[4] POOR" "[3] FAIR"
#> [5] "[2] GOOD" "[1] EXCELLENT" "[0] IAP"
By default unlabelled values will be included with the value used as the factor level. They can be discarded with no_label_to_na = TRUE
.
gss %>% count(HELPSICK = to_factor(HELPSICK))
#> # A tibble: 8 x 2
#> HELPSICK n
#> <fct> <int>
#> 1 IAP 785
#> 2 GOVT SHOULD HELP 528
#> 3 2 306
#> 4 AGREE WITH BOTH 448
#> 5 4 129
#> 6 PEOPLE HELP SELVES 117
#> 7 DK 33
#> 8 NA 2
# Convert unlabelled levels to NA
gss %>% count(HELPSICK = to_factor(HELPSICK, nolabel_to_na = TRUE))
#> # A tibble: 7 x 2
#> HELPSICK n
#> <fct> <int>
#> 1 IAP 785
#> 2 GOVT SHOULD HELP 528
#> 3 AGREE WITH BOTH 448
#> 4 PEOPLE HELP SELVES 117
#> 5 DK 33
#> 6 NA 2
#> 7 <NA> 435
And all labelled vectors in the data frame can be converted to factors in one go.
# Convert all labelled vectors to factors
to_factor(gss)
#> # A tibble: 2,348 x 1,065
#> ABANY ABDEFECT ABFELEGL ABHELP1 ABHELP2 ABHELP3 ABHELP4 ABHLTH ABINSPAY
#> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
#> 1 NO YES IAP Yes Yes Yes Yes YES People …
#> 2 YES YES It depe… No No No No YES People …
#> 3 IAP IAP IAP Yes No Yes Yes IAP People …
#> 4 IAP IAP Should Yes Yes Yes Yes IAP People …
#> 5 NO YES IAP No No No Yes YES People …
#> 6 YES YES Should Yes Yes Yes Yes YES People …
#> 7 YES YES It depe… Yes No Yes Yes YES People …
#> 8 NO YES IAP Yes No Yes Yes YES No answ…
#> 9 IAP IAP It depe… Yes Yes Yes Yes IAP People …
#> 10 IAP IAP IAP Yes No No Yes IAP People …
#> # … with 2,338 more rows, and 1,056 more variables: ABMEDGOV1 <fct>,
#> # ABMEDGOV2 <fct>, ABMELEGL <fct>, ABMORAL <fct>, ABNOMORE <fct>,
#> # ABPOOR <fct>, ABPOORW <fct>, ABRAPE <fct>, ABSINGLE <fct>, ABSTATE1 <fct>,
#> # ABSTATE2 <fct>, ACQNTSEX <fct>, ACTSSOC <fct>, ADMINCONSENT <fct>,
#> # ADULTS <fct>, ADVFRONT <fct>, AFFRMACT <fct>, AFRAIDOF <fct>,
#> # AFTERLIF <fct>, AGE <fct>, AGED <fct>, AGEKDBRN <fct>, ANCESTRS <fct>,
#> # ARTHRTIS <fct>, ASTROLGY <fct>, ASTROSCI <fct>, ATHEISTS <fct>,
#> # ATTEND <fct>, ATTEND12 <fct>, ATTENDMA <fct>, ATTENDPA <fct>, AWAY1 <fct>,
#> # AWAY11 <fct>, AWAY2 <fct>, AWAY3 <fct>, AWAY4 <fct>, AWAY5 <fct>,
#> # AWAY6 <fct>, AWAY7 <fct>, BABIES <fct>, BACKPAIN <fct>, BALLOT <fct>,
#> # BALNEG <fct>, BALPOS <fct>, BEFAIR <fct>, BETRLANG <fct>, BIBLE <fct>,
#> # BIGBANG <fct>, BIGBANG1 <fct>, BIGBANG2 <fct>, BIRD <fct>, BIRDB4 <fct>,
#> # BORN <fct>, BOYORGRL <fct>, BREAKDWN <fct>, BUDDHSTS <fct>, BUYESOP <fct>,
#> # BUYVALUE <fct>, CANTRUST <fct>, CAPPUN <fct>, CAT <fct>, CATB4 <fct>,
#> # CHARACTR <fct>, CHEMGEN <fct>, CHILDS <fct>, CHLDIDEL <fct>,
#> # CHRISTNS <fct>, CHURHPOW <fct>, CLASS <fct>, CLERGVTE <fct>,
#> # CLOSETO1 <fct>, CLOSETO2 <fct>, CLOSETO3 <fct>, CLOSETO4 <fct>,
#> # CLOSETO5 <fct>, CNTCTFAM <fct>, CNTCTFRD <fct>, CNTCTKID <fct>,
#> # CNTCTPAR <fct>, CNTCTSIB <fct>, CODEG <fct>, CODEN <fct>, COEDUC <fct>,
#> # COEVWORK <fct>, COFUND <fct>, COHORT <fct>, COHRS1 <fct>, COHRS2 <fct>,
#> # COIND10 <fct>, COISCO08 <fct>, COJew <fct>, COLATH <fct>, COLCOM <fct>,
#> # COLDEG1 <fct>, COLHOMO <fct>, COLMIL <fct>, COLMSLM <fct>, COLRAC <fct>,
#> # COLSCI <fct>, COLSCINM <fct>, …
3.2.2 Character vectors
The to_character()
function allows you to convert to a character vector instead of a factor, using the same general conversion arguments as to_factor()
.
# Convert to a character variable
gss %>% count(HEALTH = to_character(HEALTH, levels = "prefixed"))
#> # A tibble: 6 x 2
#> HEALTH n
#> <chr> <int>
#> 1 [0] IAP 774
#> 2 [1] EXCELLENT 359
#> 3 [2] GOOD 771
#> 4 [3] FAIR 355
#> 5 [4] POOR 84
#> 6 [8] DK 5
# Remove tagged NA values
gss %>% count(HEALTH = to_character(HEALTH, user_na_to_na = TRUE))
#> # A tibble: 5 x 2
#> HEALTH n
#> <chr> <int>
#> 1 EXCELLENT 359
#> 2 FAIR 355
#> 3 GOOD 771
#> 4 POOR 84
#> 5 <NA> 779