Applying the same operation with multiple columns of similar names in R - r

I'm wondering if there is a way to simplify this code to avoid repetition givent that the column names are similar excepting one character that increases for each operation.
out <- df %>%
mutate (ATN1.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch1/RefCh1)),
ATN2.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch2/RefCh2)),
ATN3.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch3/RefCh3)),
ATN4.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch4/RefCh4)),
ATN5.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch5/RefCh5)),
ATN6.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch6/RefCh6)),
ATN7.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch7/RefCh7)))
This is a small subset of my data if you wanna play with it
df = structure(list(Status = c(1, 17, 1, 1, 1, 1, 2, 0, 0, 0), ATN1.1 = c(NA,
NA, NA, NA, NA, NA, 0, 0.187761662304176, 0.373310604025045,
0.570139498143909), ATN2.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.136443172947395,
0.269071359915515, 0.407552762179439), ATN3.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.113733164068766, 0.224219770615697, 0.336923929839777
), ATN4.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.0942969310983806,
0.186894753425896, 0.279629737677226), ATN5.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.0753327883349684, 0.149617411430523, 0.22690457078205
), ATN6.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.0493106158715682,
0.100348708536177, 0.155828822066352), ATN7.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.0526398637123631, 0.103191368342497, 0.154644102801848
), ATN0.1.1 = c(NA, NA, NA, NA, NA, NA, 15.054824247419, 15.054824247419,
15.054824247419, 15.054824247419), ATN0.2.1 = c(NA, NA, NA, NA,
NA, NA, 24.1338734012274, 24.1338734012274, 24.1338734012274,
24.1338734012274), ATN0.3.1 = c(NA, NA, NA, NA, NA, NA, 27.4233147524393,
27.4233147524393, 27.4233147524393, 27.4233147524393), ATN0.4.1 = c(NA,
NA, NA, NA, NA, NA, 20.8560560826831, 20.8560560826831, 20.8560560826831,
20.8560560826831), ATN0.5.1 = c(NA, NA, NA, NA, NA, NA, 17.1645092239121,
17.1645092239121, 17.1645092239121, 17.1645092239121), ATN0.6.1 = c(NA,
NA, NA, NA, NA, NA, 4.4180613710882, 4.4180613710882, 4.4180613710882,
4.4180613710882), ATN0.7.1 = c(NA, NA, NA, NA, NA, NA, 10.8192165605015,
10.8192165605015, 10.8192165605015, 10.8192165605015), Sen1Ch1 = c(0,
99, 0, 783198, 785643, 787093, 786717, 785935, 784922, 783784
), Sen2Ch1 = c(0, 324, 0, 793643, 796398, 798041, 798658, 798957,
799003, 798951), Sen1Ch2 = c(0, 53, 0, 739627, 741339, 742308,
741804, 741195, 740403, 739520), Sen2Ch2 = c(0, 416, 0, 743716,
745420, 746399, 746532, 746599, 746467, 746279), Sen1Ch3 = c(0,
49, 0, 720709, 722113, 722900, 722515, 722002, 721364, 720681
), Sen2Ch3 = c(0, 294, 0, 734485, 735877, 736650, 736749, 736783,
736664, 736513), Sen1Ch4 = c(0, 61, 0, 732332, 732529, 732487,
731524, 730678, 729723, 728756), Sen2Ch4 = c(0, 222, 0, 737261,
737172, 736976, 736329, 735869, 735302, 734762), Sen1Ch5 = c(0,
59, 0, 765776, 767327, 768116, 767883, 767617, 767121, 766567
), Sen2Ch5 = c(0, 248, 0, 775632, 777074, 777800, 777883, 777970,
777832, 777655), Sen1Ch6 = c(0, 57, 0, 899145, 901398, 902644,
902723, 902737, 902436, 902095), Sen2Ch6 = c(0, 352, 0, 926157,
928263, 929423, 929746, 930043, 930042, 930025), Sen1Ch7 = c(0,
45, 0, 845802, 848332, 849736, 849960, 850137, 849979, 849764
), Sen2Ch7 = c(0, 360, 0, 867160, 869852, 871321, 871830, 872308,
872428, 872500), RefCh1 = c(0, 10100, 0, 908802, 911770, 913546,
914536, 915344, 915862, 916336), RefCh2 = c(0, 6200, 0, 940232,
942473, 943743, 944281, 944794, 945037, 945218), RefCh3 = c(0,
6200, 0, 947069, 948944, 950017, 950484, 950890, 951100, 951271
), RefCh4 = c(0, 14700, 0, 900977, 901433, 901543, 901167, 900974,
900630, 900271), RefCh5 = c(0, 8250, 0, 908355, 910304, 911295,
911674, 912045, 912133, 912179), RefCh6 = c(0, 6200, 0, 939365,
941703, 942978, 943500, 943980, 944147, 944314), RefCh7 = c(0,
6200, 0, 941728, 944713, 946375, 947078, 947774, 948077, 948325
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))

You can feed dynamic variable names to mutate with !!sym for example:
for(i in 1:7){
out <- df %>%
mutate(!!sym(sprintf("ATN%s.1",i)) := ifelse(Status == 1, NA_integer_, -100 * log(!!sym(paste0("Sen1Ch",i))/!!sym(paste0("RefCh",i)))))
}
Note you need := inside the mutate.

Here is a base r solution with mapply. First define an auxiliary function f to make the code more readable, then get the column names to be changed and that take part in the formula with regular expressions, finally, csall the function f in a mapply loop.
f <- function(x, y, Status) {
ifelse(Status == 1, NA_integer_, -100 * log(x/y))
}
atn <- grep("^ATN\\d\\.1$", names(df), value = TRUE)
sen1ch <- grep("^Sen1Ch", names(df), value = TRUE)
refch <- grep("^RefCh", names(df), value = TRUE)
df[atn] <- mapply(f, df[sen1ch], df[refch], MoreArgs = list(Status = df$Status))
df
#> # A tibble: 10 x 36
#> Status ATN1.1 ATN2.1 ATN3.1 ATN4.1 ATN5.1 ATN6.1 ATN7.1 ATN0.1.1 ATN0.2.1
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 NA NA NA NA NA NA NA NA NA
#> 2 17 463. 476. 484. 548. 494. 469. 493. NA NA
#> 3 1 NA NA NA NA NA NA NA NA NA
#> 4 1 NA NA NA NA NA NA NA NA NA
#> 5 1 NA NA NA NA NA NA NA NA NA
#> 6 1 NA NA NA NA NA NA NA NA NA
#> 7 2 15.1 24.1 27.4 20.9 17.2 4.42 10.8 15.1 24.1
#> 8 0 15.2 24.3 27.5 21.0 17.2 4.47 10.9 15.1 24.1
#> 9 0 15.4 24.4 27.6 21.0 17.3 4.52 10.9 15.1 24.1
#> 10 0 15.6 24.5 27.8 21.1 17.4 4.57 11.0 15.1 24.1
#> # ... with 26 more variables: ATN0.3.1 <dbl>, ATN0.4.1 <dbl>, ATN0.5.1 <dbl>,
#> # ATN0.6.1 <dbl>, ATN0.7.1 <dbl>, Sen1Ch1 <dbl>, Sen2Ch1 <dbl>,
#> # Sen1Ch2 <dbl>, Sen2Ch2 <dbl>, Sen1Ch3 <dbl>, Sen2Ch3 <dbl>, Sen1Ch4 <dbl>,
#> # Sen2Ch4 <dbl>, Sen1Ch5 <dbl>, Sen2Ch5 <dbl>, Sen1Ch6 <dbl>, Sen2Ch6 <dbl>,
#> # Sen1Ch7 <dbl>, Sen2Ch7 <dbl>, RefCh1 <dbl>, RefCh2 <dbl>, RefCh3 <dbl>,
#> # RefCh4 <dbl>, RefCh5 <dbl>, RefCh6 <dbl>, RefCh7 <dbl>
Created on 2022-04-14 by the reprex package (v2.0.1)

Related

Data cleaning, from cross-sectional (multiple files) to panel in RStudio: merge/gather?

I have yearly observations for individuals on different variables from 2008-2020. I have data on family (25 variables), income (15 variables), and schooling (22 variables).
Right now, have 'cleaned' every single dataset so that every column of every category has the same column name. For context, this is what my R looks like now.
The thing is, I would like to have one big dataset with all of the individuals and years in one dataframe. I know that I should/could use the innerjoin or merge function first of all sorting by 'Householdmember', and that I could use the gather function, but I am truly struggling in what order I should do this and where I should start. I've been trying a lot of things, but considering the number of dataframes, it's hard to keep track of what I'm doing. I also created lists of every category for every year because this was recommended in one method, but that did not work out...
I want to end up with a dataframe that looks similar to this:
Individual
Year
Var1
Var2
1
2008
value
value
1
2009
value
value
1
2010
value
value
2
2008
value
value
2
2009
value
value
2
2010
value
value
What I should do as first step... If I merge the dataframes, I don't think R knows which values correspond to which year...
> head(fam08)
# A tibble: 6 x 25
HouseholdMember RandomChild YearBirthRandom Gender Age FatherBirth FatherAlive MotherBirth MotherAlive Divorce SeeFather SeeMother
<dbl> <dbl+lbl> <dbl> <dbl+l> <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+l> <dbl+lbl> <dbl+lbl>
1 800033 16 [not ap… NA 1 [mal… 16 1952 1 [yes] 1961 1 [yes] 1 [yes] 7 [ever… 7 [ever…
2 800042 16 [not ap… NA 2 [fem… 32 1946 1 [yes] 1948 1 [yes] 2 [no] 4 [at l… 4 [at l…
3 800045 16 [not ap… NA 1 [mal… 65 1913 2 [no] 1915 2 [no] 2 [no] NA NA
4 800057 16 [not ap… NA 1 [mal… 33 1939 1 [yes] 1945 1 [yes] 1 [yes] 4 [at l… 4 [at l…
5 800076 16 [not ap… NA 2 [fem… 22 1955 1 [yes] 1955 1 [yes] 1 [yes] 5 [at l… 3 [a fe…
6 800119 16 [not ap… NA 2 [fem… 57 1908 2 [no] 1918 2 [no] 2 [no] NA NA
# … with 13 more variables: Married <dbl+lbl>, Child <dbl+lbl>, NumChild <dbl>, SchoolCH1 <dbl+lbl>, SchoolCH2 <dbl+lbl>,
# SchoolCH3 <dbl+lbl>, SchoolCH4 <dbl+lbl>, BirthCH1 <dbl>, BirthCH2 <dbl>, BirthCH3 <dbl>, BirthCH4 <dbl>, FamSatisfaction <dbl+lbl>,
# Year <dbl>
> head(fam09)
# A tibble: 6 x 25
HouseholdMember RandomChild YearBirthRandom Gender Age FatherBirth FatherAlive MotherBirth MotherAlive Divorce SeeFather SeeMother
<dbl> <dbl+lbl> <dbl> <dbl+l> <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+l> <dbl+lbl> <dbl+lbl>
1 800033 16 [not ap… NA 1 [mal… 17 1952 1 [yes] 1961 1 [yes] NA 5 [at l… 7 [ever…
2 800042 16 [not ap… NA 2 [fem… 33 1946 1 [yes] 1948 1 [yes] NA 4 [at l… 4 [at l…
3 800057 16 [not ap… NA 1 [mal… 34 1939 1 [yes] 1945 1 [yes] NA 3 [a fe… 3 [a fe…
4 800076 16 [not ap… NA 2 [fem… 23 1955 1 [yes] 1955 1 [yes] NA 5 [at l… 3 [a fe…
5 800119 16 [not ap… NA 2 [fem… 58 NA NA NA NA NA NA NA
6 800125 16 [not ap… NA 2 [fem… 50 NA NA 1928 1 [yes] NA NA 1 [neve…
# … with 13 more variables: Married <dbl+lbl>, Child <dbl+lbl>, NumChild <dbl>, SchoolCH1 <dbl+lbl>, SchoolCH2 <dbl+lbl>,
# SchoolCH3 <dbl+lbl>, SchoolCH4 <dbl+lbl>, BirthCH1 <dbl>, BirthCH2 <dbl>, BirthCH3 <dbl>, BirthCH4 <dbl>, FamSatisfaction <dbl+lbl>,
# Year <dbl>
dput(head(fam09,10))
structure(list(HouseholdMember = c(800033, 800042, 800057, 800076,
800119, 800125, 800170, 800186, 800201, 800204), RandomChild = structure(c(16,
16, 16, 16, 16, 16, 3, 16, 16, 16), label = "Randomly chosen child", labels = c(`child 1` = 1,
`child 2` = 2, `child 3` = 3, `child 4` = 4, `child 5` = 5, `child 6` = 6,
`child 7` = 7, `child 8` = 8, `child 9` = 9, `child 10` = 10,
`child 11` = 11, `child 12` = 12, `child 13` = 13, `child 14` = 14,
`child 15` = 15, `not applicable` = 16), class = "haven_labelled"),
YearBirthRandom = c(NA, NA, NA, NA, NA, NA, 1999, NA, NA,
NA), Gender = structure(c(1, 2, 1, 2, 2, 2, 2, 2, 1, 1), label = "Gender respondent", labels = c(male = 1,
female = 2), class = "haven_labelled"), Age = c(17, 33, 34,
23, 58, 50, 50, 69, 35, 67), FatherBirth = structure(c(1952,
1946, 1939, 1955, NA, NA, 1926, NA, 1948, NA), label = "What is the year of birth of your father?", labels = c(`I don't know` = 99999), class = "haven_labelled"),
FatherAlive = structure(c(1, 1, 1, 1, NA, NA, 1, NA, 1, NA
), label = "Is your father still alive?", labels = c(yes = 1,
no = 2, `I don't know` = 99), class = "haven_labelled"),
MotherBirth = structure(c(1961, 1948, 1945, 1955, NA, 1928,
1931, NA, 1950, NA), label = "What is the year of birth of your mother?", labels = c(`I don't know` = 99999), class = "haven_labelled"),
MotherAlive = structure(c(1, 1, 1, 1, NA, 1, 1, NA, 1, NA
), label = "Is your mother still alive?", labels = c(yes = 1,
no = 2, `I don't know` = 99), class = "haven_labelled"),
Divorce = structure(c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), label = "Did your own parents ever divorce?", labels = c(yes = 1,
no = 2, `my parents never had a relationship` = 3, `I don't know` = 99
), class = "haven_labelled"), SeeFather = structure(c(5,
4, 3, 5, NA, NA, 6, NA, 3, NA), label = "How often did you see your father over the past 12 months?", labels = c(never = 1,
once = 2, `a few times` = 3, `at least every month` = 4,
`at least every week` = 5, `a few times per week` = 6, `every day` = 7
), class = "haven_labelled"), SeeMother = structure(c(7,
4, 3, 3, NA, 1, 6, NA, 3, NA), label = "How often did you see your mother over the past 12 months?", labels = c(never = 1,
once = 2, `a few times` = 3, `at least every month` = 4,
`at least every week` = 5, `a few times per week` = 6, `every day` = 7
), class = "haven_labelled"), Married = structure(c(NA, 1,
2, 2, 1, 2, 1, 1, 1, 1), label = "Are you married to this partner?", labels = c(yes = 1,
no = 2), class = "haven_labelled"), Child = structure(c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), label = "Have you had any children?", labels = c(yes = 1,
no = 2), class = "haven_labelled"), NumChild = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), SchoolCH1 = structure(c(NA,
NA, NA, NA, NA, NA, 4, NA, NA, NA), label = "What school does child 1 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1,
`school for special primary education` = 2, `secondary school` = 3,
other = 4), class = "haven_labelled"), SchoolCH2 = structure(c(NA,
NA, NA, NA, NA, NA, 3, NA, NA, NA), label = "What school does child 2 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1,
`school for special primary education` = 2, `secondary school` = 3,
other = 4), class = "haven_labelled"), SchoolCH3 = structure(c(NA,
NA, NA, NA, NA, NA, 1, NA, NA, NA), label = "What school does child 3 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1,
`school for special primary education` = 2, `secondary school` = 3,
other = 4), class = "haven_labelled"), SchoolCH4 = structure(c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), label = "What school does child 4 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1,
`school for special primary education` = 2, `secondary school` = 3,
other = 4), class = "haven_labelled"), BirthCH1 = c(NA, 2005,
2007, NA, 1983, NA, 1991, 1964, NA, 1974), BirthCH2 = c(NA,
2007, NA, NA, 1985, NA, 1994, 1966, NA, 1976), BirthCH3 = c(NA,
NA, NA, NA, NA, NA, 1999, 1970, NA, NA), BirthCH4 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), FamSatisfaction = structure(c(NA,
8, 9, NA, 8, NA, 8, NA, NA, NA), label = "How satisfied are you with your family life?", labels = c(`entirely dissatisfied` = 0,
`entirely satisfied` = 10, `I don’t know` = 999), class = "haven_labelled"),
Year = c(2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009,
2009, 2009)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
I believe you could do something along these lines:
fam = bind_rows(fam_list)
inc = bind_rows(inc_list)
ws = bind_rows(ws_list)
result = fam %>%
left_join(inc, by=c("HouseholdMember", "Year")) %>%
left_join(ws, by=c("HouseholdMember", "Year"))
Output:
HouseholdMember Year fam_v1 fam_v2 fam_v3 inc_v1 inc_v2 inc_v3 ws_v1 ws_v2 ws_v3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 8001 2008 0.609 -0.253 -1.30 0.0147 0.719 -0.765 0.120 0.974 -0.764
2 8002 2008 0.395 1.73 -0.503 0.119 -3.33 -0.798 0.325 0.664 1.65
3 8003 2008 0.562 0.157 0.243 -1.18 -0.260 0.105 1.09 0.855 1.19
4 8004 2008 1.32 0.737 -1.18 0.725 -1.82 0.356 0.362 2.04 1.76
5 8005 2008 -0.497 -0.444 -0.632 -0.534 1.63 0.984 1.29 0.614 0.576
6 8006 2008 -1.70 -0.989 -1.32 0.868 0.0979 0.468 -0.0146 1.11 0.957
7 8007 2008 -2.19 -0.419 1.69 1.34 -0.404 -1.43 -0.156 0.648 -0.186
8 8008 2008 1.48 0.350 -0.595 0.785 -0.609 1.28 -1.01 1.04 0.845
9 8009 2008 -0.315 -0.530 0.419 0.390 -0.0951 -0.755 0.135 0.696 -1.97
10 8010 2008 -0.882 1.38 2.06 -0.0757 1.53 -0.494 -1.03 1.14 1.87
Note:
I manufactured the data for this example by creating a lists of tibbles; I believe the fam_list, inc_list, and ws_list are similar to the list objects in your image. These are list of data frames / tibbles. I then use bind_rows to bind these similar structure tibbles together so that I have a three large tibbles.
I then use left_join twice to join inc and ws to fam
Input Data:
library(tidyverse)
fam_list = lapply(8:20, function(x) {
tibble(HouseholdMember = c(8000+seq(1:100)),
Year=2000+x,
fam_v1=rnorm(100),
fam_v2=rnorm(100),
fam_v3=rnorm(100)
)
})
names(fam_list) = paste0("fam_20", 8:20)
inc_list = lapply(8:20, function(x) {
tibble(HouseholdMember = c(8000+seq(1:100)),
Year=2000+x,
inc_v1=rnorm(100),
inc_v2=rnorm(100),
inc_v3=rnorm(100)
)
})
names(inc_list) = paste0("inc_20", 8:20)
ws_list = lapply(8:20, function(x) {
tibble(HouseholdMember = c(8000+seq(1:100)),
Year=2000+x,
ws_v1=rnorm(100),
ws_v2=rnorm(100),
ws_v3=rnorm(100)
)
})
names(ws_list) = paste0("ws_20", 8:20)
Input

Capitalizing the first letter of characters in a column using substr function

I have this data frame
head(df)
## patnum hospstay lowph pltct race bwt gest inout twn lol magsulf
## 1 1 34 NA 100 white 1250 35 born at duke 0 NA NA
## 2 2 9 7.250000 244 white 1370 32 born at duke 0 NA NA
## 3 3 -2 7.059998 114 black 620 23 born at duke 0 NA NA
## 4 4 40 7.250000 182 black 1480 32 born at duke 0 NA NA
## 5 5 2 6.969997 54 black 925 28 born at duke 0 NA NA
## 6 6 62 7.189999 NA white 940 28 born at duke 0 NA NA
## meth toc delivery apg1 vent pneumo pda cld sex dead
## 1 0 0 abdominal 8 0 0 0 0 female 0
## 2 1 0 abdominal 7 0 0 0 0 female 0
## 3 0 1 vaginal 1 1 0 0 NA female 1
## 4 1 0 vaginal 8 0 0 0 0 male 0
## 5 0 0 abdominal 5 1 1 0 0 female 1
## 6 1 0 abdominal 8 1 0 0 0 female 0
The race variable has 4 entries, "white", "black", "native american", "oriental". I am wanting to replace this string with capitalized versions "White", "Black", "Native American", "Oriental". I would like to do this using the substr function. I'm not sure what code to use to accomplish this. I was provided an example below, where the
day_full = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
substr(day_full_1, nchar(day_full_1)-2, nchar(day_full_1)) = "DAY"
The result is: "SunDAY", "MonDAY", "TuesDAY", "WednesDAY", "ThursDAY", "FriDAY", "SaturDAY", "SunDAY"
This is similar to what I want to do, but I only want to have the first letter of each of the races to be capitalized. How would I translate this to make each first letter of the 4 races capital?
This is the solution I've tried now.
substr(SB_xlsx$race, 1, 1) <- toupper(substr(SB_xlsx$race, 1, 1))
substr(SB_xlsx$race, 1, 1)
## structure(list(patnum = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
## 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), hospstay = c(34,
## 9, -2, 40, 2, 62, 32, NA, NA, 28, 38, NA, 62, 69, 1, 93, 44,
## 50, 66, 65, 44, 70, 85, NA), lowph = c(NA, 7.25, 7.059998, 7.25,
## 6.969997, 7.189999, 7.32, NA, NA, 7.16, 7.039997, NA, 7.179996,
## 7.419998, 7.119999, 7.239998, 7.129997, 7.269997, 7.179996, 7.07,
## 7.289997, 7.129997, 7.189999, NA), pltct = c(100, 244, 114, 182,
## 54, NA, 282, NA, NA, 153, 229, NA, 182, 361, 378, 255, 186, NA,
## 260, 183, 134, 229, 68, NA), race = c("white", "white", "black",
## "black", "black", "white", "black", NA, NA, "black", "white",
## NA, "black", "white", "white", "black", "white", "black", "black",
## "white", "white", "black", "white", NA), bwt = c(1250, 1370,
## 620, 1480, 925, 940, 1255, 600, 700, 1350, 1310, 550, 1110, 1180,
## 970, 770, 1490, 1170, 1360, 1330, 1000, 1120, 740, NA), gest = c(35,
## 32, 23, 32, 28, 28, 29.5, 26, 24, 34, 32, 24, 28, 28, 28, 26,
## 33, 31, 31, 31, 28, 29, 26, NA), inout = c("born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", NA), twn = c(0, 0, 0, 0, 0, 0, 0, NA, NA, 0,
## 0, NA, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, NA), lol = c(NA, NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), magsulf = c(NA, NA, NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
## NA, NA), meth = c(0, 1, 0, 1, 0, 1, 1, NA, NA, 1, 0, NA, 0, 0,
## 1, 1, 1, 1, 1, 1, 0, 1, 0, NA), toc = c(0, 0, 1, 0, 0, 0, 0,
## NA, NA, 0, 0, NA, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, NA), delivery = c("abdominal",
## "abdominal", "vaginal", "vaginal", "abdominal", "abdominal",
## "vaginal", NA, NA, "abdominal", "vaginal", NA, "vaginal", "abdominal",
## "vaginal", "vaginal", "abdominal", "vaginal", "vaginal", "vaginal",
## "vaginal", "vaginal", "abdominal", NA), apg1 = c(8, 7, 1, 8,
## 5, 8, 9, NA, NA, 4, 6, NA, 6, 6, 2, 4, 8, 7, 1, 8, 5, 9, 9, NA
## ), vent = c(0, 0, 1, 0, 1, 1, 0, NA, NA, 0, 1, NA, 0, 0, 1, 1,
## 0, 0, 1, 1, 0, 1, 0, NA), pneumo = c(0, 0, 0, 0, 1, 0, 0, NA,
## NA, 0, 0, NA, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, NA), pda = c(0,
## 0, 0, 0, 0, 0, 0, NA, NA, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0,
## 0, 0, NA), cld = c(0, 0, NA, 0, 0, 0, 0, NA, NA, 0, 0, NA, 1,
## 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, NA), sex = c("female", "female",
## "female", "male", "female", "female", "female", NA, NA, "female",
## "male", NA, "male", "male", "female", "male", "male", "female",
## "male", "male", "female", "female", "female", NA), dead = c(0,
## 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
## 0, NA)), class = "data.frame", row.names = c(NA, -24L))
Two solutions:
df <- structure(list(patnum = 1:6, hospstay = c(34L, 9L, -2L, 40L, 2L, 62L), lowph = c(NA, 7.25, 7.059998, 7.25, 6.969997, 7.189999), pltct = c(100L, 244L, 114L, 182L, 54L, NA), race = c("white", "white", "black", "black", "black", "white"), bwt = c(1250L, 1370L, 620L, 1480L, 925L, 940L), gest = c(35L, 32L, 23L, 32L, 28L, 28L), inout = c("born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke"), twn = c(0L, 0L, 0L, 0L, 0L, 0L), lol = c(NA, NA, NA, NA, NA, NA), magsulf = c(NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))
tools::toTitleCase(df$race)
# [1] "White" "White" "Black" "Black" "Black" "White"
But those are simpler with no spaces, let's create one for this exercise:
vec <- c("white", "black", "native american")
tools::toTitleCase(vec)
# [1] "White" "Black" "Native American"
We can also use gregexpr/regmatches to do it:
gre <- gregexpr("(?<=^| ).", vec, perl=TRUE)
regmatches(vec, gre)
# [[1]]
# [1] "w"
# [[2]]
# [1] "b"
# [[3]]
# [1] "n" "a"
regmatches(vec, gre) <- lapply(regmatches(vec, gre), toupper)
vec
# [1] "White" "Black" "Native American"
I'm sure there's a stringr-variant out there as well.
As for substr, it's feasible to use regex to find all (1) first-chars and (2) all chars that follow a space, then extract each one, then toupper-them, then put that back into place ... but at that point you're still using regex and effectively doing what toTitleCase is doing natively and what this gregexpr/regmatches code is doing a little more verbosely.
If all you wanted to do was replace the first character, though, and not care about letters after spaces, then
substr(vec, 1, 1) <- toupper(substr(vec, 1, 1))
vec
# [1] "White" "Black" "Native american"
though in this case, I think the lower-case "a" in "Native american" is wrong, so I don't think this is the best approach.
Scaling
Since you are concerned about scaling (I'm assuming you're venturing into 100K or more, since less than that is not going to be an issue with any method demonstrated), here's a comparison:
bench::mark(
toTitleCase = tools::toTitleCase(vec),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec, perl=TRUE)
regmatches(vec, gre) <- lapply(regmatches(vec, gre), toupper)
vec
}
)
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 toTitleCase 401us 474us 1735. 4.15KB 0 868 0 500ms <chr [3]> <Rprofmem [9 x 3]> <bench_~ <tibble~
# 2 gregexpr 111us 205us 5240. 24.28KB 2.26 2315 1 442ms <chr [3]> <Rprofmem [6 x 3]> <bench_~ <tibble~
Granted, vec size 3 is pretty small, let's scale that up a bit.
vec30000 <- rep(vec, 10000) # 30000 length
bench::mark(
toTitleCase = tools::toTitleCase(vec30000),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec30000, perl=TRUE)
regmatches(vec30000, gre) <- lapply(regmatches(vec30000, gre), toupper)
vec30000
}
)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 toTitleCase 6.01s 6.01s 0.166 36MB 0.832 1 5 6.01s <chr [30,000]> <Rprofmem [~ <bench_t~ <tibble~
# 2 gregexpr 773.13ms 773.13ms 1.29 241MB 2.59 1 2 773.13ms <chr [30,000]> <Rprofmem [~ <bench_t~ <tibble~
Looking at the `itr/sec` column showing iterations per second, it appears that even at scale, the gregexpr method works better. (If you look at the source code for toTitleCase, you'll see why: it's consider a lot more than just space-delimited words, it's also consider linking words, exception-words, etc.)
Another way is to use perl substitution:
gsub('\\b(\\w)', '\\U\\1', vec, perl = TRUE)
[1] "White" "Black" "Native American"
This method is way faster (ie 35+ times Faster) than the gregexpr method mentioned before:
microbenchmark::microbenchmark(
gsub = gsub('\\b(\\w)', '\\U\\1', vec30000, perl = TRUE),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec30000, perl=TRUE)
regmatches(vec30000, gre) <- lapply(regmatches(vec30000, gre), toupper)
vec30000 },
unit = 'relative', check = 'equal')
Unit: relative
expr min lq mean median uq max neval
gsub 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 5
gregexpr 37.37549 41.10014 29.00345 24.49221 25.39978 25.54325 5

transpose from one variable under another in R

here example of my data
mydat=structure(list(ADR.N.14.0 = c(8140010250001, 8140010250002),
NOMYAR.N.16.6 = c(1, 1), KOFPOR1.N.16.6 = c(7, 10), POR1.C.254 = c("о",
"BB"), VOZPOR1.N.16.6 = c(80, 45), VYSPOR1.N.16.6 = c(24,
17), DEMPOR1.N.16.6 = c(36, 16), POLNOT1.N.16.6 = c(0.6,
0.9), ZAPZAH1.N.16.6 = c(210, 160), NOMYAR2.N.16.6 = c(1,
1), KOFSOCT2.N.16.6 = c(3, 0), POR2.C.254 = c("BB", "о"),
VOZPOR2.N.16.6 = c(70, 45), VYSPOR2.N.16.6 = c(22, 17), DEMPOR2.N.16.6 = c(26,
22), POLNOT2.N.16.6 = c(0, 0), ZAPZAH2.N.16.6 = c(0, 0)), class = "data.frame", row.names = c(NA,
-2L))
how for each value of ADR,N,14,0move data from one variable under another.
To be more clear
here variables with prefix1
NOMYAR,N,16,6 KOFPOR**1**,N,16,6 POR**1**,C,254 VOZPOR**1**,N,16,6 VYSPOR**1**,N,16,6 DEMPOR**1**,N,16,6 POLNOT**1**,N,16,6 ZAPZAH**1**,N,16,6
and near rows with prefix2
NOMYAR**2**,N,16,6 KOFPOR**2**,N,16,6 POR**2**,C,254 VOZPOR**2**,N,16,6 VYSPOR**1**,N,16,6 DEMPOR**2**,N,16,6 POLNOT**2**,N,16,6 ZAPZAH**2**,N,16,6
so i need that for for ADR,N,14,0 =8140010250001
the content of the fields with the prefix 2 was under the content of the fields with the prefix 1
like this
result=structure(list(ADR.N.14.0 = c(8140010250001, 8140010250001, 8140010250002,
8140010250002, NA, NA, NA, NA, NA, NA), NOMYAR.N.16.6 = c(1,
1, 1, 1, NA, NA, NA, NA, NA, NA), KOFPOR1.N.16.6 = c(7, 3, 10,
0, NA, NA, NA, NA, NA, NA), POR1.C.254 = c("о", "BB", "BB", "о",
"", "", "", "", "", ""), VOZPOR1.N.16.6 = c(80, 70, 45, 45, NA,
NA, NA, NA, NA, NA), VYSPOR1.N.16.6 = c(24, 22, 17, 17, NA, NA,
NA, NA, NA, NA), DEMPOR1.N.16.6 = c(36, 26, 16, 22, NA, NA, NA,
NA, NA, NA), POLNOT1.N.16.6 = c(0.6, 0, 0.9, 0, NA, NA, NA, NA,
NA, NA), ZAPZAH1.N.16.6 = c(210, 0, 160, 0, NA, NA, NA, NA, NA,
NA)), class = "data.frame", row.names = c(NA, -10L))
How can i do such transpose?
You can use pivot_longer and specify names_pattern to include pattern of names that you want together.
tidyr::pivot_longer(mydat, cols = -ADR.N.14.0,
names_to = c('.value'),
names_pattern = '(.*?)\\d?\\..*')
# ADR.N.14.0 NOMYAR KOFPOR POR VOZPOR VYSPOR DEMPOR POLNOT ZAPZAH KOFSOCT
# <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 8140010250001 1 7 о 80 24 36 0.6 210 3
#2 8140010250001 1 NA BB 70 22 26 0 0 NA
#3 8140010250002 1 10 BB 45 17 16 0.9 160 0
#4 8140010250002 1 NA о 45 17 22 0 0 NA

R reshape name value pairs from wide to long using pivot_longer

I am trying to figure out how to reshape a dataset of the names of political parties from wide to long using dplyr and pivot_longer.
For each Party_ID, there is a number of constant columns attached (Party_Name_Short, Party_Name, Country, Party_in_orig_title) and a number of time changing factors as well: election, Date, Rename, Reason, Party_Title, alliance, member_parties, split, parent_party, merger, child_party, successor, predecessor. The time changing factors were recorded up to 11 times for each party, as reflected by the index in the colname.
In order to provide a sample I selected the first three time changing columns for each party and a sample of 5 random rows:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0), Date1 = c(2011, NA, 1999, 1987, NA), Rename1 = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA), Reason1 = c("foundation",
NA, "split from FN", "split", NA), Party_Title1 = c(0, NA, 0,
0, NA), alliance1 = c(0, NA, 0, 0, NA), member_parties1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_),
split1 = c(0, NA, 1, 1, NA), parent_party1 = c(NA, NA, "FN",
"MDP", NA), merger1 = c(0, NA, 0, 0, NA), child_party1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor1 = c(0, NA, 0, 0, NA), predecessor1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), Date2 = c(2012, NA, NA, NA, NA), Rename2 = c("Pozitivna Slovenija",
NA, NA, NA, NA), Reason2 = c("renamed", NA, NA, NA, NA),
Party_Title2 = c(0, NA, NA, NA, NA), alliance2 = c(0, NA,
NA, NA, NA), member_parties2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split2 = c(0,
NA, NA, NA, NA), parent_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), merger2 = c(0,
NA, NA, NA, NA), child_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), successor2 = c(0,
NA, NA, NA, NA), predecessor2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), Date3 = c(2014,
NA, NA, NA, NA), Rename3 = c("ZaAB", NA, NA, NA, NA), Reason3 = c("split",
NA, NA, NA, NA), Party_Title3 = c(0, NA, NA, NA, NA), alliance3 = c(0,
NA, NA, NA, NA), member_parties3 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split3 = c(1,
NA, NA, NA, NA), parent_party3 = c("LZJ-PS", NA, NA, NA,
NA), merger3 = c(0, NA, NA, NA, NA), child_party3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor3 = c(0, NA, NA, NA, NA), predecessor3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), election1 = structure(c(15309, 16740, 11839, 6390, 17956
), class = "Date"), election2 = structure(c(16252, NA, NA,
NA, NA), class = "Date"), election3 = structure(c(16344,
NA, NA, NA, NA), class = "Date")), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
I would like the data to follow a "long" structure where each party_id and the constant factors are repeated 11 times and there are single columns for the time changing factors. Following the top-rated answer formulated here I tried different variations of the following command:
pivot_longer(cols = starts_with(c("election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor")),
names_to = c(".value", "election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor"), names_sep = "_") %>%
select(-matches("election[1-9]"), -matches("Date[1-9]"), -matches("Rename[1-9]"),
-matches("Reason[1-9]"), -matches("alliance[1-9]"), -matches("member_parties[1-9]"),
-matches("split[1-9]"), -matches("parent_party[1-9]"), -matches("merger[1-9]"),
-matches("child_party[1-9]"), -matches("successor[1-9]"), -matches("predecessor[1-9]"),
-matches("Party_Title[1-9]"), -matches("election1[0-2]"), -matches("Date1[0-2]"), -matches("Rename1[0-2]"),
-matches("Reason1[0-2]"), -matches("alliance1[0-2]"), -matches("member_parties1[0-2]"),
-matches("split1[0-2]"), -matches("parent_party1[0-2]"), -matches("merger1[0-2]"),
-matches("child_party1[0-2]"), -matches("successor1[0-2]"), -matches("predecessor1[0-2]"),
-matches("Party_Title1[0-2]"))
However, for some reason, I get a lot of missing values and do not achieve the shape of the data I would like to have. I'd appreciate any hint if you have an idea of how to do this. Thanks!
Update:
I would like the final output to look something like:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200", "LZJ-PS", "ZiZi", "MNR", "MDP", "E200", "LZJ-PS",
"ZiZi", "MNR", "MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST", "SVN", "HRV", "FRA", "PRT", "EST", "SVN", "HRV", "FRA",
"PRT", "EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720, 1987,
2612, 1263, 1281, 2720, 1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), time = c(1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3), Date = c(2011, NA, 1999,
1987, NA, 2012, NA, NA, NA, NA, 2014, NA, NA, NA, NA), Rename = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA, "Pozitivna Slovenija",
NA, NA, NA, NA, "ZaAB", NA, NA, NA, NA), Reason = c("foundation",
NA, "split from FN", "split", NA, "renamed", NA, NA, NA, NA,
"split", NA, NA, NA, NA), Party_Title = c(0, NA, 0, 0, NA, 0,
NA, NA, NA, NA, 0, NA, NA, NA, NA), alliance = c(0, NA, 0, 0,
NA, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA), member_parties = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), split = c(0,
NA, 1, 1, NA, 0, NA, NA, NA, NA, 1, NA, NA, NA, NA), parent_party = c(NA,
NA, "FN", "MDP", NA, NA, NA, NA, NA, NA, "LZJ-PS", NA, NA, NA,
NA), merger = c(0, NA, 0, 0, NA, 0, NA, NA, NA, NA, 0, NA, NA,
NA, NA), child_party = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), successor = c(0, NA, 0, 0, NA, 0, NA,
NA, NA, NA, 0, NA, NA, NA, NA), predecessor = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), election = structure(c(1322697600,
1446336000, 1022889600, 552096000, 1551398400, 1404172800, NA,
NA, NA, NA, 1412121600, NA, NA, NA, NA), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -15L), class = c("tbl_df",
"tbl", "data.frame"))
Notice: the newly added time column and notice that this is only for example purposes, with three time changing factors, whereas in fact there are 11 in the data.
Using pivot_longer with names_sep to split between a non-digit and a digit at the end of the string
library(tidyr)
library(dplyr)
df1 %>%
pivot_longer(cols = matches('\\d+$'), names_to = c(".value", 'time'),
names_sep="(?<=\\D)(?=\\d+$)") %>%
arrange(time)
# A tibble: 15 x 19
# Party_Name_Short Party_Name Country Party_ID Party_in_orig_t… time Date Rename Reason Party_Title alliance member_parties split
# <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
# 1 LZJ-PS Lista Zor… SVN 1987 0 1 2011 Lista… found… 0 0 <NA> 0
# 2 ZiZi Živi zid HRV 2612 0 1 NA <NA> <NA> NA NA <NA> NA
# 3 MNR Mouvement… FRA 1263 0 1 1999 Mouve… split… 0 0 <NA> 1
# 4 MDP Movimento… PRT 1281 0 1 1987 ID split 0 0 <NA> 1
# 5 E200 Erakond E… EST 2720 0 1 NA <NA> <NA> NA NA <NA> NA
# 6 LZJ-PS Lista Zor… SVN 1987 0 2 2012 Pozit… renam… 0 0 <NA> 0
# 7 ZiZi Živi zid HRV 2612 0 2 NA <NA> <NA> NA NA <NA> NA
# 8 MNR Mouvement… FRA 1263 0 2 NA <NA> <NA> NA NA <NA> NA
# 9 MDP Movimento… PRT 1281 0 2 NA <NA> <NA> NA NA <NA> NA
#10 E200 Erakond E… EST 2720 0 2 NA <NA> <NA> NA NA <NA> NA
#11 LZJ-PS Lista Zor… SVN 1987 0 3 2014 ZaAB split 0 0 <NA> 1
#12 ZiZi Živi zid HRV 2612 0 3 NA <NA> <NA> NA NA <NA> NA
#13 MNR Mouvement… FRA 1263 0 3 NA <NA> <NA> NA NA <NA> NA
#14 MDP Movimento… PRT 1281 0 3 NA <NA> <NA> NA NA <NA> NA
#15 E200 Erakond E… EST 2720 0 3 NA <NA> <NA> NA NA <NA> NA
# … with 6 more variables: parent_party <chr>, merger <dbl>, child_party <chr>, successor <dbl>, predecessor <chr>, election <date>

Keep column names for createDummyFeatures "reference" (n-1)

I have this kind of data.
library(dplyr)
glimpse(samp)
Observations: 5
Variables: 5
$ review_count <int> 68, 3, 7, 9, 5
$ Alcohol <fct> full_bar, NA, full_bar, beer_and_wi...
$ BikeParking <fct> True, NA, False, NA, NA
$ BusinessAcceptsBitcoin <fct> NA, NA, NA, NA, NA
$ BusinessAcceptsCreditCards <fct> True, NA, NA, True, True
I want to create 1-p dummy features. The createDummyFeatures function of the mlr package has the option reference to do this.
library(mlr)
dummy = createDummyFeatures(samp, target = "review_count", method = "reference")
The problem is that it doesn´t keep the original column names.
glimpse(dummy)
Observations: 5
Variables: 6
$ review_count <int> 68, 3, 7, 9, 5
$ Alcohol.full_bar <dbl> 1, NA, 1, 0, NA
$ Alcohol.none <dbl> 0, NA, 0, 0, NA
$ True <dbl> 1, NA, 0, NA, NA
$ True.1 <dbl> NA, NA, NA, NA, NA
$ True.2 <dbl> 1, NA, NA, 1, 1
The question is how can I keep them?
An Idea is to create them by the 1-of-nmethod and then remove all columns which contain "False".
dummy2 = createDummyFeatures(samp, target = "review_count")
dummy2 = dummy2 %>%
select(-contains("False"))
glimpse(dummy2)
Observations: 5
Variables: 7
$ review_count <int> 68, 3, 7, 9, 5
$ Alcohol.beer_and_wine <dbl> 0, NA, 0, 1, NA
$ Alcohol.full_bar <dbl> 1, NA, 1, 0, NA
$ Alcohol.none <dbl> 0, NA, 0, 0, NA
$ BikeParking.True <dbl> 1, NA, 0, NA, NA
$ BusinessAcceptsBitcoin.True <dbl> NA, NA, NA, NA, NA
$ BusinessAcceptsCreditCards.True <dbl> 1, NA, NA, 1, 1
However, I don´t know if it is the same as n-1 especially for the factors with more then 2 levels (The dummy coding is for an XGBoost regression where "review count" is the target variable).
dput(samp)
structure(list(review_count = c(68L, 3L, 7L, 9L, 5L), Alcohol = structure(c(2L,
NA, 2L, 1L, NA), .Label = c("beer_and_wine", "full_bar", "none"
), class = "factor"), BikeParking = structure(c(2L, NA, 1L, NA,
NA), .Label = c("False", "True"), class = "factor"), BusinessAcceptsBitcoin = structure(c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), .Label = c("False",
"True"), class = "factor"), BusinessAcceptsCreditCards = structure(c(2L,
NA, NA, 2L, 2L), .Label = c("False", "True"), class = "factor")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L))
Edit
For those who have the same problem, I fixed this issue using caret.
library(caret)
dummy_dat = dummyVars("~ .", data = samp, fullRank = T)
dat = data.frame(predict(dummy_dat, newdata = samp))

Resources