I have a dataframe apcd_hud_ex. I want to take some column names (e.g. x2014_03_15), and change the value of the columns based on the current value of the columns, the parsed date in the column names, and another column in the dataframe (SMOKEFREE_DATE). I can do it in a loop over the columns, but I would really like to know how to do it with dplyr and mutate. Any help would be much appreciated!
apcd_hud_ex = structure(list(studyid = 1:5, SMOKEFREE_DATE = structure(c(16283,
16283, 16071, 16071, 16648), class = "Date"), x2014_03_15 = c(1,
1, 1, 0, 1), x2014_04_15 = c(1, 1, 1, 1, 1), x2014_05_15 = c(1,
1, 1, 1, 1), x2014_06_15 = c(1, 1, 1, 1, 1), x2014_07_15 = c(1,
1, 1, 1, 1), x2014_08_15 = c(1, 1, 1, 1, 1), x2014_09_15 = c(1,
1, 1, 1, 1), x2014_10_15 = c(1, 1, 1, 1, 1), x2014_11_15 = c(1,
1, 1, 1, 1), x2014_12_15 = c(1, 1, 1, 1, 1), x2015_01_15 = c(1,
1, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df", "tbl",
"data.frame"))
> apcd_hud_ex
# A tibble: 5 x 13
studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15
<int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2014-08-01 1 1 1 1 1 1 1 1
2 2 2014-08-01 1 1 1 1 1 1 1 1
3 3 2014-01-01 1 1 1 1 1 1 1 1
4 4 2014-01-01 0 1 1 1 1 1 1 1
5 5 2015-08-01 1 1 1 1 1 1 1 1
# ... with 3 more variables: x2014_11_15 <dbl>, x2014_12_15 <dbl>, x2015_01_15 <dbl>
>
#function for loop
assign_PHRes_enrollIns_fn <- function(SFdate,insValue,insDate){
val = if_else(insValue == 0,
0,
if_else(as.Date(insDate) < as.Date(SFdate,"%Y-%m-%d"),
1,
2))
return(val)
}
#vectorized function
assign_PHRes_enrollIns_fn_vec <- Vectorize(assign_PHRes_enrollIns_fn)
dateCols = names(apcd_hud_ex)[which(names(apcd_hud_ex) == "x2014_03_15"):which(names(apcd_hud_ex) == "x2015_01_15")]
This loop over the column names (dateCols) works:
for(i in 1:length(dateCols)){
dateCol = dateCols[i]
insDate = as.Date(paste0(str_sub(dateCol,2,5),"/",str_sub(dateCol,7,8),"/",str_sub(dateCol,10,11)),"%Y/%m/%d")
apcd_hud_ex[,dateCol] = assign_PHRes_enrollIns_fn_vec(apcd_hud_ex[,"SMOKEFREE_DATE"],apcd_hud_ex[,dateCol],insDate)
}
Now the manipulated dataframe looks like this, which is what I want:
> apcd_hud_ex
# A tibble: 5 x 13
studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15
<int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2014-08-01 1 1 1 1 1 2 2 2
2 2 2014-08-01 1 1 1 1 1 2 2 2
3 3 2014-01-01 2 2 2 2 2 2 2 2
4 4 2014-01-01 0 2 2 2 2 2 2 2
5 5 2015-08-01 1 1 1 1 1 1 1 1
# ... with 3 more variables: x2014_11_15 <dbl>, x2014_12_15 <dbl>, x2015_01_15 <dbl>
However, I would like to learn how to do this with dynamic programming and dplyr. I've tried 2 functions:
newInsValCols_fn1 <- function(df,dateCols){
insDate = as.Date(paste0(str_sub(dateCols,2,5),"/",str_sub(dateCols,7,8),"/",str_sub(dateCols,10,11)),"%Y/%m/%d")
df1 <- df %>%
mutate({{dateCols}} := if_else({{dateCols}} == 0,
0,
if_else(as.Date(insDate) < as.Date(SMOKEFREE_DATE,"%Y-%m-%d"),
1,
2)))
return(df1)
}
newInsValCols_fn1(apcd_hud_ex,dateCols)
Which gives error:
Error: The LHS of `:=` must be a string or a symbol
So I tried using symbols:
newInsValCols_fn2 <- function(df,dateCols){
dateCols_syms = syms(dateCols)
insDate = as.Date(paste0(str_sub(dateCols,2,5),"/",str_sub(dateCols,7,8),"/",str_sub(dateCols,10,11)),"%Y/%m/%d")
df1 <- df %>%
mutate(!!dateCols_syms := if_else({{dateCols}} == 0,
0,
if_else(as.Date(insDate) < as.Date(SMOKEFREE_DATE,"%Y-%m-%d"),
1,
2)))
return(df1)
}
newInsValCols_fn2(apcd_hud_ex,dateCols)
which gives the same error:
Error: The LHS of `:=` must be a string or a symbol
I also tried using !!! instead of !!, but that resulted in the following error:
Error: The LHS of `:=` can't be spliced with `!!!`
Something in my understanding is lacking.
Here's how I'd do it with dplyr.
library(dplyr)
library(lubridate)
apcd_hud_ex %>%
mutate(across(
starts_with('x'),
~ case_when(. == 0 ~ 0,
ymd(gsub('x', '', cur_column())) < SMOKEFREE_DATE ~ 1,
TRUE ~ 2)
))
#> # A tibble: 5 x 13
#> studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15 x2014_11_15 x2014_12_15 x2015_01_15
#> <int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2014-08-01 1 1 1 1 1 2 2 2 2 2 2
#> 2 2 2014-08-01 1 1 1 1 1 2 2 2 2 2 2
#> 3 3 2014-01-01 2 2 2 2 2 2 2 2 2 2 2
#> 4 4 2014-01-01 0 2 2 2 2 2 2 2 2 2 2
#> 5 5 2015-08-01 1 1 1 1 1 1 1 1 1 1 1
You can use pivot_longer to have just one column to modify, which is an alternative to mutate(across()).
You can use case_when to have multiple conditions, so you do not need to nest multiple if statements. The value will be the one of the first true statement.
library(tidyverse)
apcd_hud_ex <- structure(list(studyid = 1:5, SMOKEFREE_DATE = structure(c(
16283,
16283, 16071, 16071, 16648
), class = "Date"), x2014_03_15 = c(
1,
1, 1, 0, 1
), x2014_04_15 = c(1, 1, 1, 1, 1), x2014_05_15 = c(
1,
1, 1, 1, 1
), x2014_06_15 = c(1, 1, 1, 1, 1), x2014_07_15 = c(
1,
1, 1, 1, 1
), x2014_08_15 = c(1, 1, 1, 1, 1), x2014_09_15 = c(
1,
1, 1, 1, 1
), x2014_10_15 = c(1, 1, 1, 1, 1), x2014_11_15 = c(
1,
1, 1, 1, 1
), x2014_12_15 = c(1, 1, 1, 1, 1), x2015_01_15 = c(
1,
1, 1, 1, 1
)), row.names = c(NA, -5L), class = c(
"tbl_df", "tbl",
"data.frame"
))
apcd_hud_ex %>%
pivot_longer(starts_with("x")) %>%
mutate(
insDate = name %>% str_remove("^x") %>% str_replace_all("_", "-") %>% as.Date(),
value = case_when(
value == 0 ~ 0,
insDate < SMOKEFREE_DATE ~ 1,
insDate >= SMOKEFREE_DATE ~ 2
)
) %>%
select(-insDate) %>%
pivot_wider()
#> # A tibble: 5 × 13
#> studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15
#> <int> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2014-08-01 1 1 1 1
#> 2 2 2014-08-01 1 1 1 1
#> 3 3 2014-01-01 2 2 2 2
#> 4 4 2014-01-01 0 2 2 2
#> 5 5 2015-08-01 1 1 1 1
#> # … with 7 more variables: x2014_07_15 <dbl>, x2014_08_15 <dbl>,
#> # x2014_09_15 <dbl>, x2014_10_15 <dbl>, x2014_11_15 <dbl>, x2014_12_15 <dbl>,
#> # x2015_01_15 <dbl>
Created on 2022-05-05 by the reprex package (v2.0.0)
Related
I have this dataset
structure(list(ID = c(1, 2, 3, 4, 6, 7), V = c(0, 0, 1, 1,
1, 0), Mus = c(1, 0, 1, 1, 1, 0), R = c(1, 0, 1, 1, 1, 1),
E = c(1, 0, 0, 1, 0, 0), S = c(1, 0, 1, 1, 1, 0), t = c(0,
0, 0, 1, 0, 0), score = c(1, 0.4, 1, 0.4, 0.4, 0.4)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(c(`5` = 5L,
`12` = 12L, `15` = 15L, `21` = 21L, `22` = 22L, `23` = 23L, `34` = 34L,
`44` = 44L, `46` = 46L, `52` = 52L, `56` = 56L, `57` = 57L, `58` = 58L
), class = "omit"))
I would like to remove rows equal to zero in columns that ranges from the second to the sixth columns and that in last column have 0.4. This would imply - if possible - to readjust the ID value properly.
My expected outcome would be:
ID Vid Mus Rea Ema SMS tel MMT
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 1 1 1 1 0 1
2 3 1 1 1 0 1 0 1
3 4 1 1 1 1 1 1 0.4
4 6 1 1 1 0 1 0 0.4
5 7 0 0 1 0 0 0 0.4
that does not have the second ID rows from the original dataset. Does anyone has any clue for doing this via dplyr or another iterative method (for loop and so on)?
Thanks
You can use if_all() in filter() like you can use across() in other functions. Just use row_number() to regenerate ID.
library(dplyr)
df %>%
filter(
!(if_all(
.cols = V:t,
.fns = ~ .x == 0
) & score == 0.4)
) %>%
mutate(
ID = row_number()
)
#> # A tibble: 5 × 8
#> ID V Mus R E S t score
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 1 1 1 1 0 1
#> 2 2 1 1 1 0 1 0 1
#> 3 3 1 1 1 1 1 1 0.4
#> 4 4 1 1 1 0 1 0 0.4
#> 5 5 0 0 1 0 0 0 0.4
You can use rowSums with filter:
library(dplyr)
df %>%
filter(!(rowSums(across(V:t)) == 0 & score == 0.4)) %>%
mutate(ID = row_number())
ID V Mus R E S t score
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 1 1 1 1 0 1
2 2 1 1 1 0 1 0 1
3 3 1 1 1 1 1 1 0.4
4 4 1 1 1 0 1 0 0.4
5 5 0 0 1 0 0 0 0.4
I would like to replace 2003 with x, 2004 with y, and 2005 with z, in the column suffixes.
For example, I would like to transform:
In:
Here's the reproducible example:
structure(list(id = c(1, 1, 1, 1, 1), xd_2004 = c(1, 1, 1, 1,
1), xd_2003 = c(1, 1, 1, 1, 1), xe_2004 = c(1, 1, 1, 1, 1), xe_2003 = c(1,
1, 1, 1, 1), xd_2005 = c(1, 1, 1, 1, 1), xe_2005 = c(1, 1, 1,
1, 1)), class = "data.frame", row.names = c(NA, -5L))
We may also use a named vector in str_replace_all
library(dplyr)
library(stringr)
df %>%
rename_with(~ str_replace_all(.x, setNames(c('x', 'y', 'z'), 2003:2005)))
-output
id xd_y xd_x xe_y xe_x xd_z xe_z
1 1 1 1 1 1 1 1
2 1 1 1 1 1 1 1
3 1 1 1 1 1 1 1
4 1 1 1 1 1 1 1
5 1 1 1 1 1 1 1
Using rename_with we could do:
library(dplyr)
library(stringr)
df %>%
rename_with(., ~str_replace_all(., '2004', "y")) %>%
rename_with(., ~str_replace_all(., '2003', "x")) %>%
rename_with(., ~str_replace_all(., '2005', "z"))
id xd_y xd_x xe_y xe_x xd_z xe_z
1 1 1 1 1 1 1 1
2 1 1 1 1 1 1 1
3 1 1 1 1 1 1 1
4 1 1 1 1 1 1 1
5 1 1 1 1 1 1 1
I want to use summarize and across from dplyrto count the number of non-NA values by my grouping variable. For example, using these data:
library(tidyverse)
d <- tibble(ID = c(1, 1, 1, 2, 2, 2, 3, 3, 3),
Col1 = c(5, 8, 2, NA, 2, 2, NA, NA, 1),
Col2 = c(NA, 2, 1, NA, NA, NA, 1, NA, NA),
Col3 = c(1, 5, 2, 4, 1, NA, NA, NA, NA))
# A tibble: 9 x 4
ID Col1 Col2 Col3
<dbl> <dbl> <dbl> <dbl>
1 1 5 NA 1
2 1 8 2 5
3 1 2 1 2
4 2 NA NA 4
5 2 2 NA 1
6 2 2 NA NA
7 3 NA 1 NA
8 3 NA NA NA
9 3 1 NA NA
With a solution resembling:
d %>%
group_by(ID) %>%
summarize(across(matches("^Col[1-3]$"),
#function to count non-NA per column per ID
))
With the following result:
# A tibble: 3 x 4
ID Col1 Col2 Col3
<dbl> <dbl> <dbl> <dbl>
1 1 3 2 3
2 2 2 0 2
3 3 1 1 0
I hope this is what you are looking for:
library(dplyr)
d %>%
group_by(ID) %>%
summarise(across(Col1:Col3, ~ sum(!is.na(.x)), .names = "non-{.col}"))
# A tibble: 3 x 4
ID `non-Col1` `non-Col2` `non-Col3`
<dbl> <int> <int> <int>
1 1 3 2 3
2 2 2 0 2
3 3 1 1 0
Or if you would like to select columns by their shared string you can use this:
d %>%
group_by(ID) %>%
summarise(across(contains("Col"), ~ sum(!is.na(.x)), .names = "non-{.col}"))
I have a df that looks like this:
It can be build using codes:
structure(list(ID = c(1, 2, 3, 4, 5), Pass = c(0, 1, 1, 1, 1),
Math = c(0, 0, 1, 1, 1), ELA = c(0, 1, 0, 1, 0), PE = c(0,
0, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
Where pass stand for a student pass any test or not. Now I want to build a new var Result to capture a student's test results like following, what should I do?
Try the base R code below
q <- with(data.frame(which(df[-(1:2)] == 1, arr.ind = TRUE)),
tapply(names(df[-(1:2)])[col], factor(row, levels = 1:nrow(df)), toString))
df$Result <- ifelse(is.na(q), "Not Pass", paste0("Pass: ", q))
which gives
> df
# A tibble: 5 x 6
ID Pass Math ELA PE Result
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 0 0 0 0 Not Pass
2 2 1 0 1 0 Pass: ELA
3 3 1 1 0 1 Pass: Math, PE
4 4 1 1 1 1 Pass: Math, ELA, PE
5 5 1 1 0 1 Pass: Math, PE
Using dplyr with rowwise
library(dplyr)
library(stringr)
df1 %>%
rowwise %>%
mutate(Result = if(as.logical(Pass))
str_c('Pass: ', toString(names(select(., Math:PE))[as.logical(c_across(Math:PE))])) else 'Not pass' ) %>%
ungroup
# A tibble: 5 x 6
# ID Pass Math ELA PE Result
# <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#1 1 0 0 0 0 Not pass
#2 2 1 0 1 0 Pass: ELA
#3 3 1 1 0 1 Pass: Math, PE
#4 4 1 1 1 1 Pass: Math, ELA, PE
#5 5 1 1 0 1 Pass: Math, PE
data
df1 <- structure(list(ID = c(1, 2, 3, 4, 5), Pass = c(0, 1, 1, 1, 1),
Math = c(0, 0, 1, 1, 1), ELA = c(0, 1, 0, 1, 0), PE = c(0,
0, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
Here's one solution:
library(dplyr)
library(magrittr)
library(stringr)
df <- structure(list(ID = c(1, 2, 3, 4, 5), Pass = c(0, 1, 1, 1, 1),
Math = c(0, 0, 1, 1, 1), ELA = c(0, 1, 0, 1, 0), PE = c(0,
0, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
df %<>% pivot_longer(cols = -c(ID, Pass), names_to = "sub", values_to = "done")
df %<>% group_by(ID) %>% mutate(Result = paste0(ifelse(done == 1, sub, NA), collapse = ", ")) %>% ungroup()
df %<>% pivot_wider(names_from = sub, values_from = done)
df %<>% mutate(Result = paste0("Pass: ", str_replace_all(Result, "NA[, ]*", "")))
df %<>% mutate(Result = ifelse(str_detect(Result, "Pass: $"), "Not pass", str_replace_all(Result, ",[\\s]*$", "")))
df
# # A tibble: 5 x 6
# ID Pass Result Math ELA PE
# <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
# 1 1 0 Not pass 0 0 0
# 2 2 1 Pass: ELA 0 1 0
# 3 3 1 Pass: Math, PE 1 0 1
# 4 4 1 Pass: Math, ELA, PE 1 1 1
# 5 5 1 Pass: Math, PE 1 0 1
I can provide an explanation of what the code is doing if necessary.
I have data as follows:
DT <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 8 x 6
Area Year Group Population_Count Male_Count Female_Count
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5
2 A 1 2 12 7 5
3 A 2 1 10 5 5
4 A 2 2 12 4 8
5 B 1 1 10 5 5
6 B 1 2 13 8 5
7 B 2 1 10 5 5
8 B 2 2 11 6 5
I would like to keep one observations per Area-Year, without losing any information. I tried to do
DTcast <- dcast(DT, Area + Year ~ Group + Population_Count + Male_Count + Female_Count)
But that results in a lot of rubbish:
Area Year 1_10_5_5 2_11_6_5 2_12_4_8 2_12_7_5 2_13_8_5
1 A 1 5 NA NA 5 NA
2 A 2 5 NA 8 NA NA
3 B 1 5 NA NA NA 5
4 B 2 5 5 NA NA NA
In addition, when I apply it to the actual data, I get:
Using 'H_FEMALE' as value column. Use 'value.var' to override
Error in CJ(1:72284, 1:1333365) :
Cross product of elements provided to CJ() would result in 96380955660 rows which exceeds .Machine$integer.max == 2147483647
So I think I am doing something wrong. I think it maybe has to do with the value.var which I do not know how to select.
Desired result:
# A tibble: 4 x 9
Area Year Group `Population_Count_ Group_1` `Male_Count_ Group_1` `Female_Count_ Group_1` `Population_Count_ Group_2` `Male_Count_ Group_2` `Female_Count_ Group_2`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5 12 7 5
2 A 2 1 10 5 5 12 4 8
3 B 1 1 10 5 5 13 8 5
4 B 2 1 10 5 5 11 6 5
library(tidyverse)
DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
> DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
# A tibble: 4 x 8
Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2 Female_Count_1 Female_Count_2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 10 12 5 7 5 5
2 A 2 10 12 5 4 5 8
3 B 1 10 13 5 8 5 5
4 B 2 10 11 5 6 5 5
This will name your columns as desired
DT %>% pivot_wider(id_cols = c("Area", "Year"),
names_from = "Group",
values_from = 4:6,
names_sep = "_Group_")
use data.table
library(data.table)
dt <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
setDT(dt)
dcast(
dt,
formula = Area + Year ~ Group,
value.var = grep("_Count", names(dt), value = T)
)
#> Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2
#> 1: A 1 10 12 5 7
#> 2: A 2 10 12 5 4
#> 3: B 1 10 13 5 8
#> 4: B 2 10 11 5 6
#> Female_Count_1 Female_Count_2
#> 1: 5 5
#> 2: 5 8
#> 3: 5 5
#> 4: 5 5
Created on 2020-12-18 by the reprex package (v0.3.0)