mutate using values in a nest for each group using map - r

Consider the case below for an experiment where group is different treatments, init are the initial values for each sample, change is expected change after treatment and sd_change is standard deviation of the change.
library(tidyverse)
set.seed(001)
data1 <- tibble(group = rep(c("a", "b"), each = 4),
init = rpois(8, 10)) %>%
group_by(group, init) %>%
expand(change = seq(2, 6, 2)) %>%
mutate(sd_change = 2)
as_tibble(data1)
> data1
# A tibble: 24 x 4
# Groups: group, init [8]
group init change sd_change
<chr> <int> <dbl> <dbl>
1 a 7 2 2
2 a 7 4 2
3 a 7 6 2
4 a 8 2 2
5 a 8 4 2
6 a 8 6 2
7 a 10 2 2
8 a 10 4 2
9 a 10 6 2
10 a 11 2 2
# ... with 14 more rows
I generate final values and obtain mean and variance for each group and change as below
data2a <- data1 %>%
rowwise %>%
mutate(final = rnorm(1, change, sd_change) + init) %>%
ungroup
data2a %>%
group_by(group, change) %>%
summarise(mu_start = mean(init), mu_end = mean(final),
v_start = var(init), v_end = var(final))
# A tibble: 6 x 6
# Groups: group [2]
group change mu_start mu_end v_start v_end
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a 2 9 10.9 3.33 13.9
2 a 4 9 14.7 3.33 4.90
3 a 6 9 15.5 3.33 10.2
4 b 2 11.5 13.2 4.33 3.69
5 b 4 11.5 14.8 4.33 17.8
6 b 6 11.5 17.7 4.33 9.77
I want to repeat the above procedure R times by generating one final random value. I can do this with a for loop but I'm learning purrr and I'm stuck when summarising. See one version below:
# function to generate final values where R = 3
f <- function(n=3, x, y, z){
out <- rnorm(n, x, y)
out <- out + z
}
data2b <- data1 %>%
mutate(final = pmap(list(z = init,
x = change,
y = sd_change),
f)) %>%
ungroup
as_tibble(data2b)
# A tibble: 24 x 5
group init change sd_change final
<chr> <int> <dbl> <dbl> <list>
1 a 7 2 2 <dbl [3]>
2 a 7 4 2 <dbl [3]>
3 a 7 6 2 <dbl [3]>
4 a 8 2 2 <dbl [3]>
5 a 8 4 2 <dbl [3]>
6 a 8 6 2 <dbl [3]>
7 a 10 2 2 <dbl [3]>
8 a 10 4 2 <dbl [3]>
9 a 10 6 2 <dbl [3]>
10 a 11 2 2 <dbl [3]>
# ... with 14 more rows
summarise to get mu_end that should be a list of length R=3 in this example. The following gives an error
data2b %>%
split(.$group, .$change) %>%
mutate(mu_end = map(final, mean),
v_end = map(final, var)
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "list"
The output should be like this
# A tibble: 6 x 4
# Groups: group [2]
group change mu_end v_end
<chr> <dbl> <dbl> <dbl>
1 a 2 10.9 13.9
2 a 4 14.7 4.90
3 a 6 15.5 10.2
4 b 2 13.2 3.69
5 b 4 14.8 17.8
6 b 6 17.7 9.77
but each row of mu_end and v_end should be a list of length R
any help?

We can either do a group_split and then map through the list of tibbles, mutate to create the mean and var of the list column 'final' by looping with map
data2b %>%
group_split(group, change) %>%
map_df(~ .x %>%
mutate(mu_end = map_dbl(final, mean),
v_end = map_dbl(final, var)))
Or without splitting
data2b %>%
group_by(group, change) %>%
mutate(mu_end = map_dbl(final, mean), v_end = map_dbl(final, var))

Related

curly curly operator doesnt work with map() in R

library(tidyverse)
mean_by <- function(data,by,conti){
data %>% group_by({{by}}) %>% summarise(mean=mean({{conti}})) %>%
print() %>%
ggplot(aes(x={{by}},y=mean))+geom_col()
}
map(mtcars %>% select_if(is.numeric),~mean_by(mtcars,cyl,.))
# Not quite the same
mean_by(mtcars,cyl,carb)
I was toying around with the curly curly operator in R (just learned about it!) and then when iterating using map it seemd like the grouping isnt working very well, and I cant get my hands around the problem. What am I doing wrong?
Btw, When trying the explicit pmap way, I couldnt get around using the cyl variable in a clever way
pmap(mtcars %>% select_if(is.numeric),mean_by,..1=mtcars,..2=cyl,..3=.)
Error in pmap():
i In index: 1.
Caused by error in withCallingHandlers():
! object 'cyl' not found
Run rlang::last_error() to see where the error occurred.
It is expecting the column names and not the values - here, the select_if returns a subset of columns that are numeric. We may need the names to loop which would be a string, thus it is better to convert to symbol and evaluate (!!)
library(dplyr)
library(purrr)
mean_by <- function(data,by,conti){
by_sym <- rlang::ensym(by)
conti <- rlang::ensym(conti)
data %>% group_by(!! by_sym) %>%
summarise(mean=mean(!!conti)) %>%
print() %>%
ggplot(aes(x= !!by_sym,y=mean))+geom_col()
}
map(mtcars %>%
select_if(is.numeric) %>%
names,~mean_by(mtcars,cyl, !!.x))
-output (graphs removed)
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 26.7
2 6 19.7
3 8 15.1
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 4
2 6 6
3 8 8
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 105.
2 6 183.
3 8 353.
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 82.6
2 6 122.
3 8 209.
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 4.07
2 6 3.59
3 8 3.23
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 2.29
2 6 3.12
3 8 4.00
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 19.1
2 6 18.0
3 8 16.8
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 0.909
2 6 0.571
3 8 0
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 0.727
2 6 0.429
3 8 0.143
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 4.09
2 6 3.86
3 8 3.29
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 1.55
2 6 3.43
3 8 3.5
I've not seen the tilde syntax with map, but if you change that it seems to work.
map(mtcars %>% select_if(is.numeric), mean_by, data=mtcars, by=cyl)
Side note, you don't need that print() statement in mean_by.
mean_by <- function(data,by,conti){
data %>% group_by({{by}}) %>% summarise(mean=mean({{conti}})) %>%
ggplot(aes(x={{by}},y=mean))+geom_col()
}

consecutive grouped ID list R

If I have a df and want to do a grouped ID i would do:
df <- data.frame(id= rep(c(1,8,4), each = 3), score = runif(9))
df %>% group_by(id) %>% mutate(ID = cur_group_id())
following(How to create a consecutive group number answer of #Ronak Shah).
Now I have a list of those dfs and want to give consecutive group numbers, but they shall not start in every lists element new. In other words the ID column in listelement is 1 to 10, and in list two 11 to 15 and so on (so I can´t simply run the same code with lapply).
I guess I could do something like:
names(df)<-c("a", "b")
df<- mapply(cbind,df, "list"=names(df), SIMPLIFY=F)
df <- do.call(rbind, list)
df<-df %>% group_by(id) %>% mutate(ID = cur_group_id())
split(df, list)
but maybe some have more direct, clever ways?
A dplyr way could be using bind_rows as group_split (experimental):
library(dplyr)
df_list |>
bind_rows(.id = "origin") |>
mutate(ID = consecutive_id(id)) |> # If dplyr v.<1.1.0, use ID = cumsum(!duplicated(id))
group_split(origin, .keep = FALSE)
Output:
[[1]]
# A tibble: 9 × 3
id score ID
<dbl> <dbl> <int>
1 1 0.187 1
2 1 0.232 1
3 1 0.317 1
4 8 0.303 2
5 8 0.159 2
6 8 0.0400 2
7 4 0.219 3
8 4 0.811 3
9 4 0.526 3
[[2]]
# A tibble: 9 × 3
id score ID
<dbl> <dbl> <int>
1 3 0.915 4
2 3 0.831 4
3 3 0.0458 4
4 5 0.456 5
5 5 0.265 5
6 5 0.305 5
7 2 0.507 6
8 2 0.181 6
9 2 0.760 6
Data:
set.seed(1234)
df1 <- tibble(id = rep(c(1,8,4), each = 3), score = runif(9))
df2 <- tibble(id = rep(c(3,5,2), each = 3), score = runif(9))
df_list <- list(df1, df2)
Or using cur_group_id() for the group number, this approach, however, gives another order than you expect in your question:
library(dplyr)
df_list |>
bind_rows(.id = "origin") |>
mutate(ID = cur_group_id(), .by = "id") |> # If dplyr v.<1.1.0, use group_by()-notation
group_split(origin, .keep = FALSE)
Output:
[[1]]
# A tibble: 9 × 3
id score ID
<dbl> <dbl> <int>
1 1 0.187 1
2 1 0.232 1
3 1 0.317 1
4 8 0.303 6
5 8 0.159 6
6 8 0.0400 6
7 4 0.219 4
8 4 0.811 4
9 4 0.526 4
[[2]]
# A tibble: 9 × 3
id score ID
<dbl> <dbl> <int>
1 3 0.915 3
2 3 0.831 3
3 3 0.0458 3
4 5 0.456 5
5 5 0.265 5
6 5 0.305 5
7 2 0.507 2
8 2 0.181 2
9 2 0.760 2

Calculating R squared from multiple columns

I'm very new to R and have been trying to figure out how to calculate R^2 from a few columns within a large data set of approx 300+ columns.
Example:
rcalc <- data.frame('x1' = c(694, 702, 701), 'x2'=c(652, 659, 655),
'x3'=c(614, 612, 613), 'y1'= c(17.97, 17.95, 17.96), 'y2' = c(12.03, 12.0,
12.1), 'y3' = c(0.09, 0.1, 0.1))
From here I am stuck.
The formula in excel I can do, and looks like this:
RSQ(X1:X3, Y1:Y3) or RSQ(694:652:614, 17.97:12.03:0.09)
So, each row needs to be calculated for R^2. I was able to use the 'lm' command but was only able to do this for 1 row:
I had to take the value from each column of x (x1:x3) and stack them into 1 column, then each value from each column y (y1:y3) and stack into 1 column. Then performed the following:
rsqrd = lm(x~y, data=rcalc)
summary(rsqrd)$r.squared
This worked but again, only for 1 row. I'm not sure how to do this for thousands of rows. I hope this wasn't too confusing. Any help is greatly appreciated.
Troubleshooting:
with pivot_longer:
row col obs value
1 c 300_0 DUT Ip2_comp 784.9775
1 c 300_12 DUT Ip2_comp 864.4234
1 c 300_18 DUT Ip2_comp 919.3384
1 c 300_0 REF O2 0.09
1 c 300_12 REF O2 11.95
1 c 300_18 REF O2 17.98
2 c 300_0 DUT Ip2_comp 781.5785
2 c 300_12 DUT Ip2_comp 865.5541
2 c 300_18 DUT Ip2_comp 921.0646
2 c 300_0 REF O2 0.09
With Pivot_wider:
row obs c
1 300_0 DUT Ip2_comp 784.9775
1 300_12 DUT Ip2_comp 864.4234
1 300_18 DUT Ip2_comp 919.3384
1 300_0 REF O2 0.09
1 300_12 REF O2 11.95
1 300_18 REF O2 17.98
2 300_0 DUT Ip2_comp 781.5785
2 300_12 DUT Ip2_comp 865.5541
2 300_18 DUT Ip2_comp 921.0646
I'm sure this could be done more concisely, but here's one approach using tidyverse functions. First, I do some reshaping to add a row number and make it into a longer shape, with columns for row, observation # (1-3), x, and y.
Then I "nest" all the data except row number so that I can run a separate regression on each row's data, and then extract r squared (and a variety of other stats) from each regression.
library(tidyverse)
rcalc %>% # your data
# reshape to get matched columns for all x and for all y values
mutate(row = row_number()) %>%
pivot_longer(-row, names_to = c("col", "obs"), names_sep = 1) %>% # split column name into two fields after first character
pivot_wider(names_from = col, values_from = value) %>%
# nest data, regression, unnest
nest(-row) %>%
mutate(model = map(data, function(df) lm(y ~ x, data = df)),
tidied = map(model, broom::glance)) %>%
unnest(tidied)
Result
# A tibble: 3 x 15
row data model r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
<int> <list> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
1 1 <tibble [3 × 3]> <lm> 0.952 0.905 2.81 20.0 0.140 1 -5.71 17.4 14.7 7.91 1 3
2 2 <tibble [3 × 3]> <lm> 0.973 0.946 2.10 36.3 0.105 1 -4.84 15.7 13.0 4.43 1 3
3 3 <tibble [3 × 3]> <lm> 0.951 0.903 2.84 19.6 0.141 1 -5.74 17.5 14.8 8.05 1 3
Edit: for troubleshooting, I am adding here the results I see at each stage:
after the pivot_longer step:
# A tibble: 18 x 4
row col obs value
<int> <chr> <chr> <dbl>
1 1 x 1 694
2 1 x 2 652
3 1 x 3 614
4 1 y 1 18.0
5 1 y 2 12.0
6 1 y 3 0.09
7 2 x 1 702
8 2 x 2 659
9 2 x 3 612
10 2 y 1 18.0
11 2 y 2 12
12 2 y 3 0.1
13 3 x 1 701
14 3 x 2 655
15 3 x 3 613
16 3 y 1 18.0
17 3 y 2 12.1
18 3 y 3 0.1
after the pivot_wider step:
# A tibble: 9 x 4
row obs x y
<int> <chr> <dbl> <dbl>
1 1 1 694 18.0
2 1 2 652 12.0
3 1 3 614 0.09
4 2 1 702 18.0
5 2 2 659 12
6 2 3 612 0.1
7 3 1 701 18.0
8 3 2 655 12.1
9 3 3 613 0.1

R user-defined/dynamic summary function within dplyr::summarise

Somewhat hard to define this question without sounding like lots of similar questions!
I have a function for which I want one of the parameters to be a function name, that will be passed to dplyr::summarise, e.g. "mean" or "sum":
data(mtcars)
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
zColquo = quo_name(zCol)
cellSummaries <- x %>%
group_by(gear, !!sym(groupcol)) %>% # 1 preset grouper, 1 user-defined
summarise(Count = n(), # 1 preset summary, 1 user defined
!!zColquo := mean(!!sym(zColquo))) # mean should be zFun, user-defined
ungroup
}
(this groups by gear and cyl, then returns, per group, count and mean(disp))
Per my note, I'd like 'mean' to be dynamic, performing the function defined by zFun, but I can't for the life of me work out how to do it! Thanks in advance for any advice.
You can use match.fun to make the function dynamic. I also removed zColquo as it's not needed.
library(dplyr)
library(rlang)
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
cellSummaries <- x %>%
group_by(gear, !!sym(groupcol)) %>%
summarise(Count = n(),
!!zCol := match.fun(zFun)(!!sym(zCol))) %>%
ungroup
return(cellSummaries)
}
You can then check output
f()
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 242.
#3 3 8 12 358.
#4 4 4 8 103.
#5 4 6 4 164.
#6 5 4 2 108.
#7 5 6 1 145
#8 5 8 2 326
f(zFun = "sum")
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 483
#3 3 8 12 4291.
#4 4 4 8 821
#5 4 6 4 655.
#6 5 4 2 215.
#7 5 6 1 145
#8 5 8 2 652
We can use get
library(dplyr)
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
zColquo = quo_name(zCol)
x %>%
group_by(gear, !!sym(groupcol)) %>% # 1 preset grouper, 1 user-defined
summarise(Count = n(), # 1 preset summary, 1 user defined
!!zColquo := get(zFun)(!!sym(zCol))) %>%
ungroup
}
f()
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 242.
#3 3 8 12 358.
#4 4 4 8 103.
#5 4 6 4 164.
#6 5 4 2 108.
#7 5 6 1 145
#8 5 8 2 326
f(zFun = "sum")
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 483
#3 3 8 12 4291.
#4 4 4 8 821
#5 4 6 4 655.
#6 5 4 2 215.
#7 5 6 1 145
#8 5 8 2 652
In addition, we could remove the sym evaluation in group_by and in summarise if we wrap with across
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
x %>%
group_by(across(c(gear, groupcol))) %>% # 1 preset grouper, 1 user-defined
summarise(Count = n(), # 1 preset summary, 1 user defined
across(zCol, ~ get(zFun)(.))) %>%
ungroup
}
f()
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 242.
#3 3 8 12 358.
#4 4 4 8 103.
#5 4 6 4 164.
#6 5 4 2 108.
#7 5 6 1 145
#8 5 8 2 326

Arrange values within a specific group

I'm trying to arrange values in decreasing order within a exact group in a nested dataframe. My input data looks like this. I've got two grouping variables (group1 and group2) and three values (i.e. id, value2, value3).
library(tidyverse)
set.seed(1234)
df <- tibble(group1 = c(rep(LETTERS[1:3], 4)),
group2 = c(rep(0, 6), rep(2, 6)),
value2 = rnorm(12, 20, sd = 10),
value3 = rnorm(12, 20, sd = 50)) %>%
group_by(group1) %>%
mutate(id = c(1:4)) %>%
ungroup()
I decided to group them by group1 and group2 and then nest():
df_nested <- df %>%
group_by(group1, group2) %>%
nest()
# A tibble: 6 x 3
# Groups: group1, group2 [6]
group1 group2 data
<chr> <dbl> <list>
1 A 0 <tibble [2 x 3]>
2 B 0 <tibble [2 x 3]>
3 C 0 <tibble [2 x 3]>
4 A 2 <tibble [2 x 3]>
5 B 2 <tibble [2 x 3]>
6 C 2 <tibble [2 x 3]>
Perfect. Now I need to sort only those data which group2 is equal to 2 by id. However I'm receiving a following error:
df_nested %>%
mutate(data = map2_df(.x = data, .y = group2,
~ifelse(.y == 2, arrange(-.x$id),
.x)))
Error: Argument 1 must have names
You could do :
library(dplyr)
library(purrr)
df_nested$data <- map2(df_nested$data, df_nested$group2,~if(.y == 2)
arrange(.x, -.x$id) else .x)
So data where group2 is not equal to 2 is not sorted
df_nested$data[[1]]
# A tibble: 2 x 3
# value2 value3 id
# <dbl> <dbl> <int>
#1 13.1 -89.0 1
#2 9.76 -3.29 2
and where group2 is 2 is sorted.
df_nested$data[[4]]
# A tibble: 2 x 3
#value2 value3 id
# <dbl> <dbl> <int>
#1 15.0 -28.4 4
#2 31.0 -22.8 3
If you want to combine them do :
map2_df(df_nested$data, df_nested$group2,~if(.y == 2) arrange(.x, -.x$id) else .x)
I would suggest creating an additional variable id_ which will be equal to the original id variable when group2 == 2 and NA otherwise. This way if we use it in sorting it'll make no effect when group2 != 2.
df %>%
mutate(id_ = if_else(group2 == 2, id, NA_integer_)) %>%
arrange(group1, group2, -id_)
#> # A tibble: 12 x 6
#> group1 group2 value2 value3 id id_
#> <chr> <dbl> <dbl> <dbl> <int> <int>
#> 1 A 0 17.6 50.2 1 NA
#> 2 A 0 33.8 -14.4 2 NA
#> 3 A 2 23.1 22.6 4 4
#> 4 A 2 13.7 50.2 3 3
#> 5 B 0 15.4 49.9 1 NA
#> 6 B 0 16.2 63.7 2 NA
#> 7 B 2 41.7 -2.90 4 4
#> 8 B 2 16.6 46.7 3 3
#> 9 C 0 19.9 -64.3 1 NA
#> 10 C 0 19.9 59.7 2 NA
#> 11 C 2 34.1 48.5 4 4
#> 12 C 2 32.3 23.1 3 3
Then if needed we can group and nest the result.

Resources