I want to aggregate Date by group. However, each observation can belong to several groups (e.g. observation 1 belongs to group A and B). I could not find a nice way to achieve this with data.table. Currently I created for each of the possible groups a logical variable which takes the value TRUE if the observation belongs to that group. I am looking for a better way to do this than presented below. I would also like to know how I could achieve this with the tidyverse.
library(data.table)
# Data
set.seed(1)
TF <- c(TRUE, FALSE)
time <- rep(1:4, each = 5)
df <- data.table(time = time, x = rnorm(20), groupA = sample(TF, size = 20, replace = TRUE),
groupB = sample(TF, size = 20, replace = TRUE),
groupC = sample(TF, size = 20, replace = TRUE))
# This should be nicer and less repetitive
df[groupA == TRUE, .(A = sum(x)), by = time][
df[groupB == TRUE, .(B = sum(x)), by = time], on = "time"][
df[groupC == TRUE, .(C = sum(x)), by = time], on = "time"]
# desired output
time A B C
1: 1 NA 0.9432955 0.1331984
2: 2 1.2257538 0.2427420 0.1882493
3: 3 -0.1992284 -0.1992284 1.9016244
4: 4 0.5327774 0.9438362 0.9276459
Here is a solution with data.table:
df[, lapply(.SD[, .(groupA, groupB, groupC)]*x, sum), time]
# > df[, lapply(.SD[, .(groupA, groupB, groupC)]*x, sum), time]
# time groupA groupB groupC
# 1: 1 0.0000000 0.9432955 0.1331984
# 2: 2 1.2257538 0.2427420 0.1882493
# 3: 3 -0.1992284 -0.1992284 1.9016244
# 4: 4 0.5327774 0.9438362 0.9276459
or (thx to #chinsoon12 for the comment) more programmatically:
df[, lapply(.SD*x, sum), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
If you want the result in the long format you can do:
df[, colSums(.SD*x), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
### with indicator for the group:
df[, .(colSums(.SD*x), c("A","B","C")), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
I think it's easier here to work in long format. First I gather the observations to long format, then keep only the values where the observation belongs to the corresponding group. Then I remove the logical column, and rename the groups to single letters. Then I aggregate across groups and time (summarise in dplyr).
Finally I spread back to wide format.
library(dplyr)
library(tidyr)
set.seed(1)
TF <- c(TRUE, FALSE)
time <- rep(1:4, each = 5)
df <- data.frame(time = time, x = rnorm(20), groupA = sample(TF, size = 20, replace = TRUE),
groupB = sample(TF, size = 20, replace = TRUE),
groupC = sample(TF, size = 20, replace = TRUE))
df %>%
gather(group, belongs, groupA:groupC) %>%
filter(belongs) %>%
select(-belongs) %>%
mutate(group = gsub("group", "", group)) %>%
group_by(time, group) %>%
summarise(x = sum(x)) %>%
spread(group, x)
Output
# A tibble: 4 x 4
# Groups: time [4]
time A B C
<int> <dbl> <dbl> <dbl>
1 1 NA 0.943 0.133
2 2 1.23 0.243 0.188
3 3 -0.199 -0.199 1.90
4 4 0.533 0.944 0.928
An option can be using tidyr and dplyr packages in combination with data.table. Try to work on data in long format and then change it to wide format.
library(dplyr)
library(tidyr)
melt(df, id.vars = c("time", "x")) %>%
filter(value) %>%
group_by(time, variable) %>%
summarise(sum = sum(x)) %>%
spread(variable, sum)
# # A tibble: 4 x 4
# # Groups: time [4]
# time groupA groupB groupC
# * <int> <dbl> <dbl> <dbl>
# 1 1 NA 0.943 0.133
# 2 2 1.23 0.243 0.188
# 3 3 - 0.199 -0.199 1.90
# 4 4 0.533 0.944 0.928
Related
I am trying to calculate mean for some data along a non-regular date sequence. For example, I have minute level data for specific periods of time during the day and I am interested in calculating 5 minute averages. However, I am not sure how does the width parameter in rollapply works when is specified as a list.
library(tidyverse)
library(zoo)
length = 16
set.seed(10)
dxf <- data.frame(
date = seq(Sys.time(), by = "59 sec", length.out = length),
value = runif(length)
)
# Create a "discontinuity"
dxf$date[8:length] <- dxf$date[8:length] + 3600*24
# Add some noise
dxf$date <- dxf$date + runif(length, 0, 1)
diff(dxf$date)
dxf %>%
arrange(date) %>%
mutate(
diff = c(as.numeric(diff(date)), NA),
mean = rollapply(value, width = 5, mean, partial = TRUE, align = "left")
)
# This is what I need. Therefore, I need a variable width but adjusting to the discontinuity in the rows.
mean1 <- mean(dxf$value[1:5])
mean2 <- mean(dxf$value[2:6])
mean3 <- mean(dxf$value[3:7])
mean4 <- NA # Only have 4 values mean(dxf$value[4:7])
mean5 <- NA # Only have 3 values mean(dxf$value[5:7])
mean6 <- NA # Only have 2 values mean(dxf$value[6:7])
mean7 <- NA # Only have 1 values mean(dxf$value[7:7])
mean8 <- mean(dxf$value[7:11])
etc.
I think this is a tricky problem. Here is one approach
1 Generate a 1 min sequence from the first to the last datetime
2 Interpolate so we have a value at each 1 min. This includes interpolating across the discontinuity
3 Calculate the running 5 min mean based on the 1 min interpolated values
4 Remove the values where the gap in the original datetime values is too large
Also, take care with time zones, best to set these to some deliberately chosen value or UTC which the lubridate functions do by default.
library(tidyverse)
library(RcppRoll)
library(lubridate)
dxf <- tibble(
date = seq(from = ymd_hms('2019-08-14 09:06:05'), by = "59 sec", length.out = 30),
value = runif(30)
)
dxf$date[15:30] <- dxf$date[15:30] + 3600*24 # discontinuing
dxf$date <- dxf$date + round(runif(30)) # noise
dxf <- dxf %>%
mutate(date = ymd_hms(date),
date_num = as.numeric(date),
diff = date_num - lag(date_num))
discontinuity <- which(dxf$diff > 70)
n = nrow(dxf)
date_seq <- seq(from = dxf$date_num[1], to = dxf$date_num[n], by = 60) # create a 1 min sequence
value_interp = approx(x = dxf$date_num, y = dxf$value, xout = date_seq) # interpolate values for the 5 min sequence
df <- tibble(
date = as_datetime(date_seq),
mean_value = RcppRoll::roll_mean(value_interp$y, n = 5, fill = NA, align = 'left'))
df %>%
filter(date < dxf$date[discontinuity - 1] | date > dxf$date[discontinuity])
We could extract the date, group them and then use rollmean
library(dplyr)
dxf %>%
mutate(d1 = as.Date(date)) %>%
group_by(d1) %>%
mutate(mean = zoo::rollmean(value, 5, align = "left", fill = NA)) %>%
ungroup %>%
select(-d1)
# date value mean
# <dttm> <dbl> <dbl>
# 1 2019-08-14 12:49:09 0.507 0.404
# 2 2019-08-14 12:50:08 0.307 0.347
# 3 2019-08-14 12:51:07 0.427 0.341
# 4 2019-08-14 12:52:07 0.693 NA
# 5 2019-08-14 12:53:06 0.0851 NA
# 6 2019-08-14 12:54:05 0.225 NA
# 7 2019-08-14 12:55:04 0.275 NA
# 8 2019-08-15 12:56:02 0.272 0.507
# 9 2019-08-15 12:57:01 0.616 0.476
#10 2019-08-15 12:58:01 0.430 0.472
#11 2019-08-15 12:59:00 0.652 0.457
#12 2019-08-15 12:59:58 0.568 0.413
#13 2019-08-15 13:00:58 0.114 NA
#14 2019-08-15 13:01:56 0.596 NA
#15 2019-08-15 13:02:56 0.358 NA
#16 2019-08-15 13:03:54 0.429 NA
data
set.seed(10)
dxf <- data.frame(
date = seq(Sys.time(), by = "59 sec", length.out = length),
value = runif(length)
)
dxf$date[8:length] <- dxf$date[8:length] + 3600*24
dxf$date <- dxf$date + runif(length, 0, 1)
Here w[i] is number of elements of date that are less than or equal to date[i] + 300 minus i - 1 noting that 300 refers to 300 seconds.
date <- dxf$date
w <- findInterval(date + 300, date) - seq_along(date) + 1
rollapply(dxf$value, w, mean, align = "left") * ifelse(w < 5, NA, 1)
# same
sapply(seq_along(w), function(i) mean(dxf$value[seq(i, length = w[i])])) *
ifelse(w < 5, NA, 1)
There are 10 projects split between group A & B, each with different start and end dates. For each day within a given period the sum of outputX and outputY needs to be calculated. I manage to do this for all projects together, but how to split the results per group?
I've made several attempts with lapply() and purrr:map(), also looking at filters and splits, but to no avail. An example that doesn't distinguish between groups is found below.
library(tidyverse)
library(lubridate)
df <- data.frame(
project = 1:10,
group = c("A","B"),
outputX = rnorm(2),
outputY = rnorm(5),
start_date = sample(seq(as.Date('2018-01-3'), as.Date('2018-1-13'), by="day"), 10),
end_date = sample(seq(as.Date('2018-01-13'), as.Date('2018-01-31'), by="day"), 10))
df$interval <- interval(df$start_date, df$end_date)
period <- data.frame(date = seq(as.Date("2018-01-08"), as.Date("2018-01-17"), by = 1))
df_sum <- do.call(rbind, lapply(period$date, function(x){
index <- x %within% df$interval;
list("X" = sum(df$outputX[index]),
"Y" = sum(df$outputY[index]))}))
outcome <- cbind(period, df_sum) %>% gather("id", "value", 2:3)
outcome
Ultimately, it should be a 40x4 table. Some suggestions are much appreciated!
If I understand you correctly, you need to use inner join. SO can suggest us to use sqldf. See https://stackoverflow.com/a/11895368/9300556
With your data we can do smth like this. There is no need to calculate df$interval but we need to add ID to period, otherwise sqldf wont work.
df <- data.frame(
project = 1:10,
group = c("A","B"),
outputX = rnorm(2),
outputY = rnorm(5),
start = sample(seq(as.Date('2018-01-3'), as.Date('2018-1-13'), by="day"), 10),
end = sample(seq(as.Date('2018-01-13'), as.Date('2018-01-31'), by="day"), 10))
# df$interval <- interval(df$start_date, df$end_date)
period <- data.frame(date = seq(as.Date("2018-01-08"), as.Date("2018-01-17"), by = 1)) %>%
mutate(id = 1:nrow(.))
Then we can use sqldf
sqldf::sqldf("select * from period inner join df
on (period.date > df.start and period.date <= df.end) ") %>%
as_tibble() %>%
group_by(date, group) %>%
summarise(X = sum(outputX),
Y = sum(outputY)) %>%
gather(id, value, -group, -date)
# A tibble: 40 x 4
# Groups: date [10]
date group id value
<date> <fct> <chr> <dbl>
1 2018-01-08 A X 3.04
2 2018-01-08 B X 2.34
3 2018-01-09 A X 3.04
4 2018-01-09 B X 3.51
5 2018-01-10 A X 3.04
6 2018-01-10 B X 4.68
7 2018-01-11 A X 4.05
8 2018-01-11 B X 4.68
9 2018-01-12 A X 4.05
10 2018-01-12 B X 5.84
# ... with 30 more rows
I have a toy example of a tibble.
What is the most efficient way to sum two consecutive rows of y grouped by x
library(tibble)
l = list(x = c("a", "b", "a", "b", "a", "b"), y = c(1, 4, 3, 3, 7, 0))
df <- as_tibble(l)
df
#> # A tibble: 6 x 2
#> x y
#> <chr> <dbl>
#> 1 a 1
#> 2 b 4
#> 3 a 3
#> 4 b 3
#> 5 a 7
#> 6 b 0
So the output would be something like this
group sum seq
a 4 1
a 10 2
b 7 1
b 3 2
I'd like to use the tidyverse and possibly roll_sum() from the RcppRoll package
and have the code so that a variable length of consecutive rows could be used for real world data in which there would be many groups
TIA
One way to do this is use group_by %>% do where you can customize the returned data frame in do:
library(RcppRoll); library(tidyverse)
n = 2
df %>%
group_by(x) %>%
do(
data.frame(
sum = roll_sum(.$y, n),
seq = seq_len(length(.$y) - n + 1)
)
)
# A tibble: 4 x 3
# Groups: x [2]
# x sum seq
# <chr> <dbl> <int>
#1 a 4 1
#2 a 10 2
#3 b 7 1
#4 b 3 2
Edit: Since this is not as efficient, probably due to the data frame construction header and binding data frames on the go, here is an improved version (still somewhat slower than data.table but not as much now):
df %>%
group_by(x) %>%
summarise(sum = list(roll_sum(y, n)), seq = list(seq_len(n() -n + 1))) %>%
unnest()
Timing, use #Matt's data and setup:
library(tibble)
library(dplyr)
library(RcppRoll)
library(stringi) ## Only included for ability to generate random strings
## Generate data with arbitrary number of groups and rows --------------
rowCount <- 100000
groupCount <- 10000
sumRows <- 2L
set.seed(1)
l <- tibble(x = sample(stri_rand_strings(groupCount,3),rowCount,rep=TRUE),
y = sample(0:10,rowCount,rep=TRUE))
## Using dplyr and tibble -----------------------------------------------
ptm <- proc.time() ## Start the clock
dplyr_result <- l %>%
group_by(x) %>%
summarise(sum = list(roll_sum(y, n)), seq = list(seq_len(n() -n + 1))) %>%
unnest()
dplyr_time <- proc.time() - ptm ## Stop the clock
## Using data.table instead ----------------------------------------------
library(data.table)
ptm <- proc.time() ## Start the clock
setDT(l) ## Convert l to a data.table
dt_result <- l[,.(sum = RcppRoll::roll_sum(y, n = sumRows, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
data.table_time <- proc.time() - ptm
Result is:
dplyr_time
# user system elapsed
# 0.688 0.003 0.689
data.table_time
# user system elapsed
# 0.422 0.009 0.430
Here is one approach for you. Since you want to sum up two consecutive rows, you could use lead() and do the calculation for sum. For seq, I think you can simply take row numbers, seeing your expected outcome. Once you are done with these operations, you arrange your data by x (if necessary, x and seq). Finally, you drop rows with NAs. If necessary, you may want to drop y by writing select(-y) at the end of the code.
group_by(df, x) %>%
mutate(sum = y + lead(y),
seq = row_number()) %>%
arrange(x) %>%
ungroup %>%
filter(complete.cases(.))
# x y sum seq
# <chr> <dbl> <dbl> <int>
#1 a 1 4 1
#2 a 3 10 2
#3 b 4 7 1
#4 b 3 3 2
I notice you asked for the most efficient way-- if you are looking at scaling this up to a much larger set, I would strongly recommend data.table.
library(data.table)
library(RcppRoll)
l[, .(sum = RcppRoll::roll_sum(y, n = 2L, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
A rough benchmark comparison of this vs an answer using the tidyverse packages with 100,000 rows and 10,000 groups illustrates the significant difference.
(I used Psidom's answer instead of jazzurro's since jazzuro's did not allow for an arbritary number of rows to be summed.)
library(tibble)
library(dplyr)
library(RcppRoll)
library(stringi) ## Only included for ability to generate random strings
## Generate data with arbitrary number of groups and rows --------------
rowCount <- 100000
groupCount <- 10000
sumRows <- 2L
set.seed(1)
l <- tibble(x = sample(stri_rand_strings(groupCount,3),rowCount,rep=TRUE),
y = sample(0:10,rowCount,rep=TRUE))
## Using dplyr and tibble -----------------------------------------------
ptm <- proc.time() ## Start the clock
dplyr_result <- l %>%
group_by(x) %>%
do(
data.frame(
sum = roll_sum(.$y, sumRows),
seq = seq_len(length(.$y) - sumRows + 1)
)
)
|========================================================0% ~0 s remaining
dplyr_time <- proc.time() - ptm ## Stop the clock
## Using data.table instead ----------------------------------------------
library(data.table)
ptm <- proc.time() ## Start the clock
setDT(l) ## Convert l to a data.table
dt_result <- l[,.(sum = RcppRoll::roll_sum(y, n = sumRows, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
data.table_time <- proc.time() - ptm ## Stop the clock
Results:
> dplyr_time
user system elapsed
10.28 0.04 10.36
> data.table_time
user system elapsed
0.35 0.02 0.36
> all.equal(dplyr_result,as.tibble(dt_result))
[1] TRUE
A solution using tidyverse and zoo. This is similar to Psidom's approach.
library(tidyverse)
library(zoo)
df2 <- df %>%
group_by(x) %>%
do(data_frame(x = unique(.$x),
sum = rollapplyr(.$y, width = 2, FUN = sum))) %>%
mutate(seq = 1:n()) %>%
ungroup()
df2
# A tibble: 4 x 3
x sum seq
<chr> <dbl> <int>
1 a 4 1
2 a 10 2
3 b 7 1
4 b 3 2
zoo + dplyr
library(zoo)
library(dplyr)
df %>%
group_by(x) %>%
mutate(sum = c(NA, rollapply(y, width = 2, sum)),
seq = row_number() - 1) %>%
drop_na()
# A tibble: 4 x 4
# Groups: x [2]
x y sum seq
<chr> <dbl> <dbl> <dbl>
1 a 3 4 1
2 b 3 7 1
3 a 7 10 2
4 b 0 3 2
If the moving window only equal to 2 using lag
df %>%
group_by(x) %>%
mutate(sum = y + lag(y),
seq = row_number() - 1) %>%
drop_na()
# A tibble: 4 x 4
# Groups: x [2]
x y sum seq
<chr> <dbl> <dbl> <dbl>
1 a 3 4 1
2 b 3 7 1
3 a 7 10 2
4 b 0 3 2
EDIT :
n = 3 # your moving window
df %>%
group_by(x) %>%
mutate(sum = c(rep(NA, n - 1), rollapply(y, width = n, sum)),
seq = row_number() - n + 1) %>%
drop_na()
A small variant on existing answers: first convert the data to list-column format, then use purrr to map() roll_sum() onto the data.
l = list(x = c("a", "b", "a", "b", "a", "b"), y = c(1, 4, 3, 3, 7, 0))
as.tibble(l) %>%
group_by(x) %>%
summarize(list_y = list(y)) %>%
mutate(rollsum = map(list_y, ~roll_sum(.x, 2))) %>%
select(x, rollsum) %>%
unnest %>%
group_by(x) %>%
mutate(seq = row_number())
I think if you have the latest version of purrr you could get rid of the last two lines (the final group_by() and mutate()) by using imap() instead of map.
Often I need to spread multiple value columns, as in this question. But I do it often enough that I'd like to be able to write a function that does this.
For example, given the data:
set.seed(42)
dat <- data_frame(id = rep(1:2,each = 2),
grp = rep(letters[1:2],times = 2),
avg = rnorm(4),
sd = runif(4))
> dat
# A tibble: 4 x 4
id grp avg sd
<int> <chr> <dbl> <dbl>
1 1 a 1.3709584 0.6569923
2 1 b -0.5646982 0.7050648
3 2 a 0.3631284 0.4577418
4 2 b 0.6328626 0.7191123
I'd like to create a function that returns something like:
# A tibble: 2 x 5
id a_avg b_avg a_sd b_sd
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1.3709584 -0.5646982 0.6569923 0.7050648
2 2 0.3631284 0.6328626 0.4577418 0.7191123
How can I do that?
We'll return to the answer provided in the question linked to, but for the moment let's start with a more naive approach.
One idea would be to spread each value column individually, and then join the results, i.e.
library(dplyr)
library(tidyr)
library(tibble)
dat_avg <- dat %>%
select(-sd) %>%
spread(key = grp,value = avg) %>%
rename(a_avg = a,
b_avg = b)
dat_sd <- dat %>%
select(-avg) %>%
spread(key = grp,value = sd) %>%
rename(a_sd = a,
b_sd = b)
> full_join(dat_avg,
dat_sd,
by = 'id')
# A tibble: 2 x 5
id a_avg b_avg a_sd b_sd
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1.3709584 -0.5646982 0.6569923 0.7050648
2 2 0.3631284 0.6328626 0.4577418 0.7191123
(I used a full_join just in case we run into situations where not all combinations of the join columns appear in all of them.)
Let's start with a function that works like spread but allows you to pass the key and value columns as characters:
spread_chr <- function(data, key_col, value_cols, fill = NA,
convert = FALSE,drop = TRUE,sep = NULL){
n_val <- length(value_cols)
result <- vector(mode = "list", length = n_val)
id_cols <- setdiff(names(data), c(key_col,value_cols))
for (i in seq_along(result)){
result[[i]] <- spread(data = data[,c(id_cols,key_col,value_cols[i]),drop = FALSE],
key = !!key_col,
value = !!value_cols[i],
fill = fill,
convert = convert,
drop = drop,
sep = paste0(sep,value_cols[i],sep))
}
result %>%
purrr::reduce(.f = full_join, by = id_cols)
}
> dat %>%
spread_chr(key_col = "grp",
value_cols = c("avg","sd"),
sep = "_")
# A tibble: 2 x 5
id grp_avg_a grp_avg_b grp_sd_a grp_sd_b
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1.3709584 -0.5646982 0.6569923 0.7050648
2 2 0.3631284 0.6328626 0.4577418 0.7191123
The key ideas here are to unquote the arguments key_col and value_cols[i] using the !! operator, and using the sep argument in spread to control the resulting value column names.
If we wanted to convert this function to accept unquoted arguments for the key and value columns, we could modify it like so:
spread_nq <- function(data, key_col,..., fill = NA,
convert = FALSE, drop = TRUE, sep = NULL){
val_quos <- rlang::quos(...)
key_quo <- rlang::enquo(key_col)
value_cols <- unname(tidyselect::vars_select(names(data),!!!val_quos))
key_col <- unname(tidyselect::vars_select(names(data),!!key_quo))
n_val <- length(value_cols)
result <- vector(mode = "list",length = n_val)
id_cols <- setdiff(names(data),c(key_col,value_cols))
for (i in seq_along(result)){
result[[i]] <- spread(data = data[,c(id_cols,key_col,value_cols[i]),drop = FALSE],
key = !!key_col,
value = !!value_cols[i],
fill = fill,
convert = convert,
drop = drop,
sep = paste0(sep,value_cols[i],sep))
}
result %>%
purrr::reduce(.f = full_join,by = id_cols)
}
> dat %>%
spread_nq(key_col = grp,avg,sd,sep = "_")
# A tibble: 2 x 5
id grp_avg_a grp_avg_b grp_sd_a grp_sd_b
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1.3709584 -0.5646982 0.6569923 0.7050648
2 2 0.3631284 0.6328626 0.4577418 0.7191123
The change here is that we capture the unquoted arguments with rlang::quos and rlang::enquo and then simply convert them back to characters using tidyselect::vars_select.
Returning to the solution in the linked question that uses a sequence of gather, unite and spread, we can use what we've learned to make a function like this:
spread_nt <- function(data,key_col,...,fill = NA,
convert = TRUE,drop = TRUE,sep = "_"){
key_quo <- rlang::enquo(key_col)
val_quos <- rlang::quos(...)
value_cols <- unname(tidyselect::vars_select(names(data),!!!val_quos))
key_col <- unname(tidyselect::vars_select(names(data),!!key_quo))
data %>%
gather(key = ..var..,value = ..val..,!!!val_quos) %>%
unite(col = ..grp..,c(key_col,"..var.."),sep = sep) %>%
spread(key = ..grp..,value = ..val..,fill = fill,
convert = convert,drop = drop,sep = NULL)
}
> dat %>%
spread_nt(key_col = grp,avg,sd,sep = "_")
# A tibble: 2 x 5
id a_avg a_sd b_avg b_sd
* <int> <dbl> <dbl> <dbl> <dbl>
1 1 1.3709584 0.6569923 -0.5646982 0.7050648
2 2 0.3631284 0.4577418 0.6328626 0.7191123
This relies on the same techniques from rlang from the last example. We're using some unusual names like ..var.. for our intermediate variables in order to reduce the chances of name collisions with existing columns in our data frame.
Also, we're using the sep argument in unite to control the resulting column names, so in this case when we spread we force sep = NULL.
Spreading operations can also be done by unnesting a properly reformated table, here's an alternative using tidyverse :
# helper function that returns an horizontal one lined named tibble wrapped into a list
lhframe <- function(x,nms) list(setNames(as_tibble(t(x)),nms))
dat %>% group_by(id) %>%
summarize(avg = lhframe(avg,grp),
sd = lhframe(sd,grp)) %>%
unnest(.sep="_")
# # A tibble: 2 x 5
# id avg_a avg_b sd_a sd_b
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 -1.7631631 0.4600974 0.7595443 0.5664884
# 2 2 -0.6399949 0.4554501 0.8496897 0.1894739
Unfortunately the following doesn't work:
dat %>% group_by(id) %>%
summarize_at(vars(avg,sd),lhframe,grp) %>%
unnest(.sep="_")
Since tidyr version 1.0.0
tidyr::pivot_wider(data = dat, id_cols = id, names_from = grp, values_from = avg:sd)
# # A tibble: 2 x 5
# id avg_a avg_b sd_a sd_b
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 1.37 -0.565 0.657 0.705
# 2 2 0.363 0.633 0.458 0.719
Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!
This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.
I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc
Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))
Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6