Mutate a dynamic subset of variable

Mutate a dynamic subset of variable - r

a = tibble(x = runif(1000,0,10),
t = rpois(1000,4)
) %>% arrange(t)
I want a column l that averages the subset of x for the values associated to a t < t(x).
Expected result:
for x[t=0], l = NaN
for x[t=1], l = mean(x[t<1])
for x[t=2], l = mean(x[t<2])
etc.
A code that does not work:
a %>%
mutate(
l = mean(x[a$t < .$t])
) -> a
Now this could would work:
for (i in c(1:1000)) {
a$l[i] = mean(a$x[a$t < a$t[i]])
}
But is not a mutate. I'd like a mutate so I can apply it to groups etc.
To understand better the issue: imagine that you have to average all the x before a date. Now: this, dynamically, in a mutate.
I think that purrr may be necessary but I hate it.

You can use map with mutate:
library(tidyverse)
f <- function(lim) mean(a$x[a$t < lim])
a %>% mutate(l = map_dbl(t, f))
Testing against OP solution:
res <- a %>% mutate(l = map_dbl(t, f))
l <- vector(mode = "numeric", length = 1000)
for (i in c(1:1000)) l[i] = mean(a$x[a$t < a$t[i]])
assertthat::are_equal(res$l, l) # TRUE

For each t value you can calculate average value of x and then calculate lag value of cumulative mean.
library(dplyr)
a %>%
group_by(t) %>%
summarise(l = mean(x)) %>%
mutate(l = lag(cummean(l)))
# t l
# <int> <dbl>
# 1 0 NA
# 2 1 5.33
# 3 2 5.45
# 4 3 5.36
# 5 4 5.26
# 6 5 5.16
# 7 6 5.10
# 8 7 5.07
# 9 8 5.12
#10 9 4.96
#11 10 4.98
#12 11 5.15
#13 12 4.93
If you want to maintain number of rows in the dataframe add %>% left_join(a, by = 't') to the above answer.
data
set.seed(123)
a = tibble(x = runif(1000,0,10),
t = rpois(1000,4)
) %>% arrange(t)

Related

Add grouped means to dataframe with grouped frequencies and proportions

I have this sort of data:
df <- data.frame(
id = sample(1:5, 100, replace = TRUE),
dur = sample(c(NA, rnorm(10)), 100, replace = TRUE),
char = sample(LETTERS, 100, replace = TRUE)
)
From this I can compute counts and proportions of the variable char:
library(dplyr)
df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq', sort = TRUE) %>%
mutate(prop = prop.table(freq) * 100)
char freq prop
1 C 6 8.571429
2 M 6 8.571429
3 X 5 7.142857
4 Y 5 7.142857
5 Z 5 7.142857
6 E 4 5.714286
7 I 4 5.714286
8 K 4 5.714286
9 J 3 4.285714
10 Q 3 4.285714
... clipped
Now, in df, the char values also have duration values. So I want to add another column, say mean_dur, with the mean dur values grouped by char in df. Adding on something like group_by(char) etc. to the above code doesn't work as the variable char is no longer recognized. How can that be achieved?
EDIT:
It can be done in steps, like this:
# Step 1 -- make df with counts and proportions:
df1 <- df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq', sort = TRUE) %>%
mutate(prop = prop.table(freq) * 100)
# Step 2 -- make another df with mean dur values:
df2 <- df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(mean_dur = mean(dur, na.rm = TRUE))
# Step 3 -- transfer mean dur values by matching `char`in `df1`and `df2`
df1$mean_dur <- df2$mean_dur[match(df1$char, df2$char)]
But is there a cleaner and tidyer dplyr way?
EDIT 2:
Thanks to #Anoushiravan R's solution, from which I picked the left_join idea, this seems like a clean and tidy solution (and it does not require the package janitor):
df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq', sort = TRUE) %>%
mutate(prop = prop.table(freq) * 100) %>%
left_join(df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(mean_dur = mean(dur)), by = "char")

I hope this is what you are looking for:
library(dplyr)
library(janitor)
df %>%
filter(!is.na(dur) & !id == lag(id)) %>%
tabyl(char) %>%
rename(freq = percent) %>%
mutate(freq = freq * 100) %>%
select(-n) %>%
arrange(desc(freq)) %>%
left_join(df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(mean_dur = mean(dur)), by = "char")
char freq mean_dur
T 7.894737 -0.4861708
Z 7.894737 -0.2867046
A 6.578947 -0.5056797
B 5.263158 0.3513478
E 5.263158 0.5113139
K 5.263158 -1.4560764
L 5.263158 0.8235192
N 5.263158 0.9037481
X 5.263158 -1.4669529
C 3.947368 -0.4064762
I 3.947368 -0.7722133
P 3.947368 -0.1076928
U 3.947368 0.5573875
Y 3.947368 0.2404896
D 2.631579 0.5942473
F 2.631579 1.2381883
G 2.631579 -0.2155605
J 2.631579 1.0528329
M 2.631579 -1.5482806
O 2.631579 0.2813264
S 2.631579 1.2132490
V 2.631579 0.6157874
H 1.315789 -1.2664754
Q 1.315789 1.1027114
R 1.315789 0.1288634
W 1.315789 1.0528329

If you're prepared to give up prop.table, then I think this gives you what you want...
df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(
n=n(),
prop = 100*n/nrow(.),
mean_dur=mean(dur, na.rm=TRUE),
.groups="drop"
)
# A tibble: 25 x 4
char n prop mean_dur
* <fct> <int> <dbl> <dbl>
1 A 6 8.82 0.158
2 B 5 7.35 -0.144
3 C 2 2.94 0.951
4 D 2 2.94 0.518
5 E 5 7.35 0.211
6 F 3 4.41 0.333
7 G 2 2.94 0.951
8 H 3 4.41 0.624
9 I 2 2.94 -0.422
10 J 2 2.94 -0.347
# … with 15 more rows
[It took me a while to notice you were working with random data. set.seed() would have been helpful! ;=) ]
Edited in line with comment below

Another option:
mean_dur <- df %>% group_by(char) %>% summarise(mean_dur=mean(dur,na.rm=T))
tab <- df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq') %>%
mutate(prop = prop.table(freq) * 100)
tab <- merge.data.frame(tab,mean_dur)
tab <- tab[order(tab$freq,decreasing = T),]
char freq prop mean_dur
17 R 6 8.108108 -0.75610907
3 D 5 6.756757 -0.61657511
5 F 5 6.756757 -0.34153689
10 K 5 6.756757 -0.90688768
19 T 5 6.756757 0.33628707
6 G 4 5.405405 -0.93390134
9 J 4 5.405405 0.27471673
11 L 4 5.405405 0.87029782
13 N 4 5.405405 0.17163797
16 Q 4 5.405405 -0.67554378
22 X 4 5.405405 -0.42108346
7 H 3 4.054054 0.36290234
14 O 3 4.054054 -0.56712470
15 P 3 4.054054 0.08316665
2 C 2 2.702703 -1.15398142
4 E 2 2.702703 -0.31271923
12 M 2 2.702703 -0.96001502
18 S 2 2.702703 -0.88921047
20 U 2 2.702703 0.24299241
21 W 2 2.702703 -1.32772406
1 A 1 1.351351 0.24299241
8 I 1 1.351351 -1.07336407
23 Z 1 1.351351 -1.07336407

R combine rows with similar values

I have a dataframe and the row values are first ordered from smallest to largest. I compute row value differences between adjacent rows, combine rows with similar differences (e.g., smaller than 1), and return averaged values of combined rows. I could check each row differences with a for loop, but seems a very inefficient way. Any better ideas? Thanks.
library(dplyr)
DF <- data.frame(ID=letters[1:12],
Values=c(1, 2.2, 3, 5, 6.2, 6.8, 7, 8.5, 10, 12.2, 13, 14))
DF <- DF %>%
mutate(Diff=c(0, diff(Values)))
The expected output of DF would be
ID Values
a 1.0
b/c 2.6 # (2.2+3.0)/2
d 5.0
e/f/g 6.67 # (6.2+6.8+7.0)/3
h 8.5
i 10.0
j/k 12.6 # (12.2+13.0)/2
i 14.0

Here is an option with data.table
library(data.table)
setDT(DF)[, .(ID = toString(ID), Values = round(mean(Values), 2)),
by = .(Diff = cumsum(c(TRUE, diff(Values)>=1)))][, -1, with = FALSE]
# ID Values
#1: a 1.00
#2: b, c 2.60
#3: d 5.00
#4: e, f, g 6.67
#5: h 8.50
#6: i 10.00
#7: j, k 12.60
#8: l 14.00

Calculate difference between Values of every row and check if those are >= 1. Cumulative sum of that >=1 will provide you distinct group on which one can summarize to get desired result.
library(dplyr)
DF %>% arrange(Values) %>%
group_by(Diff = cumsum(c(1,diff(Values)) >= 1) ) %>%
summarise(ID = paste0(ID, collapse = "/"), Values = mean(Values)) %>%
ungroup() %>% select(-Diff)
# # A tibble: 8 x 2
# ID Values
# <chr> <dbl>
# 1 a 1.00
# 2 b/c 2.60
# 3 d 5.00
# 4 e/f/g 6.67
# 5 h 8.50
# 6 i 10.0
# 7 j/k 12.6
# 8 l 14.0

library(magrittr)
df <- DF[order(DF$Values),]
df$Values %>%
#Find groups for each row
outer(., ., function(x, y) x >= y & x < y + 1) %>%
# Remove sub-groups
`[<-`(apply(., 1, cumsum) > 1, F) %>%
# Remove sub-group columns
.[, colSums(.) > 0] %>%
# select these groups from data
apply(2, function(x) data.frame(ID = paste(df$ID[x], collapse = '/')
, Values = mean(df$Values[x]))) %>%
# bind results by row
do.call(what = rbind)
# ID Values
# 1 a 1.000000
# 2 b/c 2.600000
# 4 d 5.000000
# 5 e/f/g 6.666667
# 8 h 8.500000
# 9 i 10.000000
# 10 j/k 12.600000
# 12 l 14.000000
Note:
This method is different from those using diff because it groups rows together only if all Values are within < 1 of each other.
Example:
Change the dataset so that Value is 7.3 at ID g.
Above method: The IDs e, f, and g are no longer grouped together because the value at ID e is 6.2 and 7.2 - 6.2 > 1.
Diff Method: IDs e, f, and g are still grouped together because the diff of IDs at e and f is < 1 and the diff of IDs F and G is < 1

R dplyr join by range or virtual column

I want to join two tibbles by a range or a virtual column. but it seems the by - parameter just allow to handle chr oder vector(chr) of existing column names.
In my example I have a tibble d with a column value, and a tibble r with a from and a to column.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
> d
# A tibble: 26 x 1
value
<dbl>
1 1.0
2 1.2
3 1.4
4 1.6
5 1.8
6 2.0
7 2.2
8 2.4
9 2.6
10 2.8
# ... with 16 more rows
> r
# A tibble: 6 x 3
from to class
<int> <dbl> <chr>
1 1 2 A
2 2 3 B
3 3 4 C
4 4 5 D
5 5 6 E
6 6 Inf F
now I want to join the value column in d within the range of from and to in r:
d %>% inner_join(r, by = "value between from and to") # >= and <
I can't find a way to do this so decided to join the floor of value in d with the from column in r
d %>% inner_join(r, by = c("floor(value)" = "from"))
of course i can create a second column to solve that:
d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, class)
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
but isn't there a more comfortable way?
Thanks

I don't think inequality joins is implemented in dplyr yet, or it ever will (see this discussion on Join on inequality constraints), but this is a good situation to use an SQL join:
library(tibble)
library(sqldf)
as.tibble(sqldf("select d.value, r.class from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Alternatively, if you want to integrate the join into your dplyr chain, you can use fuzzyjoin::fuzzy_join:
library(dplyr)
library(fuzzyjoin)
d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"),
match_fun = list(`>=`, `<`)) %>%
select(value, class)
Result:
# A tibble: 31 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 A
7 2.0 B
8 2.2 B
9 2.4 B
10 2.6 B
# ... with 21 more rows
Notice I added single quotes around from and to since those are reserved words for the SQL language.

Ok thanks for advices, this was pretty interesting. I finally wrote a function range_join (inspired by #ycw's code) and compared all described solution in view of runtime.
I like fuzzy_join but with only 50k rows in d it needs more than 40sec. Thats too slow.
Here the result with 5k rows in d
library(dplyr)
library(fuzzyjoin)
library(sqldf)
#join by range by #WiWeber
range_join <- function(x, y, value, left, right){
x_result <- tibble()
for (y_ in split(y, 1:nrow(y)))
x_result <- x_result %>% bind_rows(x[x[[value]] >= y_[[left]] & x[[value]] < y_[[right]],] %>% cbind(y_))
return(x_result)
}
#dynamic join by #ycw
dynamic_join <- function(d, r){
d$type <- NA_character_
for (r_ in split(r, r$type))
d <- d %>% mutate(type = ifelse(value >= r_$from & value < r_$to, r_$type, type))
return(d)
}
d <- tibble(value = seq(1,6, by = 0.001), join = TRUE)
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), type = LETTERS[seq(1,6)], join = TRUE)
# #useR sqldf - fast and intuitive but extra library with horrible code
start <- Sys.time()
d2 <- tbl_df(sqldf("select d.value, r.type from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Sys.time() - start
# #useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
start <- Sys.time()
d2 <- d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"), match_fun = list(`>=`, `<`)) %>%
select(value, type)
Sys.time() - start
# #jonathande4 cut pretty fast
start <- Sys.time()
d2 <- d
d2$type <- cut(d$value, unique(c(r$from, r$to)), r$type, right = FALSE)
Sys.time() - start
# #WiWeber floor
start <- Sys.time()
d2 <- d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, type)
Sys.time() - start
# #WiWeber cross join - filter
start <- Sys.time()
d2 <- d %>%
inner_join(r, by = "join") %>%
filter(value >= from, value < to) %>%
select(value, type)
Sys.time() - start
# #hardik-gupta sapply
start <- Sys.time()
d2 <- d %>%
mutate(
type = unlist(sapply(value, function (x) r[which(x >= r$from & x < r$to), "type"]))
) %>%
select(value, type)
Sys.time() - start
# #ycw re-dynamic join
start <- Sys.time()
d2 <- d %>% dynamic_join(r)
Sys.time() - start
# #WiWeber range_join
start <- Sys.time()
d2 <- d %>%
range_join(r, "value", "from", "to") %>%
select(value, type)
Sys.time() - start
Results:
# #useR sqldf - fast and intuitive but extra library with horrible code
Time difference of 0.06221986 secs
# #useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
Time difference of 4.765595 secs
# #jonathande4 cut pretty fast
Time difference of 0.004637003 secs
# #WiWeber floor
Time difference of 0.02223396 secs
# #WiWeber cross join - filter
Time difference of 0.0201931 secs
# #hardik-gupta sapply
Time difference of 5.166633 secs
# #ycw dynamic join
Time difference of 0.03124094 secs
# #WiWeber range_join
Time difference of 0.02691698 secs
greez WiWeber

You use the cut function to create a "class" in object d and then use a left join.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d[["class"]] <- cut(d[["value"]], c(0,2,3,4,5,6,Inf), c('A',"B", "C", "D", "E", "F"), right = FALSE)
d <- left_join(d, r)
To get the right buckets, you just need to work with the cut function to get what you want.

We can use sapply for this
library(tibble)
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d <- cbind(d, data.frame(class = (unlist(sapply(d$value, function (x) r[which(x >= r$from & x < r$to), "class"]))) ) )
d
value class
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
11 3.0 C
12 3.2 C
13 3.4 C
14 3.6 C
15 3.8 C
16 4.0 D
17 4.2 D
18 4.4 D
19 4.6 D
20 4.8 D
21 5.0 E
22 5.2 E
23 5.4 E
24 5.6 E
25 5.8 E
26 6.0 F

We can use mutate and case_when from dplyr.
library(dplyr)
d2 <- d %>%
mutate(class = case_when(
value >= 1 & value < 2 ~ "A",
value >= 2 & value < 3 ~ "B",
value >= 3 & value < 4 ~ "C",
value >= 4 & value < 5 ~ "D",
value >= 5 & value < 6 ~ "E",
value >= 6 ~ "F"
))
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
Update
Here is a workaround by defining a function for this task.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
library(dplyr)
# Define a function for dynamic join
dynamic_join <- function(d, r){
if (!("class" %in% colnames(d))){
d[["class"]] <- NA_character_
}
d <- d %>%
mutate(class = ifelse(value >= r$from & value < r$to, r$class, class))
return(d)
}
re_dynamic_join <- function(d, r){
r_list <- split(r, r$class)
for (i in 1:length(r_list)){
d <- dynamic_join(d, r_list[[i]])
}
return(d)
}
# Apply the function
d2 <- d %>% re_dynamic_join(r)
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows

I really liked #WiWeber's range_join function, but it gives an error if a record is not within range. Here's a modification
library(dplyr)
d <- tibble(value = c(seq(1,4, by = 0.2),9))
r <- tibble(from = seq(1,5), to = c(seq(2,5),8), class = LETTERS[seq(1,5)])
range_join <- function(x, y, value, left, right){
all_matches <- tibble()
x = as.data.frame(x)
y = as.data.frame(y)
x$index=x[,value]
for (i in 1:nrow(y)){
matches = x %>% filter(index>=y[i,left] & index<= y[i,right])
if (nrow(matches)>0){
all_matches = all_matches %>% bind_rows(matches %>% cbind(y[i,]))
}
}
all_matches = all_matches %>% select(-index)
return(all_matches)
}
data <- d %>%
range_join(r, "value", "from", "to")
data

How do I do a rolling cumsum over consecutive rows of a tibble in R

I have a toy example of a tibble.
What is the most efficient way to sum two consecutive rows of y grouped by x
library(tibble)
l = list(x = c("a", "b", "a", "b", "a", "b"), y = c(1, 4, 3, 3, 7, 0))
df <- as_tibble(l)
df
#> # A tibble: 6 x 2
#> x y
#> <chr> <dbl>
#> 1 a 1
#> 2 b 4
#> 3 a 3
#> 4 b 3
#> 5 a 7
#> 6 b 0
So the output would be something like this
group sum seq
a 4 1
a 10 2
b 7 1
b 3 2
I'd like to use the tidyverse and possibly roll_sum() from the RcppRoll package
and have the code so that a variable length of consecutive rows could be used for real world data in which there would be many groups
TIA

One way to do this is use group_by %>% do where you can customize the returned data frame in do:
library(RcppRoll); library(tidyverse)
n = 2
df %>%
group_by(x) %>%
do(
data.frame(
sum = roll_sum(.$y, n),
seq = seq_len(length(.$y) - n + 1)
)
)
# A tibble: 4 x 3
# Groups: x [2]
# x sum seq
# <chr> <dbl> <int>
#1 a 4 1
#2 a 10 2
#3 b 7 1
#4 b 3 2
Edit: Since this is not as efficient, probably due to the data frame construction header and binding data frames on the go, here is an improved version (still somewhat slower than data.table but not as much now):
df %>%
group_by(x) %>%
summarise(sum = list(roll_sum(y, n)), seq = list(seq_len(n() -n + 1))) %>%
unnest()
Timing, use #Matt's data and setup:
library(tibble)
library(dplyr)
library(RcppRoll)
library(stringi) ## Only included for ability to generate random strings
## Generate data with arbitrary number of groups and rows --------------
rowCount <- 100000
groupCount <- 10000
sumRows <- 2L
set.seed(1)
l <- tibble(x = sample(stri_rand_strings(groupCount,3),rowCount,rep=TRUE),
y = sample(0:10,rowCount,rep=TRUE))
## Using dplyr and tibble -----------------------------------------------
ptm <- proc.time() ## Start the clock
dplyr_result <- l %>%
group_by(x) %>%
summarise(sum = list(roll_sum(y, n)), seq = list(seq_len(n() -n + 1))) %>%
unnest()
dplyr_time <- proc.time() - ptm ## Stop the clock
## Using data.table instead ----------------------------------------------
library(data.table)
ptm <- proc.time() ## Start the clock
setDT(l) ## Convert l to a data.table
dt_result <- l[,.(sum = RcppRoll::roll_sum(y, n = sumRows, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
data.table_time <- proc.time() - ptm
Result is:
dplyr_time
# user system elapsed
# 0.688 0.003 0.689
data.table_time
# user system elapsed
# 0.422 0.009 0.430

Here is one approach for you. Since you want to sum up two consecutive rows, you could use lead() and do the calculation for sum. For seq, I think you can simply take row numbers, seeing your expected outcome. Once you are done with these operations, you arrange your data by x (if necessary, x and seq). Finally, you drop rows with NAs. If necessary, you may want to drop y by writing select(-y) at the end of the code.
group_by(df, x) %>%
mutate(sum = y + lead(y),
seq = row_number()) %>%
arrange(x) %>%
ungroup %>%
filter(complete.cases(.))
# x y sum seq
# <chr> <dbl> <dbl> <int>
#1 a 1 4 1
#2 a 3 10 2
#3 b 4 7 1
#4 b 3 3 2

I notice you asked for the most efficient way-- if you are looking at scaling this up to a much larger set, I would strongly recommend data.table.
library(data.table)
library(RcppRoll)
l[, .(sum = RcppRoll::roll_sum(y, n = 2L, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
A rough benchmark comparison of this vs an answer using the tidyverse packages with 100,000 rows and 10,000 groups illustrates the significant difference.
(I used Psidom's answer instead of jazzurro's since jazzuro's did not allow for an arbritary number of rows to be summed.)
library(tibble)
library(dplyr)
library(RcppRoll)
library(stringi) ## Only included for ability to generate random strings
## Generate data with arbitrary number of groups and rows --------------
rowCount <- 100000
groupCount <- 10000
sumRows <- 2L
set.seed(1)
l <- tibble(x = sample(stri_rand_strings(groupCount,3),rowCount,rep=TRUE),
y = sample(0:10,rowCount,rep=TRUE))
## Using dplyr and tibble -----------------------------------------------
ptm <- proc.time() ## Start the clock
dplyr_result <- l %>%
group_by(x) %>%
do(
data.frame(
sum = roll_sum(.$y, sumRows),
seq = seq_len(length(.$y) - sumRows + 1)
)
)
|========================================================0% ~0 s remaining
dplyr_time <- proc.time() - ptm ## Stop the clock
## Using data.table instead ----------------------------------------------
library(data.table)
ptm <- proc.time() ## Start the clock
setDT(l) ## Convert l to a data.table
dt_result <- l[,.(sum = RcppRoll::roll_sum(y, n = sumRows, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
data.table_time <- proc.time() - ptm ## Stop the clock
Results:
> dplyr_time
user system elapsed
10.28 0.04 10.36
> data.table_time
user system elapsed
0.35 0.02 0.36
> all.equal(dplyr_result,as.tibble(dt_result))
[1] TRUE

A solution using tidyverse and zoo. This is similar to Psidom's approach.
library(tidyverse)
library(zoo)
df2 <- df %>%
group_by(x) %>%
do(data_frame(x = unique(.$x),
sum = rollapplyr(.$y, width = 2, FUN = sum))) %>%
mutate(seq = 1:n()) %>%
ungroup()
df2
# A tibble: 4 x 3
x sum seq
<chr> <dbl> <int>
1 a 4 1
2 a 10 2
3 b 7 1
4 b 3 2

zoo + dplyr
library(zoo)
library(dplyr)
df %>%
group_by(x) %>%
mutate(sum = c(NA, rollapply(y, width = 2, sum)),
seq = row_number() - 1) %>%
drop_na()
# A tibble: 4 x 4
# Groups: x [2]
x y sum seq
<chr> <dbl> <dbl> <dbl>
1 a 3 4 1
2 b 3 7 1
3 a 7 10 2
4 b 0 3 2
If the moving window only equal to 2 using lag
df %>%
group_by(x) %>%
mutate(sum = y + lag(y),
seq = row_number() - 1) %>%
drop_na()
# A tibble: 4 x 4
# Groups: x [2]
x y sum seq
<chr> <dbl> <dbl> <dbl>
1 a 3 4 1
2 b 3 7 1
3 a 7 10 2
4 b 0 3 2
EDIT :
n = 3 # your moving window
df %>%
group_by(x) %>%
mutate(sum = c(rep(NA, n - 1), rollapply(y, width = n, sum)),
seq = row_number() - n + 1) %>%
drop_na()

A small variant on existing answers: first convert the data to list-column format, then use purrr to map() roll_sum() onto the data.
l = list(x = c("a", "b", "a", "b", "a", "b"), y = c(1, 4, 3, 3, 7, 0))
as.tibble(l) %>%
group_by(x) %>%
summarize(list_y = list(y)) %>%
mutate(rollsum = map(list_y, ~roll_sum(.x, 2))) %>%
select(x, rollsum) %>%
unnest %>%
group_by(x) %>%
mutate(seq = row_number())
I think if you have the latest version of purrr you could get rid of the last two lines (the final group_by() and mutate()) by using imap() instead of map.

Grouping Over All Possible Combinations of Several Variables With dplyr

Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!

This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.

I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc

Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))

Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Mutate a dynamic subset of variable - r

Related

Add grouped means to dataframe with grouped frequencies and proportions

R combine rows with similar values

R dplyr join by range or virtual column

How do I do a rolling cumsum over consecutive rows of a tibble in R

Grouping Over All Possible Combinations of Several Variables With dplyr

Categories

Resources