dplyr Rolling Conditional Counts - r

I have a data frame as follows:
df <- data.frame(
Item=c("A","A","A","A","A","B","B","B","B","B"),
Date=c("2018-1-1","2018-2-1","2018-3-1","2018-4-1","2018-5-1","2018-1-1","2018-2-1",
"2018-3-1","2018-4-1","2018-5-1"),
Value=rnorm(10))
I want to mutate a new column grouped by Item, to count the number of values higher than 0 within the window of 3 (or any other integer I specify).
I am familiar with tidyverse, therefore, a dplyr solution would be most welcome.

Think zoo:: package if you want to roll anything.
df$new<-
zoo::rollsum( df$Value > 0, 3, fill = NA )
# Item Date Value new
#1 A 2018-1-1 0.5852699 NA
#2 A 2018-2-1 -0.7383377 1
#3 A 2018-3-1 -0.3157693 1
#4 A 2018-4-1 1.2475237 1
#5 A 2018-5-1 -1.5479757 1
#6 B 2018-1-1 -0.6913331 0
#7 B 2018-2-1 -0.2423809 0
#8 B 2018-3-1 -1.6363024 0
#9 B 2018-4-1 -0.3256263 1
#10 B 2018-5-1 0.3563144 NA
You have an option of the "window-position". Have a closer look at argument align = c("center", "left", "right").
So as a dplyr chain:
df %>% group_by(Item) %>% dplyr::mutate( new = zoo::rollsum( Value > 0, 3, fill = NA ))

You could use the RcppRoll package.
require(RcppRoll)
df$new <- df$new <- RcppRoll::roll_sum(df$Value > 0, 3, fill = NA)
Using Tidyverse:
df %>%
group_by(Item) %>%
dplyr::mutate(new = RcppRoll::roll_sum(Value > 0, 3, fill = NA))
Speedwise this is faster than the zoo Package:
n <- 10000
df <- data.frame(
Item = sample(LETTERS, n, replace = TRUE),
Value = rnorm(n))
df_grouped <- df %>%
group_by(Item)
microbenchmark::microbenchmark(
RcppRoll = df_grouped <- df_grouped %>% dplyr::mutate(new_RcppRoll = RcppRoll::roll_sum(Value > 0, 3, fill = NA)),
zoo = df_grouped <- df_grouped %>% dplyr::mutate(new_zoo = zoo::rollsum( Value > 0, 3, fill = NA ))
)
Results in:
Unit: milliseconds
expr min lq mean median uq max neval
RcppRoll 2.509003 2.741993 2.929227 2.83913 2.983726 5.832962 100
zoo 11.172920 11.785113 13.288970 12.43320 13.607826 25.879754 100
And
all.equal(df_grouped$new_RcppRoll, df_grouped$new_zoo)
TRUE

Item Date Value
<fct> <date> <int>
1 A 2018-01-01 3
2 B 2018-01-01 2
3 B 2018-02-01 -5
4 A 2018-02-01 -3
5 A 2018-03-01 4
6 B 2018-03-01 -2
7 A 2018-04-01 5
8 B 2018-04-01 0
9 A 2018-05-01 1
10 B 2018-05-01 -4
Changed rnorm example for clarity, used sample(-5:5):
> df <- df %>% mutate(greater_than = (Value>0)*Value) %>%
group_by(Item) %>% arrange(Date) %>% mutate(greater_than =
zoo::rollapplyr(greater_than, 3, sum, partial = T))
df %>% arrange(Item) %>% head(10)
Should look like this:
1 A 2018-01-01 3 3
2 A 2018-02-01 -3 3
3 A 2018-03-01 4 7
4 A 2018-04-01 5 9
5 A 2018-05-01 1 10
6 B 2018-01-01 2 2
7 B 2018-02-01 -5 2
8 B 2018-03-01 -2 2
9 B 2018-04-01 0 0
10 B 2018-05-01 -4 0

Related

Sum most recent scores in 3 unique areas

I have dataset of areas and scores in those areas.
I want to maintain an aggregated score (agg_score) that is equal to the sum of the most recent scores for A, B, and C.
For instance you will see in my expected_output for row 4 is 7, because the calue of C is now 2 while the most recent values of A and B are still 1 & 4.
All I have been able to do so far is sum the three most recent scores, which results in agg_score values that equal the sum of C, C, and B at times. It is important that I have an accurate agg_score at each possible date.
library(dplyr)
ds <-
tibble(
area = c("A", "B", "C", "C", "B", "A", "A", "B", "C"),
score = c(1,4,5,2,6,3,4,6,3),
scoring_date =
seq.Date(
from = as.Date("2019-01-01"),
to = as.Date("2019-01-09"),
by = "days"
),
expected_output = c(NA, NA, 10, 7, 9, 11, 12, 12, 13)
) %>%
arrange(scoring_date)
# Inadequate code for summing last three scores
ds %>%
mutate(agg_score = score + lag(score) + lag(score, 2))
Using dplyr::last we can find the last 'recent' value for each area then sum them when length reaches 3.
#small function to clarify
sum_fun<-function(x){
#browser()
lc_vec <- ds[1:x,] %>% group_by(area) %>% summarise(lc=last(score)) %>% pull(lc)
lc_vecf <- ifelse(length(lc_vec)==3,sum(lc_vec),NA)
return(lc_vecf)
}
library(dplyr)
ds %>% mutate(Output=sapply(1:nrow(.),sum_fun)) #Instead of sapply we can use purrr::map_dpl
# A tibble: 9 x 5
area score scoring_date expected_output Output
<chr> <dbl> <date> <dbl> <dbl>
1 A 1. 2019-01-01 NA NA
2 B 4. 2019-01-02 NA NA
3 C 5. 2019-01-03 10. 10.
4 C 2. 2019-01-04 7. 7.
5 B 6. 2019-01-05 9. 9.
6 A 3. 2019-01-06 11. 11.
7 A 4. 2019-01-07 12. 12.
8 B 6. 2019-01-08 12. 12.
9 C 3. 2019-01-09 13. 13.
There might be a data.table self-merge option out there, but I couldn't quite figure it out. Here's an idea using implementing your fill but in data.table. Should be flexible for more "area"s:
library(data.table)
lapply(unique(ds$area), function(a){
ds[, paste0("val_",a) := zoo::na.locf0(ifelse(area==a, score, NA))]
invisible(return(NULL))
})
ds[, agg_score := rowSums(.SD), .SDcols = paste0("val_", unique(ds$area))][, paste0("val_", unique(ds$area)) := NULL]
ds
# area score scoring_date agg_score
#1 A 1 2019-01-01 NA
#2 B 4 2019-01-02 NA
#3 C 5 2019-01-03 10
#4 C 2 2019-01-04 7
#5 B 6 2019-01-05 9
#6 A 3 2019-01-06 11
#7 A 4 2019-01-07 12
#8 B 6 2019-01-08 12
#9 C 3 2019-01-09 13
Original solution:
Alternatively you could try an sapply. The function is a little long, but that's because we have a lot of work to do! If you wanted to do this on more areas you wouldn't have to manually fill each one, so that could be a benefit:
ds$agg_score <- sapply(1:nrow(ds), function(i) {other_areas <- setdiff(unique(ds$area), ds[i, "area"])
f_idxs = Filter(function(x) x < i, which(ds$area %in% other_areas)) #Locate other areas that come before current index
if(length(f_idxs) == 0) return(NA)
idxs = sapply(split(f_idxs, ds[f_idxs, "area"]), max) #Split based on area so we can get maximum index before our date
if(length(idxs) < length(other_areas)) return(NA)
sum(ds[c(idxs, i), "score"])}) #Sum up our scores
So I found a way to do this using fill() to ensure the most recent value is always carried forward until replaced by a more recent value.
library(tidyr)
ds %>%
select(area, score, scoring_date) %>%
spread(area, score) %>%
fill(A, .direction = "down") %>%
fill(B, .direction = "down") %>%
fill(C, .direction = "down") %>%
rowwise() %>%
mutate(agg_score = sum(A, B, C))
nuevoDs<-ds %>% arrange(desc(scoring_date)) %>% as.data.frame
#getting length of dataframe
longitud<-nrow(nuevoDs)
#we will iterate on each value up until (longitud - 2) and save results to a vector
elVector <- vector()
for(i in 1:(longitud-2))
{
elVector[i] <- nuevoDs[i,"score"] + nuevoDs[i+1,"score"] + nuevoDs[i+2,"score"]
}
#before cbinding we need to make the vector the same length as your dataFrame
elVector[longitud-1] <- 0
elVector[longitud] <- 0
elVector
cbind(nuevoDs,elVector)
area score scoring_date elVector
1 C 3 2019-01-09 13
2 B 6 2019-01-08 13
3 A 4 2019-01-07 13
4 A 3 2019-01-06 11
5 B 6 2019-01-05 13
6 C 2 2019-01-04 11
7 C 5 2019-01-03 10
8 B 4 2019-01-02 0
9 A 1 2019-01-01 0
Another possible data.table approach.
ds[, output :=
ds[,
ds[.(area=unique(area), scd=.BY$scoring_date),
sum(score),
on=.(area=area, scoring_date<=scd),
mult="last"],
by=.(area, scoring_date)]$V1
]
output:
area score scoring_date output
1: A 1 2019-01-01 NA
2: B 4 2019-01-02 NA
3: C 5 2019-01-03 10
4: C 2 2019-01-04 7
5: B 6 2019-01-05 9
6: A 3 2019-01-06 11
7: A 4 2019-01-07 12
8: B 6 2019-01-08 12
9: C 3 2019-01-09 13
data:
library(data.table)
ds <- data.table(
area = c("A", "B", "C", "C", "B", "A", "A", "B", "C"),
score = c(1,4,5,2,6,3,4,6,3),
scoring_date = seq.Date(from = as.Date("2019-01-01"), to = as.Date("2019-01-09"), by = "days"))
Explanation:
The gist of the above code is:
ds[.(area=unique(area), scd=.BY$scoring_date),
sum(score),
on=.(area=area, scoring_date<=scd),
mult="last"]
It means for each date (scd=.BY$scoring_date), we try to perform a non-equi self join to find the latest (mult="last") score for all areas (area=unique(area))

join and sum columns together R

I have a dataframe:
df <- data.frame(ca = c("a","b","a","c","b", "b"),
f = c(3,4,0,NA,3, 4),
f2 = c(NA,5,6,1,9, 7),
f3 = c(3,0,6,3,0, 8))
I want join and sum my columns "f" and "f2" and rename it in "f_news"
exemple :
df <- data.frame(ca = c("a","b","a","c","b", "b"),
f_new = c(3,9,6,1,12, 11),
f3 = c(3,0,6,3,0, 8))
Do you have an idea of how to do this with summarise, spread, group_by?
Using plyr and dplyr you can do this:
df %>%
rowwise() %>%
mutate(f_new=sum(f, f2, na.rm = T))
# A tibble: 6 x 5
# ca f f2 f3 f_new
# <fct> <dbl> <dbl> <dbl> <dbl>
#1 a 3 NA 3 3
#2 b 4 5 0 9
#3 a 0 6 6 6
#4 c NA 1 3 1
#5 b 3 9 0 12
#6 b 4 7 8 11
This method will retain and NA values
Here is an answer using tidyverse methods from dplyr and tidyr
library(tidyverse)
df <- data.frame(ca = c("a","b","a","c","b", "b"),
f = c(3,4,0,NA,3, 4),
f2 = c(NA,5,6,1,9, 7),
f3 = c(3,0,6,3,0, 8))
df %>%
replace_na(list(f = 0, f2 = 0)) %>%
mutate(f_new = f + f2)
#> ca f f2 f3 f_new
#> 1 a 3 0 3 3
#> 2 b 4 5 0 9
#> 3 a 0 6 6 6
#> 4 c 0 1 3 1
#> 5 b 3 9 0 12
#> 6 b 4 7 8 11
Dplyr can do this quite nice with the following code. Rowwise allows you to consider each row separately. And the mutate command sums whatever columns you want. the na.rm=TRUE handles the issue when you have NA's and want to ignore them. As a comment mentioned, if you do not have this, it will give you an NA if it's in any of the summed values.
library(dplyr)
df %>%
rowwise() %>%
mutate(f_new = sum(f,f2, na.rm = TRUE))

Run length sequence by time and ID

This problem does not seem to have been put out here before.
I want to find the number of subjects that score 1 for 6 consecutive hours.
The subjects have not been scored every hour so if an hour is missing the hours are not consecutive and the output for that 6-hour period should be NA.
The reason for assigning NA would be that we do not know how the subject has scored on the missing hour. This problem can be used to count consecutive hits, but only count it if a subject has participated.
My dataframe looks like this:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
df<-data.frame(ID,hour,A)
I have tried to use the rle function (I am sure its possible) but I cannot get it to condition on both hour and ID.
The output would be like this:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six<-c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
Thank you in advance.
I believe the original data set I gave was too small to make the solutions more generalizable.
I just tried the codes with this dataset and found that this will result in a wrong result.
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
df<-data.frame(ID,hour,A)
For the new dataset the output should be:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
Here is an approach in tidyverse with the updated data set:
library(tidyverse)
df %>%
group_by(ID) %>%
expand(hour = seq(min(hour), max(hour))) %>%
left_join(df) %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
filter(!is.na(A)) %>%
ungroup() %>%
select(ID, hour, A, six) %>%
as.data.frame() -> df_out2
check requested output:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
all.equal(df, df_out2)
#output
TRUE
The old answer:
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six) %>%
as.data.frame() -> df_out2
Lets check if the result is like requested:
ID <- c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour <- c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A <- c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six <- c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df1 <- data.frame(ID, hour, A, six)
df1 is requested output
all.equal(df1, df_out2)
#output
TRUE
some benchmarking:
library(microbenchmark)
library(data.table)
akrun <- function(df){
setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
][, sixnew :=if(sum(A)>=5) rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
}
missuse <- function(df){
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six)
}
Mike <- function(df){
ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))})
}
microbenchmark(Mike(df),
akrun(df),
missuse(df))
#output
Unit: microseconds
expr min lq mean median uq max neval
Mike(df) 491.291 575.7115 704.2213 597.7155 629.0295 9578.684 100
akrun(df) 6568.313 6725.5175 7867.4059 6843.5790 7279.2240 69790.755 100
missuse(df) 11042.822 11321.0505 12434.8671 11512.3200 12616.3485 43170.935 100
way to go Mike H.!
To get the groupings you can compare the current hour to the lagged hour to see if they are "consecutive" or 1 integer apart and then take the cumsum of that. Once you have the groupings you can use a simple ave to get the output you want.
library(data.table)
df$six <- ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))}
)
df
# ID hour A six
#1 1 1 0 NA
#2 1 2 1 NA
#3 1 3 0 NA
#4 1 7 1 0
#5 1 8 1 0
#6 1 9 1 0
#7 1 10 1 0
#8 1 11 1 0
#9 1 12 1 1
#10 1 17 0 NA
#11 1 18 0 NA
#12 1 19 0 NA
#13 2 1 1 0
#14 2 2 1 0
#15 2 3 1 0
#16 2 4 1 0
#17 2 5 1 0
#18 2 6 1 1
#19 2 8 1 NA
#20 2 9 1 NA
#21 2 15 1 NA
If you only want to select a patient at most once, this will select the last time period:
df$six_adj <- ave(df$six, df$ID, df$six, FUN = function(x) {
if(all(x==1)) return(c(rep(0, length(x) - 1), 1))
else return(x)}
)
We can use rleid from data.table. Create a grouping column with run-length-id of 'A' ('grp'). Take the difference of 'hour' with the next value of 'hour', check if it is equal to 1 and multiply with 'A' to create 'Anew'. Grouped by run-length-id of 'Anew' and 'grp', if the sum of 'A' is greater than a particular value, replicate with 0s and 1s or else return NA. In the last phase, assign some spillover NAs to 0 by creating an index ('i1')
library(data.table)
setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
][, sixnew :=if(sum(A)>=5) rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
# ID hour A six sixnew
# 1: 1 1 0 NA NA
# 2: 1 2 1 NA NA
# 3: 1 3 0 NA NA
# 4: 1 7 1 0 0
# 5: 1 8 1 0 0
# 6: 1 9 1 0 0
# 7: 1 10 1 0 0
# 8: 1 11 1 0 0
# 9: 1 12 1 1 1
#10: 1 17 0 NA NA
#11: 1 18 0 NA NA
#12: 1 19 0 NA NA
#13: 2 1 1 0 0
#14: 2 2 1 0 0
#15: 2 3 1 0 0
#16: 2 4 1 0 0
#17: 2 5 1 0 0
#18: 2 6 1 1 1
#19: 2 8 1 NA NA
#20: 2 9 1 NA NA
#21: 2 15 1 NA NA
Or slightly more compact option would be
i1 <- setDT(df)[, if(sum(A)>= 5) .I[.N] , rleid(c(TRUE, diff(hour)==1), A)]$V1
df[i1, sixnew := 1][unlist(Map(`:`, i1-5, i1-1)), sixnew := 0]
df
# ID hour A six sixnew
# 1: 1 1 0 NA NA
# 2: 1 2 1 NA NA
# 3: 1 3 0 NA NA
# 4: 1 7 1 0 0
# 5: 1 8 1 0 0
# 6: 1 9 1 0 0
# 7: 1 10 1 0 0
# 8: 1 11 1 0 0
# 9: 1 12 1 1 1
#10: 1 17 0 NA NA
#11: 1 18 0 NA NA
#12: 1 19 0 NA NA
#13: 2 1 1 0 0
#14: 2 2 1 0 0
#15: 2 3 1 0 0
#16: 2 4 1 0 0
#17: 2 5 1 0 0
#18: 2 6 1 1 1
#19: 2 8 1 NA NA
#20: 2 9 1 NA NA
#21: 2 15 1 NA NA
Benchmarks
Created a slightly bigger dataset and tested the solutions
-data
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
ID <- rep(1:5000, rep(c(12, 9), 2500))
A <- rep(A, 2500)
hour <- rep(hour, 2500)
dftest <- data.frame(ID, hour, A)
-functions
akrun <- function(df){
df1 <- copy(df)
setDT(df1)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
][, sixnew :=if(sum(A)>=5) rep(c(0, 1), c(.N-1, 1))
else NA_real_,.(rleid(Anew), grp)]
i1 <- df1[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df1[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
}
akrun2 <- function(df) {
df1 <- copy(df)
i1 <- setDT(df1)[, if(sum(A)>= 5) .I[.N] , rleid(c(TRUE, diff(hour)==1), A)]$V1
df1[i1, sixnew := 1][unlist(Map(`:`, i1-5, i1-1)), sixnew := 0]
}
missuse <- function(df){
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six)
}
Mike <- function(df){
ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))})
}
-benchmark
microbenchmark(Mike(dftest),
akrun(dftest),
akrun2(dftest),
missuse(dftest),
times = 10L, unit = 'relative')
-output
#Unit: relative
# expr min lq mean median uq max neval cld
# Mike(dftest) 1.682794 1.754494 1.673811 1.68806 1.632765 1.640221 10 a
# akrun(dftest) 13.159245 12.950117 12.176965 12.33716 11.856271 11.095228 10 b
# akrun2(dftest) 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 10 a
# missuse(dftest) 37.899905 36.773837 34.726845 34.87672 33.155939 30.665840 10 c

Count number of values in row using dplyr

This question should have a simple, elegant solution but I can't figure it out, so here it goes:
Let's say I have the following dataset and I want to count the number of 2s present in each row using dplyr.
set.seed(1)
ID <- LETTERS[1:5]
X1 <- sample(1:5, 5,T)
X2 <- sample(1:5, 5,T)
X3 <- sample(1:5, 5,T)
df <- data.frame(ID,X1,X2,X3)
library(dplyr)
Now, the following works:
df %>%
rowwise %>%
mutate(numtwos = sum(c(X1,X2,X3) == 2))
But how do I avoid typing out all of the column names?
I know this is probably easier to do without dplyr, but more generally I want to know how I can use dplyr's mutate with multiple columns without typing out all the column names.
Try rowSums:
> set.seed(1)
> ID <- LETTERS[1:5]
> X1 <- sample(1:5, 5,T)
> X2 <- sample(1:5, 5,T)
> X3 <- sample(1:5, 5,T)
> df <- data.frame(ID,X1,X2,X3)
> df
ID X1 X2 X3
1 A 2 5 2
2 B 2 5 1
3 C 3 4 4
4 D 5 4 2
5 E 2 1 4
> rowSums(df == 2)
[1] 2 1 0 1 1
Alternatively, with dplyr:
> df %>% mutate(numtwos = rowSums(. == 2))
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1
Here's another alternative using purrr:
library(purrr)
df %>%
by_row(function(x) {
sum(x[-1] == 2) },
.to = "numtwos",
.collate = "cols"
)
Which gives:
#Source: local data frame [5 x 5]
#
# ID X1 X2 X3 numtwos
# <fctr> <int> <int> <int> <int>
#1 A 2 5 2 2
#2 B 2 5 1 1
#3 C 3 4 4 0
#4 D 5 4 2 1
#5 E 2 1 4 1
As per mentioned in the NEWS, row based functionals are still maturing in dplyr:
We are still figuring out what belongs in dplyr and what belongs in
purrr. Expect much experimentation and many changes with these
functions.
Benchmark
We can see how rowwise() and do() compare to purrr::by_row() for this type of problem and how they "perform" against rowSums() and the tidy data way:
largedf <- df[rep(seq_len(nrow(df)), 10e3), ]
library(microbenchmark)
microbenchmark(
steven = largedf %>%
by_row(function(x) {
sum(x[-1] == 2) },
.to = "numtwos",
.collate = "cols"),
psidom = largedf %>%
rowwise %>%
do(data_frame(numtwos = sum(.[-1] == 2))) %>%
cbind(largedf, .),
gopala = largedf %>%
gather(key, value, -ID) %>%
group_by(ID) %>%
summarise(numtwos = sum(value == 2)) %>%
inner_join(largedf, .),
evan = largedf %>%
mutate(numtwos = rowSums(. == 2)),
times = 10L,
unit = "relative"
)
Results:
#Unit: relative
# expr min lq mean median uq max neval cld
# steven 1225.190659 1261.466936 1267.737126 1227.762573 1276.07977 1339.841636 10 b
# psidom 3677.603240 3759.402212 3726.891458 3678.717170 3728.78828 3777.425492 10 c
# gopala 2.715005 2.684599 2.638425 2.612631 2.59827 2.572972 10 a
# evan 1.000000 1.000000 1.000000 1.000000 1.00000 1.000000 10 a
Just wanted to add to the answer of #evan.oman in case you only want to sum rows for specific columns, not all of them. You can use the regular select and/or select_helpers functions. In this example, we don't want to include X1 in rowSums:
df %>%
mutate(numtwos = rowSums(select(., -X1) == 2))
ID X1 X2 X3 numtwos
1 A 2 5 2 1
2 B 2 5 1 0
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 0
One approach is to use a combination of dplyr and tidyr to convert data into long format, and do the computation:
library(dplyr)
library(tidyr)
df %>%
gather(key, value, -ID) %>%
group_by(ID) %>%
summarise(numtwos = sum(value == 2)) %>%
inner_join(df, .)
Output is as follows:
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1
You can use do, which doesn't add the column to your original data frame and you need to add the column to your original data frame.
df %>%
rowwise %>%
do(numtwos = sum(.[-1] == 2)) %>%
data.frame
numtwos
1 2
2 1
3 0
4 1
5 1
Add a cbind to bind the new column to the original data frame:
df %>%
rowwise %>%
do(numtwos = sum(.[-1] == 2)) %>%
data.frame %>% cbind(df, .)
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1

Operations between groups with dplyr

I have a data frame as follow where I would like to group the data by grp and index and use group a as a reference to perform some simple calculations. I would like to subtract the variable value from other group from the values of group a.
df <- data.frame(grp = rep(letters[1:3], each = 2),
index = rep(1:2, times = 3),
value = seq(10, 60, length.out = 6))
df
## grp index value
## 1 a 1 10
## 2 a 2 20
## 3 b 1 30
## 4 b 2 40
## 5 c 1 50
## 6 c 2 60
The desired outpout would be like:
## grp index value
## 1 b 1 20
## 2 b 2 20
## 3 c 1 40
## 4 c 2 40
My guess is it will be something close to:
group_by(df, grp, index) %>%
mutate(diff = value - value[grp == "a"])
Ideally I would like to do it using dplyr.
Regards, Philippe
We can filter for 'grp' that are not 'a' and then do the difference within mutate.
df %>%
filter(grp!="a") %>%
mutate(value = value- df$value[df$grp=="a"])
Or another option would be join
df %>%
filter(grp!="a") %>%
left_join(., subset(df, grp=="a", select=-1), by = "index") %>%
mutate(value = value.x- value.y) %>%
select(1, 2, 5)
# grp index value
#1 b 1 20
#2 b 2 20
#3 c 1 40
#4 c 2 40

Resources