There are 10 projects split between group A & B, each with different start and end dates. For each day within a given period the sum of outputX and outputY needs to be calculated. I manage to do this for all projects together, but how to split the results per group?
I've made several attempts with lapply() and purrr:map(), also looking at filters and splits, but to no avail. An example that doesn't distinguish between groups is found below.
library(tidyverse)
library(lubridate)
df <- data.frame(
project = 1:10,
group = c("A","B"),
outputX = rnorm(2),
outputY = rnorm(5),
start_date = sample(seq(as.Date('2018-01-3'), as.Date('2018-1-13'), by="day"), 10),
end_date = sample(seq(as.Date('2018-01-13'), as.Date('2018-01-31'), by="day"), 10))
df$interval <- interval(df$start_date, df$end_date)
period <- data.frame(date = seq(as.Date("2018-01-08"), as.Date("2018-01-17"), by = 1))
df_sum <- do.call(rbind, lapply(period$date, function(x){
index <- x %within% df$interval;
list("X" = sum(df$outputX[index]),
"Y" = sum(df$outputY[index]))}))
outcome <- cbind(period, df_sum) %>% gather("id", "value", 2:3)
outcome
Ultimately, it should be a 40x4 table. Some suggestions are much appreciated!
If I understand you correctly, you need to use inner join. SO can suggest us to use sqldf. See https://stackoverflow.com/a/11895368/9300556
With your data we can do smth like this. There is no need to calculate df$interval but we need to add ID to period, otherwise sqldf wont work.
df <- data.frame(
project = 1:10,
group = c("A","B"),
outputX = rnorm(2),
outputY = rnorm(5),
start = sample(seq(as.Date('2018-01-3'), as.Date('2018-1-13'), by="day"), 10),
end = sample(seq(as.Date('2018-01-13'), as.Date('2018-01-31'), by="day"), 10))
# df$interval <- interval(df$start_date, df$end_date)
period <- data.frame(date = seq(as.Date("2018-01-08"), as.Date("2018-01-17"), by = 1)) %>%
mutate(id = 1:nrow(.))
Then we can use sqldf
sqldf::sqldf("select * from period inner join df
on (period.date > df.start and period.date <= df.end) ") %>%
as_tibble() %>%
group_by(date, group) %>%
summarise(X = sum(outputX),
Y = sum(outputY)) %>%
gather(id, value, -group, -date)
# A tibble: 40 x 4
# Groups: date [10]
date group id value
<date> <fct> <chr> <dbl>
1 2018-01-08 A X 3.04
2 2018-01-08 B X 2.34
3 2018-01-09 A X 3.04
4 2018-01-09 B X 3.51
5 2018-01-10 A X 3.04
6 2018-01-10 B X 4.68
7 2018-01-11 A X 4.05
8 2018-01-11 B X 4.68
9 2018-01-12 A X 4.05
10 2018-01-12 B X 5.84
# ... with 30 more rows
Related
My dataframe is
df <- data.frame(x = c(4,4,4,2,2,2), y = c(1,2,3,1,2,3), y_share = c(0.2,0.4,0.2,0.5,0.3,0.2))
I want to have an aggregation df with 2 columns of y and z with
z = sum(x*y_share)/sum(y_share).
In this case, the resulted dataframe should be like this:
result = data.frame(y = c(1,2,3), z = c(2.57, 3.14, 3))
I tried this
func = function(x) {y=sum(vector(x[1])*vector(x[3]))/sum(vector(x[3]))
return(y)}
agg = aggregate(df, by=list(df$y), FUN=func)
but it doesn't work.
Thank you
We can use data.table
library(data.table)
setDT(df)[, .(z = sum(x * y_share)/sum(y_share)), by = y]
# y z
#1: 1 2.571429
#2: 2 3.142857
#3: 3 3.000000
Or if we want to use base R, here is an option with by
stack(by(df, list(df$y), FUN = function(z)
with(z, sum(x * y_share)/sum(y_share))))[2:1]
data
df <- data.frame(x=c(4,4,4,2,2,2), y=c(1,2,3,1,2,3),
y_share=c(0.2,0.4,0.2,0.5,0.3,0.2))
Tidyverse approach (using dplyr):
library(dplyr)
result <- df %>%
group_by(y) %>%
summarise(z = sum(x*y_share)/sum(y_share)) %>%
ungroup()
Result
result
# A tibble: 3 x 2
# y z
# <dbl> <dbl>
# 1 1 2.57
# 2 2 3.14
# 3 3 3.
Data
df <- data.frame(x = c(4,4,4,2,2,2),
y = c(1,2,3,1,2,3),
y_share = c(0.2,0.4,0.2,0.5,0.3,0.2))
result <- data.frame(y = c(1,2,3),
z = c(2.57, 3.14, 3))
Data:
set.seed(42)
df1 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),1),
value = sample(1:30),
Y = sample(c("yes", "no"), 30, replace = TRUE)
)
df2 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),7)
)
I want for each date in df2$Date calculate the sum of df1$Value if date in df1$Date falls within df2$Date and df2$Date+6
Inshort I need to calculate weekly sums
Using data.table, create a range start/end, then merge on overlap, then get sum over group:
library(data.table)
df1$start <- df1$Date
df1$end <- df1$Date
df2$start <- df2$Date
df2$end <- df2$Date + 6
setDT(df1, key = c("start", "end"))
setDT(df2, key = c("start", "end"))
foverlaps(df1, df2)[, list(mySum = sum(value)), by = Date ]
# Date mySum
# 1: 2018-01-01 138
# 2: 2018-01-08 96
# 3: 2018-01-15 83
# 4: 2018-01-22 109
# 5: 2018-01-29 39
Check out library lubridate and dplyr, those two are quiet common.
library(lubridate)
library(dplyr)
df1$last_week_day <- ceiling_date(df1$Date, "week") + 1
df1 %>% group_by(last_week_day) %>% summarize(week_value = sum(value))
We can use fuzzyjoin
library(dplyr)
library(fuzzyjoin)
df2$EndDate <- df2$Date+6
fuzzy_left_join(
df1, df2,
by = c(
"Date" = "Date",
"Date" = "EndDate"
), match_fun = list(`>=`, `<=`)) %>%
group_by(Date.y) %>% summarise(Sum=sum(value))
# A tibble: 5 x 2
Date.y Sum
<date> <int>
1 2018-01-01 138
2 2018-01-08 96
3 2018-01-15 83
4 2018-01-22 109
5 2018-01-29 39
In R, how do deal with messy data frame with mixed up row and column as variables?
days <- c(as.Date("2011-07-01") + 0:9)
set.seed(10)
d <- data.frame(days,replicate(9,round(runif(10,0,10),3)))
names(d) <- c("Date", "x.astreet.1", "x.astreet.2", "x.astreet.3",
"x.Bstreet.1", "x.Bstreet.2", "x.Bstreet.3",
"x.Cstreet.1", "x.Cstreet.2", "x.Cstreet.3")
streetnames <- c(NA,rep(c("Astr.","Bstr.","Cstr."),3))
molecule <- c(NA, rep(c("SO","CO","O3"),3))
d <- rbind(streetnames, molecule,d)
see df as tbl in this printscreen
in this case idealy should have only 5 rows (Date, SO, NO, O3, Station)
Here's my approach. The advantage of doing it this way is that it's completely programmatic. It's fine to have a solution where you manually rename the variables if the dataset is complete, but this approach can scale to the dataset if you're still adding new stations and gases.
# OP changed the 'streetnames' vector, below is the correct one they've provided.
days <- c(as.Date("2011-07-01") + 0:9)
set.seed(10)
d <- data.frame(days,replicate(9,round(runif(10,0,10),3)))
names(d) <- c("Date", "x.astreet.1", "x.astreet.2", "x.astreet.3",
"x.Bstreet.1", "x.Bstreet.2", "x.Bstreet.3",
"x.Cstreet.1", "x.Cstreet.2", "x.Cstreet.3")
streetnames <- c(NA,rep(c("Astr."),3),rep(c("Bstr."),3),rep(c("Cstr."),3))
molecule <- c(NA, rep(c("SO","CO","O3"),3))
d <- rbind(streetnames, molecule, d)
# ---------------
library(tidyr)
library(dplyr)
library(janitor)
# Replace column names with the combined first two rows. This is tricky to do inside
# a dplyr pipeline so I do it outside.
names(d) <- paste(d[1,], d[2,])
d2 <-
d %>%
slice(3:n()) %>% # Remove first 2 rows
clean_names() %>% # Janitor standardises column names
rename(date = na_na) %>%
gather(measure, value, -date) %>% # Collapse wide to long
separate(measure, # Break this column into several columns
into = c("station", "gas")) %>%
mutate_at("value", as.numeric) %>%
# You can stop there to have a long table. To get a wide table:
spread(gas, value) %>%
identity()
head(d2)
#> date station co o3 so
#> 1 2011-07-01 astr 6.517 8.647 5.075
#> 2 2011-07-01 bstr 2.755 3.543 5.356
#> 3 2011-07-01 cstr 0.756 8.614 0.319
#> 4 2011-07-02 astr 5.677 6.154 3.068
#> 5 2011-07-02 bstr 2.289 9.364 0.931
#> 6 2011-07-02 cstr 5.344 4.644 1.145
str(d2)
#> 'data.frame': 30 obs. of 5 variables:
#> $ date : Date, format: "2011-07-01" "2011-07-01" "2011-07-01" ...
#> $ station: chr "astr" "bstr" "cstr" "astr" ...
#> $ co : num 6.517 2.755 0.756 5.677 2.289 ...
#> $ o3 : num 8.65 3.54 8.61 6.15 9.36 ...
#> $ so : num 5.075 5.356 0.319 3.068 0.931 ...
Note: I always throw an identity() at the end of pipelines for debugging purposes. It lets you comment out entire lines of the pipe without having to worry about trailing %>% raising errors.
A base R approach could be the following.
res <- lapply(seq(2, ncol(d), by = 3), function(i){
Date <- d[-(1:2), "Date"]
SO <- d[-(1:2), i]
CO <- d[-(1:2), i + 1]
O3 <- d[-(1:2), i + 2]
data.frame(Date, SO, CO, O3)
})
res <- do.call(rbind, res)
res$Date <- as.Date(res$Date)
row.names(res) <- NULL
head(res)
# Date SO CO O3
#1 2011-07-01 5.075 6.517 8.647
#2 2011-07-02 3.068 5.677 6.154
#3 2011-07-03 4.269 1.135 7.751
#4 2011-07-04 6.931 5.959 3.556
#5 2011-07-05 0.851 3.58 4.058
#6 2011-07-06 2.254 4.288 7.066
Starting from the beginning of your code sample with your rbind calls omitted:
days <- c(as.Date("2011-07-01") + 0:9)
set.seed(10)
d <- data.frame(days,replicate(9,round(runif(10,0,10),3)))
names(d) <- c("Date", "x.astreet.1", "x.astreet.2", "x.astreet.3",
"x.Bstreet.1", "x.Bstreet.2", "x.Bstreet.3",
"x.Cstreet.1", "x.Cstreet.2", "x.Cstreet.3")
d %<>% gather(col_name, value, -Date) %>%
separate(col_name, c("x", "street_name", "molecule_number"), sep = "\\.", convert = TRUE) %>%
select(-x) %>%
spread(molecule_number, value) %>%
rename(SO = `1`, NO = `2`, O3 = `3`)
I think this is what you're trying to get to. There is likely a more elegant solution, but this will work.
I assumed that the suffix 1, 2, 3 correspond to SO, CO, and O3.
This solution does not use the streetnames or molucule_number vectors that you created, so you can leave off the rbind() call that you made.
library(dplyr)
library(tidyr)
e <- d %>% gather(key = "station", value = "val", x.astreet.1:x.Cstreet.3)
SO <- e %>% filter(grepl("1", station))
CO <- e %>% filter(grepl("2", station))
O3 <- e %>% filter(grepl("3", station))
f <- data.frame(SO, CO %>% select(val), O3 %>% select(val))
g <- f %>% mutate(Station = case_when(station == "x.astreet.1" ~ "Astr",
station == "x.Bstreet.1" ~ "Bstr",
station == "x.Cstreet.1" ~ "Cstr"),
SO = val,
CO = val.1,
O3 = val.2) %>%
select(Date, SO, CO, O3, Station)
I left in the DF renaming so you could see the result after each step.
I want to aggregate Date by group. However, each observation can belong to several groups (e.g. observation 1 belongs to group A and B). I could not find a nice way to achieve this with data.table. Currently I created for each of the possible groups a logical variable which takes the value TRUE if the observation belongs to that group. I am looking for a better way to do this than presented below. I would also like to know how I could achieve this with the tidyverse.
library(data.table)
# Data
set.seed(1)
TF <- c(TRUE, FALSE)
time <- rep(1:4, each = 5)
df <- data.table(time = time, x = rnorm(20), groupA = sample(TF, size = 20, replace = TRUE),
groupB = sample(TF, size = 20, replace = TRUE),
groupC = sample(TF, size = 20, replace = TRUE))
# This should be nicer and less repetitive
df[groupA == TRUE, .(A = sum(x)), by = time][
df[groupB == TRUE, .(B = sum(x)), by = time], on = "time"][
df[groupC == TRUE, .(C = sum(x)), by = time], on = "time"]
# desired output
time A B C
1: 1 NA 0.9432955 0.1331984
2: 2 1.2257538 0.2427420 0.1882493
3: 3 -0.1992284 -0.1992284 1.9016244
4: 4 0.5327774 0.9438362 0.9276459
Here is a solution with data.table:
df[, lapply(.SD[, .(groupA, groupB, groupC)]*x, sum), time]
# > df[, lapply(.SD[, .(groupA, groupB, groupC)]*x, sum), time]
# time groupA groupB groupC
# 1: 1 0.0000000 0.9432955 0.1331984
# 2: 2 1.2257538 0.2427420 0.1882493
# 3: 3 -0.1992284 -0.1992284 1.9016244
# 4: 4 0.5327774 0.9438362 0.9276459
or (thx to #chinsoon12 for the comment) more programmatically:
df[, lapply(.SD*x, sum), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
If you want the result in the long format you can do:
df[, colSums(.SD*x), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
### with indicator for the group:
df[, .(colSums(.SD*x), c("A","B","C")), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
I think it's easier here to work in long format. First I gather the observations to long format, then keep only the values where the observation belongs to the corresponding group. Then I remove the logical column, and rename the groups to single letters. Then I aggregate across groups and time (summarise in dplyr).
Finally I spread back to wide format.
library(dplyr)
library(tidyr)
set.seed(1)
TF <- c(TRUE, FALSE)
time <- rep(1:4, each = 5)
df <- data.frame(time = time, x = rnorm(20), groupA = sample(TF, size = 20, replace = TRUE),
groupB = sample(TF, size = 20, replace = TRUE),
groupC = sample(TF, size = 20, replace = TRUE))
df %>%
gather(group, belongs, groupA:groupC) %>%
filter(belongs) %>%
select(-belongs) %>%
mutate(group = gsub("group", "", group)) %>%
group_by(time, group) %>%
summarise(x = sum(x)) %>%
spread(group, x)
Output
# A tibble: 4 x 4
# Groups: time [4]
time A B C
<int> <dbl> <dbl> <dbl>
1 1 NA 0.943 0.133
2 2 1.23 0.243 0.188
3 3 -0.199 -0.199 1.90
4 4 0.533 0.944 0.928
An option can be using tidyr and dplyr packages in combination with data.table. Try to work on data in long format and then change it to wide format.
library(dplyr)
library(tidyr)
melt(df, id.vars = c("time", "x")) %>%
filter(value) %>%
group_by(time, variable) %>%
summarise(sum = sum(x)) %>%
spread(variable, sum)
# # A tibble: 4 x 4
# # Groups: time [4]
# time groupA groupB groupC
# * <int> <dbl> <dbl> <dbl>
# 1 1 NA 0.943 0.133
# 2 2 1.23 0.243 0.188
# 3 3 - 0.199 -0.199 1.90
# 4 4 0.533 0.944 0.928
I have a toy example of a tibble.
What is the most efficient way to sum two consecutive rows of y grouped by x
library(tibble)
l = list(x = c("a", "b", "a", "b", "a", "b"), y = c(1, 4, 3, 3, 7, 0))
df <- as_tibble(l)
df
#> # A tibble: 6 x 2
#> x y
#> <chr> <dbl>
#> 1 a 1
#> 2 b 4
#> 3 a 3
#> 4 b 3
#> 5 a 7
#> 6 b 0
So the output would be something like this
group sum seq
a 4 1
a 10 2
b 7 1
b 3 2
I'd like to use the tidyverse and possibly roll_sum() from the RcppRoll package
and have the code so that a variable length of consecutive rows could be used for real world data in which there would be many groups
TIA
One way to do this is use group_by %>% do where you can customize the returned data frame in do:
library(RcppRoll); library(tidyverse)
n = 2
df %>%
group_by(x) %>%
do(
data.frame(
sum = roll_sum(.$y, n),
seq = seq_len(length(.$y) - n + 1)
)
)
# A tibble: 4 x 3
# Groups: x [2]
# x sum seq
# <chr> <dbl> <int>
#1 a 4 1
#2 a 10 2
#3 b 7 1
#4 b 3 2
Edit: Since this is not as efficient, probably due to the data frame construction header and binding data frames on the go, here is an improved version (still somewhat slower than data.table but not as much now):
df %>%
group_by(x) %>%
summarise(sum = list(roll_sum(y, n)), seq = list(seq_len(n() -n + 1))) %>%
unnest()
Timing, use #Matt's data and setup:
library(tibble)
library(dplyr)
library(RcppRoll)
library(stringi) ## Only included for ability to generate random strings
## Generate data with arbitrary number of groups and rows --------------
rowCount <- 100000
groupCount <- 10000
sumRows <- 2L
set.seed(1)
l <- tibble(x = sample(stri_rand_strings(groupCount,3),rowCount,rep=TRUE),
y = sample(0:10,rowCount,rep=TRUE))
## Using dplyr and tibble -----------------------------------------------
ptm <- proc.time() ## Start the clock
dplyr_result <- l %>%
group_by(x) %>%
summarise(sum = list(roll_sum(y, n)), seq = list(seq_len(n() -n + 1))) %>%
unnest()
dplyr_time <- proc.time() - ptm ## Stop the clock
## Using data.table instead ----------------------------------------------
library(data.table)
ptm <- proc.time() ## Start the clock
setDT(l) ## Convert l to a data.table
dt_result <- l[,.(sum = RcppRoll::roll_sum(y, n = sumRows, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
data.table_time <- proc.time() - ptm
Result is:
dplyr_time
# user system elapsed
# 0.688 0.003 0.689
data.table_time
# user system elapsed
# 0.422 0.009 0.430
Here is one approach for you. Since you want to sum up two consecutive rows, you could use lead() and do the calculation for sum. For seq, I think you can simply take row numbers, seeing your expected outcome. Once you are done with these operations, you arrange your data by x (if necessary, x and seq). Finally, you drop rows with NAs. If necessary, you may want to drop y by writing select(-y) at the end of the code.
group_by(df, x) %>%
mutate(sum = y + lead(y),
seq = row_number()) %>%
arrange(x) %>%
ungroup %>%
filter(complete.cases(.))
# x y sum seq
# <chr> <dbl> <dbl> <int>
#1 a 1 4 1
#2 a 3 10 2
#3 b 4 7 1
#4 b 3 3 2
I notice you asked for the most efficient way-- if you are looking at scaling this up to a much larger set, I would strongly recommend data.table.
library(data.table)
library(RcppRoll)
l[, .(sum = RcppRoll::roll_sum(y, n = 2L, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
A rough benchmark comparison of this vs an answer using the tidyverse packages with 100,000 rows and 10,000 groups illustrates the significant difference.
(I used Psidom's answer instead of jazzurro's since jazzuro's did not allow for an arbritary number of rows to be summed.)
library(tibble)
library(dplyr)
library(RcppRoll)
library(stringi) ## Only included for ability to generate random strings
## Generate data with arbitrary number of groups and rows --------------
rowCount <- 100000
groupCount <- 10000
sumRows <- 2L
set.seed(1)
l <- tibble(x = sample(stri_rand_strings(groupCount,3),rowCount,rep=TRUE),
y = sample(0:10,rowCount,rep=TRUE))
## Using dplyr and tibble -----------------------------------------------
ptm <- proc.time() ## Start the clock
dplyr_result <- l %>%
group_by(x) %>%
do(
data.frame(
sum = roll_sum(.$y, sumRows),
seq = seq_len(length(.$y) - sumRows + 1)
)
)
|========================================================0% ~0 s remaining
dplyr_time <- proc.time() - ptm ## Stop the clock
## Using data.table instead ----------------------------------------------
library(data.table)
ptm <- proc.time() ## Start the clock
setDT(l) ## Convert l to a data.table
dt_result <- l[,.(sum = RcppRoll::roll_sum(y, n = sumRows, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
data.table_time <- proc.time() - ptm ## Stop the clock
Results:
> dplyr_time
user system elapsed
10.28 0.04 10.36
> data.table_time
user system elapsed
0.35 0.02 0.36
> all.equal(dplyr_result,as.tibble(dt_result))
[1] TRUE
A solution using tidyverse and zoo. This is similar to Psidom's approach.
library(tidyverse)
library(zoo)
df2 <- df %>%
group_by(x) %>%
do(data_frame(x = unique(.$x),
sum = rollapplyr(.$y, width = 2, FUN = sum))) %>%
mutate(seq = 1:n()) %>%
ungroup()
df2
# A tibble: 4 x 3
x sum seq
<chr> <dbl> <int>
1 a 4 1
2 a 10 2
3 b 7 1
4 b 3 2
zoo + dplyr
library(zoo)
library(dplyr)
df %>%
group_by(x) %>%
mutate(sum = c(NA, rollapply(y, width = 2, sum)),
seq = row_number() - 1) %>%
drop_na()
# A tibble: 4 x 4
# Groups: x [2]
x y sum seq
<chr> <dbl> <dbl> <dbl>
1 a 3 4 1
2 b 3 7 1
3 a 7 10 2
4 b 0 3 2
If the moving window only equal to 2 using lag
df %>%
group_by(x) %>%
mutate(sum = y + lag(y),
seq = row_number() - 1) %>%
drop_na()
# A tibble: 4 x 4
# Groups: x [2]
x y sum seq
<chr> <dbl> <dbl> <dbl>
1 a 3 4 1
2 b 3 7 1
3 a 7 10 2
4 b 0 3 2
EDIT :
n = 3 # your moving window
df %>%
group_by(x) %>%
mutate(sum = c(rep(NA, n - 1), rollapply(y, width = n, sum)),
seq = row_number() - n + 1) %>%
drop_na()
A small variant on existing answers: first convert the data to list-column format, then use purrr to map() roll_sum() onto the data.
l = list(x = c("a", "b", "a", "b", "a", "b"), y = c(1, 4, 3, 3, 7, 0))
as.tibble(l) %>%
group_by(x) %>%
summarize(list_y = list(y)) %>%
mutate(rollsum = map(list_y, ~roll_sum(.x, 2))) %>%
select(x, rollsum) %>%
unnest %>%
group_by(x) %>%
mutate(seq = row_number())
I think if you have the latest version of purrr you could get rid of the last two lines (the final group_by() and mutate()) by using imap() instead of map.