I have two data frames:
Date <- seq(as.Date("2013/1/1"), by = "day", length.out = 17)
x <-data.frame(Date)
x$discharge <- c("1000","1100","1200","1300","1400","1200","1300","1300","1200","1100","1200","1200","1100","1400","1200","1100","1400")
x$discharge <- as.numeric(x$discharge)
And:
Date2 <- c("2013-01-01","2013-01-08","2013-01-12","2013-01-17")
y <- data.frame(Date2)
y$concentration <- c("1.5","2.5","1.5","3.5")
y$Date2 <- as.Date(y$Date2)
y$concentration <- as.numeric(y$concentration)
What I am desperately trying to do is to the following:
In data frame y the first measurement is for the period 2013-01-01 to 2013-01-07
Calculate the average discharge for this period in data frame x
Return the average discharge to data frame y in a new column next to the first measurement and continue with the next measurement
I was having a look into function such as dplyr or apply but was not able to figure it out.
library(dplyr)
x %>%
mutate(period = cut(as.Date(Date), c(as.Date("1900-01-01"), as.Date(y$Date2[-1]), as.Date("2100-01-01")), c(1:length(y$Date2)))) %>%
group_by(period) %>%
mutate(meandischarge = mean(discharge, na.rm = T)) %>%
right_join(y, by = c("Date" = "Date2"))
Date discharge period meandischarge concentration
<date> <dbl> <fctr> <dbl> <dbl>
1 2013-01-01 1000 1 1214.286 1.5
2 2013-01-08 1300 2 1200.000 2.5
3 2013-01-12 1200 3 1200.000 1.5
4 2013-01-17 1400 4 1400.000 3.5
If you only want the original y variables, you could do this:
x %>%
mutate(period = cut(as.Date(Date), c(as.Date("1900-01-01"), as.Date(y$Date2[-1]), as.Date("2100-01-01")), c(1:length(y$Date2)))) %>%
group_by(period) %>%
mutate(meandischarge = mean(discharge, na.rm = T)) %>%
ungroup() %>%
right_join(y, by = c("Date" = "Date2")) %>%
select(Date2 = Date, concentration, meandischarge)
Date2 concentration meandischarge
<date> <dbl> <dbl>
1 2013-01-01 1.5 1214.286
2 2013-01-08 2.5 1200.000
3 2013-01-12 1.5 1200.000
4 2013-01-17 3.5 1400.000
Related
I have a data set that I would like to split into 10-day intervals. The code that I included below does that, but for the last week or so there are days that (e.g., the 31st or 30th of a month) that remain end up by itself.
I would like to either remove the intervals that create this or include them in the previous intervals.
For example:
If I separate the month of January by 10-day intervals, it would put the first 10 days in a element of a list, the second 10 days into another element and the third 10 days into another one. It would then put January 31st into a element of list by itself.
My desired output would be to either remove these elements from the list or more preferably include them in the third 10-day interval. Can that be done? If so, what would be the best way to do so?
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("26-12-2010"), dmy("20-12-2013"), by = "days"), 500)
ID <- rep(seq(1, 5), 100)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
int <- df %>%
arrange(ID) %>%
mutate(new = ceiling_date(date, '10 day')) %>%
# mutate(cut = data.table::rleid(cut(new, breaks = "10 day"))) %>%
group_by(new) %>%
group_split()
Here is a solution which splits the months by 10-day intervals but corrects new to assign day 31 of a month to the last period. So,
days 1 to 10 belong to the first third of a month,
days 11 to 20 to the second third, and
days 21 to 31 to the third third.
int <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(new) %>%
group_split()
int[[1]]
# A tibble: 6 x 5
date x y ID new
<date> <dbl> <dbl> <int> <date>
1 2010-12-26 71469. 819084. 1 2010-12-21
2 2010-12-27 69417. 893227. 2 2010-12-21
3 2010-12-28 70865. 831341. 3 2010-12-21
4 2010-12-29 68322. 812423. 4 2010-12-21
5 2010-12-30 65643. 837395. 5 2010-12-21
6 2010-12-31 63638. 892200. 1 2010-12-21
Now, 2010-12-31 was assigned to the third third of December.
Note that new indicates the start of the interval by calling floor_date() instead of ceiling_date(). This is due to avoid potential problems with day arithmetic across month boundaries and to clarify to which month the interval belongs to. For instance, for the last day of February, ceiling_date(ymd('2011-02-28'), '10 day') returns "2011-03-03" which is a date in March.
If there is a single row in a group give it the previous new value. Try this -
library(dplyr)
library(lubridate)
df %>%
arrange(ID, date) %>%
mutate(new = ceiling_date(date, '10 day')) %>%
add_count(new) %>%
mutate(new = if_else(n == 1, lag(new), new)) %>%
select(-n) %>%
group_split(new)
Above would only work to combine groups that has 1 observation in a group. If we want to combine more than 1 day use the below code which counts numbers of days in a group. It combines the group if number of day is less than n number of days.
n <- 2
df %>%
arrange(ID, date) %>%
mutate(new = ceiling_date(date, '10 day'),
ID = match(new, unique(new))) -> tmp
tmp %>%
group_by(new, ID) %>%
summarise(count_unique = n_distinct(date)) %>%
ungroup %>%
mutate(new = if_else(count_unique < n, lag(new), new)) %>%
inner_join(tmp, by = 'ID') %>%
select(new = new.x, date, x, y) %>%
group_split(new)
Alternative solution
library(lubridate)
library(tidyverse)
dt <- rep_len(seq(dmy("26-12-2010"), dmy("20-12-2013"), by = "days"), 500)
ID <- rep(seq(1, 5), 100)
df <- data.frame(dt = dt,
x = runif(length(dt), min = 60000, max = 80000),
y = runif(length(dt), min = 800000, max = 900000),
ID)
Include extra days (31st) into the last third
int_df <- df %>%
# arrange(ID) %>%
mutate(day_date = day(dt),
day_new = case_when(
day_date <= 10 ~ 1,
day_date <= 20 ~ 11,
TRUE ~ 21
),
new = ymd(paste(year(dt), month(dt), day_new, sep = "-"))) %>%
select(-c(day_date, day_new)) %>%
group_by(new) %>%
group_split()
int_df[[1]]
#> # A tibble: 6 x 5
#> dt x y ID new
#> <date> <dbl> <dbl> <int> <date>
#> 1 2010-12-26 62395. 837491. 1 2010-12-21
#> 2 2010-12-27 66236. 836481. 2 2010-12-21
#> 3 2010-12-28 79918. 818399. 3 2010-12-21
#> 4 2010-12-29 67613. 807213. 4 2010-12-21
#> 5 2010-12-30 72980. 899380. 5 2010-12-21
#> 6 2010-12-31 61004. 876191. 1 2010-12-21
Exclude extra days (31st)
int_df <- df %>%
# arrange(ID) %>%
mutate(day_date = day(dt),
day_new = case_when(
day_date <= 10 ~ 1,
day_date <= 20 ~ 11,
day_date <= 30 ~ 21,
TRUE ~ 31
),
new = ymd(paste(year(dt), month(dt), day_new, sep = "-"))) %>%
filter(day_date != 31) %>%
select(-c(day_date, day_new)) %>%
group_by(new) %>%
group_split()
int_df[[1]]
#> # A tibble: 5 x 5
#> dt x y ID new
#> <date> <dbl> <dbl> <int> <date>
#> 1 2010-12-26 62395. 837491. 1 2010-12-21
#> 2 2010-12-27 66236. 836481. 2 2010-12-21
#> 3 2010-12-28 79918. 818399. 3 2010-12-21
#> 4 2010-12-29 67613. 807213. 4 2010-12-21
#> 5 2010-12-30 72980. 899380. 5 2010-12-21
Created on 2021-07-03 by the reprex package (v2.0.0)
Consider the following example:
library(tidyverse)
library(lubridate)
df = tibble(client_id = rep(1:3, each=24),
date = rep(seq(ymd("2016-01-01"), (ymd("2016-12-01") + years(1)), by='month'), 3),
expenditure = runif(72))
In df you have stored information on monthly expenditure from a bunch of clients for the past 2 years. Now you want to calculate the monthly difference between this year and the previous year for each client.
Is there any way of doing this maintaining the "long" format of the dataset? Here I show you the way I am doing it nowadays, which implies going wide:
df2 = df %>%
mutate(date2 = paste0('val_',
year(date),
formatC(month(date), width=2, flag="0"))) %>%
select(client_id, date2, value) %>%
pivot_wider(names_from = date2,
values_from = value)
df3 = (df2[,2:13] - df2[,14:25])
However I find tihs unnecessary complex, and in large datasets going from long to wide can take quite a lot of time, so I think there must be a better way of doing it.
If you want to keep data in long format, one way would be to group by month and date value for each client_id and calculate the difference using diff.
library(dplyr)
df %>%
group_by(client_id, month_date = format(date, "%m-%d")) %>%
summarise(diff = -diff(expenditure))
# client_id month_date diff
# <int> <chr> <dbl>
# 1 1 01-01 0.278
# 2 1 02-01 -0.0421
# 3 1 03-01 0.0117
# 4 1 04-01 -0.0440
# 5 1 05-01 0.855
# 6 1 06-01 0.354
# 7 1 07-01 -0.226
# 8 1 08-01 0.506
# 9 1 09-01 0.119
#10 1 10-01 0.00819
# … with 26 more rows
An option with data.table
library(data.table)
library(zoo)
setDT(df)[, .(diff = -diff(expenditure)), .(client_id, month_date = as.yearmon(date))]
I have a series of large matrices and I am just getting used to navigating them in this format and working with functions.
I have minute data for a number of parameters which i have been able to reduce to daily averages - i would like to align each mean output with a date sequence and from there extract the daily average for each year.
In the singular form i have done it like this
A <- matrix(c(1:3285),nrow=3)
AA <- sapply(1:1095, function(x) mean(A [,x], na.rm = TRUE))
D <- seq(from = as.Date("2013-01-01"), to = as.Date("2015-12-31"), by= 1)
df <- cbind.data.frame(D,AA)
Which gets me the means per column aligned to a date for 2013-2015
library(lubridate)
years <- year(as.Date(df$D, "%d-%b-%y"))
day <- yday(as.Date(df$D, "%d-%b-%y"))
#to get the average of DOY over three years
avg <- as.data.frame(tapply(df$AA,day, mean, na.rm=T)) #gives average value on day of year
#Average for specific DOY for each year
av <- as.data.frame(tapply(df$AA,list(day,years), mean, na.rm=T)) #gets the DOY average per year
#bind to get yearly averages and overall average in a data frame format
DF <- cbind(av,avg)
head(DF)
colnames(DF)[4] <- "avg" #rename ts average column
Now say i have multiple matrices (all the same dimension just different parameters) that i want to do this for... is there an efficient way to loop through this so i get a data frame (DF) output for each A-C?
#extra matrices to play with:
B <- matrix(c(3285:6570),nrow=3)
C <- matrix(c(6570:9855),nrow=3)
I have gotten thus far with some initial help on stackoverflow:
#column means for each matrices
vapply(list(A, B, C), colMeans, numeric(1095))
Here's a tidyverse solution. Let
dates <- seq(from = as.Date("2013-01-01"), to = as.Date("2015-12-31"), by = 1)
A <- data.frame(matrix(c(1:3285), ncol = 3, byrow = TRUE))
since I understand that dates are the same to all the matrices. Also, I made A long rather than wide, that's better when working with tidyverse. Then perhaps you would prefer the output in the form of
A %>% group_by(year = year(dates), day = yday(dates)) %>%
summarise(dayYearAvg = mean(c(X1, X2, X3))) %>%
group_by(day) %>% mutate(dayAvg = mean(dayYearAvg))
# A tibble: 1,095 x 4
# Groups: day [365]
# year day dayYearAvg dayAvg
# <dbl> <dbl> <dbl> <dbl>
# 1 2013 1 2 1097
# 2 2013 2 5 1100
# 3 2013 3 8 1103
# ...
If not, we get the same as in your example with
A %>% group_by(year = year(dates), day = yday(dates)) %>%
summarise(dayYearAvg = mean(c(X1, X2, X3))) %>%
group_by(day) %>% mutate(dayAvg = mean(dayYearAvg)) %>%
spread(year, dayYearAvg) %>% ungroup %>% select(-day)
# A tibble: 365 x 4
# dayAvg `2013` `2014` `2015`
# <dbl> <dbl> <dbl> <dbl>
# 1 1097 2 1097 2192
# 2 1100 5 1100 2195
# 3 1103 8 1103 2198
# 4 1106 11 1106 2201
# ...
Now let also
B <- data.frame(matrix(c(3285:6569), ncol = 3, byrow = TRUE))
C <- data.frame(matrix(c(6570:9854), ncol = 3, byrow = TRUE))
l <- list(A, B, C)
This gives
map(l, . %>% group_by(year = year(dates), day = yday(dates)) %>%
summarise(dayYearAvg = mean(c(X1, X2, X3))) %>%
group_by(day) %>% mutate(dayAvg = mean(dayYearAvg)) %>%
spread(year, dayYearAvg) %>% ungroup %>% select(-day))
# [[1]]
# A tibble: 365 x 4
# dayAvg `2013` `2014` `2015`
# <dbl> <dbl> <dbl> <dbl>
# 1 1097 2 1097 2192
# 2 1100 5 1100 2195
# ...
# [[2]]
# A tibble: 365 x 4
# dayAvg `2013` `2014` `2015`
# <dbl> <dbl> <dbl> <dbl>
# 1 4381 3286 4381 5476
# 2 4384 3289 4384 5479
# ...
# [[3]]
# A tibble: 365 x 4
# dayAvg `2013` `2014` `2015`
# <dbl> <dbl> <dbl> <dbl>
# 1 7666 6571 7666 8761
# 2 7669 6574 7669 8764
# ...
Here's a tinyverse solution (i.e., no third-party packages) that wraps your process in a function to receive a matrix as input and return data frame as output. Then run lapply on a list of matrices.
df_process <- function(mat) {
# CREATE DF AND ADD NEW COLUMNS
df <- within(data.frame(D=seq(from = as.Date("2013-01-01"),
to = as.Date("2015-12-31"), by= 1),
AA=sapply(1:1095, function(x) mean(mat[,x], na.rm=TRUE))),
{
year <- format(as.Date(df$D, origin="1970-01-01"), "%Y")
day <- format(as.Date(df$D, origin="1970-01-01"), "%d")
})
# CREATE DF WITH TAPPLY CALLS, RENAME COLUMNS
df <- setNames(data.frame(tapply(df$AA,list(day,years), mean, na.rm=T),
avg = c(tapply(df$AA, day, mean, na.rm=T))),
c("2013", "2014", "2015", "avg"))
}
A <- matrix(c(1:3285),nrow=3)
B <- matrix(c(3286:6570),nrow=3)
C <- matrix(c(6571:9855),nrow=3)
# NAMED LIST OF DATA FRAMES
DF_list <- setNames(lapply(list(A, B, C), df_process), c("A", "B", "C"))
all.equal(DF, DF_list$A)
# [1] TRUE
identical(DF, DF_list$A)
# [1] TRUE
Output
lapply(DF_list, head)
# $A
# 2013 2014 2015 avg
# 01 501.5 1596.5 2691.5 1596.5
# 02 504.5 1599.5 2694.5 1599.5
# 03 507.5 1602.5 2697.5 1602.5
# 04 510.5 1605.5 2700.5 1605.5
# 05 513.5 1608.5 2703.5 1608.5
# 06 516.5 1611.5 2706.5 1611.5
# $B
# 2013 2014 2015 avg
# 01 3786.5 4881.5 5976.5 4881.5
# 02 3789.5 4884.5 5979.5 4884.5
# 03 3792.5 4887.5 5982.5 4887.5
# 04 3795.5 4890.5 5985.5 4890.5
# 05 3798.5 4893.5 5988.5 4893.5
# 06 3801.5 4896.5 5991.5 4896.5
# $C
# 2013 2014 2015 avg
# 01 7071.5 8166.5 9261.5 8166.5
# 02 7074.5 8169.5 9264.5 8169.5
# 03 7077.5 8172.5 9267.5 8172.5
# 04 7080.5 8175.5 9270.5 8175.5
# 05 7083.5 8178.5 9273.5 8178.5
# 06 7086.5 8181.5 9276.5 8181.5
I'd like to use dplyr and tbrf to calculate a 90 day rolling geometric mean and 90th percentile for each group 'Type'. The code below is generating percentiles for each date, not every 90 days. It also is wrongly generating duplicate rows.
side note:I first tried using %within% and creating an interval in the lubridate package.However, class Interval from lubridate is currently not supported in dplyr hence wanting to try tbrf. I have also tried tibbletime,RcppRoll and zoo's Rollapply
##sample data###
Value=c(50,900,25,25,125,50,25,25,2000,25,25,
25,25,25,25,25,25,325,25,300,475,25)
Dates = as.Date(c("2015-02-23","2015-04-20","2015-06-17",
"2015-08-20","2015-10-05","2015-12-22",
"2016-01-19","2016-03-29","2016-05-03",
"2016-07-21","2016-09-08","2016-11-07",
"2017-02-27","2017-04-19","2017-06-29",
"2017-08-24","2017-10-23","2017-12-28",
"2018-01-16","2018-03-14","2018-05-29",
"2018-07-24"))
Type = c(rep("A", 11), rep("B", 11))
df=data.frame(Value,Dates,Type)
######failed attempt 1####
df2=df %>% group_by(Type) %>%
tbr_gmean(Value, Dates, "days", 90) %>%
tbr_misc(Value, Dates,"days", 90, quantile, .(0.9))
## failed attempt #2##
start.date = min(df$Dates)
breaks = seq(start.date - 30*3600*24, start.date + 30*3600*24, "90 days")
df$group = cut(df$Dates, breaks=breaks)
DF= df %>% group_by(Type,group) %>%
mutate(Count=n(),gm=geoMean(Value),
percentile_90=quantile(Value,0.90))
Edited: Try this:
library(psych)
library(dplyr)
library(zoo)
dfmod<-df %>%
group_by(Type) %>%
arrange(Dates) %>%
mutate(rnk = cumsum(c(TRUE, diff(Dates) > 5)))%>% #changed it from !=1 to reflect that you want the date difference to be within 5 days or less
group_by(Type,rnk) %>%
mutate(GM = rollapply(Value, 2, geometric.mean, fill=NA, align="right"),
qt=rollapply(Value, 2, quantile, p=0.90, fill=NA, align="right")) #changed 5 to 2 so that the rolling sum is calculated for every 2 rows
head(dfmod)
## A tibble: 6 x 6
## Groups: Type, rnk [1]
# Dates Type Value rnk GM qt
# <date> <fct> <dbl> <int> <dbl> <dbl>
#1 2018-10-03 A 35.3 1 NA NA
#2 2018-10-04 A 34.3 1 NA NA
#3 2018-10-05 A 34.6 1 NA NA
#4 2018-10-06 A 34.3 1 NA NA
#5 2018-10-07 A 34.1 1 34.5 35.1
#6 2018-10-08 A 34.7 1 34.4 34.6
Want to calculate conditional sum based on specified dates in r. My sample df is
start_date = c("7/24/2017", "7/1/2017", "7/25/2017")
end_date = c("7/27/2017", "7/4/2017", "7/28/2017")
`7/23/2017` = c(1,5,1)
`7/24/2017` = c(2,0,2)
`7/25/2017` = c(0,0,10)
`7/26/2017` = c(2,2,2)
`7/27/2017` = c(0,0,0)
df = data.frame(start_date,end_date,`7/23/2017`,`7/24/2017`,`7/25/2017`,`7/26/2017`,`7/27/2017`)
In Excel it looks like:
I want to perform calculations as specified in Column H which is a conditional sum of columns C through G based on the dates specified in columns A and B.
Apparently, Excel allows columns to be dates but not R.
#wide to long format
dat <- reshape(df, direction="long", varying=list(names(df)[3:7]), v.names="Value",
idvar=c("start_date","end_date"), timevar="Date",
times=seq(as.Date("2017/07/23"),as.Date("2017/07/27"), "day"))
#convert from factor to date class
dat$end_date <- as.Date(dat$end_date, format = "%m/%d/%Y")
dat$start_date <- as.Date(dat$start_date, format = "%m/%d/%Y")
library(dplyr)
dat %>% group_by(start_date, end_date) %>%
mutate(mval = ifelse(between(Date, start_date, end_date), Value, 0)) %>%
summarise(conditional_sum=sum(mval))
# # A tibble: 3 x 3
# # Groups: start_date [?]
# start_date end_date conditional_sum
# <date> <date> <dbl>
# 1 2017-07-01 2017-07-04 0
# 2 2017-07-24 2017-07-27 4
# 3 2017-07-25 2017-07-28 12
You could achieve that as follows:
# number of trailing columns without numeric values
c = 2
# create a separate vector with the dates
dates = as.Date(gsub("X","",tail(colnames(df),-c)),format="%m.%d.%Y")
# convert date columns in dataframe
df$start_date = as.Date(df$start_date,format="%m/%d/%Y")
df$end_date = as.Date(df$end_date,format="%m/%d/%Y")
# calculate sum
sapply(1:nrow(df),function(x) {y = df[x,(c+1):ncol(df)][dates %in%
seq(df$start_date[x],df$end_date[x],by="day") ]; ifelse(length(y)>0,sum(y),0) })
returns:
[1] 4 0 12
Hope this helps!
Here's a solution all in one dplyr pipe:
library(dplyr)
library(lubridate)
library(tidyr)
df %>%
gather(date, value, -c(1, 2)) %>%
mutate(date = gsub('X', '', date)) %>%
mutate(date = gsub('\\.', '/', date)) %>%
mutate(date = mdy(date)) %>%
filter(date >= mdy(start_date) & date <=mdy(end_date)) %>%
group_by(start_date, end_date) %>%
summarize(Conditional_Sum = sum(value)) %>%
right_join(df) %>%
mutate(Conditional_Sum = ifelse(is.na(Conditional_Sum), 0, Conditional_Sum)) %>%
select(-one_of('Conditional_Sum'), one_of('Conditional_Sum'))
## start_date end_date X7.23.2017 X7.24.2017 X7.25.2017 X7.26.2017 X7.27.2017 Conditional_Sum
## <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7/24/2017 7/27/2017 1 2 0 2 0 4
## 2 7/1/2017 7/4/2017 5 0 0 2 0 0
## 3 7/25/2017 7/28/2017 1 2 10 2 0 12