Consider the following example:
library(tidyverse)
library(lubridate)
df = tibble(client_id = rep(1:3, each=24),
date = rep(seq(ymd("2016-01-01"), (ymd("2016-12-01") + years(1)), by='month'), 3),
expenditure = runif(72))
In df you have stored information on monthly expenditure from a bunch of clients for the past 2 years. Now you want to calculate the monthly difference between this year and the previous year for each client.
Is there any way of doing this maintaining the "long" format of the dataset? Here I show you the way I am doing it nowadays, which implies going wide:
df2 = df %>%
mutate(date2 = paste0('val_',
year(date),
formatC(month(date), width=2, flag="0"))) %>%
select(client_id, date2, value) %>%
pivot_wider(names_from = date2,
values_from = value)
df3 = (df2[,2:13] - df2[,14:25])
However I find tihs unnecessary complex, and in large datasets going from long to wide can take quite a lot of time, so I think there must be a better way of doing it.
If you want to keep data in long format, one way would be to group by month and date value for each client_id and calculate the difference using diff.
library(dplyr)
df %>%
group_by(client_id, month_date = format(date, "%m-%d")) %>%
summarise(diff = -diff(expenditure))
# client_id month_date diff
# <int> <chr> <dbl>
# 1 1 01-01 0.278
# 2 1 02-01 -0.0421
# 3 1 03-01 0.0117
# 4 1 04-01 -0.0440
# 5 1 05-01 0.855
# 6 1 06-01 0.354
# 7 1 07-01 -0.226
# 8 1 08-01 0.506
# 9 1 09-01 0.119
#10 1 10-01 0.00819
# … with 26 more rows
An option with data.table
library(data.table)
library(zoo)
setDT(df)[, .(diff = -diff(expenditure)), .(client_id, month_date = as.yearmon(date))]
I'm struggling to find a solution for the following problem. I have a df with id's/ dob's and another monthbucket df as following
set.seed(33)
df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10),
id = seq(1:10) )
monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)
I want to get an output which gives me the count of members within age groups (<19, 19-64, >64) for each of my monthly buckets. The count obviously switches over the year when people have birthdays.
I got the age calculation with something like:
age.fct <- function(dob, bucketdate) {
period <- as.period(interval(dob, bucketdate),unit = "year")
period$year}
I guess the general approach would be to calculate the age for each monthbucket, assign into one of the 3 age groups and count it up by month. Any suggestions?
EDIT 1.
Thanks for all the different approaches, I just run a brief benchmark on the solutions to determine which answer to accept. Somehow the data table solution didn't work on my test data set but I will check as soon as I have a few minutes in the next days.
set.seed(33)
df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10000),
id = seq(1:10000) )
monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)
birth_days <- df$dob
month_bucket <- monthbucket$startmonth
and the benchmark
microbenchmark::microbenchmark(
MM= monthbucket %>% group_by_all %>% expand(id=df$id) %>% left_join(.,{df %>% mutate(birth_month =cut(dob, "month"))},by="id") %>% mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>%
mutate(age_cat=case_when(age<19 ~ "<19", age>64 ~ ">64",TRUE ~ "19-64")) %>% group_by(month) %>% count(age_cat) %>% gather(variable, count, n) %>%
unite(variable, age_cat) %>% spread(variable, count)
,
AkselA = {ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
ages <- do.call(data.frame, lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
},
Cole1 ={t(table(apply(X = outer(month_bucket, birth_days, `-`) / 365.25, MARGIN = 2, FUN = cut, c(0,19,65, Inf)), rep(format(month_bucket,'%Y-%m'), length(birth_days))))
},
# cole2={ cast(CJ(month_bucket, birth_days)[, .N, by = .(month_bucket , cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))], month_bucket ~ cut, value.var = 'N')
# },
#
Cole3={crossing(month_bucket, birth_days)%>%count(month_bucket, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf)))%>%spread(age_range, n)
},
Cole4={all_combos <- expand.grid(month_bucket = month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
reshape(data = aggregate( all_combos$month_bucket, by = list(bucket = all_combos$month_bucket,age_group = all_combos$cut_r), FUN = length), timevar = 'age_group' , idvar = 'bucket', direction = 'wide' )
},
times = 1L)
Unit: milliseconds
expr min lq mean median uq max neval
MM 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 1
AkselA 17.12697 17.12697 17.12697 17.12697 17.12697 17.12697 1
Cole1 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 1
Cole3 23.63945 23.63945 23.63945 23.63945 23.63945 23.63945 1
Cole4 877.92782 877.92782 877.92782 877.92782 877.92782 877.92782 1
Based on speed AkselA's approach seems to be the fastest but I get a different result for M-M's approach compared to all others (once AkselA's changes to 65 in the cut part cut, c(0, 19, 64, Inf)... I will accept answer based on speed but will look into the differences in the results!
Not very sophisticated but I joined the two tables (first expanded monthbucket on df$id) and then calculated the age (as you have the whole month, I just calculated difftime with the first day of month of birth and startmonth). Then, for each month (bucket) I counted number of different age groups and at the end converted long format to wide for better illustration.
library(lubridate)
library(tidyverse)
monthbucket %>%
group_by_all %>%
expand(id=df$id) %>%
left_join(.,{df %>%
mutate(birth_month =cut(dob, "month"))},
by="id") %>%
mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>%
mutate(age_cat=case_when(age<19 ~ "<19",
age>64 ~ ">64",
TRUE ~ "19-64")) %>%
group_by(month) %>%
count(age_cat) %>%
gather(variable, count, n) %>%
unite(variable, age_cat) %>%
spread(variable, count)
#> # A tibble: 13 x 4
#> # Groups: month [13]
#> month `<19` `>64` `19-64`
#> <fct> <int> <int> <int>
#> 1 2010-01 3 2 5
#> 2 2010-02 3 2 5
#> 3 2010-03 3 2 5
#> 4 2010-04 3 2 5
#> 5 2010-05 3 2 5
#> 6 2010-06 3 2 5
#> 7 2010-07 3 2 5
#> 8 2010-08 3 2 5
#> 9 2010-09 3 2 5
#> 10 2010-10 3 2 5
#> 11 2010-11 3 2 5
#> 12 2010-12 3 2 5
#> 13 2011-01 3 2 5
Created on 2019-07-03 by the reprex package (v0.3.0)
Assuming I understand your request.
ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
ages <- do.call(data.frame,
lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
ages
# 2010-01 2010-02 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 2011-01
# 0-19 2 2 2 2 2 2 2 2 2 2 2 2 2
# 19-64 7 7 7 7 7 7 7 7 7 7 7 7 7
# 64+ 1 1 1 1 1 1 1 1 1 1 1 1 1
#
There are some similarities to #AkselA's answer as it depends on outer(), cut(), and table().
set.seed(33)
birth_days <- sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10)
month_bucket <- seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months")
t(
table(
apply(
X = outer(month_bucket, birth_days, `-`) / 365.25
, MARGIN = 2
, FUN = cut, c(0,19,65, Inf)
)
, rep(format(month_bucket,'%Y-%m'), length(birth_days))
)
)
(0,19] (19,65] (65,Inf]
2010-01 2 7 1
2010-02 2 7 1
2010-03 2 7 1
2010-04 2 7 1
2010-05 2 7 1
2010-06 2 7 1
2010-07 2 7 1
2010-08 2 7 1
2010-09 2 7 1
2010-10 2 7 1
2010-11 2 7 1
2010-12 2 7 1
2011-01 2 7 1
I felt weird having such a similar solution so here is data.table:
library(data.table)
dcast(
CJ(month_bucket, birth_days
)[, .N
, by = .(month_bucket
, cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))
]
, month_bucket ~ cut
, value.var = 'N')
dplyr and tidyr:
library(dplyr)
library(tidyr)
crossing(month_bucket, birth_days)%>%
count(month_bucket
, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf))
)%>%
spread(age_range, n)
And a similar approach in base that I'm not completely happy with.
all_combos <- expand.grid(month_bucket = month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
reshape(
data = aggregate(
all_combos$month_bucket
, by = list(bucket = all_combos$month_bucket
,age_group = all_combos$cut_r)
, FUN = length)
, timevar = 'age_group'
, idvar = 'bucket'
, direction = 'wide'
)
I have measured hourly data of ground O3 but with some missing data (marked as NA). I want to calculate daily maximums, but only in case there are more than 17 hourly measurements per date. In case it is less than 18 measurement per date I want to write NA.
head(o3sat)
date hour O3
1/1/2010 0 50.2
1/1/2010 1 39.8
1/1/2010 2 41.8
1/1/2010 3 NA
1/1/2010 4 9.2
1/1/2010 5 6.0
Is there a possibility to add some argument to this function to indicate that at least 75% of the data must be available in a day for the value to be calculated, else the data is removed
maximums <- aggregate(o3sat["dnevnik"], list(Date = as.Date(o3sat$datum)), max, na.rm = TRUE)
It is better to provide a reproducible example when asking a question. Here, I created an example data frame based on the information you provided. This data frame contains hourly O3 measurements from 2010-01-01 to 2010-01-03.
library(dplyr)
library(tidyr)
library(lubridate)
o3sat <- read.table(text = " date hour O3
'1/1/2010' 0 50.2
'1/1/2010' 1 39.8
'1/1/2010' 2 41.8
'1/1/2010' 3 NA
'1/1/2010' 4 9.2
'1/1/2010' 5 6.0 ",
stringsAsFactors = FALSE, header = TRUE)
set.seed(1234)
o3sat_ex <- o3sat %>%
mutate(date = mdy(date)) %>%
complete(date = seq.Date(ymd("2010-01-01"), ymd("2010-01-03"), 1), hour = 0:23) %>%
mutate(O3 = c(o3sat$O3, rnorm(66, 30, 10))) %>%
mutate(O3 = ifelse(row_number() %in% sample(7:72, 18), NA, O3))
We can count how many non-NA value per day using the following code.
o3sat_ex %>%
group_by(date) %>%
summarize(sum(!is.na(O3)))
# # A tibble: 3 x 2
# date `sum(!is.na(O3))`
# <date> <int>
# 1 2010-01-01 18
# 2 2010-01-02 17
# 3 2010-01-03 18
Based on your description, we would like to calculate the maximum for 2010-01-01 and 2010-01-03, but not 2010-01-02 as it only contains 17 non-NA values.
Here is one way to achieve the task, we can define a function, max_helper, that only returns maximum if the count of non-NA values is larger than 17.
max_helper <- function(x, threshold){
if (sum(!is.na(x)) >= threshold) {
r <- max(x, na.rm = TRUE)
} else {
r <- NA
}
return(r)
}
We can apply this number using the dplyr code to get the answer.
o3sat_ex2 <- o3sat_ex %>%
group_by(date) %>%
summarize(O3 = max_helper(O3, 18))
o3sat_ex2
# # A tibble: 3 x 2
# date O3
# <date> <dbl>
# 1 2010-01-01 50.2
# 2 2010-01-02 NA
# 3 2010-01-03 47.8
I apologize in advance for the simplicity of this question.
How can I generate a monthly time series data set using something like set seed? I have a question about results from two packages but need to create a sample data set to show as an example. My data set needs to have some NA values within in it.
Regards,
Simon
Here's a random list of 1000 dates +- 5 years from today with some missing data using the simstudy package (please provide sample data and expected output for a more specific answer):
library(simstudy)
library(dplyr)
library(lubridate)
set.seed(1724)
# define data
def <- defData(varname = "tmp", dist = "uniform", formula = "0;1") # sumstudy seems to crash when adding missing data with only 1 column
def <- defData(def, varname = "date", dist = "uniform", formula = "-5;5") # +- 5 years
df_full <- genData(1000, def)
##### missing data ----
defM <- defMiss(varname = "date", formula = 0.1, logit.link = F)
df_missing <- genMiss(df_full, defM, idvars = "id")
# Create data with missing values
df <- genObs(df_full, df_missing, idvars = "id")
df %>%
as_tibble() %>%
select(-tmp) %>%
mutate(date = ymd(floor_date(as.POSIXct(Sys.Date()) + date * 365 * 24 * 60 * 60, unit = "day")), # +- 5 years from today
month = format(date, "%Y-%m"))
# A tibble: 1,000 x 3
id date month
<int> <date> <chr>
1 1 NA NA
2 2 2021-09-12 2021-09
3 3 2023-11-08 2023-11
4 4 2015-03-02 2015-03
5 5 2021-08-12 2021-08
6 6 2021-10-20 2021-10
7 7 2017-05-17 2017-05
8 8 2019-04-12 2019-04
9 9 NA NA
10 10 NA NA
# ... with 990 more rows
Want to calculate conditional sum based on specified dates in r. My sample df is
start_date = c("7/24/2017", "7/1/2017", "7/25/2017")
end_date = c("7/27/2017", "7/4/2017", "7/28/2017")
`7/23/2017` = c(1,5,1)
`7/24/2017` = c(2,0,2)
`7/25/2017` = c(0,0,10)
`7/26/2017` = c(2,2,2)
`7/27/2017` = c(0,0,0)
df = data.frame(start_date,end_date,`7/23/2017`,`7/24/2017`,`7/25/2017`,`7/26/2017`,`7/27/2017`)
In Excel it looks like:
I want to perform calculations as specified in Column H which is a conditional sum of columns C through G based on the dates specified in columns A and B.
Apparently, Excel allows columns to be dates but not R.
#wide to long format
dat <- reshape(df, direction="long", varying=list(names(df)[3:7]), v.names="Value",
idvar=c("start_date","end_date"), timevar="Date",
times=seq(as.Date("2017/07/23"),as.Date("2017/07/27"), "day"))
#convert from factor to date class
dat$end_date <- as.Date(dat$end_date, format = "%m/%d/%Y")
dat$start_date <- as.Date(dat$start_date, format = "%m/%d/%Y")
library(dplyr)
dat %>% group_by(start_date, end_date) %>%
mutate(mval = ifelse(between(Date, start_date, end_date), Value, 0)) %>%
summarise(conditional_sum=sum(mval))
# # A tibble: 3 x 3
# # Groups: start_date [?]
# start_date end_date conditional_sum
# <date> <date> <dbl>
# 1 2017-07-01 2017-07-04 0
# 2 2017-07-24 2017-07-27 4
# 3 2017-07-25 2017-07-28 12
You could achieve that as follows:
# number of trailing columns without numeric values
c = 2
# create a separate vector with the dates
dates = as.Date(gsub("X","",tail(colnames(df),-c)),format="%m.%d.%Y")
# convert date columns in dataframe
df$start_date = as.Date(df$start_date,format="%m/%d/%Y")
df$end_date = as.Date(df$end_date,format="%m/%d/%Y")
# calculate sum
sapply(1:nrow(df),function(x) {y = df[x,(c+1):ncol(df)][dates %in%
seq(df$start_date[x],df$end_date[x],by="day") ]; ifelse(length(y)>0,sum(y),0) })
returns:
[1] 4 0 12
Hope this helps!
Here's a solution all in one dplyr pipe:
library(dplyr)
library(lubridate)
library(tidyr)
df %>%
gather(date, value, -c(1, 2)) %>%
mutate(date = gsub('X', '', date)) %>%
mutate(date = gsub('\\.', '/', date)) %>%
mutate(date = mdy(date)) %>%
filter(date >= mdy(start_date) & date <=mdy(end_date)) %>%
group_by(start_date, end_date) %>%
summarize(Conditional_Sum = sum(value)) %>%
right_join(df) %>%
mutate(Conditional_Sum = ifelse(is.na(Conditional_Sum), 0, Conditional_Sum)) %>%
select(-one_of('Conditional_Sum'), one_of('Conditional_Sum'))
## start_date end_date X7.23.2017 X7.24.2017 X7.25.2017 X7.26.2017 X7.27.2017 Conditional_Sum
## <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7/24/2017 7/27/2017 1 2 0 2 0 4
## 2 7/1/2017 7/4/2017 5 0 0 2 0 0
## 3 7/25/2017 7/28/2017 1 2 10 2 0 12