Get the mean between two dates - r

I would like to calculate the mean of a variable between two dates.
Here the example of data frames
library(lubridate) #ymd function
day= rep(seq.Date(from=ymd("2020-03-01"),to=ymd("2020-04-15"),by="day"), times=4)
center= rep(c("A", "B", "C", "D"), each=46)
ocupation= as.numeric(round(runif(184,20,40),1))
df <- data.frame(day,center,ocupation)
start <- mdy("03/15/2020","04/12/2020","05/01/2020","02/13/2020")
end <- mdy("03/20/2020","04/28/2020","05/14/2020","03/01/2020")
center<-c("A", "A", "B", "C")
id<-c(1,2,3,4)
patients <- data.frame(id, center,start,end)
The shown data frame of patients it is just a sample, the original contains more than 12.000 ids
From each id, I would like to get the mean occupation between the start and the end dates in the center

You can do this using the dplyr package from tidyverse.
df <- as_tibble(df)
library(dplyr) # 1.0.0
df %>%
# find only the days in df corresponding to day ranges in patients
filter(day %in% c(seq(patients[1, 3], patients[1, 4], by = "days"),
seq(patients[2, 3], patients[2, 4], by = "days"),
seq(patients[3, 3], patients[3, 4], by = "days"),
seq(patients[4, 3], patients[4, 4], by = "days"))) %>%
# add id column
mutate(id = ifelse(day %in% seq(patients[1, 3], patients[1, 4], by = "days"), patients$id[1],
ifelse(day %in% seq(patients[2, 3], patients[2, 4], by = "days"), patients$id[2],
ifelse(day %in% seq(patients[3, 3], patients[3, 4], by = "days"), patients$id[3], patients$id[4])))) %>%
# group by id
group_by(id) %>%
# find mean occuption for each id
summarise(mean_occupation = mean(ocupation))
# A tibble: 3 x 2
id mean_occupation
<dbl> <dbl>
1 1 29.7
2 2 31.7
3 4 32.2
EDIT
Version with for loops for many id's:
df <- as_tibble(df)
library(dplyr)
# create days vector from patients
days <- list()
for (i in 1:nrow(patients)) {
dates <- seq(patients[i, 3], patients[i, 4], by = "days")
for (j in 1:length(dates)) {
names(dates)[j] <- patients$id[i]
}
days[[i]] <- dates
}
days <- as.Date(unlist(days), origin = "1970-01-01")
# filter df for days
mid <- df %>%
filter(day %in% days)
# create id col (I couldn't do this directly in mutate())
id <- character()
for (i in 1:nrow(mid)) {
id[i] <- names(days)[which(days == mid$day[i])]
}
# bind together and finish
final <- mid %>%
cbind(id) %>% as_tibble() %>%
group_by(id) %>%
summarise(mean_occupation = mean(ocupation))
> final
# A tibble: 3 x 2
id mean_occupation
<chr> <dbl>
1 1 29.7
2 2 31.7
3 4 32.2

I would create a function that returns the average occupancy for one id:
mean.occ = function(id, patients, occupency, day, center){
to.select = day > patients[id, "start"] & day < patients[id, "end"] & center == patients[id, "center"]
return(mean(occupency[to.select]))
}
here, day > patients[id, "start"] & day < patients[id, "end"] & center == patients[id, "center"] select the ocuupency values between the start and end dates for a specific id and corresponding to the given center.
Then use sapply to apply it to each ids:
mean.occupancies = sapply(patients$id, FUN = mean.occ, patients, ocupation, day, center)
It is finally possible to add the results to the patients data frame:
patients = cbind.data.frame(patients, mean.occupancies)

Related

Error in 'group by' stating its a character for looping through summarise R

still getting to grips with R, and as a newbie, as I have been doing most of my coding manual, ie copy and paste the same block of code 20 times. Here, I was trying to learn about looping and summarising. When I do the summarise with one individual dataset, it works fine, but now I try and loop it, it states its a character, so I added in 'as.numeric' to no prevail. Any advice would be helpful.
Error:
Error in UseMethod("group_by_") :
no applicable method for 'group_by_' applied to an object of class "character"
#educationYears
fiv0_education <- subset(sf_education, Year == '2005')
six0_education <- subset(sf_education, Year == '2006')
sev0_education <- subset(sf_education, Year == '2007')
eig0_education <- subset(sf_education, Year == '2008')
nin0_education <- subset(sf_education, Year == '2009')
ten_education <- subset(sf_education, Year == '2010')
one_education <- subset(sf_education, Year == '2011')
two_education <- subset(sf_education, Year == '2012')
thr_education <- subset(sf_education, Year == '2013')
for_education <- subset(sf_education, Year == '2014')
fiv_education <- subset(sf_education, Year == '2015')
six_education <- subset(sf_education, Year == '2016')
sev_education <- subset(sf_education, Year == '2017')
eig_education <- subset(sf_education, Year == '2018')
nin_education <- subset(sf_education, Year == '2019')
names <- c('fiv0', 'six0', 'sev0', 'eig0', 'nin0', 'ten', 'one', 'two', 'thr', 'for', 'fiv', 'six', 'sev', 'eig', 'nin')
test <- vector("list", length(names))
for (i in 1:length(names)) {
test[i] <- paste(names[i], '_education', sep = "", collapse = NULL) %>%
group_by(as.numeric(as.character(Kod))) %>%
summarise(Count=sum(as.numeric(as.character(Count))))
}
Here is a solution using built-in data set iris as an example. I believe it's easy to adapt to the problem in the question.
1. A solution with a for loop, like in the question.
library(dplyr)
names <- c('fiv0', 'six0', 'sev0')
test <- vector("list", length(names))
for (i in 1:length(names)) {
tmp <- paste0(names[i], '_education')
test[[i]] <- get(tmp, envir = .GlobalEnv) %>%
mutate(Count = as.numeric(as.character(Count))) %>%
group_by(Kod) %>%
summarise(Count = sum(Count))
}
test
#[[1]]
## A tibble: 4 x 2
# Kod Count
# <int> <dbl>
#1 1 1.6
#2 2 3.7
#3 3 2.4
#4 4 4.6
#
#[[2]]
## A tibble: 4 x 2
# Kod Count
# <int> <dbl>
#1 1 24.5
#2 2 27.2
#3 3 19.1
#4 4 30.5
#
#[[3]]
## A tibble: 4 x 2
# Kod Count
# <int> <dbl>
#1 1 15.9
#2 2 18.9
#3 3 15.5
#4 4 16
2. Here is another way, with purrr::map.
This code uses the data set already split in several with subset.
paste0(names, '_education') %>%
mget(envir = .GlobalEnv) %>%
purrr::map(
function(X){
X %>%
mutate(Count = as.numeric(as.character(Count))) %>%
group_by(Kod) %>%
summarise(Count = sum(Count))
}
)
3. Another purrr:map way, but this time from the original data set, with no need to subset multiple times first.
Note that the splitting column here is Species, in the question it's Year.
df1 %>%
group_split(Species) %>%
purrr::map(
function(X){
X %>%
mutate(Count = as.numeric(as.character(Count))) %>%
group_by(Kod) %>%
summarise(Count = sum(Count))
}
)
Data creation code.
set.seed(1234)
df1 <- iris[4:5]
names(df1)[1] <- "Count"
df1$Kod <- sample(4, 150, TRUE)
fiv0_education <- subset(df1, Species == 'setosa')
six0_education <- subset(df1, Species == 'virginica')
sev0_education <- subset(df1, Species == 'versicolor')

I want to return a season and year value from a continuous list of dates

I have a continuous list of dates (yyyy-mm-dd) from 1985 to 2018 in one column (Colname = date). What I wish to do is generate another column which outputs a water season and year given the date.
To make it clearer I have two water season:
Summer = yyyy-04-01 to yyyy-09-31;
Winter = yyyy-10-01 to yyyy(+1)-03-31.
So for 2018 - Summer = 2018-04-01 to 2018-09-31; Winter 2018-10-01 to 2019-03-31.
What I would like to output is something like the following:
Many thanks.
A tidy verse approach
library(tidyverse)
df <-tibble(date = seq(from = as.Date('2000-01-01'), to = as.Date('2001-12-31'), by = '1 month'))
df
df %>%
mutate(water_season_year = case_when(
lubridate::month(date) %in% c(4:9) ~str_c('Su_', lubridate::year(date)),
lubridate::month(date) %in% c(10:12) ~str_c('Wi_', lubridate::year(date)),
lubridate::month(date) %in% c(1:3)~str_c('Wi_', lubridate::year(date) -1),
TRUE ~ 'Error'))
You can compare just the month part of the data to get the season, in base R consider doing
month <- as.integer(format(df$date, "%m"))
year <- format(df$date, "%Y")
inds <- month >= 4 & month <= 9
df$water_season_year <- NA
df$water_season_year[inds] <- paste("Su", year[inds], sep = "_")
df$water_season_year[!inds] <- paste("Wi", year[!inds], sep = "_")
#To add previous year for month <= 3 do
df$water_season_year[month <= 3] <- paste("Wi",
as.integer(year[month <= 3]) - 1, sep = "_")
df
# date water_season_year
#1 2019-01-03 Wi_2019
#2 2000-06-01 Su_2000
Make sure that date variable is of "Date" class.
data
df <-data.frame(date = as.Date(c("2019-01-03", "2000-06-01")))

Drop ID with NA in a conditional group

Extending this question:
I have some data prepared using the below code:
# # Data Preparation ----------------------
library(lubridate)
start_date <- "2018-10-30 00:00:00"
start_date <- as.POSIXct(start_date, origin="1970-01-01")
dates <- c(start_date)
for(i in 1:287) {
dates <- c(dates, start_date + minutes(i * 10))
}
dates <- as.POSIXct(dates, origin="1970-01-01")
date_val <- format(dates, '%d-%m-%Y')
weather.forecast.data <- data.frame(dateTime = dates, date = date_val)
weather.forecast.data <- rbind(weather.forecast.data, weather.forecast.data, weather.forecast.data, weather.forecast.data)
weather.forecast.data$id <- c(rep('GH1', 288), rep('GH2', 288), rep('GH3', 288), rep('GH4', 288))
weather.forecast.data$radiation <- round(runif(nrow(weather.forecast.data)), 2)
weather.forecast.data$hour <- as.integer(format(weather.forecast.data$dateTime, '%H'))
weather.forecast.data$day_night <- ifelse(weather.forecast.data$hour < 6, 'night', ifelse(weather.forecast.data$hour < 19, 'day', 'night'))
# # GH2: Total Morning missing # #
weather.forecast.data$radiation[(weather.forecast.data$id == 'GH2') & (weather.forecast.data$date == '30-10-2018') & (weather.forecast.data$day_night == 'day')] = NA
weather.forecast.data$hour <- NULL
weather.forecast.data$day_night <- NULL
My task is to remove ids from the weather.forecast.data where for each id and each date, morning half (06 hours to 18 hours), the radiation values are missing (NA) using dplyr in R.
I want to eliminate rows for a given id and date which has the entire morning radiation value as missing. i.e. if an id for a date has morning radiation missing. I drop all the rows with that particular id and date. So, we drop all the 144 records because its morning has radiation missing.
We can see that GH2 has entire morning radiation missing on date 30-10-2018. We therefore drop all 144 records with id == 'GH2' and date = '30-10-2018'.
setDT(weather.forecast.data)
weather.forecast.data[, sum(is.na(radiation)), .(id, date)]
id date V1
1: GH1 30-10-2018 0
2: GH1 31-10-2018 0
3: GH2 30-10-2018 78
4: GH2 31-10-2018 0
5: GH3 30-10-2018 0
6: GH3 31-10-2018 0
7: GH4 30-10-2018 0
8: GH4 31-10-2018 0
I have the code using data.table:
setDT(weather.forecast.data)
weather.forecast.data[, hour:= hour(dateTime)]
weather.forecast.data[, day_night:=c("night", "day")[(6 <= hour & hour < 19) + 1L]]
weather.forecast.data[, date_id := paste(date, id, sep = "__")]
weather.forecast.data[, all_is_na := all(is.na(radiation)), .(date_id, day_night)]
weather.forecast.data[!(date_id %in% unique(weather.forecast.data[(all_is_na == TRUE) & (day_night == 'day'), date_id]))]
I need the code using dplyr and I have tried the following. It is dropping many rows than required:
library(dplyr)
weather.forecast.data <- weather.forecast.data %>%
mutate(hour = as.integer(format(dateTime, '%H'))) %>%
mutate(day_night = ifelse(hour < 6, 'night', ifelse(hour < 19, 'day', 'night'))) %>%
group_by(date, day_night, id) %>%
filter((!all(is.na(radiation))) & (day_night == 'day')) %>%
select (-c(hour, day_night)) %>%
as.data.frame
Note: Output should return the data by dropping the rows where id = 'GH2' and date = '30-10-2018'
I believe you are complicating a bit. The following code does what you describe in the question.
library(lubridate)
library(dplyr)
weather.forecast.data %>%
mutate(hour = hour(dateTime),
day_night = c("night", "day")[(6 <= hour & hour < 19) + 1L]) %>%
group_by(date, id) %>%
mutate(delete = all(!(is.na(radiation) & day_night == "day"))) %>%
ungroup() %>%
filter(delete) %>%
select(-hour, -day_night, -delete) %>%
as.data.frame() -> df1
See if it worked giving the expected 144 deleted rows.
nrow(weather.forecast.data) - nrow(df1)
#[1] 144
Data.
I repost the data generation code, simplified in two places and with a call to set.seed.
set.seed(4192)
start_date <- "2018-10-30 00:00:00"
start_date <- as.POSIXct(start_date, origin="1970-01-01")
dates <- start_date + minutes(0:287 * 10)
dates <- as.POSIXct(dates, origin="1970-01-01")
date_val <- format(dates, '%d-%m-%Y')
weather.forecast.data <- data.frame(dateTime = dates, date = date_val)
weather.forecast.data <- rbind(weather.forecast.data, weather.forecast.data, weather.forecast.data, weather.forecast.data)
weather.forecast.data$id <- c(rep('GH1', 288), rep('GH2', 288), rep('GH3', 288), rep('GH4', 288))
weather.forecast.data$radiation <- round(runif(nrow(weather.forecast.data)), 2)
weather.forecast.data$hour <- hour(weather.forecast.data$dateTime)
weather.forecast.data$day_night <- ifelse(weather.forecast.data$hour < 6, 'night', ifelse(weather.forecast.data$hour < 19, 'day', 'night'))
# # GH2: Total Morning missing # #
weather.forecast.data$radiation[(weather.forecast.data$id == 'GH2') & (weather.forecast.data$date == '30-10-2018') & (weather.forecast.data$day_night == 'day')] = NA
weather.forecast.data$hour <- NULL
weather.forecast.data$day_night <- NULL
You are filtering for rows that only contain "day" in the day_night column. If I understood you correctly you want the following:
library(dplyr)
weather.forecast.data <- weather.forecast.data %>%
mutate(hour = as.integer(format(dateTime, '%H'))) %>%
mutate(day_night = ifelse(hour < 6, 'night', ifelse(hour < 19, 'day',
'night'))) %>%
group_by(date, day_night, id) %>%
filter((!(all(is.na(radiation))) & (day_night == 'day'))) %>%
select (-c(hour, day_night)) %>%
as.data.frame
This would remove all IDs that have all NAs during the day.

Identifying duplicates with different IDs

I have the following data:
set.seed(26312)
id <- rep(c(1, 2, 3, 4, 5), each = 9)
wrc <- round(runif(36, 20, 100))
wrc <- c(wrc, wrc[10:18])
x <- rep(1:9, 5)
dat <- data.frame(id, wrc, x)
In this data set, id 2 and id 5 contain the exact same data but with different IDs. This can be verified by running,
dat[dat$id == 2, ]
dat[dat$id == 5, ]
I have a much larger data set, with 4321 IDs, and I want to remove these duplicates because even though they have different IDs, they really are duplicates.
Presently I am do a combo of really awful and extremely slow for() and while() loops. In English, what the code is doing is subsetting an id and then comparing that id to every other id that I have subsetted within a while loop. When I find a duplicate, meaning all the rows of data are identical, it should throw away the first id that is a duplicate. The resulting cleaned_data is what I want, it is just unbearable slow to get there. Because it takes roughly 1 minute to do a comparison when I have 4321 ids, so that's about 4321 minutes to run this awful loop. Can someone help?
library("dplyr")
id_check = 1:5
cleaned_data <- data.frame()
for(i in id_check){
compare_tmp <- dat %>% filter(id == i)
compare_check <- compare_tmp %>% select(wrc, x)
duplicate = FALSE
if(i == length(id_check)){
cleaned_data <- rbind(cleaned_data, compare_tmp)
break
} else {
id_tmp = i + 1
}
while(duplicate == FALSE){
check <- dat %>% filter(id == id_tmp) %>% select(wrc, x)
if(nrow(check) == 0) break
duplicate = identical(compare_check, check)
id_tmp = id_tmp + 1
if(id_tmp == (length(id_check) + 1)) {
break
}
}
if(duplicate == FALSE){
cleaned_data <- rbind(cleaned_data, compare_tmp)
}
}
cleaned_data
This is in response to why duplicated won't work. Below ids 2 and 5 are not the same subjects because there data aren't always identical.
set.seed(26312)
id <- rep(c(1, 2, 3, 4, 5), each = 9)
wrc <- round(runif(36, 20, 100))
wrc <- c(wrc, wrc[c(1, 11:18)])
x <- rep(1:9, 5)
dat <- data.frame(id, wrc, x)
dat[dat$id == 2,]
dat[dat$id == 5,]
If I run dat[!duplicated(dat[2:3]),] it removes id 5, when it shouldn't.
If the column structure is accurate, you could convert to wide format for duplicate detection:
dat_wide = reshape2::dcast(dat, id ~ x, value.var = "wrc")
dupes = dat_wide$id[duplicated(dat_wide[-1], fromLast = T)]
no_dupes = dat[!dat$id %in% dupes, ]
Maybe something along the lines of:
do.call(
rbind,
split(dat, dat$id)[!duplicated(lapply(split(dat[2:3], dat$id), `rownames<-`, NULL), fromLast = TRUE)]
)
This splits by id, identifies duplicates, then binds again the non-duplicates.
Edit
Since time is of the essence here, I ran a benchmark of the solutions so far:
set.seed(26312)
p <- 4321
id <- rep(1:p, each = 9)
dats <- replicate(p %/% 2, round(runif(9, 20, 100)), simplify = FALSE)
wrc <- unlist(sample(dats, p, replace = TRUE))
x <- rep(1:9, times = p)
dat <- data.frame(id, wrc, x)
microbenchmark::microbenchmark(
base = {
do.call(
rbind,
split(dat, dat$id)[!duplicated(lapply(split(dat[2:3], dat$id), `rownames<-`, NULL), fromLast = TRUE)]
)
},
tidyr = {
as_tibble(dat) %>%
nest(-id) %>%
filter(!duplicated(data, fromLast = TRUE)) %>%
unnest()
},
reshape = {
dat_wide = reshape2::dcast(dat, id ~ x, value.var = "wrc")
dupes = dat_wide$id[duplicated(dat_wide[-1], fromLast = T)]
no_dupes = dat[!dat$id %in% dupes, ]
},
times = 10L
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# base 892.8239 980.36553 1090.87505 1096.12514 1187.98810 1232.47244 10 c
# tidyr 944.8156 953.10558 977.71756 976.83703 990.58672 1033.27664 10 b
# reshape 49.9955 50.13347 52.20539 51.91833 53.91568 55.64506 10 a
With tidyr:
library(tidyr)
library(dplyr)
as_tibble(dat) %>%
nest(-id) %>%
filter(!duplicated(data, fromLast = TRUE)) %>%
unnest()
# # A tibble: 36 x 3
# id wrc x
# <dbl> <dbl> <int>
# 1 1 53 1
# 2 1 44 2
# 3 1 70 3
# 4 1 31 4
# 5 1 67 5
# 6 1 50 6
# 7 1 70 7
# 8 1 40 8
# 9 1 52 9
# 10 3 95 1
# # ... with 26 more rows
(Note: not sure about the Stackoverflow policy about multiple answers, but this one is different enough to deserve a separate answer IMHO (if it's not, please say so and I'll edit my initial answer and delete this one).

Most effective way to group data in quarters and fiscal years in R

I have a large database (POY) with data from 2011 to 2017 which contains a date column. I would need to do two things: make it possible to split by quarters and by fiscal year.
Our fiscal year unfortunately does not run in parallel with calendar years but goes from July to June. Which also means that my Quarter 1 runs from July to September.
I've written some code that seems to work fine but it seems rather lengthy (especially the second part). Does anyone have any advice for this beginner to make it more efficient?
#Copy of date column and splitting it in 3 columns for year, month and day
library(tidyr)
POY$Date2 <- POY$Date
POY<-separate(POY, Date2, c("year","month","day"), sep = "-", convert=TRUE)
#Making a quarter variable
POY$quarter[POY$month<=3] <- "Q3"
POY$quarter[POY$month>3 & POY$month <=6] <- "Q4"
POY$quarter[POY$month>6 & POY$month <=9] <- "Q1"
POY$quarter[POY$month>9 & POY$month <=12] <- "Q2"
POY$quarter <- as.factor(POY$quarter)
For the Fiscal Year variable: it runs July - June, so:
June'15 should become FY1415
July'15 should become FY1516
Or: Q1 and Q2 in 2015 should become FY1516, while Q3 and Q4 of 2015 are actually FY1415.
#Making a FY variable
for (i in 1:nrow(POY)) {
if (POY$quarter[i] == "Q1" | POY$quarter[i] == "Q2") {
year1 <- as.character(POY$year[i])
year2 <- as.character(POY$year[i] + 1)
} else {
year1 <- as.character(POY$year[i]- 1)
year2 <- as.character(POY$year[i])
}
POY$FY[i] <- paste0("FY", substr(year1, start=3, stop=4), substr(year2, start=3, stop=4))
}
POY$FY <- as.factor(POY$FY)
summary(POY$FY)
Any suggestions?
Thank you!
Not sure if this was available at the time but the lubridate package contains a quarter function which allows you to create your fiscal quarter and year columns.
The documentation is here.
Examples for your case would be:
x <- ymd("2011-07-01")
quarter(x)
quarter(x, with_year = TRUE)
quarter(x, with_year = TRUE, fiscal_start = 7)
You can then use dplyr and paste function to mutate your own columns in creating fiscal quarters and years.
I've used a combination of base R, lubridate and dplyr;
# make a blank dataframe with sequential dates ...
df <- data.frame(date = seq (as.Date('2011-07-01'), as.Date('2015-07-01'), by = 'month'))
# similar to original poster, separate year/month/day
df <-
df %>%
separate(col = date, into = c('yr', 'mnth', 'dy'), sep = '-', convert = TRUE, remove = FALSE)
# extract last 2 digits of year
df$yr_small <- strftime(x = df$date, format = '%y', tz = 'GMT')
df$yr_small <- as.numeric(df$yr_small)
# Use dplyr's "case_when" to categorise quarters
df <-
df %>%
# make quarters
mutate(
quarter = case_when(
mnth >= 7 & mnth <= 9 ~ 'Q1'
, mnth >= 10 & mnth <= 12 ~ 'Q2'
, mnth >= 1 & mnth <= 3 ~ 'Q3'
, mnth >= 4 & mnth <= 6 ~ 'Q4' ) ) %>%
# ... the financial year is
mutate (
financial_year = case_when(
quarter == 'Q1' | quarter == 'Q2' ~ (yr_small + 1)
, quarter == 'Q3' | quarter == 'Q4' ~ (yr_small) ) )
# final column to make the full financial year start/end
df <- df %>% mutate (FY = paste('FY',df$financial_year, df$financial_year + 1, sep = '') )
Should give you this:
You could use this to replace the for-loop, I think. If you'd supply some data I could test it.
#Making a FY variable
POY$year1 <- as.character(POY$year - 1)
POY$year2 <- as.character(POY$year)
POY$year1[(POY$quarter == "Q1") | (POY$quarter == "Q2")] <-
as.character(POY$year[(POY$quarter == "Q1") |(POY$quarter == "Q2")])
POY$year2[(POY$quarter == "Q1") | (POY$quarter == "Q2")] <-
as.character(POY$year[(POY$quarter == "Q1") | (POY$quarter == "Q2")] + 1)
POY$FY <-
paste0("FY", substr(POY$year1, 3, 4), substr(POY$year2, 3, 4))
POY$FY <- as.factor(POY$FY)
summary(POY$FY)

Resources