Drop ID with NA in a conditional group - r

Extending this question:
I have some data prepared using the below code:
# # Data Preparation ----------------------
library(lubridate)
start_date <- "2018-10-30 00:00:00"
start_date <- as.POSIXct(start_date, origin="1970-01-01")
dates <- c(start_date)
for(i in 1:287) {
dates <- c(dates, start_date + minutes(i * 10))
}
dates <- as.POSIXct(dates, origin="1970-01-01")
date_val <- format(dates, '%d-%m-%Y')
weather.forecast.data <- data.frame(dateTime = dates, date = date_val)
weather.forecast.data <- rbind(weather.forecast.data, weather.forecast.data, weather.forecast.data, weather.forecast.data)
weather.forecast.data$id <- c(rep('GH1', 288), rep('GH2', 288), rep('GH3', 288), rep('GH4', 288))
weather.forecast.data$radiation <- round(runif(nrow(weather.forecast.data)), 2)
weather.forecast.data$hour <- as.integer(format(weather.forecast.data$dateTime, '%H'))
weather.forecast.data$day_night <- ifelse(weather.forecast.data$hour < 6, 'night', ifelse(weather.forecast.data$hour < 19, 'day', 'night'))
# # GH2: Total Morning missing # #
weather.forecast.data$radiation[(weather.forecast.data$id == 'GH2') & (weather.forecast.data$date == '30-10-2018') & (weather.forecast.data$day_night == 'day')] = NA
weather.forecast.data$hour <- NULL
weather.forecast.data$day_night <- NULL
My task is to remove ids from the weather.forecast.data where for each id and each date, morning half (06 hours to 18 hours), the radiation values are missing (NA) using dplyr in R.
I want to eliminate rows for a given id and date which has the entire morning radiation value as missing. i.e. if an id for a date has morning radiation missing. I drop all the rows with that particular id and date. So, we drop all the 144 records because its morning has radiation missing.
We can see that GH2 has entire morning radiation missing on date 30-10-2018. We therefore drop all 144 records with id == 'GH2' and date = '30-10-2018'.
setDT(weather.forecast.data)
weather.forecast.data[, sum(is.na(radiation)), .(id, date)]
id date V1
1: GH1 30-10-2018 0
2: GH1 31-10-2018 0
3: GH2 30-10-2018 78
4: GH2 31-10-2018 0
5: GH3 30-10-2018 0
6: GH3 31-10-2018 0
7: GH4 30-10-2018 0
8: GH4 31-10-2018 0
I have the code using data.table:
setDT(weather.forecast.data)
weather.forecast.data[, hour:= hour(dateTime)]
weather.forecast.data[, day_night:=c("night", "day")[(6 <= hour & hour < 19) + 1L]]
weather.forecast.data[, date_id := paste(date, id, sep = "__")]
weather.forecast.data[, all_is_na := all(is.na(radiation)), .(date_id, day_night)]
weather.forecast.data[!(date_id %in% unique(weather.forecast.data[(all_is_na == TRUE) & (day_night == 'day'), date_id]))]
I need the code using dplyr and I have tried the following. It is dropping many rows than required:
library(dplyr)
weather.forecast.data <- weather.forecast.data %>%
mutate(hour = as.integer(format(dateTime, '%H'))) %>%
mutate(day_night = ifelse(hour < 6, 'night', ifelse(hour < 19, 'day', 'night'))) %>%
group_by(date, day_night, id) %>%
filter((!all(is.na(radiation))) & (day_night == 'day')) %>%
select (-c(hour, day_night)) %>%
as.data.frame
Note: Output should return the data by dropping the rows where id = 'GH2' and date = '30-10-2018'

I believe you are complicating a bit. The following code does what you describe in the question.
library(lubridate)
library(dplyr)
weather.forecast.data %>%
mutate(hour = hour(dateTime),
day_night = c("night", "day")[(6 <= hour & hour < 19) + 1L]) %>%
group_by(date, id) %>%
mutate(delete = all(!(is.na(radiation) & day_night == "day"))) %>%
ungroup() %>%
filter(delete) %>%
select(-hour, -day_night, -delete) %>%
as.data.frame() -> df1
See if it worked giving the expected 144 deleted rows.
nrow(weather.forecast.data) - nrow(df1)
#[1] 144
Data.
I repost the data generation code, simplified in two places and with a call to set.seed.
set.seed(4192)
start_date <- "2018-10-30 00:00:00"
start_date <- as.POSIXct(start_date, origin="1970-01-01")
dates <- start_date + minutes(0:287 * 10)
dates <- as.POSIXct(dates, origin="1970-01-01")
date_val <- format(dates, '%d-%m-%Y')
weather.forecast.data <- data.frame(dateTime = dates, date = date_val)
weather.forecast.data <- rbind(weather.forecast.data, weather.forecast.data, weather.forecast.data, weather.forecast.data)
weather.forecast.data$id <- c(rep('GH1', 288), rep('GH2', 288), rep('GH3', 288), rep('GH4', 288))
weather.forecast.data$radiation <- round(runif(nrow(weather.forecast.data)), 2)
weather.forecast.data$hour <- hour(weather.forecast.data$dateTime)
weather.forecast.data$day_night <- ifelse(weather.forecast.data$hour < 6, 'night', ifelse(weather.forecast.data$hour < 19, 'day', 'night'))
# # GH2: Total Morning missing # #
weather.forecast.data$radiation[(weather.forecast.data$id == 'GH2') & (weather.forecast.data$date == '30-10-2018') & (weather.forecast.data$day_night == 'day')] = NA
weather.forecast.data$hour <- NULL
weather.forecast.data$day_night <- NULL

You are filtering for rows that only contain "day" in the day_night column. If I understood you correctly you want the following:
library(dplyr)
weather.forecast.data <- weather.forecast.data %>%
mutate(hour = as.integer(format(dateTime, '%H'))) %>%
mutate(day_night = ifelse(hour < 6, 'night', ifelse(hour < 19, 'day',
'night'))) %>%
group_by(date, day_night, id) %>%
filter((!(all(is.na(radiation))) & (day_night == 'day'))) %>%
select (-c(hour, day_night)) %>%
as.data.frame
This would remove all IDs that have all NAs during the day.

Related

Get the mean between two dates

I would like to calculate the mean of a variable between two dates.
Here the example of data frames
library(lubridate) #ymd function
day= rep(seq.Date(from=ymd("2020-03-01"),to=ymd("2020-04-15"),by="day"), times=4)
center= rep(c("A", "B", "C", "D"), each=46)
ocupation= as.numeric(round(runif(184,20,40),1))
df <- data.frame(day,center,ocupation)
start <- mdy("03/15/2020","04/12/2020","05/01/2020","02/13/2020")
end <- mdy("03/20/2020","04/28/2020","05/14/2020","03/01/2020")
center<-c("A", "A", "B", "C")
id<-c(1,2,3,4)
patients <- data.frame(id, center,start,end)
The shown data frame of patients it is just a sample, the original contains more than 12.000 ids
From each id, I would like to get the mean occupation between the start and the end dates in the center
You can do this using the dplyr package from tidyverse.
df <- as_tibble(df)
library(dplyr) # 1.0.0
df %>%
# find only the days in df corresponding to day ranges in patients
filter(day %in% c(seq(patients[1, 3], patients[1, 4], by = "days"),
seq(patients[2, 3], patients[2, 4], by = "days"),
seq(patients[3, 3], patients[3, 4], by = "days"),
seq(patients[4, 3], patients[4, 4], by = "days"))) %>%
# add id column
mutate(id = ifelse(day %in% seq(patients[1, 3], patients[1, 4], by = "days"), patients$id[1],
ifelse(day %in% seq(patients[2, 3], patients[2, 4], by = "days"), patients$id[2],
ifelse(day %in% seq(patients[3, 3], patients[3, 4], by = "days"), patients$id[3], patients$id[4])))) %>%
# group by id
group_by(id) %>%
# find mean occuption for each id
summarise(mean_occupation = mean(ocupation))
# A tibble: 3 x 2
id mean_occupation
<dbl> <dbl>
1 1 29.7
2 2 31.7
3 4 32.2
EDIT
Version with for loops for many id's:
df <- as_tibble(df)
library(dplyr)
# create days vector from patients
days <- list()
for (i in 1:nrow(patients)) {
dates <- seq(patients[i, 3], patients[i, 4], by = "days")
for (j in 1:length(dates)) {
names(dates)[j] <- patients$id[i]
}
days[[i]] <- dates
}
days <- as.Date(unlist(days), origin = "1970-01-01")
# filter df for days
mid <- df %>%
filter(day %in% days)
# create id col (I couldn't do this directly in mutate())
id <- character()
for (i in 1:nrow(mid)) {
id[i] <- names(days)[which(days == mid$day[i])]
}
# bind together and finish
final <- mid %>%
cbind(id) %>% as_tibble() %>%
group_by(id) %>%
summarise(mean_occupation = mean(ocupation))
> final
# A tibble: 3 x 2
id mean_occupation
<chr> <dbl>
1 1 29.7
2 2 31.7
3 4 32.2
I would create a function that returns the average occupancy for one id:
mean.occ = function(id, patients, occupency, day, center){
to.select = day > patients[id, "start"] & day < patients[id, "end"] & center == patients[id, "center"]
return(mean(occupency[to.select]))
}
here, day > patients[id, "start"] & day < patients[id, "end"] & center == patients[id, "center"] select the ocuupency values between the start and end dates for a specific id and corresponding to the given center.
Then use sapply to apply it to each ids:
mean.occupancies = sapply(patients$id, FUN = mean.occ, patients, ocupation, day, center)
It is finally possible to add the results to the patients data frame:
patients = cbind.data.frame(patients, mean.occupancies)

I want to return a season and year value from a continuous list of dates

I have a continuous list of dates (yyyy-mm-dd) from 1985 to 2018 in one column (Colname = date). What I wish to do is generate another column which outputs a water season and year given the date.
To make it clearer I have two water season:
Summer = yyyy-04-01 to yyyy-09-31;
Winter = yyyy-10-01 to yyyy(+1)-03-31.
So for 2018 - Summer = 2018-04-01 to 2018-09-31; Winter 2018-10-01 to 2019-03-31.
What I would like to output is something like the following:
Many thanks.
A tidy verse approach
library(tidyverse)
df <-tibble(date = seq(from = as.Date('2000-01-01'), to = as.Date('2001-12-31'), by = '1 month'))
df
df %>%
mutate(water_season_year = case_when(
lubridate::month(date) %in% c(4:9) ~str_c('Su_', lubridate::year(date)),
lubridate::month(date) %in% c(10:12) ~str_c('Wi_', lubridate::year(date)),
lubridate::month(date) %in% c(1:3)~str_c('Wi_', lubridate::year(date) -1),
TRUE ~ 'Error'))
You can compare just the month part of the data to get the season, in base R consider doing
month <- as.integer(format(df$date, "%m"))
year <- format(df$date, "%Y")
inds <- month >= 4 & month <= 9
df$water_season_year <- NA
df$water_season_year[inds] <- paste("Su", year[inds], sep = "_")
df$water_season_year[!inds] <- paste("Wi", year[!inds], sep = "_")
#To add previous year for month <= 3 do
df$water_season_year[month <= 3] <- paste("Wi",
as.integer(year[month <= 3]) - 1, sep = "_")
df
# date water_season_year
#1 2019-01-03 Wi_2019
#2 2000-06-01 Su_2000
Make sure that date variable is of "Date" class.
data
df <-data.frame(date = as.Date(c("2019-01-03", "2000-06-01")))

Get the No_intersection/Complementary part of several date's intervals

I want to get the missing part of several date's intervals in 2017.
here for example, each "id" of following dataframe:
df <- data.frame(id=c(rep("a",3),rep("b",2)),
start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
id start end
a 2017-01-01 2017-01-15
a 2017-01-10 2017-01-20
a 2017-02-10 2017-02-20
b 2017-03-01 2017-03-28
b 2017-04-20 2017-04-29
I want to get:
df_final <- data.frame(id=c(rep("a",2),rep("b",3)),
start=c("2017-01-21","2017-02-21","2017-01-01","2017-03-29","2017-04-30"),
end=c("2017-02-09","2017-12-31","2017-02-28","2017-04-19","2017-12-31"))
id start end
a 2017-01-21 2017-02-09
a 2017-02-21 2017-12-31
b 2017-01-01 2017-02-28
b 2017-03-29 2017-04-19
b 2017-04-30 2017-12-31
Thank you!
First, confirm whether start and end are Date class.
df$start <- as.Date(df$start)
df$end <- as.Date(df$end)
Use by() to split the data into a list of two data frames according to the ids.
library(purrr)
by(df, df$id, function(x){
year <- seq(as.Date("2017-01-01"), as.Date("2017-12-31"), 1)
ind <- map2(x$start, x$end, function(start, end){
which(year < start | year > end)
}) %>% reduce(intersect)
gap <- which(diff(ind) > 1)
head <- ind[c(1, gap + 1)] ; tail <- ind[c(gap, length(ind))]
return(data.frame(id = unique(x$id), start = year[head], end = year[tail]))
}) %>% reduce(rbind)
Description:
year : All days in 2017.
ind : Get rid of the dates between start and end along the rows and the outcome represents the indices of missing dates.
gap : The discontinuous indices.
Output:
# id start end
# 1 a 2017-01-21 2017-02-09
# 2 a 2017-02-21 2017-12-31
# 3 b 2017-01-01 2017-02-28
# 4 b 2017-03-29 2017-04-19
# 5 b 2017-04-30 2017-12-31
I think my solution is still cumbersome. Hope to help you.
I encountered a similar problem recently, and I found that expanding the table to get one row for each relevant date, and then collapsing back down to ranges, was easier than trying to work out the correct logic from the range endpoints alone.
Here's how that approach would work. Alternatively, it might be possible to do something like this or this, but those approaches don't have the "not in range" issue you're dealing with.
library(dplyr)
library(fuzzyjoin)
library(lubridate)
df <- data.frame(id=c(rep("a",3),rep("b",2)),
start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
# All the dates in 2017.
all.2017.dates = data.frame(date = seq.Date(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))
# Start by expanding the original dataframe so that we get one record for each
# id for each date in any of that id's ranges.
df.expanded = df %>%
# Convert the strings to real dates.
mutate(start.date = as.Date(start),
end.date = as.Date(end)) %>%
# Left join to 2017 dates on dates that are in the range of this record.
fuzzy_left_join(all.2017.dates,
by = c("start.date" = "date", "end.date" = "date"),
match_fun = list(`<=`, `>=`)) %>%
# Filter to distinct ids/dates.
select(id, date) %>%
distinct()
# Now, do an anti-join that gets dates NOT in an id's ranges, and collapse back
# down to ranges.
df.final = expand.grid(id = unique(df$id),
date = all.2017.dates$date) %>%
# Anti-join on id and date.
anti_join(df.expanded,
by = c("id", "date")) %>%
# Sort by id, then date, so that the lead/lag functions behave as expected.
arrange(id, date) %>%
# Check whether this record is an endpoint (i.e., is it adjacent to the
# previous/next record?).
mutate(prev.day.included = coalesce(date == lag(date) + 1 &
id == lag(id), F),
next.day.included = coalesce(date == lead(date) - 1 &
id == lag(id), F)) %>%
# Filter to just endpoint records.
filter(!prev.day.included | !next.day.included) %>%
# Fill in both start and end dates on "start" records. The start date is the
# date in the record; the end date is the date of the next record.
mutate(start.date = as.Date(ifelse(!prev.day.included, date, NA),
origin = lubridate::origin),
end.date = as.Date(ifelse(!prev.day.included, lead(date), NA),
origin = lubridate::origin)) %>%
filter(!is.na(start.date))
Here's my solution:
library(tidyverse)
library(lubridate)
library(wrapr)
df %>%
mutate_at(2:3, ymd) %>%
group_by(id) %>%
gather('start_end', 'date', start:end) %>%
mutate(date = if_else(start_end == 'start', min(date), max(date))) %>%
unique() %>%
mutate(
start = if_else(
start_end == 'start',
date %>% min() %>% year() %>% paste0('-01-01') %>% ymd(),
date
),
end = if_else(
start_end == 'end',
date %>% max() %>% year() %>% paste0('-12-31') %>% ymd(),
date
)) %>%
filter(start != end) %>%
select(id, start, end) %>%
mutate(supp = TRUE) %>%
bind_rows(mutate(df, supp = FALSE) %>% mutate_at(2:3, ymd)) %>%
arrange(id, start) %>%
mutate(rn = row_number()) %.>%
left_join(., mutate(., rn = rn - 1), by = c('id', 'rn')) %>%
na.omit() %>%
mutate(
start = case_when(
(start.y >= end.x) & !supp.x ~ end.x + 1,
(start.y >= end.x) & supp.x ~ start.x,
TRUE ~ as.Date(NA)
),
end = case_when(
(start.y >= end.x) & supp.y ~ end.y,
(start.y >= end.x) & !supp.y ~ start.y - 1,
TRUE ~ as.Date(NA)
)
) %>%
select(id, start, end) %>%
na.omit()

groupby summarise outside of groupby dplyr

I'm trying to group ids with date in this dataset, but I want to summarise based on one of the features outside of the group.
library(dplyr)
library(lubridate)
set.seed(100)
df <- data.frame(ids = sample(c('436247', '2465347', '346654645'), 10000, replace=TRUE),
date = sample(seq.Date(ymd('2018-03-01'), ymd('2018-05-01'), by=1), 10000, replace=TRUE))
new_df <- df %>%
group_by(ids, date) %>%
summarise(events = length(ids[date >= date - 30 & date <= date]))
I'm trying to take this dataframe and answer the question - "for each of the ids, and each date, how many other records within that id, are within the past 30 days of that date". Unfortunately, when I group_by both the ids and date, it only looks within the grouped date. I've created the solution below, but not sure if there is a better one with dplyr?
groupby_function <- function(df, spec_date){
result <- df %>%
group_by(ids) %>%
summarise(events = length(ids[date >= spec_date - 30 & date <= spec_date])) %>%
mutate(date = spec_date)
return(result)
}
date_vector <- seq.Date(ymd('2018-03-01'), ymd('2018-05-01'), by=1)
list_results <- lapply(date_vector, groupby_function, df=df)
x <- do.call(rbind, list_results)
"for each of the ids, and each date, how many other records within that id, are within the past 30 days of that date"
For that, a "join by" condition makes sense, but isn't yet included in dplyr. Until it is, you could use data.table inside your dplyr chain:
# enumerate id-date combos of interest
grid_df = expand.grid(
id = unique(df$ids),
d = seq(min(df$date), max(df$date), by="day")
)
# helper function
library(data.table)
count_matches = function(DF, targetDF, ...){
onexpr = substitute(list(...))
data.table(targetDF)[DF, on=eval(onexpr), .N, by=.EACHI]$N
}
# use a non-equi join to count matching rows
res = grid_df %>%
mutate(d_dn = d - 30) %>%
mutate(n = count_matches(., df, ids = id, date >= d_dn, date <= d)) %>%
as.tibble
# A tibble: 186 x 4
id d d_dn n
<fctr> <date> <date> <int>
1 436247 2018-03-01 2018-01-30 72
2 2465347 2018-03-01 2018-01-30 69
3 346654645 2018-03-01 2018-01-30 51
4 436247 2018-03-02 2018-01-31 123
5 2465347 2018-03-02 2018-01-31 120
6 346654645 2018-03-02 2018-01-31 100
7 436247 2018-03-03 2018-02-01 170
8 2465347 2018-03-03 2018-02-01 166
9 346654645 2018-03-03 2018-02-01 154
10 436247 2018-03-04 2018-02-02 228
# ... with 176 more rows
It should work fine for equality conditions to write either ids = id or ids == id, I think.
If you're interested, the syntax is x[i, on=, j, by=.EACHI] where x and i are tables. For each row of i, we look up rows of x based on the on= criteria (left-hand side refers to columns in x; right-hand to columns in i); then we do j for each ("by each row of i" so by=.EACHI). In this case, j = .N means that we count matched rows of x, returned as a column of counts N.
You can look at the "ungrouped" data by just going back to the original data frame(calling df$date or df$ids). So I think what you are after is
test_df <- df %>%
group_by(ids, date) %>%
summarise(events = length(df$ids[df$date >= date[1] - 30 & df$date <= date[1] & df$ids == ids[1]]))
Also, I ran your proposed function, but I did not see any difference in the result from your original group_by solution, so I don't think that is what you want.
If a 'non dplyr' solution is acceptable, this gives you what you want.
df$diff <- as.vector(
sapply(unique(df$ids), function(x)
sapply(df$date[df$ids == x], function(y)
sum(abs(y - df$date[df$ids == x]) >= 30)
)
)
)
Alternatively, in dplyr, you can get a result like the above using:
f <- function(x) {
sapply(x, function(y) sum(abs(y - x) >= 30))
}
df$diff <- unlist(
df %>%
group_by(ids) %>%
do(diff = f(.$date)) %>%
.$diff
)
Here's an answer. But it assumes there's a continuous sequence of dates in each id.
df %>%
group_by(ids, date) %>%
count() %>%
arrange(ids, date) %>%
group_by(ids) %>%
mutate(
events = cumsum(n) - cumsum(lag(n, 30, 0))
)

Grouping by date and collapsing rows into a single one

I'm looking for a way to tramfor the df into the dfres.
Dfres is
obj <- date #where type == I5,
min <- min(date) #where type == I6,
max <- max(date) #where type == I6,
all of this grouped by year.
year <- c('2014','2015','2016','2017','2014','2015','2016','2017','2016','2014','2015')
type <- c('I6','I6','I6','I6','I6','I6','I6','I6','I5','I5','I5')
date <- c('2014-06-03','2015-08-01','2016-06-01','2017-05-15',
'2014-04-11','2015-03-14','2016-03-17','2017-03-08','2016-11-05',
'2014-09-04','2015-05-01')
df <- data.frame(year,type,date)
year <- c('2014','2015','2016','2017')
obj <- c('2014-09-04','2015-05-01','2016-11-05',NA)
min <- c('2014-04-11','2015-03-14','2016-03-17','2017-03-08')
max <- c('2014-06-03', '2015-08-01','2016-06-01','2017-05-15')
dfres <- data.frame(year,obj,min,max)
If anyone can help me, not to prepare the data in order to solve this one way around but an "easy" way throwing a sentence, I would be graceful.
An idea using dplyr would be,
library(dplyr)
df %>%
filter(type == 'I6') %>%
group_by(year) %>%
summarise(min_d = min(date), max_d = max(date)) %>%
full_join(df[df$type == 'I5',], ., by = 'year') %>%
select(-type) %>%
arrange(year)
# year date min_d max_d
#1 2014 2014-09-04 2014-04-11 2014-06-03
#2 2015 2015-05-01 2015-03-14 2015-08-01
#3 2016 2016-11-05 2016-03-17 2016-06-01
#4 2017 <NA> 2017-03-08 2017-05-15
A data.table approach would be:
library(data.table)
setDT(df)
i5 <- df[type == 'I5', .(obj = date), by = year]
i6 <- df[type == 'I6', .(min = min(as.Date(date)), max = max(as.Date(date))), by = year]
dfres <- merge(i5, i6, by = 'year', all = TRUE)

Resources