I want to get the missing part of several date's intervals in 2017.
here for example, each "id" of following dataframe:
df <- data.frame(id=c(rep("a",3),rep("b",2)),
start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
id start end
a 2017-01-01 2017-01-15
a 2017-01-10 2017-01-20
a 2017-02-10 2017-02-20
b 2017-03-01 2017-03-28
b 2017-04-20 2017-04-29
I want to get:
df_final <- data.frame(id=c(rep("a",2),rep("b",3)),
start=c("2017-01-21","2017-02-21","2017-01-01","2017-03-29","2017-04-30"),
end=c("2017-02-09","2017-12-31","2017-02-28","2017-04-19","2017-12-31"))
id start end
a 2017-01-21 2017-02-09
a 2017-02-21 2017-12-31
b 2017-01-01 2017-02-28
b 2017-03-29 2017-04-19
b 2017-04-30 2017-12-31
Thank you!
First, confirm whether start and end are Date class.
df$start <- as.Date(df$start)
df$end <- as.Date(df$end)
Use by() to split the data into a list of two data frames according to the ids.
library(purrr)
by(df, df$id, function(x){
year <- seq(as.Date("2017-01-01"), as.Date("2017-12-31"), 1)
ind <- map2(x$start, x$end, function(start, end){
which(year < start | year > end)
}) %>% reduce(intersect)
gap <- which(diff(ind) > 1)
head <- ind[c(1, gap + 1)] ; tail <- ind[c(gap, length(ind))]
return(data.frame(id = unique(x$id), start = year[head], end = year[tail]))
}) %>% reduce(rbind)
Description:
year : All days in 2017.
ind : Get rid of the dates between start and end along the rows and the outcome represents the indices of missing dates.
gap : The discontinuous indices.
Output:
# id start end
# 1 a 2017-01-21 2017-02-09
# 2 a 2017-02-21 2017-12-31
# 3 b 2017-01-01 2017-02-28
# 4 b 2017-03-29 2017-04-19
# 5 b 2017-04-30 2017-12-31
I think my solution is still cumbersome. Hope to help you.
I encountered a similar problem recently, and I found that expanding the table to get one row for each relevant date, and then collapsing back down to ranges, was easier than trying to work out the correct logic from the range endpoints alone.
Here's how that approach would work. Alternatively, it might be possible to do something like this or this, but those approaches don't have the "not in range" issue you're dealing with.
library(dplyr)
library(fuzzyjoin)
library(lubridate)
df <- data.frame(id=c(rep("a",3),rep("b",2)),
start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
# All the dates in 2017.
all.2017.dates = data.frame(date = seq.Date(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))
# Start by expanding the original dataframe so that we get one record for each
# id for each date in any of that id's ranges.
df.expanded = df %>%
# Convert the strings to real dates.
mutate(start.date = as.Date(start),
end.date = as.Date(end)) %>%
# Left join to 2017 dates on dates that are in the range of this record.
fuzzy_left_join(all.2017.dates,
by = c("start.date" = "date", "end.date" = "date"),
match_fun = list(`<=`, `>=`)) %>%
# Filter to distinct ids/dates.
select(id, date) %>%
distinct()
# Now, do an anti-join that gets dates NOT in an id's ranges, and collapse back
# down to ranges.
df.final = expand.grid(id = unique(df$id),
date = all.2017.dates$date) %>%
# Anti-join on id and date.
anti_join(df.expanded,
by = c("id", "date")) %>%
# Sort by id, then date, so that the lead/lag functions behave as expected.
arrange(id, date) %>%
# Check whether this record is an endpoint (i.e., is it adjacent to the
# previous/next record?).
mutate(prev.day.included = coalesce(date == lag(date) + 1 &
id == lag(id), F),
next.day.included = coalesce(date == lead(date) - 1 &
id == lag(id), F)) %>%
# Filter to just endpoint records.
filter(!prev.day.included | !next.day.included) %>%
# Fill in both start and end dates on "start" records. The start date is the
# date in the record; the end date is the date of the next record.
mutate(start.date = as.Date(ifelse(!prev.day.included, date, NA),
origin = lubridate::origin),
end.date = as.Date(ifelse(!prev.day.included, lead(date), NA),
origin = lubridate::origin)) %>%
filter(!is.na(start.date))
Here's my solution:
library(tidyverse)
library(lubridate)
library(wrapr)
df %>%
mutate_at(2:3, ymd) %>%
group_by(id) %>%
gather('start_end', 'date', start:end) %>%
mutate(date = if_else(start_end == 'start', min(date), max(date))) %>%
unique() %>%
mutate(
start = if_else(
start_end == 'start',
date %>% min() %>% year() %>% paste0('-01-01') %>% ymd(),
date
),
end = if_else(
start_end == 'end',
date %>% max() %>% year() %>% paste0('-12-31') %>% ymd(),
date
)) %>%
filter(start != end) %>%
select(id, start, end) %>%
mutate(supp = TRUE) %>%
bind_rows(mutate(df, supp = FALSE) %>% mutate_at(2:3, ymd)) %>%
arrange(id, start) %>%
mutate(rn = row_number()) %.>%
left_join(., mutate(., rn = rn - 1), by = c('id', 'rn')) %>%
na.omit() %>%
mutate(
start = case_when(
(start.y >= end.x) & !supp.x ~ end.x + 1,
(start.y >= end.x) & supp.x ~ start.x,
TRUE ~ as.Date(NA)
),
end = case_when(
(start.y >= end.x) & supp.y ~ end.y,
(start.y >= end.x) & !supp.y ~ start.y - 1,
TRUE ~ as.Date(NA)
)
) %>%
select(id, start, end) %>%
na.omit()
Related
Apologies I am not sure this warrants another question. But am basing this off of another questions (Group by ID and Outcome and take the earliest earliest Dates of specific outcomes and assign numbers (i.e outcome1, outcome2)).
mydata = data.frame (Id =c (1,1,1,1,1,1,1,1,2,2,2,2),
Date = c("2001-01-31", "2001-02-13","2001-05-31",
"2001-06-02","2018-01-31","2018-03-31","2018-07-31",
"2019-04-04","2014-01-31","2014-02-02","2014-04-31",
"2014-05-18"),Outcome = c("Relapse","CR","Relapse","Relapse",
"CR","CR","CR","Relapse","CR", "CR","Relapse","CR"))
Which outputs the below.
Id Date Outcome
1 2001-01-31 Relapse
1 2001-02-13 CR
1 2001-05-31 Relapse
1 2001-06-02 Relapse
1 2018-01-31 CR
1 2018-03-31 CR
1 2018-07-31 CR
1 2019-04-04 Relapse
2 2014-01-31 CR
2 2014-02-02 CR
2 2014-04-31 Relapse
2 2014-05-18 CR
As you can see each patient is in certain phases at different times and I would like to capture the earliest dates of when each new phase starts per patient. I would like then to rename these phases to CR1, Relapse1, CR2, Relapse2 and so forth. But I would only like to start naming AFTER patient achieves their first CR1. So if a patient has a relapse before CR, those would be ignored.
Im hoping to get an output like this:
Id CR1 Relapse1 CR2 Relapse2
1 2001-02-13 2001-05-31 2018-01-31 2019-04-04
2 2014-01-31 2014-04-31 2014-05-18 NA
As you see patient 1 had a relapse at 2001-01-31, but was ignored because it occured before the first CR1.
The following code was so helpful to the previous question, but was wondering if I can modify it so that it will start the counting after the first CR? Thank you!!
mydata %>%
group_by(Id) %>%
mutate(Grp = rleid(Outcome)) %>%
group_by(Grp, .add = T) %>%
slice(1) %>%
group_by(Id, Outcome) %>%
mutate(n = row_number()) %>%
pivot_wider(id_cols = Id, names_from = c(Outcome, n), values_from = Date)
Using data.table package
library(data.table)
mydata <- setDT(mydata)
# identify consecutive same Outcome for each Id
mydata[, flag:=rleid(Id, Outcome)]
# collapse identified consecutive values (keep min date)
tmp <- mydata[, by="flag,Id,Outcome", .(Date=min(Date))]
# mark relapse if first
tmp <- tmp[, by=Id,
flag := (Outcome[1]=="Relapse") & (seq_len(.N)==1)
]
# remove marked relapse
tmp <- tmp[(!flag)]
# Outcome numbering
tmp[, by="Id,Outcome", n:=seq_len(.N)]
# cast (widen)
dcast(tmp, Id~paste0(Outcome, n), value.var = "Date")
Got
Id CR1 CR2 Relapse1 Relapse2
1: 1 2001-02-13 2018-01-31 2001-05-31 2019-04-04
2: 2 2014-01-31 2014-05-18 2014-04-31 <NA>
using %>% notation
mydata[, flag:=rleid(Id,Outcome)] %>%
.[, by="flag,Id,Outcome", .( Date=min(Date) )] %>%
.[, by="Id", flag := (Outcome[1]=="Relapse") & (seq_len(.N)==1)] %>%
.[(!flag)] %>%
.[, by="Id,Outcome", n := seq_len(.N)] %>%
dcast(., Id ~ paste0(Outcome, n), value.var = "Date")
That is my approach :
library(data.table)
my_df <- data.frame (Id =c (1,1,1,1,1,1,1,1,2,2,2,2),
Date = c("2001-01-31", "2001-02-13","2001-05-31",
"2001-06-02","2018-01-31","2018-03-31","2018-07-31",
"2019-04-04","2014-01-31","2014-02-02","2014-04-31",
"2014-05-18"),
Outcome = c("Relapse","Relapse","Relapse","Relapse",
"CR","CR","CR","Relapse","CR", "CR","Relapse","CR"),
stringsAsFactors = FALSE)
my_df <- my_df %>% group_by(Id) %>% arrange(Id, Date)
my_df <- my_df %>%
group_by(Id) %>%
mutate(Value = seq_along(Outcome)) %>%
mutate(first_v = Outcome == "CR" & !duplicated(Outcome == "CR")) %>%
mutate(first_a = first_v == "FALSE" & Value > Value[Outcome == "CR" & !duplicated(Outcome == "CR")] | Outcome == "CR") %>%
filter(first_a == "TRUE")
my_df %>%
group_by(Id) %>%
mutate(Grp = rleid(Outcome)) %>%
group_by(Grp, .add = T) %>%
group_by(Id, Outcome) %>%
mutate(n = row_number()) %>%
pivot_wider(id_cols = Id, names_from = c(Outcome, n), values_from = Date)
You could try adding:
filter(Date >= first(Date[Outcome == "CR"]))
to filter out rows before the first "CR". This assumes your Date is sorted/arranged first.
library(tidyverse)
library(data.table)
mydata %>%
group_by(Id) %>%
filter(Date >= first(Date[Outcome == "CR"])) %>%
mutate(Grp = rleid(Outcome)) %>%
group_by(Grp, .add = T) %>%
slice(1) %>%
group_by(Id, Outcome) %>%
mutate(n = row_number()) %>%
pivot_wider(id_cols = Id, names_from = c(Outcome, n), values_from = Date)
Output
Id CR_1 Relapse_1 CR_2 Relapse_2
<dbl> <chr> <chr> <chr> <chr>
1 1 2001-02-13 2001-05-31 2018-01-31 2019-04-04
2 2 2014-01-31 2014-04-31 2014-05-18 NA
I have a dataset which has multiple start dates and end dates for each Id. I would like to take the earliest date from the "startDate" column and the latest date from the endDate column.
data = data.frame(ID=c(1,1,1,1,2,2,2),
startDate= c("2018-01-31", "2018-01-31", "2018-01-31", "2019-06-06",
"2002-06-07", "2002-06-07", "2002-09-12"),
endDate = c(NA,NA,NA,"2019-07-09",NA,NA, "2002-10-02"))
This is the output I was hoping to get:
data = data.frame(ID=c(1,2),
startDate= c("2018-01-31","2002-06-07"),
endDate = c("2019-07-09","2002-10-02"))
After trying I have figured out how to do this through the following code, but would prefer something more efficient if at all possible. I am continuously needing to do this and i would rather not have to create two separate dataframes. Thank you guys for your help!
data_start <- data %>%
group_by(ID) %>%
arrange(startDate) %>%
slice(1L)
data_end <- data %>%
group_by(ID) %>%
arrange(desc(endDate)) %>%
slice(1L)
data <- left_join(data_start[,c(1,2)], data_end[,c(1,3)], by="ID")
Or with first and last:
library(dplyr)
data %>%
group_by(ID) %>%
summarise(
startDate = first(startDate),
endDate = last(endDate)
)
# A tibble: 2 x 3
ID startDate endDate
* <dbl> <chr> <chr>
1 1 2018-01-31 2019-07-09
2 2 2002-06-07 2002-10-02
You can use min and max, working the variables as dates
data %>% group_by(ID) %>%
summarise(startDate = min(as.Date(startDate),na.rm = T),
endDate = max(as.Date(endDate),na.rm = T))
I have two dataframes:
In DF1, for every ID, the param have been recorded on various dates.
In DF2, for every ID, a number of dates are given.
For every ID, I would like to get all the corresponding param and value from DF1, depending on the dates:
either the value that corresponds to the most recent date1 (in DF1) before date2 (in DF2) for a given param or
If there is no such date1, the most recent value after date2.
DF1 is (I have marked with * the correct rows for the result ):
ID date1 param value
1 id1 1/1/2020 pA pA_1_1
2 id1 2/1/2020 pA pA_1_2 *
3 id1 17/1/2020 pA pA_1_3
4 id1 20/1/2020 pB pB_1_1 *
5 id1 21/1/2020 pB pB_1_2
6 id2 21/12/2022 pA pA_2_1 *
7 id2 22/12/2022 pA pA_2_2
8 id2 18/12/2022 pB pB_2_1 *
9 id2 19/12/2022 pB pB_2_2
DF2 is:
ID date2
1 id1 15/1/2020
2 id2 20/12/2020
The result should be:
ID date2 param value date1
1 id1 15/1/2020 pA pA_1_2 2/1/2020
2 id1 15/1/2020 pB pB_1_1 20/1/2020
3 id2 20/12/2020 pA pA_2_1 21/12/2022
4 id2 20/12/2020 pB pB_2_1 18/12/2022
Code to reproduce the DF1 and DF2:
DF1= data.frame(
stringsAsFactors = FALSE,
ID = c("id1","id1","id1","id1",
"id1","id2","id2","id2","id2"),
date1 = c("1/1/2020","2/1/2020",
"17/1/2020","20/1/2020","21/1/2020","21/12/2022",
"22/12/2022","18/12/2022","19/12/2022"),
param = c("pA", "pA", "pA", "pB", "pB", "pA", "pA", "pB", "pB"),
value = c("pA_1_1","pA_1_2","pA_1_3",
"pB_1_1","pB_1_2","pA_2_1","pA_2_2","pB_2_1","pB_2_2")
)
DF2=data.frame(
stringsAsFactors = FALSE,
ID = c("id1", "id2"),
date2 = c("15/1/2020", "20/12/2020")
)
This is my solution. I'm sure there is a way to write this with less code (using one dataframe instead of two and later merging). But I don't know righ now.
library(tidyverse)
library(lubridate)
# Get before date2
before <- DF1 %>%
left_join(DF2,by = "ID") %>%
mutate(diff = dmy(date1)-dmy(date2)) %>%
mutate(Grp = data.table::rleid(param)) %>%
filter(diff < 0) %>%
group_by(Grp) %>%
filter(diff == max(diff)) %>%
ungroup
# Get after date2
after <- DF1 %>%
left_join(DF2,by = "ID") %>%
mutate(diff = dmy(date1)-dmy(date2)) %>%
mutate(Grp = data.table::rleid(param)) %>%
filter(diff > 0) %>%
group_by(Grp) %>%
filter(! Grp %in% before$Grp, diff == min(diff)) %>%
ungroup
result <- bind_rows(before,after) %>%
select(ID,date2, param, value, date1) %>%
arrange(ID, param)
Explanation: I'm using lubridate library to compare the dates. I do the same process to create two dataframes - first one (before df) for groups which accomplish first condition (closest date in DF1 before date2 in DF2), second one (after df) is for groups which do the other way round (closest date in DF1 after date2 in DF2).
I will explain first:
# Get before date2
before <- DF1 %>%
left_join(DF2,by = "ID") %>%
mutate(diff = dmy(date1)-dmy(date2)) %>%
mutate(Grp = data.table::rleid(param)) %>%
filter(diff < 0) %>%
group_by(Grp) %>%
filter(diff == max(diff)) %>%
ungroup
Here, we merge DF1 and DF2 by ID, so rows with same ID have the same date2. Then, we calculate the differences date1-date2 - first we convert characters to date using dmy(). Therefore, dates before date2 will be a negative difference. With data.table::rleid(param) we enumerate subgroups with different ID & param, so we can know the subgroups. Then we can group by then and filter by them.
At the end:
result <- bind_rows(before,after) %>%
select(ID,date2, param, value, date1) %>%
arrange(ID, param)
We bind the two dataframe by rows and select the columns you are looking for, to delete the columns we created to operate with (group and filter).
PS: I added arrange() to make sure the final df is sorted by ID and param values.
The code below produces the number of avalanches in SLC by each year-month during the ski season (Dec-Mar). Since this code gets the total each year-month, it does not add in the the year-months that had 0 avalanches. How do I fill in my table so it will provide all year-month?
# write the webscraper
library(XML)
library(RCurl)
library(dplyr)
avalanche<-data.frame()
avalanche.url<-"https://utahavalanchecenter.org/observations?page="
all.pages<-0:202
for(page in all.pages){
this.url<-paste(avalanche.url, page, sep="")
this.webpage<-htmlParse(getURL(this.url))
thispage.avalanche<-readHTMLTable(this.webpage, which=1, header=T,stringsAsFactors=F)
names(thispage.avalanche)<-c('Date','Region','Location','Observer')
avalanche<-rbind(avalanche,thispage.avalanche)
}
# subset the data to the Salt Lake Region
avalancheslc<-subset(avalanche, Region=="Salt Lake")
str(avalancheslc)
# convert the dates and get the total the number of avalanches
avalancheslc <- avalancheslc %>%
group_by(Date = format(as.yearmon(Date, "%m/%d/%Y"), "%Y-%m")) %>%
summarise(AvalancheTotal = n())
# pipe to only include Dec-Mar of each year
avalancheslc <- avalancheslc %>% filter(as.integer(substr(Date, 6, 7)) %in% c(12, 1:3))
# the data right now looks like this
Date AvalancheTotal
1980-01 1
1981-02 1
.
.
.
# the data needs to look like this
Date AvalancheTotal
1980-01 1
1980-02 0
1980-03 0
1980-12 0
1981-01 0
1981-02 1
1981-03 1
library("tidyverse")
library("lubridate")
# You data here...
# Simpler version
avalancheslc %>%
separate(Date, c("year", "month")) %>%
# Some years might be missing (no avalanches at all)
# We can fill in those with `full_seq` but
# `full_seq` works with numbers not characters
mutate(year = as.integer(year)) %>%
complete(year = full_seq(year, 1), month,
fill = list(AvalancheTotal = 0)) %>%
unite("Date", year, month, sep = "-")
# Alternative version (fills in all months, so needs filtering afterwards)
avalancheslc <- avalancheslc %>%
# In case `Date` needs parsing
mutate(Date = parse_date_time(Date, "%y-%m"))
# A full data frame of months
all_months <- avalancheslc %>%
expand(Date = seq(first(Date), last(Date), by = "month"))
# Join to `avalanches` and fill in with 0s
avalancheslc %>%
right_join(all_months) %>%
replace_na(list(AvalancheTotal = 0))
Extending this question:
I have some data prepared using the below code:
# # Data Preparation ----------------------
library(lubridate)
start_date <- "2018-10-30 00:00:00"
start_date <- as.POSIXct(start_date, origin="1970-01-01")
dates <- c(start_date)
for(i in 1:287) {
dates <- c(dates, start_date + minutes(i * 10))
}
dates <- as.POSIXct(dates, origin="1970-01-01")
date_val <- format(dates, '%d-%m-%Y')
weather.forecast.data <- data.frame(dateTime = dates, date = date_val)
weather.forecast.data <- rbind(weather.forecast.data, weather.forecast.data, weather.forecast.data, weather.forecast.data)
weather.forecast.data$id <- c(rep('GH1', 288), rep('GH2', 288), rep('GH3', 288), rep('GH4', 288))
weather.forecast.data$radiation <- round(runif(nrow(weather.forecast.data)), 2)
weather.forecast.data$hour <- as.integer(format(weather.forecast.data$dateTime, '%H'))
weather.forecast.data$day_night <- ifelse(weather.forecast.data$hour < 6, 'night', ifelse(weather.forecast.data$hour < 19, 'day', 'night'))
# # GH2: Total Morning missing # #
weather.forecast.data$radiation[(weather.forecast.data$id == 'GH2') & (weather.forecast.data$date == '30-10-2018') & (weather.forecast.data$day_night == 'day')] = NA
weather.forecast.data$hour <- NULL
weather.forecast.data$day_night <- NULL
My task is to remove ids from the weather.forecast.data where for each id and each date, morning half (06 hours to 18 hours), the radiation values are missing (NA) using dplyr in R.
I want to eliminate rows for a given id and date which has the entire morning radiation value as missing. i.e. if an id for a date has morning radiation missing. I drop all the rows with that particular id and date. So, we drop all the 144 records because its morning has radiation missing.
We can see that GH2 has entire morning radiation missing on date 30-10-2018. We therefore drop all 144 records with id == 'GH2' and date = '30-10-2018'.
setDT(weather.forecast.data)
weather.forecast.data[, sum(is.na(radiation)), .(id, date)]
id date V1
1: GH1 30-10-2018 0
2: GH1 31-10-2018 0
3: GH2 30-10-2018 78
4: GH2 31-10-2018 0
5: GH3 30-10-2018 0
6: GH3 31-10-2018 0
7: GH4 30-10-2018 0
8: GH4 31-10-2018 0
I have the code using data.table:
setDT(weather.forecast.data)
weather.forecast.data[, hour:= hour(dateTime)]
weather.forecast.data[, day_night:=c("night", "day")[(6 <= hour & hour < 19) + 1L]]
weather.forecast.data[, date_id := paste(date, id, sep = "__")]
weather.forecast.data[, all_is_na := all(is.na(radiation)), .(date_id, day_night)]
weather.forecast.data[!(date_id %in% unique(weather.forecast.data[(all_is_na == TRUE) & (day_night == 'day'), date_id]))]
I need the code using dplyr and I have tried the following. It is dropping many rows than required:
library(dplyr)
weather.forecast.data <- weather.forecast.data %>%
mutate(hour = as.integer(format(dateTime, '%H'))) %>%
mutate(day_night = ifelse(hour < 6, 'night', ifelse(hour < 19, 'day', 'night'))) %>%
group_by(date, day_night, id) %>%
filter((!all(is.na(radiation))) & (day_night == 'day')) %>%
select (-c(hour, day_night)) %>%
as.data.frame
Note: Output should return the data by dropping the rows where id = 'GH2' and date = '30-10-2018'
I believe you are complicating a bit. The following code does what you describe in the question.
library(lubridate)
library(dplyr)
weather.forecast.data %>%
mutate(hour = hour(dateTime),
day_night = c("night", "day")[(6 <= hour & hour < 19) + 1L]) %>%
group_by(date, id) %>%
mutate(delete = all(!(is.na(radiation) & day_night == "day"))) %>%
ungroup() %>%
filter(delete) %>%
select(-hour, -day_night, -delete) %>%
as.data.frame() -> df1
See if it worked giving the expected 144 deleted rows.
nrow(weather.forecast.data) - nrow(df1)
#[1] 144
Data.
I repost the data generation code, simplified in two places and with a call to set.seed.
set.seed(4192)
start_date <- "2018-10-30 00:00:00"
start_date <- as.POSIXct(start_date, origin="1970-01-01")
dates <- start_date + minutes(0:287 * 10)
dates <- as.POSIXct(dates, origin="1970-01-01")
date_val <- format(dates, '%d-%m-%Y')
weather.forecast.data <- data.frame(dateTime = dates, date = date_val)
weather.forecast.data <- rbind(weather.forecast.data, weather.forecast.data, weather.forecast.data, weather.forecast.data)
weather.forecast.data$id <- c(rep('GH1', 288), rep('GH2', 288), rep('GH3', 288), rep('GH4', 288))
weather.forecast.data$radiation <- round(runif(nrow(weather.forecast.data)), 2)
weather.forecast.data$hour <- hour(weather.forecast.data$dateTime)
weather.forecast.data$day_night <- ifelse(weather.forecast.data$hour < 6, 'night', ifelse(weather.forecast.data$hour < 19, 'day', 'night'))
# # GH2: Total Morning missing # #
weather.forecast.data$radiation[(weather.forecast.data$id == 'GH2') & (weather.forecast.data$date == '30-10-2018') & (weather.forecast.data$day_night == 'day')] = NA
weather.forecast.data$hour <- NULL
weather.forecast.data$day_night <- NULL
You are filtering for rows that only contain "day" in the day_night column. If I understood you correctly you want the following:
library(dplyr)
weather.forecast.data <- weather.forecast.data %>%
mutate(hour = as.integer(format(dateTime, '%H'))) %>%
mutate(day_night = ifelse(hour < 6, 'night', ifelse(hour < 19, 'day',
'night'))) %>%
group_by(date, day_night, id) %>%
filter((!(all(is.na(radiation))) & (day_night == 'day'))) %>%
select (-c(hour, day_night)) %>%
as.data.frame
This would remove all IDs that have all NAs during the day.