BACKGROUD
dplyr has window functions. When you want to control the order of window functions,
you can use order_by.
DATA
mydf <- data.frame(id = c("ana", "bob", "caroline",
"bob", "ana", "caroline"),
order = as.POSIXct(c("2015-01-01 18:00:00", "2015-01-01 18:05:00",
"2015-01-01 19:20:00", "2015-01-01 09:07:00",
"2015-01-01 08:30:00", "2015-01-01 11:11:00"),
format = "%Y-%m-%d %H:%M:%S"),
value = runif(6, 10, 20),
stringsAsFactors = FALSE)
# id order value
#1 ana 2015-01-01 18:00:00 19.00659
#2 bob 2015-01-01 18:05:00 13.64010
#3 caroline 2015-01-01 19:20:00 12.08506
#4 bob 2015-01-01 09:07:00 14.40996
#5 ana 2015-01-01 08:30:00 17.45165
#6 caroline 2015-01-01 11:11:00 14.50865
Suppose you want to use lag(), you can do the following.
arrange(mydf, id, order) %>%
group_by(id) %>%
mutate(check = lag(value))
# id order value check
#1 ana 2015-01-01 08:30:00 17.45165 NA
#2 ana 2015-01-01 18:00:00 19.00659 17.45165
#3 bob 2015-01-01 09:07:00 14.40996 NA
#4 bob 2015-01-01 18:05:00 13.64010 14.40996
#5 caroline 2015-01-01 11:11:00 14.50865 NA
#6 caroline 2015-01-01 19:20:00 12.08506 14.50865
However, you can avoid using arrange() with order_by().
group_by(mydf, id) %>%
mutate(check = lag(value, order_by = order))
# id order value check
#1 ana 2015-01-01 18:00:00 19.00659 17.45165
#2 bob 2015-01-01 18:05:00 13.64010 14.40996
#3 caroline 2015-01-01 19:20:00 12.08506 14.50865
#4 bob 2015-01-01 09:07:00 14.40996 NA
#5 ana 2015-01-01 08:30:00 17.45165 NA
#6 caroline 2015-01-01 11:11:00 14.50865 NA
EXPERIMENT
I wanted to apply the same procedure to the case in which I wanted
to assign row number to a new column. Using the sample data, you can do the folowing.
group_by(mydf, id) %>%
arrange(order) %>%
mutate(num = row_number())
# id order value num
#1 ana 2015-01-01 08:30:00 17.45165 1
#2 ana 2015-01-01 18:00:00 19.00659 2
#3 bob 2015-01-01 09:07:00 14.40996 1
#4 bob 2015-01-01 18:05:00 13.64010 2
#5 caroline 2015-01-01 11:11:00 14.50865 1
#6 caroline 2015-01-01 19:20:00 12.08506 2
Can we omit the arrange line? Seeing the CRAN manual, I did the following.
Both attempts were not successful.
### Not working
group_by(mydf, id) %>%
mutate(num = row_number(order_by = order))
### Not working
group_by(mydf, id) %>%
mutate(num = order_by(order, row_number()))
How can we achieve this?
I did not mean to answer this question by myself. But, I decided to share
what I found given I have not seen many posts using order_by and particularly
with_order. My answer was to use with_order() instead of order_by().
group_by(mydf, id) %>%
mutate(num = with_order(order_by = order, fun = row_number, x = order))
# id order value num
#1 ana 2015-01-01 18:00:00 19.00659 2
#2 bob 2015-01-01 18:05:00 13.64010 2
#3 caroline 2015-01-01 19:20:00 12.08506 2
#4 bob 2015-01-01 09:07:00 14.40996 1
#5 ana 2015-01-01 08:30:00 17.45165 1
#6 caroline 2015-01-01 11:11:00 14.50865 1
I wanted to see if there would be any difference between the two
approaches in terms of speed. It seems that they are pretty similar in this case.
library(microbenchmark)
mydf2 <- data.frame(id = rep(c("ana", "bob", "caroline",
"bob", "ana", "caroline"), times = 200000),
order = seq(as.POSIXct("2015-03-01 18:00:00", format = "%Y-%m-%d %H:%M:%S"),
as.POSIXct("2015-01-01 18:00:00", format = "%Y-%m-%d %H:%M:%S"),
length.out = 1200000),
value = runif(1200000, 10, 20),
stringsAsFactors = FALSE)
jazz1 <- function() {group_by(mydf2, id) %>%
arrange(order) %>%
mutate(num = row_number())}
jazz2 <- function() {group_by(mydf2, id) %>%
mutate(num = with_order(order_by = order, fun = row_number, x = order))}
res <- microbenchmark(jazz1, jazz2, times = 1000000L)
res
#Unit: nanoseconds
# expr min lq mean median uq max neval cld
# jazz1 32 36 47.17647 38 47 12308 1e+06 a
# jazz2 32 36 47.02902 38 47 12402 1e+06 a
Related
I am working on a project where I have to only include patients who had lab tests ordered at least 12 hours apart, and to keep the timestamp of each included lab test. The issue is that many patients get several labs done within the 12 hour window, but the client has asked to not include those tests. I have made it this far:
#Create dummy dataset
df = data.frame(
"Encounter" = c(rep("12345", times=16), rep("67890", times = 5)),
"Timestamp" = c("01/06/2022 04:00:00", "01/07/2022 08:00:00",
"01/08/2022 00:00:00", "01/08/2022 04:00:00",
"01/08/2022 08:00:00", "01/08/2022 20:00:00",
"01/09/2022 04:00:00", "01/09/2022 08:00:00",
"01/09/2022 20:00:00", "01/09/2022 23:26:00",
"01/10/2022 00:00:00", "01/10/2022 08:00:00",
"01/10/2022 20:00:00", "01/11/2022 00:00:00",
"01/11/2022 20:00:00", "01/12/2022 04:00:00",
"11/10/2021 11:00:00", "11/10/2021 12:00:00",
"11/10/2021 13:00:00", "11/10/2021 14:00:00",
"11/11/2021 00:00:00"))
#Convert timestamp to POSIXlt format
df$Timestamp <- strptime(as.character(df$Timestamp), format="%m/%d/%Y %H:%M")
#Calculate time (in hours) between each previous timestamp by Encounter
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(difftime(Timestamp, lag(Timestamp), units="hours"))
I can't seem to figure out what to do next. It seems like I need to calculate a rolling 12-hours that then resets to 0 once a row hits 12 hours, but I'm not sure how to go about it. Below is my ideal result:
df$Keep.Row <- c(1,1,1,0,0,1,0,1,1,0,0,1,1,0,1,0,1,0,0,0,1)
There is absolutely nothing elegant about this, but I believe it gives you what you’re looking for. I use a temporary variable to store the “rolling” sum before it’s reset once the hours between is 12 or greater.
library(tidyverse)
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_diff = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
replace_na(list(time_diff = 0)) %>%
mutate(temp = ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(time_diff) + time_diff),
temp = ifelse(is.na(temp), 0, temp),
hours_between = ifelse(time_diff >= 12, time_diff,
ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(temp) + time_diff)),
keep = ifelse(hours_between >= 12 | is.na(hours_between), 1, 0)) %>%
select(-temp)
Created on 2022-01-27 by the reprex package (v2.0.1)
Here is an alternative option using accumulate. Here, you can use you differences, and once they exceed the threshold of 12 hours, reset by just using the diff value (starting over) instead of using the cumulative sum. To include the first time for each Encounter, you can either make that diff 12 hours, or add a separate mutate and check where Timestamp == first(Timestamp) and in those cases set keep to 1.
library(tidyverse)
thresh <- 12
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(diff = difftime(Timestamp, lag(Timestamp, default = first(Timestamp) - (thresh * 60 * 60)), units = "hours"),
keep = +(accumulate(diff, ~if_else(.x >= thresh, .y, .x + .y)) >= thresh))
Output
Encounter Timestamp diff keep
<chr> <dttm> <drtn> <int>
1 12345 2022-01-06 04:00:00 12.0000000 hours 1
2 12345 2022-01-07 08:00:00 28.0000000 hours 1
3 12345 2022-01-08 00:00:00 16.0000000 hours 1
4 12345 2022-01-08 04:00:00 4.0000000 hours 0
5 12345 2022-01-08 08:00:00 4.0000000 hours 0
6 12345 2022-01-08 20:00:00 12.0000000 hours 1
7 12345 2022-01-09 04:00:00 8.0000000 hours 0
8 12345 2022-01-09 08:00:00 4.0000000 hours 1
9 12345 2022-01-09 20:00:00 12.0000000 hours 1
10 12345 2022-01-09 23:26:00 3.4333333 hours 0
11 12345 2022-01-10 00:00:00 0.5666667 hours 0
12 12345 2022-01-10 08:00:00 8.0000000 hours 1
13 12345 2022-01-10 20:00:00 12.0000000 hours 1
14 12345 2022-01-11 00:00:00 4.0000000 hours 0
15 12345 2022-01-11 20:00:00 20.0000000 hours 1
16 12345 2022-01-12 04:00:00 8.0000000 hours 0
17 67890 2021-11-10 11:00:00 12.0000000 hours 1
18 67890 2021-11-10 12:00:00 1.0000000 hours 0
19 67890 2021-11-10 13:00:00 1.0000000 hours 0
20 67890 2021-11-10 14:00:00 1.0000000 hours 0
21 67890 2021-11-11 00:00:00 10.0000000 hours 1
Probably missing something, but wouldn't this work:
library(dplyr)
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_dif = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
filter(time_dif > 12)
I have a table that looks like this;
user_id timestamp
aa 2018-01-01 12:01 UTC
ab 2018-01-01 05:01 UTC
bb 2018-06-01 09:01 UTC
bc 2018-03-03 23:01 UTC
cc 2018-01-02 11:01 UTC
I have another table that has every week in 2018.
week_id week_start week_end
1 2018-01-01 2018-01-07
2 2018-01-08 2018-01-15
3 2018-01-16 2018-01-23
4 2018-01-23 2018-01-30
... ... ...
Assume the week_start is a Monday and week_end is a Sunday.
I'd like to do two things. I'd first like to join the week_id to the first table and then I'd like to assign a day to each of the timestamps. My output would look like this:
user_id timestamp week_id day_of_week
aa 2018-01-01 12:01 UTC 1 Monday
ab 2018-01-02 05:01 UTC 1 Tuesday
bb 2018-01-13 09:01 UTC 2 Friday
bc 2018-01-28 23:01 UTC 4 Friday
cc 2018-01-06 11:01 UTC 1 Saturday
In Excel I could easily do this with a vlookup. My main interest is to learn how to join tables in cases like this. For that reason, I won't accept answers that use the weekday function.
Here are both of the tables in a more accessible format.
user_id <- c("aa", "ab", "bb", "bc", "cc")
timestamp <- c("2018-01-01 12:01", "2018-01-01 05:01", "2018-06-01 09:01", "2018-03-03 23:01", "2018-01-02 11:01")
week_id <- seq(1,52)
week_start <- seq(as.Date("2018-01-01"), as.Date("2018-12-31"), 7)
week_end <- week_start + 6
week_start <- week_start[1:52]
week_end <- week_end[1:52]
table1 <- data.frame(user_id, timestamp)
table2 <- data.frame(week_id, week_start, week_end)
Using SQL one can join two tables on a range like this. This seems the most elegant solution expressing our intent directly but we also provide some alternatives further below.
library(sqldf)
DF1$date <- as.Date(DF1$timestamp)
sqldf("select *
from DF1 a
left join DF2 b on date between week_start and week_end")
giving:
user_id timestamp date week_id week_start week_end
1 aa 2018-01-01 12:01:00 2018-01-01 1 2018-01-01 2018-01-07
2 ab 2018-01-01 05:01:00 2018-01-01 1 2018-01-01 2018-01-07
3 bb 2018-06-01 09:01:00 2018-06-01 NA <NA> <NA>
4 bc 2018-03-03 23:01:00 2018-03-04 NA <NA> <NA>
5 cc 2018-01-02 11:01:00 2018-01-02 1 2018-01-01 2018-01-07
dplyr
In a comment the poster asked for whether it could be done in dplyr. It can't be done directly since dplyr does not support complex joins but a workaound would be to do a full cross join of the two data frames which gives rise to an nrow(DF1) * nrow(DF2) intermediate result and then filter this down. dplyr does not directly support cross joins but we can simulate one by doing a full join on an identical dummy constant column that is appended to both data frames in the full join. Since we actually need a right join here to add back the unmatched rows, we do a final right join with the original DF1 data frame. Obviously this is entirely impractical for sufficiently large inputs but for the small input here we can do it. If it were known that there is a match in DF2 to every row in DF1 then the right_join at the end could be omitted.
DF1 %>%
mutate(date = as.Date(timestamp), dummy = 1) %>%
full_join(DF2 %>% mutate(dummy = 1)) %>%
filter(date >= week_start & date <= week_end) %>%
select(-dummy) %>%
right_join(DF1)
R Base
findix finds the index in DF2 corresponding to a date d. We then sapply it over the dates corresponding to rows of DF1 and put DF1 and the corresponding DF2 row together.
findix <- function(d) c(which(d >= DF2$week_start & d <= DF2$week_end), NA)[1]
cbind(DF1, DF2[sapply(as.Date(DF1$timestamp), findix), ])
Note
The input data in reproducible form used is:
Lines1 <- "user_id timestamp
aa 2018-01-01 12:01 UTC
ab 2018-01-01 05:01 UTC
bb 2018-06-01 09:01 UTC
bc 2018-03-03 23:01 UTC
cc 2018-01-02 11:01 UTC"
DF1 <- read.csv(text = gsub(" +", ",", Lines1), strip.white = TRUE)
DF1$timestamp <- as.POSIXct(DF1$timestamp)
Lines2 <- "week_id week_start week_end
1 2018-01-01 2018-01-07
2 2018-01-08 2018-01-15
3 2018-01-16 2018-01-23
4 2018-01-23 2018-01-30"
DF2 <- read.table(text = Lines2, header = TRUE)
DF2$week_start <- as.Date(DF2$week_start)
DF2$week_end <- as.Date(DF2$week_end)
This is a case for the fuzzyjoin-package. With the match_fun- argument we can specify conditions for each column. In this case table1$date >= table2$week_start and table1$date <= table2$week_end.
library(fuzzyjoin)
library(lubridate)
table1$date <- as.Date(table1$timestamp)
fuzzy_left_join(table1, table2,
by = c("date" = "week_start", "date" = "week_end"),
match_fun = list(`>=`, `<=`)) %>%
mutate(day_of_week = wday(date, label = TRUE)) %>%
select(user_id, timestamp, week_id, day_of_week)
user_id timestamp week_id day_of_week
1 aa 2018-01-01 12:01 1 Mo
2 ab 2018-01-01 05:01 1 Mo
3 bb 2018-06-01 09:01 22 Fr
4 bc 2018-03-03 23:01 9 Sa
5 cc 2018-01-02 11:01 1 Di
I'm also a smartass because I didn't use the weekday-function but wday from the lubridate-package.
I have a dataframe. Some dates fall on the weekend. However I would like to change all weekend dates to the past Friday.
as.Date(aapl_earnings$Date, "%Y/%m/%d")
[1] "2018-04-30" "2018-01-31" "2017-11-01" "2017-07-31" "2017-05-01" "2017-01-30" "2016-10-24"
[8] "2016-07-25" "2016-04-25" "2016-01-25" "2015-10-26" "2015-07-20" "2015-04-26" "2015-01-26"
[15] "2014-10-19" "2014-07-21" "2014-04-22" "2014-01-26" "2013-10-27"
We can use a nested ifelse here and check the day of the week using weekdays and adjust the date accordingly.
dates <- weekdays(as.Date(x))
as.Date(ifelse(dates == "Saturday", x - 1,
ifelse(dates == "Sunday", x - 2, x)), origin = "1970-01-01")
#[1]"2018-04-30" "2018-01-31" "2017-11-01" "2017-07-31" "2017-05-01" "2017-01-30"
#[7]"2016-10-24" "2016-07-25" "2016-04-25" "2016-01-25" "2015-10-26" "2015-07-20"
#[13]"2015-04-24" "2015-01-26" "2014-10-17" "2014-07-21" "2014-04-22" "2014-01-24"
#[19]"2013-10-25"
Or we can also use case_when from dplyr which is more verbose.
library(dplyr)
aapl_earnings <- data.frame(Date = as.Date(x))
aapl_earnings %>%
mutate(date = weekdays(Date),
new_date = case_when(date == "Saturday" ~ Date - 1,
date == "Sunday" ~ Date - 2,
TRUE ~ Date)) %>%
select(-date)
# Date new_date
#1 2018-04-30 2018-04-30
#2 2018-01-31 2018-01-31
#3 2017-11-01 2017-11-01
#4 2017-07-31 2017-07-31
#5 2017-05-01 2017-05-01
#6 2017-01-30 2017-01-30
#7 2016-10-24 2016-10-24
#8 2016-07-25 2016-07-25
#9 2016-04-25 2016-04-25
#10 2016-01-25 2016-01-25
#11 2015-10-26 2015-10-26
#12 2015-07-20 2015-07-20
#13 2015-04-26 2015-04-24
#14 2015-01-26 2015-01-26
#15 2014-10-19 2014-10-17
#16 2014-07-21 2014-07-21
#17 2014-04-22 2014-04-22
#18 2014-01-26 2014-01-24
#19 2013-10-27 2013-10-25
data
x <- c("2018-04-30","2018-01-31","2017-11-01","2017-07-31","2017-05-01",
"2017-01-30","2016-10-24","2016-07-25","2016-04-25","2016-01-25","2015-10-26",
"2015-07-20","2015-04-26","2015-01-26" ,"2014-10-19","2014-07-21","2014-04-22",
"2014-01-26", "2013-10-27")
A data frame like below. 3 staffs have hourly readings in days, but incomplete (every staff shall have 24 readings a day).
Understand that staffs had different number of readings on the days. Now only interested in the staff with most readings in the day.
There are many days. It’s wanted to insert the missing (hourly) rows for the most ones on the days. That is, 2018-03-02 to insert only for Jack’s, 2018-03-03 only for David and 2018-03-04 only for Kate.
I tried these lines from this question (even though they fill all without differentiation) but not getting there.
How can it be done in R?
date_time <- c("2/3/2018 0:00","2/3/2018 1:00","2/3/2018 2:00","2/3/2018 3:00","2/3/2018 5:00","2/3/2018 6:00","2/3/2018 7:00","2/3/2018 8:00","2/3/2018 9:00","2/3/2018 10:00","2/3/2018 11:00","2/3/2018 12:00","2/3/2018 13:00","2/3/2018 14:00","2/3/2018 16:00","2/3/2018 17:00","2/3/2018 18:00","2/3/2018 19:00","2/3/2018 21:00","2/3/2018 22:00","2/3/2018 23:00","3/3/2018 0:00","3/3/2018 0:00","3/3/2018 1:00","3/3/2018 2:00","3/3/2018 4:00","3/3/2018 5:00","3/3/2018 7:00","3/3/2018 8:00","3/3/2018 9:00","3/3/2018 11:00","3/3/2018 12:00","3/3/2018 14:00","3/3/2018 15:00","3/3/2018 17:00","3/3/2018 18:00","3/3/2018 20:00","3/3/2018 22:00","3/3/2018 23:00","4/3/2018 0:00","4/3/2018 0:00","4/3/2018 1:00","4/3/2018 2:00","4/3/2018 3:00","4/3/2018 5:00","4/3/2018 6:00","4/3/2018 7:00","4/3/2018 8:00","4/3/2018 10:00","4/3/2018 11:00","4/3/2018 12:00","4/3/2018 14:00","4/3/2018 15:00","4/3/2018 16:00","4/3/2018 17:00","4/3/2018 19:00","4/3/2018 20:00","4/3/2018 22:00","4/3/2018 23:00")
staff <- c("Jack","Jack","Kate","Jack","Jack","Jack","Jack","Jack","Jack","Jack","Jack","Jack","Kate","Jack","Jack","Jack","David","David","Jack","Kate","David","David","David","David","David","David","David","David","David","David","David","David","David","David","David","David","David","Jack","Kate","David","David","Kate","Kate","Kate","Kate","Kate","Kate","Kate","Kate","Kate","Kate","Kate","Kate","Kate","Kate","Kate","Kate","Kate","Jack")
reading <- c(7.5,8.3,7,6.9,7.1,8.1,8.4,8.8,6,7.1,8.9,7.3,7.4,6.9,11.3,18.8,4.6,6.7,7.7,7.8,7,7,6.6,6.8,6.7,6.1,7.1,6.3,7.2,6,5.8,6.6,6.5,6.4,7.2,8.4,6.5,6.5,5.5,6.7,7,7.5,6.5,7.5,7.2,6.3,7.3,8,7,8.2,6.5,6.8,7.5,7,6.1,5.7,6.7,4.3,6.3)
df <- data.frame(date_time, staff, reading)
The option would be to do this separately. Create a data.table of the dates of interest and the corresponding 'staff', and get the full sequence of date time, then we rbind this with the original dataset and using a condition, we summarise the data
library(data.table)
stf <- c("Jack", "David", "Kate")
date <- as.Date(c("2018-03-02", "2018-03-03", "2018-03-04"))
df1 <- data.table(date, staff= stf)[, .(date_time = seq(as.POSIXct(paste(date, "00:00:00"),
tz = "GMT"),
length.out = 24, by = "1 hour")), staff]
setDT(df)[, date_time := as.POSIXct(date_time, "%d/%m/%Y %H:%M", tz = "GMT")]
res <- rbindlist(list(df, df1), fill = TRUE)[,
.(reading = if(any(is.na(reading))) sum(reading, na.rm = TRUE) else reading),
.(staff, date_time)]
table(res$staff, as.Date(res$date_time))
# 2018-03-02 2018-03-03 2018-03-04
# David 3 24 2
# Jack 24 1 1
# Kate 3 1 24
head(res)
# staff date_time reading
#1: Jack 2018-03-02 00:00:00 7.5
#2: Jack 2018-03-02 01:00:00 8.3
#3: Kate 2018-03-02 02:00:00 7.0
#4: Jack 2018-03-02 03:00:00 6.9
#5: Jack 2018-03-02 05:00:00 7.1
#6: Jack 2018-03-02 06:00:00 8.1
tail(res)
# staff date_time reading
#1: Kate 2018-03-04 04:00:00 0
#2: Kate 2018-03-04 09:00:00 0
#3: Kate 2018-03-04 13:00:00 0
#4: Kate 2018-03-04 18:00:00 0
#5: Kate 2018-03-04 21:00:00 0
#6: Kate 2018-03-04 23:00:00 0
Try this code:
Identify each daily hour and all staff members
date_h<-seq(as.POSIXlt(min(date_time),format="%d/%m/%Y %H:%M"),as.POSIXlt(max(date_time),format="%d/%m/%Y %H:%M"),by=60*60)
staff_u<-unique(staff)
comb<-expand.grid(staff_u,date_h)
colnames(comb)<-c("staff","date_time")
Uniform date format in df
df$date_time<-as.POSIXlt(df$date_time,format="%d/%m/%Y %H:%M")
Merge information
out<-merge(comb,df,all.x=T)
Your output:
head(out)
staff date_time reading
1 Jack 2018-03-02 00:00:00 7.5
2 Jack 2018-03-02 01:00:00 8.3
3 Jack 2018-03-02 02:00:00 NA
4 Jack 2018-03-02 03:00:00 6.9
5 Jack 2018-03-02 04:00:00 NA
6 Jack 2018-03-02 05:00:00 7.1
Data:
DB1 <- data.frame(orderItemID = 1:10,
orderDate = c("2013-01-21","2013-03-31","2013-04-12","2013-06-01","2014-01-01", "2014-02-19","2014-02-27","2014-10-02","2014-10-31","2014-11-21"),
deliveryDate = c("2013-01-23", "2013-03-01", "NA", "2013-06-04", "2014-01-03", "NA", "2014-02-28", "2014-10-04", "2014-11-01", "2014-11-23"))
Expected Outcome:
DB1 <- data.frame(orderItemID = 1:10,
orderDate= c("2013-01-21","2013-03-31","2013-04-12","2013-06-01","2014-01-01", "2014-02-19","2014-02-27","2014-10-02","2014-10-31","2014-11-21"),
deliveryDate = c("2013-01-23", "2013-03-01", "2013-04-14", "2013-06-04", "2014-01-03", "2014-02-21", "2014-02-28", "2014-10-04", "2014-11-01", "2014-11-23"))
My question is similar to another one I posted: so don´t be confused.
As you can see above I have some missing values in the delivery dates and I want to replace them by another date. That date should be the order date of the specific item + the average delivery time in (full) days.(2days)
The average delivery time is the time calculated from the average value of all samples that do not contain Missing values = (2days+1day+3days+2days+1day+2days+1day+2days):8=1,75
So I want to replace the NA in delivery time with the order date +2days. When there´s no NA, the date should stay the same.
I tried this already (with lubridate), but it´s not working :(
DB1$deliveryDate[is.na(DB1$deliveryDate) ] <- DB1$orderDate + days(2)
Can someone plz help me?
First, convert the columns to Date objects:
DB1[,2:3]<-lapply(DB1[,2:3],as.Date)
Then, replace the NA elements:
DB1$deliveryDate[is.na(DB1$deliveryDate)] <-
DB1$orderDate[is.na(DB1$deliveryDate)] +
mean(difftime(DB1$orderDate,DB1$deliveryDate,units="days"),na.rm=TRUE)
# orderItemID orderDate deliveryDate
#1 1 2013-01-21 2013-01-23
#2 2 2013-03-31 2013-03-01
#3 3 2013-04-12 2013-04-14
#4 4 2013-06-01 2013-06-04
#5 5 2014-01-01 2014-01-03
#6 6 2014-02-19 2014-02-21
#7 7 2014-02-27 2014-02-28
#8 8 2014-10-02 2014-10-04
#9 9 2014-10-31 2014-11-01
#10 10 2014-11-21 2014-11-23
You can do:
DB1 =cbind(DB1$orderItemID,as.data.frame(lapply(DB1[-1], as.character)))
days = round(mean(DB1$deliveryDate-DB1$orderDate, na.rm=T))
mask = is.na(DB1$deliveryDate)
DB1$deliveryDate[mask] = DB1$orderDate[mask]+days
# DB1$orderItemID orderDate deliveryDate
#1 1 2013-01-21 2013-01-23
#2 2 2013-03-31 2013-04-01
#3 3 2013-04-12 2013-04-14
#4 4 2013-06-01 2013-06-04
#5 5 2014-01-01 2014-01-03
#6 6 2014-02-19 2014-02-21
#7 7 2014-02-27 2014-02-28
#8 8 2014-10-02 2014-10-04
#9 9 2014-10-31 2014-11-01
#10 10 2014-11-21 2014-11-23
I re-arrange your data since they were not clean:
DB1 <- data.frame(orderItemID = 1:10,
orderDate = c("2013-01-21","2013-03-31","2013-04-12","2013-06-01","2014-01-01", "2014-02-19","2014-02-27","2014-10-02","2014-10-31","2014-11-21"),
deliveryDate = c("2013-01-23", "2013-04-01", NA, "2013-06-04", "2014-01-03", NA, "2014-02-28", "2014-10-04", "2014-11-01", "2014-11-23"))
Assuming that you have entered your data like this (note that NAs are not enclosed in quotes so they are read as NAs and not "NA")...
DB1 <- data.frame(orderItemID = 1:10,
orderDate = c("2013-01-21","2013-03-31","2013-04-12","2013-06-01","2014-01-01", "2014-02-19","2014-02-27","2014-10-02","2014-10-31","2014-11-21"),
deliveryDate = c("2013-01-23", "2013-03-01", NA, "2013-06-04", "2014-01-03", NA, "2014-02-28", "2014-10-04", "2014-11-01", "2014-11-23"),
stringsAsFactors = FALSE)
...and, per Nicola's answer, done this to get the formatting right...
DB1[,2:3]<-lapply(DB1[,2:3],as.Date)
...this also works:
library(lubridate)
DB1$deliveryDate <- with(DB1, as.Date(ifelse(is.na(deliveryDate), orderDate + days(2), deliveryDate), origin = "1970-01-01"))
Or you could use dplyr and pipe it:
library(lubridate)
library(dplyr)
DB2 <- DB1 %>%
mutate(deliveryDate = ifelse(is.na(deliveryDate), orderDate + days(2), deliveryDate)) %>%
mutate(deliveryDate = as.Date(.[,"deliveryDate"], origin = "1970-01-01"))