Create balanced data set - r

I am using R and have a long data set as the one outlined below:
Date ID Status
2014-10-01 12 1
2015-04-01 12 1
2015-07-01 12 1
2015-09-01 12 1
2015-11-01 12 0
2016-01-01 12 0
2016-05-01 12 0
2016-08-01 12 1
2017-03-01 12 1
2017-05-01 12 1
2014-10-01 13 1
2015-04-01 13 1
2015-07-01 13 0
2015-11-01 14 0
2016-01-01 14 0
...
My goal is to create a "balanced" data i.e. each ID should occur for each of the 10 dates. The variable "Status" for the initially non-occurring observations should be labeled as N/A. In other words, the outcome should look like this:
Date ID Status
2014-10-01 12 1
2015-04-01 12 1
2015-07-01 12 1
2015-09-01 12 1
2015-11-01 12 0
2016-01-01 12 0
2016-05-01 12 0
2016-08-01 12 1
2017-03-01 12 1
2017-05-01 12 1
2014-10-01 13 1
2015-04-01 13 1
2015-07-01 13 N/A
2015-09-01 13 N/A
2015-11-01 13 N/A
2016-01-01 13 N/A
2016-05-01 13 N/A
2016-08-01 13 N/A
2017-03-01 13 N/A
2017-05-01 13 N/A
2014-10-01 14 N/A
2015-04-01 14 N/A
2015-07-01 14 N/A
2015-09-01 14 N/A
2015-11-01 14 0
2016-01-01 14 0
2016-05-01 14 N/A
2016-08-01 14 N/A
2017-03-01 14 N/A
2017-05-01 14 N/A
...
Thank you for your help!

Here is an approach using tidyverse:
library(tidyverse)
df %>%
group_by(ID) %>%
expand(Date) %>% #in each id expand the dates
left_join(df) -> df1 #join the original data frame and save to object df1
or save to original object (thanks to Renu's comment):
df %<>%
group_by(ID) %>%
expand(Date) %>% #in each id expand the dates
left_join(df)
equivalent is:
df %>%
group_by(ID) %>%
expand(Date) %>% #in each id expand the dates
left_join(df) -> df
The result:
ID Date Status
1 12 2014-10-01 1
2 12 2015-04-01 1
3 12 2015-07-01 1
4 12 2015-09-01 1
5 12 2015-11-01 0
6 12 2016-01-01 0
7 12 2016-05-01 0
8 12 2016-08-01 1
9 12 2017-03-01 1
10 12 2017-05-01 1
11 13 2014-10-01 1
12 13 2015-04-01 1
13 13 2015-07-01 0
14 13 2015-09-01 NA
15 13 2015-11-01 NA
16 13 2016-01-01 NA
17 13 2016-05-01 NA
18 13 2016-08-01 NA
19 13 2017-03-01 NA
20 13 2017-05-01 NA
21 14 2014-10-01 NA
22 14 2015-04-01 NA
23 14 2015-07-01 NA
24 14 2015-09-01 NA
25 14 2015-11-01 0
26 14 2016-01-01 0
27 14 2016-05-01 NA
28 14 2016-08-01 NA
29 14 2017-03-01 NA
30 14 2017-05-01 NA
the data:
> dput(df)
structure(list(Date = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 5L, 6L), .Label = c("2014-10-01", "2015-04-01",
"2015-07-01", "2015-09-01", "2015-11-01", "2016-01-01", "2016-05-01",
"2016-08-01", "2017-03-01", "2017-05-01"), class = "factor"),
ID = c(12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
13L, 13L, 13L, 14L, 14L), Status = c(1L, 1L, 1L, 1L, 0L,
0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L)), .Names = c("Date",
"ID", "Status"), class = "data.frame", row.names = c(NA, -15L
))

The following worked for me:
df_b <- data.frame(date = rep(unique(df$date), length(unique(df$id))),
id = rep(unique(df$id), each = length(unique(df$date))))
balanced_data <- left_join(df_b, df)

Related

How to add missing dates to a column with NA's to the other columns? [duplicate]

This question already has answers here:
Insert rows for missing dates/times
(9 answers)
Closed 3 years ago.
Date NewArea.km2 IDday
1 2018-03-01 152.3972 0
2 2018-03-15 152.3972 0
3 2018-04-01 152.3972 0
4 2018-04-15 152.3972 0
5 2018-05-01 152.3972 0
6 2018-05-15 152.3972 0
7 2018-06-01 152.3972 0
8 2018-06-10 152.3972 0
9 2018-06-15 152.3972 0
10 2018-06-20 152.3972 0
11 2018-06-21 152.3972 1
12 2018-06-22 152.3972 3
13 2018-06-23 152.3972 4
14 2018-06-24 152.3972 6
15 2018-06-25 152.3972 6
16 2018-06-26 152.3972 16
17 2018-06-27 152.3972 22
18 2018-06-28 152.3972 22
19 2018-06-29 152.3972 24
20 2018-06-30 152.3972 24
21 2018-07-01 152.3972 27
Here's a subset of the data I have. As you may have eluded to from the title, I would like to add rows to my data frame such that missing dates are also included (i.e. 2018-03-02, 2018-03-03, etc.) The other two columns ideally would fill up with NA's such that the new row 2 would read
2 2018-03-02 NA NA
We can use complete
library(tidyr)
df1 %>%
complete(Date = seq(min(Date), max(date), by = '1 day'))
Here is a solution with base R
dfout <- `names<-`(data.frame(seq(range(df$Date)[1],range(df$Date)[2],by = "1 day"),NA,NA),names(df))
dfout[match(df$Date,dfout$Date),] <- df
such that
> head(dfout,20)
Date NewArea.km2 IDday
1 2018-03-01 152.3972 0
2 2018-03-02 NA NA
3 2018-03-03 NA NA
4 2018-03-04 NA NA
5 2018-03-05 NA NA
6 2018-03-06 NA NA
7 2018-03-07 NA NA
8 2018-03-08 NA NA
9 2018-03-09 NA NA
10 2018-03-10 NA NA
11 2018-03-11 NA NA
12 2018-03-12 NA NA
13 2018-03-13 NA NA
14 2018-03-14 NA NA
15 2018-03-15 152.3972 0
16 2018-03-16 NA NA
17 2018-03-17 NA NA
18 2018-03-18 NA NA
19 2018-03-19 NA NA
20 2018-03-20 NA NA
DATA
df <- structure(list(Date = structure(c(17591, 17605, 17622, 17636,
17652, 17666, 17683, 17692, 17697, 17702, 17703, 17704, 17705,
17706, 17707, 17708, 17709, 17710, 17711, 17712, 17713), class = "Date"),
NewArea.km2 = c(152.3972, 152.3972, 152.3972, 152.3972, 152.3972,
152.3972, 152.3972, 152.3972, 152.3972, 152.3972, 152.3972,
152.3972, 152.3972, 152.3972, 152.3972, 152.3972, 152.3972,
152.3972, 152.3972, 152.3972, 152.3972), IDday = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 3L, 4L, 6L, 6L, 16L,
22L, 22L, 24L, 24L, 27L)), row.names = c(NA, -21L), class = "data.frame")

Create group label for chunks of rows using data.table

I have something like the following dataset:
myDT <- structure(list(domain = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), id = 2:22, L1 = 2:22), row.names = c(NA,
-21L), class = c("data.table", "data.frame"))
and I would like to create a new column L2 that creates an index for every 2 rows within domain. However, if there is a remainder, like in the case for domain=2 and id=8,9,10, then those ids should be indexed together as long as its within the same domain. Please note that the specific id values in the toy dataset are made up and not always consecutive as shown. The output would be:
structure(list(domain = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), id = 2:22, L1 = 2:22, L2=c(1L,1L,2L,2L,3L,3L,4L,4L,4L,
5L,5L,6L,6L,7L,7L,8L,8L,9L,9L,10L,10L)),
row.names = c(NA, -21L), class = c("data.table", "data.frame"))
Is there an efficient way to do this in data.table?
I've tried playing with .N/rowid and the integer division operator %/% (since every n rows should give the same value) inside the subset call but it got me nowhere. For example, I tried something like:
myDT[, L2 := rowid(domain)%/%2]
but clearly this doesn't address the requirements that the last 3 rows within in domain=2 have the same index and that the index should continue incrementing for domain=3.
EDIT Please see revised desired output data table and corresponding description.
EDIT 2
Here is an appended version of myDT:
myDT2 <- structure(list(domain = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), id = 2:40,
L1 = 2:40), row.names = c(NA, -39L), class = c("data.table",
"data.frame"))
When I ran #chinsoon12's code on the above, I get:
structure(list(domain = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), id = 2:40,
L1 = 2:40, L2 = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 5L,
5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 11L,
11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L,
17L, 18L, 18L)), row.names = c(NA, -39L), class = c("data.table",
"data.frame"))
There appears to be 4 values of L2=11, when two of them should be 12 because they are in a different domain.
An idea is to make a custom function that will create sequential vectors based on the length of each group and the remainder of that length when divided by two. The function is,
f1 <- function(x) {
v1 <- length(x)
i1 <- rep(seq(floor(v1 / 2)), each = 2)
i2 <- c(i1, rep(max(i1), v1 %% 2))
i2 + seq_along(i2)
}
I tried to apply it via data.table but I was getting an error about a bug so here it is with base R,
cumsum(c(TRUE, diff(with(myDT2, ave(id, domain, FUN = f1))) != 1))
#[1] 1 1 2 2 3 3 4 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19
Here is another approach updated for the edited question (inspired by #Sotos use of cumsum):
For each domain id, create a repeated sequence 1, 0, 1, 0, 1, ..., setting the final sequence element to zero by default.
Take the cumsum over the created sequence across domain id's.
library(data.table)
setDT(myDT2)
myDT2[, L2 := c(head(rep_len(c(1, 0), .N), -1), 0), by = domain][, L2 := cumsum(L2)][]
#> domain id L1 L2
#> 1: 2 2 2 1
#> 2: 2 3 3 1
#> 3: 2 4 4 2
#> 4: 2 5 5 2
#> 5: 2 6 6 3
#> 6: 2 7 7 3
#> 7: 2 8 8 4
#> 8: 2 9 9 4
#> 9: 2 10 10 4
#> 10: 3 11 11 5
#> 11: 3 12 12 5
#> 12: 3 13 13 6
#> 13: 3 14 14 6
#> 14: 3 15 15 7
#> 15: 3 16 16 7
#> 16: 3 17 17 8
#> 17: 3 18 18 8
#> 18: 3 19 19 9
#> 19: 3 20 20 9
#> 20: 3 21 21 10
#> 21: 3 22 22 10
#> 22: 4 23 23 11
#> 23: 4 24 24 11
#> 24: 5 25 25 12
#> 25: 5 26 26 12
#> 26: 5 27 27 13
#> 27: 5 28 28 13
#> 28: 5 29 29 14
#> 29: 5 30 30 14
#> 30: 5 31 31 15
#> 31: 5 32 32 15
#> 32: 5 33 33 16
#> 33: 5 34 34 16
#> 34: 5 35 35 17
#> 35: 5 36 36 17
#> 36: 5 37 37 18
#> 37: 5 38 38 18
#> 38: 5 39 39 19
#> 39: 5 40 40 19
#> domain id L1 L2
Here is another option for variable number of repeats other than 2:
n <- 4
setDT(myDT)[, L2 :=
myDT[, {
x <- ceiling(seq_along(id)/n)
if (sum(x==x[.N]) < n) x[x==x[.N]] <- floor(.N/n)
x
}, domain][, rleid(domain, V1)]
]
Or a recursive approach:
n <- 4
s <- 0
setDT(myDT)[, L2 :=
myDT[, {
x <- s + ceiling(seq_along(id)/n)
if (sum(x==x[.N]) < n) x[x==x[.N]] <- s + floor(.N/n)
s <- if (s<max(x)) max(x) else s + 1
x
}, domain]$V1
]
output for n=2:
domain id L1 L2
1: 2 2 2 1
2: 2 3 3 1
3: 2 4 4 2
4: 2 5 5 2
5: 2 6 6 3
6: 2 7 7 3
7: 2 8 8 4
8: 2 9 9 4
9: 2 10 10 4
10: 3 11 11 5
11: 3 12 12 5
12: 3 13 13 6
13: 3 14 14 6
14: 3 15 15 7
15: 3 16 16 7
16: 3 17 17 8
17: 3 18 18 8
18: 3 19 19 9
19: 3 20 20 9
20: 3 21 21 10
21: 3 22 22 10
22: 4 23 23 11
23: 4 24 24 11
24: 5 25 25 12
25: 5 26 26 12
26: 5 27 27 13
27: 5 28 28 13
28: 5 29 29 14
29: 5 30 30 14
30: 5 31 31 15
31: 5 32 32 15
32: 5 33 33 16
33: 5 34 34 16
34: 5 35 35 17
35: 5 36 36 17
36: 5 37 37 18
37: 5 38 38 18
38: 5 39 39 19
39: 5 40 40 19
domain id L1 L2
output for n=4:
domain id L1 L2
1: 2 2 2 1
2: 2 3 3 1
3: 2 4 4 1
4: 2 5 5 1
5: 2 6 6 2
6: 2 7 7 2
7: 2 8 8 2
8: 2 9 9 2
9: 2 10 10 2
10: 3 11 11 3
11: 3 12 12 3
12: 3 13 13 3
13: 3 14 14 3
14: 3 15 15 4
15: 3 16 16 4
16: 3 17 17 4
17: 3 18 18 4
18: 3 19 19 5
19: 3 20 20 5
20: 3 21 21 5
21: 3 22 22 5
22: 4 23 23 6
23: 4 24 24 6
24: 5 25 25 7
25: 5 26 26 7
26: 5 27 27 7
27: 5 28 28 7
28: 5 29 29 8
29: 5 30 30 8
30: 5 31 31 8
31: 5 32 32 8
32: 5 33 33 9
33: 5 34 34 9
34: 5 35 35 9
35: 5 36 36 9
36: 5 37 37 10
37: 5 38 38 10
38: 5 39 39 10
39: 5 40 40 10
domain id L1 L2

Moving average and moving slope in R

I am looking to separately calculate a 7-day moving average and 7-day moving slope of 'oldvar'.
My sincere apologies that I didn't add the details below in my original post. These are repeated observations for each id which can go from a minimum of 3 observations per id to 100 observations per id. The start day can be different for different IDs, and to make things complicated, the days are not equally spaced, so some IDs have missing days.
Here is the data structure. Please note that 'average' is the variable that I am trying to create as moving 7-day average for each ID:
id day outcome average
1 1 15 100 NA
2 1 16 110 NA
3 1 17 190 NA
4 1 18 130 NA
5 1 19 140 NA
6 1 20 150 NA
7 1 21 160 140
8 1 22 100 140
9 1 23 180 150
10 1 24 120 140
12 2 16 90 NA
13 2 17 110 NA
14 2 18 120 NA
12 2 20 130 NA
15 3 16 110 NA
16 3 18 200 NA
17 3 19 180 NA
18 3 21 170 NA
19 3 22 180 168
20 3 24 210 188
21 3 25 160 180
22 3 27 200 184
Also, would appreciate advice on how to calculate a moving 7-day slope using the same.
Thank you and again many apologies for being unclear the first time around.
The real challenge is to create a data.frame after completing the missing rows. One solution could be using zoo library. The rollapply function will provide a way to assign NA value for the initial rows.
Using data from OP as is, the solution could be:
library(zoo)
library(dplyr)
# Data from OP
df <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L),
day = c(15L,16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 16L, 17L, 18L, 20L,
16L, 18L, 19L, 21L, 22L, 24L, 25L, 27L),
outcome = c(100L, 110L,190L, 130L, 140L, 150L, 160L, 100L, 180L, 120L, 90L, 110L, 120L,
130L, 110L, 200L, 180L, 170L, 180L, 210L, 160L, 200L)),
.Names = c("id", "day", "outcome"), row.names = c(NA, -22L), class = "data.frame")
# Make a list without missing day for each id
df_complete <- merge(
expand.grid(id=unique(df$id), day=min(df$day):max(df$day)),
df, all=TRUE)
# Valid range of day for each ID group
df_id_wise_range <- df %>% group_by(id) %>%
summarise(min_day = min(day), max_day = max(day)) %>% as.data.frame()
# id min_day max_day
# 1 1 15 24
# 2 2 16 20
# 3 3 16 27
# Join original df and df_complete and then use df_id_wise_range to
# filter it for valid range of day for each group
df_final <- df_complete %>%
left_join(df, by=c("id","day")) %>%
select(-outcome.y) %>%
inner_join(df_id_wise_range, by="id") %>%
filter(day >= min_day & day <= max_day) %>%
mutate(outcome = outcome.x) %>%
select( id, day, outcome) %>%
as.data.frame()
# Now apply mean to get average
df_average <- df_final %>% group_by(id) %>%
mutate(average= rollapply(outcome, 7, mean, na.rm = TRUE, by = 1,
fill = NA, align = "right", partial = 7)) %>% as.data.frame()
df_average
# The result
# id day outcome average
#1 1 15 100 NA
#2 1 16 110 NA
#3 1 17 190 NA
#4 1 18 130 NA
#5 1 19 140 NA
#6 1 20 150 NA
#7 1 21 160 140.0
#8 1 22 100 140.0
#9 1 23 180 150.0
#10 1 24 120 140.0
#11 2 16 90 NA
#12 2 17 110 NA
#13 2 18 120 NA
#....
#....
#19 3 19 180 NA
#20 3 20 NA NA
#21 3 21 170 NA
#22 3 22 180 168.0
#23 3 23 NA 182.5
#24 3 24 210 188.0
#25 3 25 160 180.0
#26 3 26 NA 180.0
#27 3 27 200 184.0
The steps to calculate moving slope are:
First create a function to return slope
Use function as as part of rollapplyr
#Function to calculate slope
slop_e <- function(z) coef(lm(b ~ a, as.data.frame(z)))[[2]]
#Apply function
z2$slope <- rollapplyr(zoo(z2), 7, slop_e , by.column = FALSE, fill = NA, align = "right")
z2
a b mean_a slope
1 1 21 NA NA
2 2 22 NA NA
3 3 23 NA NA
4 4 24 NA NA
5 5 25 NA NA
6 6 26 NA NA
7 7 27 4 1
8 8 28 5 1
9 9 29 6 1
10 10 30 7 1
11 11 31 8 1
12 12 32 9 1
13 13 33 10 1
14 14 34 11 1
15 15 35 12 1
16 16 36 13 1
17 17 37 14 1
18 18 38 15 1
19 19 39 16 1
20 20 40 17 1

R: generate value rows for each date extracted

I have a dataframe like this:
ID Year Week Monday Tuesday Wednesday
12 2017 42 8 9 8,5
12 2017 43 9 11 7,3
13 2017 43 9 10 6,8
I would like to change it in order to achive this:
ID day time
12 16/10/2017 8
12 17/10/2017 9
12 18/10/2017 8,5
12 23/10/2017 9
12 24/10/2017 11
12 25/10/2017 7,3
12 23/10/2017 9
12 24/10/2017 10
12 25/10/2017 6,8
I´m trying by using dplyr but still I have not found a solution
library(dplyr)
library(tidyr)
df %>%
gather(day, time, Monday:Wednesday) %>%
mutate(date = as.Date(paste(Year, Week, day),"%Y %U %A")) %>%
arrange(ID, Year, Week) %>%
select(-Year, -Week, -day)
# ID time date
#1 12 8 2017-10-16
#2 12 9 2017-10-17
#3 12 8,5 2017-10-18
#4 12 9 2017-10-23
#5 12 11 2017-10-24
#6 12 7,3 2017-10-25
#7 13 9 2017-10-23
#8 13 10 2017-10-24
#9 13 6,8 2017-10-25
#sample data
> dput(df)
structure(list(ID = c(12L, 12L, 13L), Year = c(2017L, 2017L,
2017L), Week = c(42L, 43L, 43L), Monday = c(8L, 9L, 9L), Tuesday = c(9L,
11L, 10L), Wednesday = structure(c(3L, 2L, 1L), .Label = c("6,8",
"7,3", "8,5"), class = "factor")), .Names = c("ID", "Year", "Week",
"Monday", "Tuesday", "Wednesday"), class = "data.frame", row.names = c(NA,
-3L))

dplyr mutate function to evaluate values within columns (current, previous, next) vertically

I have scoured SO for a way to achieve what I need without luck so here it goes.
A while back I discovered the package dplyr and its potential. I am thinking this package can do what I want, I just don't know how. This is a small subset of my data, but should be representative of my problem.
dummy<-structure(list(time = structure(1:20, .Label = c("2015-03-25 12:24:00",
"2015-03-25 21:08:00", "2015-03-25 21:13:00", "2015-03-25 21:47:00",
"2015-03-26 03:08:00", "2015-04-01 20:30:00", "2015-04-01 20:34:00",
"2015-04-01 20:42:00", "2015-04-01 20:45:00", "2015-09-29 18:26:00",
"2015-09-29 19:11:00", "2015-09-29 21:21:00", "2015-09-29 22:03:00",
"2015-09-29 22:38:00", "2015-09-30 00:48:00", "2015-09-30 01:38:00",
"2015-09-30 01:41:00", "2015-09-30 01:45:00", "2015-09-30 01:47:00",
"2015-09-30 01:49:00"), class = "factor"), ID = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), station = c(1L, 1L, 1L, 2L, 3,
4L, 4L, 4L, 4L, 5L, 5L, 6L,
6L, 5, 5, 5L, 7, 7, 7L,
7)), .Names = c("time", "ID", "station"), class = "data.frame", row.names = c(NA,
-20L))
I wish to evaluate rows within the time column conditional on the ID and station column. Specifically, I would like the function (dplyr?) to evaluate each time row, and compare the time to the previous time (row-1) and next time (row+1). If the time of current row is within 1 hour of time of previous and/or next row, and the ID and station of current row match that of previous and/or next row, then I would like to add in a new row a 1, otherwise a 0.
How would I achieve this using dplyr?
The expected outcome should be like this:
time ID station new.value
1 2015-03-25 12:24:00 1 1 0
2 2015-03-25 21:08:00 1 1 1
3 2015-03-25 21:13:00 1 1 1
4 2015-03-25 21:47:00 1 2 0
5 2015-03-26 03:08:00 1 3 0
6 2015-04-01 20:30:00 1 4 1
7 2015-04-01 20:34:00 1 4 1
8 2015-04-01 20:42:00 1 4 1
9 2015-04-01 20:45:00 1 4 1
10 2015-09-29 18:26:00 2 5 1
11 2015-09-29 19:11:00 2 5 1
12 2015-09-29 21:21:00 2 6 1
13 2015-09-29 22:03:00 2 6 1
14 2015-09-29 22:38:00 2 5 0
15 2015-09-30 00:48:00 2 5 1
16 2015-09-30 01:38:00 2 5 1
17 2015-09-30 01:41:00 2 7 1
18 2015-09-30 01:45:00 2 7 1
19 2015-09-30 01:47:00 2 7 1
20 2015-09-30 01:49:00 2 7 1
Here is an option using the difftime with dplyr mutate function. Firstly, we use a group_by operation to make sure the comparison is within each unique combination of ID and Station. The difftime can be used to calculate the difference time, here the units will be set as hours for convenience. The lag and lead functions are also from dplyr package which shift the selected column backward or forward. Combining with the vectorised operation of difftime, you can calculate the time difference between the current row and the previous/next row. We use abs to make sure the result is absolute value. The condition of <1 make sure the difference is within an hour. as.integer convert the logical values (T or F) to (1 or 0) correspondingly.
library(dplyr)
dummy %>% group_by(ID, station) %>%
mutate(new.value = as.integer(
abs(difftime(time, lag(time, default = Inf), units = "hours")) < 1 |
abs(difftime(time, lead(time, default = Inf), units = "hours")) < 1))
Source: local data frame [20 x 4]
Groups: ID, station [7]
time ID station new.value
(time) (int) (dbl) (int)
1 2015-03-25 12:24:00 1 1 0
2 2015-03-25 21:08:00 1 1 1
3 2015-03-25 21:13:00 1 1 1
4 2015-03-25 21:47:00 1 2 0
5 2015-03-26 03:08:00 1 3 0
6 2015-04-01 20:30:00 1 4 1
7 2015-04-01 20:34:00 1 4 1
8 2015-04-01 20:42:00 1 4 1
9 2015-04-01 20:45:00 1 4 1
10 2015-09-29 18:26:00 2 5 1
11 2015-09-29 19:11:00 2 5 1
12 2015-09-29 21:21:00 2 6 1
13 2015-09-29 22:03:00 2 6 1
14 2015-09-29 22:38:00 2 5 0
15 2015-09-30 00:48:00 2 5 1
16 2015-09-30 01:38:00 2 5 1
17 2015-09-30 01:41:00 2 7 1
18 2015-09-30 01:45:00 2 7 1
19 2015-09-30 01:47:00 2 7 1
20 2015-09-30 01:49:00 2 7 1
Psidom's answer is great -- here's a data.table approach.
library(data.table)
setDT(dummy)
# you do NOT want a factor for your time variable
dummy[, time := as.POSIXct(time) ]
dummy[, `:=`(lag_diff = c(Inf, diff(as.numeric(time))),
lead_diff = c(diff(as.numeric(time)), Inf)),
by = .(ID, station) ]
dummy[, new.value := as.integer(lag_diff < 3600 | lead_diff < 3600) ]
dummy
Another solution using R base functions (sapply and difftime):
n=nrow(dummy)
dummy$new.value=
as.numeric(sapply(1:n, function(i)
(i<n && (dummy[i,"ID"]==dummy[i+1,"ID"] && dummy[i,"station"]==dummy[i+1,"station"])
&& abs(as.numeric(difftime(dummy[i,"time"], dummy[i+1,"time"]), "hours"))<=1)
||
(i>1 && (dummy[i,"ID"]==dummy[i-1,"ID"] && dummy[i,"station"]==dummy[i-1,"station"])
&& abs(as.numeric(difftime(dummy[i,"time"], dummy[i-1,"time"]), "hours"))<=1)
))
# > dummy
# time ID station new.value
# 1 2015-03-25 12:24:00 1 1 0
# 2 2015-03-25 21:08:00 1 1 1
# 3 2015-03-25 21:13:00 1 1 1
# 4 2015-03-25 21:47:00 1 2 0
# 5 2015-03-26 03:08:00 1 3 0
# 6 2015-04-01 20:30:00 1 4 1
# 7 2015-04-01 20:34:00 1 4 1
# 8 2015-04-01 20:42:00 1 4 1
# 9 2015-04-01 20:45:00 1 4 1
# 10 2015-09-29 18:26:00 2 5 1
# 11 2015-09-29 19:11:00 2 5 1
# 12 2015-09-29 21:21:00 2 6 1
# 13 2015-09-29 22:03:00 2 6 1
# 14 2015-09-29 22:38:00 2 5 0
# 15 2015-09-30 00:48:00 2 5 1
# 16 2015-09-30 01:38:00 2 5 1
# 17 2015-09-30 01:41:00 2 7 1
# 18 2015-09-30 01:45:00 2 7 1
# 19 2015-09-30 01:47:00 2 7 1
# 20 2015-09-30 01:49:00 2 7 1

Resources