Trying to add a new row based on a check in R - r

I have a data set that looks like this:
library(dplyr)
library(lubridate)
s <- c(1,1,1)
r <- c("2017-01-01 12:34:17", "2017-01-01 12:52:18", "2017-01-01 13:17:18")
t <- c(1,1,1)
g <- as.data.frame(matrix(c(s, as.POSIXct(r), t), nrow = 3, ncol = 3))
names(g) <- c("DeviceId", "Time", "Success/Fail")
g$Time <- as.POSIXct(g$Time, origin = '1970-01-01')
I am trying to write a function that loops through the data set and checks to see if the row and its successor's Time are more than 15 minutes apart. Then, the loop would add a row to the data set with the same DeviceId, the row's time plus 15 minutes, and 0 in the Success/Fail column. Here's what I've come up with:
f <- function(g) {
for(i in 2:nrow(g)) {
if(g$Time[i] - g$Time[i-1] >= 15) {
q <- list(g$DeviceId[i-1], g$Time[i-1] + minutes(15), 0)
y <- data.frame()
y <- rbind(g, q)
arrange(y, Time)
} else NULL
}
}
f(g)

I think this might be what you are after. I am sort of unclear about the success/fail indicator (-1 assigned to cases where the times are less than 15 minutes apart). It avoids the loop by using the lag() function in dplyr. Presumably, your data has more than one device so I added group_by(DeviceId)
x <- g %>%
group_by(DeviceId) %>%
mutate(
lTime = lag(Time, order_by = Time),
dTime = Time - lTime,
`Success/Fail` = if_else(dTime >= 15, 0, -1),
newTime = Time + minutes(15)
)
y <- x %>%
select(DeviceId, newTime, `Success/Fail`) %>%
rename(Time = newTime) %>%
ungroup() %>%
rbind(g, .)

Here is another option. I think the previous example is removing the first row? when the last row should be dropped (no time period after it).
g <- data.frame(DeviceId = rep(1,3),
Time = ymd_hms(c("2017-01-01 12:34:17", "2017-01-01 12:52:18", "2017-01-01 13:17:18")),
Success_Fail = rep(1,3))
g %>%
transmute(DeviceId = DeviceId,
Time = Time,
t = lead(Time)) %>%
drop_na %>%
rowwise() %>%
mutate(t2 = if((t - Time) > 15) {Time + minutes(15)} else {NA},
Success_Fail = 0) %>%
dplyr::select(DeviceId, Time = t2, Success_Fail) %>%
bind_rows(g) %>%
arrange(Time)

Related

Remove elements when getting the "missing element" error

I have a data set df that I subset into two list int1 and int2. Each one of the elements in this list represents a 10-day period for single individual (e.g., the first three elements in int1 represent three different 10-day periods for ID "A"). int2 is a bit different because it only has one 10-day period for ID "A" and "B". This is because the data for month 4 only has one 10-day period.
I also have a list l1 that contains two matrices.
Setting up the example data:
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("10-04-2011"), by = "days"), 100)
ID <- rep(c("A","B"), 100)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$julian <- yday(df$date)
df$month <- month(df$date)
int1 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "1") %>%
group_split()
int2 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "4") %>%
group_split()
m1 <- matrix(1:9, nrow = 3, ncol = 3)
m2 <- matrix(20:28, nrow = 3, ncol = 3)
l1 <- list(m1, m2)
In the following code, I use the objects that I have created above. g1 is created as a sequence that is used in lstMat.
f1 is a function that does a number of calculations between int1 and int2 is also created to be used in lstMat.
g1 <- rep(seq_along(l1), sapply(l1, nrow))
# Function to calculate the number of julian dates between the first date in the
# first interval and the first date of the second interval
f1 <- function(.int1, .int2) {
t(outer(seq_along(.int1), seq_along(.int2),
FUN = Vectorize(function(i, j) {
min(.int1[[i]]$jDate) -
min(.int2[[j]]$jDate)
})
))
}
g1 <- rep(seq_along(l1), sapply(l1, nrow))
# Function to calculate the number of julian dates between the first date in the
# first interval and the first date of the second interval
f1 <- function(.int1, .int2) {
t(outer(seq_along(.int1), seq_along(.int2),
FUN = Vectorize(function(i, j) {
min(.int1[[i]]$jDate) -
min(.int2[[j]]$jDate)
})
))
}
This is the section of my script that I have been getting an error on.
lstMat <- purrr::map2(split(int1[seq_len(length(g1))], g1),
split(int2[seq_len(length(g1))], g1), f1)
Here is the error:
Error in `stop_subscript()`:
! Can't subset elements that don't exist.
x Locations 3, 4, 5, and 6 don't exist.
i There are only 2 elements.
Run `rlang::last_error()` to see where the error occurred.
I think the error is occuring due to the mismatch in length between g1 and int2 when trying to create the lstMat object. I was wondering how I could modify the code to remove those missing elements from g1 when I try to split int2 based on the g1 when running lstMat.

Set paramenter of a function using difftime and dplyr

How can I set the paramenter units ="mins" in the following function as a parameter?
This is just the data frame:
library(tidyverse)
u <- runif(10, 0, 60)
w <- runif(10, 0, 60)
df <- tibble(time_1 = as.POSIXct(u, origin = "2019-02-03 08:00:00"),
time_2 = as.POSIXct(w, origin = "2019-02-03 08:30:00"))
This is my funtion. I would like to be able to change the paramenter for difftime and set it as a parameter, e.g. units = "months".
time_diff <- function(df, stamp1, stamp2){
stamp1 <- enquo(stamp1)
stamp2 <- enquo(stamp2)
name <- paste0(quo_name(stamp1), "_", quo_name(stamp2))
df %>%
mutate(!!name := difftime(!!stamp1, !!stamp2, units="mins"))
}
df %>%
time_diff(time_2, time_1)
But I would like something like this:
df %>%
time_diff(time_2, time_1, mins)
What about just adding the units parameter? e.g.
time_diff <- function(df, stamp1, stamp2, units="mins"){
stamp1 <- enquo(stamp1)
stamp2 <- enquo(stamp2)
name <- paste0(quo_name(stamp1), "_", quo_name(stamp2))
df %>%
mutate(!!name := difftime(!!stamp1, !!stamp2, units=units))
}
Then you can do df %>% time_diff(time_2, time_1, "mins").

How to fill a vector from a for-loop containing a numeric sequence with a given step?

I have the following piece of code:
library(dplyr)
Q = 10000
span = 1995:2016
time = rep(span,times = Q, each= Q)
id = rep(1:Q,times=length(span))
s1 = rep(rnorm(Q,0,1),times=length(span))
gdp = rep(rnorm(Q,0,1),times=length(span))
e = rep(rnorm(Q,0,1),times=length(span))
dfA = data.frame(id,time,s1,e,gdp)
mgr = double()
stp = 10
for(K in seq(10,Q,stp)){
gr = double()
for(t in span){
wt1 = dfA %>% filter(time == t-1) %>%
arrange(desc(s1)) %>% mutate(w= s1/gdp)
zt1 = dfA %>% filter(time == t-1) %>% mutate(z1 = log(s1/e))
zt = dfA %>% filter(time == t) %>% mutate(z = log(s1/e))
gt = left_join(zt1,zt,by="name") %>%
mutate(g = z-z1) %>% select(name,g) %>% na.omit()
a = left_join(wt1,gt,by="name") %>% na.omit()
a = a %>% mutate(id = 1:length(a$name)) %>%
filter(id <= Q) %>% mutate(gbar = mean(g)) %>%
filter(id <= K) %>% mutate(sck = g-gbar,
gamma = w*sck)
gr = append(gr, sum(a$gamma))
}
mgr = append(mgr,mean(gr))
}
where dfA is a data frame containing an id variable and a time variable, among others. Since the time variable ranges from 1995 to 2016 and K is a sequence with step 10, I resorted to append() to store gr and mgr, respectively. The problem is that it takes too long to compute.
So my question is: Is there any way to avoid using append() to fill the vectors gr and mgr and thus reduce the time spent to compute the code?
You could initiate the 'gr' and 'mgr' vectors with a set length rather than just initiate them as a double and have R extend them every iteration. The advantage is that the memory for the vector is allocated beforehand and you don't have to redefine the entire variable mgr/gr.
## initiate vectors with set length
mgr <- double(length = length(seq(10,Q,stp)))
gr <- double(length = length(1995:2016))
# fill the positions in each iteration
outerIteration <- (K - 10) / stp
innerIteration <- t - 1994
gr[innerIteration] <- sum(a$gamma)
# take the mean for each block of length 21 (2016 - 1995)
mgr[outerIteration] <- mean(gr[(outerIteraion -1)*21 + 1 : outerIteration*21])

How to average adjacent columns (non-overlapping) in R using dplyr function?

This is a sample of my dataset.
library(tidyr)
library(dplyr)
resource <- c("good","good","bad","bad","good","good","bad","bad","good","good","bad","bad","good","good","bad","bad")
fertilizer <- c("none", "nitrogen","none","nitrogen","none", "nitrogen","none","nitrogen","none", "nitrogen","none","nitrogen","none", "nitrogen","none","nitrogen")
t1 <- sample(1:20, 16)
t2 <- sample(1:20, 16)
t3 <- sample(1:20, 16)
t4 <- sample(1:20, 16)
t5 <- sample(1:20, 16)
t6 <- sample(10:100, 16)
t7 <- sample(10:100, 16)
t8 <- sample(10:100, 16)
t9 <- sample(10:100, 16)
t10 <- sample(10:100, 16)
replicates <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)
data <- data.frame(resource, fertilizer,replicates, t1,t2,t3,t4,t5,t6,t7,t8,t9,t10)
data$resource <- as.factor(data$resource)
data$fertilizer <- as.factor(data$fertilizer)
Where t0,t1,t2..etc are time points. I need to average adjacent time points (non-overlapping) across eg. (t1,t2), (t3,t4)..and the new column headings need to have the average of the times, so that the columns read as t1.5,t3.5,...etc.
Thus in the end I need to have only 5 columns reading t1.5, t3.5,t5.5, t7.5,t9.5
Is there anyway this can be achieved using dplyr function, or any other function in R?
Edited for OP's modified request:
If you put everything in a tidy format, you can take advantage of the lag/lead functions to average adjacent rows.
library(stringr)
library(forcats)
data %>%
gather(key = time, value = value, -replicates, -resource, -fertilizer) %>%
mutate(index = as.integer(str_extract(time, "[0-9]+"))) %>%
arrange(replicates, index) %>%
group_by(resource, fertilizer, replicates) %>%
mutate(mid_value = (value + lead(value))/2,
mid_index = (index + lead(index))/2,
mid_time = str_c("t",mid_index)) %>%
ungroup %>%
filter(!is.na(mid_value), index %% 2 == 1) %>%
select(replicates, resource, fertilizer, matches("mid")) %>%
rename(value = mid_value, time = mid_time, index = mid_index) %>%
arrange(index) %>%
mutate(time = as_factor(time)) %>%
select(-index) %>%
spread(key = time, value = value) %>%
arrange(replicates)
Solution using only base R: You need to somehow find the columns you want to calculate the average for. You can do this by searching the column names for the t + "somenumber" pattern. After that, create a sequence of sequences, corresponding to the column numbers of df you want to calculate the mean for.
relevant_cols <- grep("[0-9]{1,2}", names(df))
start <- min(relevant_cols)
end <- max(relevant_cols)
cols <- split(start:end, rep(1:5, each=2))
If you look at cols, you'll see that it is list of five, each element resembling a combination of columns you want to average. This smells like a use-case for sapply():
newdf <- sapply(cols, function(x) rowMeans(df[x]) )
colnames(newdf) <- paste0("t", seq(1, diff(range(relevant_cols)), 2) + 0.5)
Edit: I seem to have misunderstood what you want to maintain and what not. You can just cbind() (parts of) the old df to newdf:
cbind(df, newdf)
cbind(df[, -relevant_cols], newdf) # This is what you want. I think..
Here ya go:
transmute(data,
t1.5 = (t1 + t2) / 2,
t3.5 = (t3 + t4) / 2,
t5.5 = (t5 + t6) / 2,
t7.5 = (t7 + t8) / 2,
t9.5 = (t9 + t10) / 2)

hourly sums with dplyr with zeros for empty hours

I have a dataset similar to the format of "my_data" below, where each line is a single count of an event. I want to obtain a summary of how many events happen in every hour. I would like to have every hour with no events be included with a 0 for its "hourly_total" value.
I can achieve this with dplyr as shown, but the empty hours are dropped instead of being set to 0.
Thank you!
set.seed(123)
library(dplyr)
library(lubridate)
latemail <- function(N, st="2012/01/01", et="2012/1/31") {
st <- as.POSIXct(as.Date(st))
et <- as.POSIXct(as.Date(et))
dt <- as.numeric(difftime(et,st,unit="sec"))
ev <- sort(runif(N, 0, dt))
rt <- st + ev
}
my_data <- data_frame( fake_times = latemail(25),
count = 1)
my_data %>% group_by( rounded_hour = floor_date(fake_times, unit = "hour")) %>%
summarise( hourly_total = sum(count))
Assign your counts to an object
counts <- my_data %>% group_by( rounded_hour = floor_date(fake_times, unit = "hour")) %>%
summarise( hourly_total = sum(count))
Create a data frame with all the necessary hours
complete_data = data.frame(hour = seq(floor_date(min(my_data$fake_times), unit = "hour"),
floor_date(max(my_data$fake_times), unit = "hour"),
by = "hour"))
Join to it and fill in the NAs.
complete_data %>% group_by( rounded_hour = floor_date(hour, unit = "hour")) %>%
left_join(counts) %>%
mutate(hourly_total = ifelse(is.na(hourly_total), 0, hourly_total))

Resources