Remove elements when getting the "missing element" error - r

I have a data set df that I subset into two list int1 and int2. Each one of the elements in this list represents a 10-day period for single individual (e.g., the first three elements in int1 represent three different 10-day periods for ID "A"). int2 is a bit different because it only has one 10-day period for ID "A" and "B". This is because the data for month 4 only has one 10-day period.
I also have a list l1 that contains two matrices.
Setting up the example data:
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("10-04-2011"), by = "days"), 100)
ID <- rep(c("A","B"), 100)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$julian <- yday(df$date)
df$month <- month(df$date)
int1 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "1") %>%
group_split()
int2 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "4") %>%
group_split()
m1 <- matrix(1:9, nrow = 3, ncol = 3)
m2 <- matrix(20:28, nrow = 3, ncol = 3)
l1 <- list(m1, m2)
In the following code, I use the objects that I have created above. g1 is created as a sequence that is used in lstMat.
f1 is a function that does a number of calculations between int1 and int2 is also created to be used in lstMat.
g1 <- rep(seq_along(l1), sapply(l1, nrow))
# Function to calculate the number of julian dates between the first date in the
# first interval and the first date of the second interval
f1 <- function(.int1, .int2) {
t(outer(seq_along(.int1), seq_along(.int2),
FUN = Vectorize(function(i, j) {
min(.int1[[i]]$jDate) -
min(.int2[[j]]$jDate)
})
))
}
g1 <- rep(seq_along(l1), sapply(l1, nrow))
# Function to calculate the number of julian dates between the first date in the
# first interval and the first date of the second interval
f1 <- function(.int1, .int2) {
t(outer(seq_along(.int1), seq_along(.int2),
FUN = Vectorize(function(i, j) {
min(.int1[[i]]$jDate) -
min(.int2[[j]]$jDate)
})
))
}
This is the section of my script that I have been getting an error on.
lstMat <- purrr::map2(split(int1[seq_len(length(g1))], g1),
split(int2[seq_len(length(g1))], g1), f1)
Here is the error:
Error in `stop_subscript()`:
! Can't subset elements that don't exist.
x Locations 3, 4, 5, and 6 don't exist.
i There are only 2 elements.
Run `rlang::last_error()` to see where the error occurred.
I think the error is occuring due to the mismatch in length between g1 and int2 when trying to create the lstMat object. I was wondering how I could modify the code to remove those missing elements from g1 when I try to split int2 based on the g1 when running lstMat.

Related

Match list elements based on attribute component

I have a data set that i split into two list int1 and int2.
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("01-01-2013"), by = "days"), 300)
ID <- rep(c("A","B", "C"), 300)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$month <- month(df$date)
df$year <- year(df$date)
# Create first list
int1 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "1") %>%
group_split()
# Create second list
int2 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "2") %>%
group_split()
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
x$year[1], sep = '_'))
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
x$year[1], sep = '_'))
I then assign a attribute to each list (match). I created a function check to grab this attribute more easily. I removed some elements from one list for this exmaple.
int1 <- int1[-c(3,6)]
# Convenience function to grab the attributes for you
check <- function(x) {
return(attr(x, "match"))
}
# Add an attribute to hold the attributes of each list element
attr(int1, "match") <- data.frame(id = sapply(int1, function(x) paste(x$ID[1])),
interval_start_date = sapply(int1, function(x) paste(x$new[1]))
)
# Check the attributes
check(int1)
# Add an attribute "tab" to hold the attributes of each list element
attr(int2, "match") <- data.frame(id = sapply(int2, function(x) paste(x$ID[1])),
interval_start_date = sapply(int2, function(x) paste(x$new[1]))
)
# Check the attributes
check(int2)
I would like to remove elements that are not in another based on the attribute that I had added. Specifically I would like to remove any that don't have the same interval_start_date and ID associated with it. For the interval_start_date, only the year and the day have to match, as the month will most likely differ between the two list. In this case, I would like int2 to match int1. Any thoughts on how I could do this? A base r method is preferred, if possible.
# Expected results
expected_int2 <- list(int2[[1]], int2[[2]], int2[[3]], int2[[4]], int2[[5]],
int2[[6]], int2[[7]])
names(expected_int2) <- sapply(int1, function(x) paste(x$ID[1],
x$year[1], sep = "_"))
We may create an index with %in% after pasteing the 'id' and the formatted 'interval_start_date' i.e. after removing the 'month' part
i1 <- with(check(int2), paste(id, format(as.Date(interval_start_date),
"%Y-%d"))) %in% with(check(int1), paste(id,
format(as.Date(interval_start_date), "%Y-%d")))
> which(i1)
[1] 1 2 4 5 7 8 9
out <- int2[i1]

Remove list elements that are not present in another list based on element names

I have two list that I am working with int1 and int2. Both list have similar names for the list elements. I would like to remove specific components in one list, in this case int2 that are not present in another list int1. Is there a good way to do this in base R? I would like my results to look like the expected_int2.
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("31-07-2011"), by = "days"), 200)
ID <- rep(c("A","B", "C"), 200)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$Month <- month(df$date)
# Create first list
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "1") %>%
group_split()
# Assign names to int1
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
#Remove list elements for the example
int1 <- int1[-c(6, 8, 9)]
# Create second list
int2 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "2") %>%
group_split()
# Assign names to int2
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
# Expected results
expected_int2 <- list(int2[[1]], int2[[2]], int2[[3]], int2[[4]], int2[[5]], int2[[6]])
names(expected_int2) <- sapply(int1, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
We can remove the month part from the names, to check if they are similar to subset
i1 <- sub("(.*)-\\d+-(.*)", "\\1-\\2", names(int2)) %in%
sub("(.*)-\\d+-(.*)", "\\1-\\2", names(int1))
out <- int2[i1]
names(out) <- names(int1)

Fixing mismatched matrices

I have a data set df that has been split into int1 and int2. In int1andint2, there is two elements for the IDA and three elements for theID` B.
My goal is to create a 2x2 matrix for ID A and 3x3 for ID B, and have it divided from my example list of matrices l1. Currently, my code is creating a 3x3 matrix for ID A and 2x2 matrix for ID B using a combination of the product from g1 and f2 using map2() resulting to lstmat.
Any suggestions on how I can get the desired output of a 2x2 matrix for ID A and 3x3 matrix for ID B?
Example data:
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("26-12-2010"), dmy("20-12-2011"), by = "days"), 500)
ID <- rep(c("A","B"), 5000)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$jDate <- julian(as.Date(df$date), origin = as.Date('1970-01-01'))
df$Month <- month(df$date)
df$year <- year(df$date)
t1 <- c(100,150)
t2 <- c(200,250)
mat <- cbind(t1,t2)
t1 <- c(150,150,200)
t2 <- c(250,250,350)
t3 <- c(350,350, 400)
mat2 <- cbind(t1,t2, t3)
l1 <- list(mat, mat2)
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "3") %>%
group_split()
int2 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "2") %>%
group_split()
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
sep = '_'))
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
sep = '_'))
int1 <- int1[-1]
int2 <- int2[-1]
Any suggestions for changes to this code for the desired result? :
g1 <- as.integer(gl(length(int1), 3, length(int1)))
f2 <- function(.int1, .int2) {
t(outer(seq_along(.int1), seq_along(.int2),
FUN = Vectorize(function(i, j) min(.int1[[i]]$jDate) -
min(.int2[[j]]$jDate))))
}
lstMat <- map2(split(int1, g1), split(int2, g1), f2)
map2(l1, lstMat, `/`)
As the 'int1', 'int2' have duplicated names, split on the names instead of creating a grouping index with gl
lstMat <- map2(split(int1, names(int1)), split(int2, names(int2)), f2)
map2(l1, lstMat, `/`)
-output
[[1]]
t1 t2
[1,] 3.571429 5.263158
[2,] 8.333333 8.928571
[[2]]
t1 t2 t3
[1,] 5.357143 6.578947 7.291667
[2,] 8.333333 8.928571 9.210526
[3,] 25.000000 19.444444 14.285714

Group data by year and filter by month in R

I have a list of data frames with daily streamflow data.
I want to estimate the maximum daily flow from June to November every year for each data frame in the list that corresponds each of them to data in a station.
This is how the list of data frames looks:
and this is the code I am using:
#Peak mean daily flow summer and fall (June to November)
PeakflowSummerFall <- lapply(listDF,function(x){x %>% group_by(x %>% mutate(year = year(Date)))
%>% filter((x %>% mutate(month = month(Date)) >= 6) & (x %>% mutate(month = month(Date)) <= 11))
%>% summarise(max=max(DailyStreamflow, na.rm =TRUE))})
but I am having this error:
<error/dplyr_error>
Problem with `filter()` input `..1`.
x Input `..1` must be of size 1, not size 24601.
i Input `..1` is `&...`.
i The error occurred in group 1: Date = 1953-06-01, DailyStreamflow = 32, year = 1953.
Backtrace:
Run `rlang::last_trace()` to see the full context
Any solution to this problem?
#### This should give provide you with enough
#### sample data for answerers to work with
install.packages('purrr')
library(purrr)
sample_dat <- listDF %>%
head %>%
map( ~ head(.x))
dput(sample_dat)
#### With that being said...
#### You should flatten the data frame...
#### It's easier to work with...
install.packages('lubridate')
library(lubridate)
listDF %>%
plyr::ldply(rbind) %>%
mutate(month = floor_date(Date, unit = 'month')) %>%
filter(month(Date) > 5, month(Date) < 12) %>%
group_by(.id, month) %>%
dplyr::summarise(max_flow = max(DailyStreamflow)) %>%
split(.$.id)
Given the posted image of the data structure, the following might work.
library(lubridate)
library(dplyr)
listDF %>%
purrr::map(function(x){
x %>%
filter(month(Date) >= 6 & month(Date) <= 11) %>%
group_by(year(Date)) %>%
summarise(Max = max(DailyStreamflow, na.rm = TRUE), .groups = "keep")
})
Test data creation code.
fun <- function(year, n){
d1 <- as.Date(paste(year, 1, 1, sep = "-"))
d2 <- as.Date(paste(year + 10, 12, 31, sep = "-"))
d <- seq(d1, d2, by = "day")
d <- sort(rep(sample(d, n, TRUE), length.out = n))
flow <- sample(10*n, n, TRUE)
data.frame(Date = d, DailyStreamflow = flow)
}
set.seed(2020)
listDF <- lapply(1:3, function(i) fun(c(1953, 1965, 1980)[i], c(24601, 13270, 17761)[i]))
str(listDF)
rm(fun)

Trying to add a new row based on a check in R

I have a data set that looks like this:
library(dplyr)
library(lubridate)
s <- c(1,1,1)
r <- c("2017-01-01 12:34:17", "2017-01-01 12:52:18", "2017-01-01 13:17:18")
t <- c(1,1,1)
g <- as.data.frame(matrix(c(s, as.POSIXct(r), t), nrow = 3, ncol = 3))
names(g) <- c("DeviceId", "Time", "Success/Fail")
g$Time <- as.POSIXct(g$Time, origin = '1970-01-01')
I am trying to write a function that loops through the data set and checks to see if the row and its successor's Time are more than 15 minutes apart. Then, the loop would add a row to the data set with the same DeviceId, the row's time plus 15 minutes, and 0 in the Success/Fail column. Here's what I've come up with:
f <- function(g) {
for(i in 2:nrow(g)) {
if(g$Time[i] - g$Time[i-1] >= 15) {
q <- list(g$DeviceId[i-1], g$Time[i-1] + minutes(15), 0)
y <- data.frame()
y <- rbind(g, q)
arrange(y, Time)
} else NULL
}
}
f(g)
I think this might be what you are after. I am sort of unclear about the success/fail indicator (-1 assigned to cases where the times are less than 15 minutes apart). It avoids the loop by using the lag() function in dplyr. Presumably, your data has more than one device so I added group_by(DeviceId)
x <- g %>%
group_by(DeviceId) %>%
mutate(
lTime = lag(Time, order_by = Time),
dTime = Time - lTime,
`Success/Fail` = if_else(dTime >= 15, 0, -1),
newTime = Time + minutes(15)
)
y <- x %>%
select(DeviceId, newTime, `Success/Fail`) %>%
rename(Time = newTime) %>%
ungroup() %>%
rbind(g, .)
Here is another option. I think the previous example is removing the first row? when the last row should be dropped (no time period after it).
g <- data.frame(DeviceId = rep(1,3),
Time = ymd_hms(c("2017-01-01 12:34:17", "2017-01-01 12:52:18", "2017-01-01 13:17:18")),
Success_Fail = rep(1,3))
g %>%
transmute(DeviceId = DeviceId,
Time = Time,
t = lead(Time)) %>%
drop_na %>%
rowwise() %>%
mutate(t2 = if((t - Time) > 15) {Time + minutes(15)} else {NA},
Success_Fail = 0) %>%
dplyr::select(DeviceId, Time = t2, Success_Fail) %>%
bind_rows(g) %>%
arrange(Time)

Resources