Find the closest enclosing FALSE value positions - r

Is there a more elegant way to solve this problem?
For every TRUE value I'm looking for the positions of the closest previous and following FALSE values.
data:
vec <- c(FALSE, TRUE, TRUE, FALSE, TRUE, FALSE)
desired outcome: (something like)
pos start end
[1,] 2 1 4
[2,] 3 1 4
[3,] 5 4 6
explanation of the first row of the outcome:
pos = 2, position of the first TRUE,
start = 1, position of the closest FALSE in front of pos = 2
end = 4, position of the closest FALSE after pos = 2.
Already working solution:
pos = which(vec)
f_pos = which(!vec)
t(
sapply(pos, function(x){ s <- rev(f_pos[f_pos < x])[1]; e <- f_pos[x < f_pos][1]; return(data.frame(pos = x, start = s, end = e)) })
)

Using findInterval
pos <- which(vec)
b <- which(!vec)
ix <- findInterval(pos, b)
cbind(pos, from = b[ix], to = b[ix + 1])
# pos from to
# [1,] 2 1 4
# [2,] 3 1 4
# [3,] 5 4 6
If we stretch your "something like" slightly, a simple cut will do:
data.frame(pos, rng = cut(pos, b))
# pos rng
# 1 2 (1,4]
# 2 3 (1,4]
# 3 5 (4,6]
If the vector ends with TRUE, the findInterval solution will give NA in 'to' column. In cut, the last 'interval' is then coded as NA.

You can do as if FALSE defined intervals and use data.table::foverlaps to find the right ones:
library(data.table)
# put your objects in data.tables:
f_pos_inter <- data.table(start=head(f_pos, -1), end=tail(f_pos, -1))
pos_inter <- data.table(start=pos, end=pos)
# define the keys:
setkeyv(pos_inter, c("start", "end")); setkeyv(f_pos_inter, c("start", "end"))
res <- foverlaps(pos_inter, f_pos_inter)
# start end i.start i.end
#1: 1 4 2 2
#2: 1 4 3 3
#3: 4 6 5 5
You can further reorder the columns and keep only the ones you need:
res[, i.end:=NULL]
setcolorder(res, c(3, 1, 2))
setnames(res, "i.start", "pos")
res
# pos start end
#1: 2 1 4
#2: 3 1 4
#3: 5 4 6
N.B: this will give NA in both columns start and end if vec ends with TRUE

Related

List with output from for loop returns empty

I have written a code to obtain crosstab results of a rasterstack for different regions (delimited by a shapefile) covering the raster. However, I am getting an empty list.
This is the function:
transitions <- function(bound, themat) { # bound = shapefile # themat = rasterstack
result = vector("list", nrow(bound)) # empty result list
names(result) = bound#data$GEOCODIGO
for (i in 1:nrow(bound)) { # this is the number of polygons to iterate through
single <- bound[i,] # selects a single polygon
clip <- mask(crop(themat, single), single) # crops the raster to the polygon boundary
result[i] <- crosstab(clip, digits = 0, long = FALSE, useNA = FALSE)
return(result)
}
}
I have tested the steps for the first object in the shapefile/bound outside of the for loop; and it worked well. But I still cannot figure out why I am getting an empty list. Any ideas?
Example data:
p <- shapefile(system.file("external/lux.shp", package="raster"))
b <- brick(raster(p), nl=2)
values(b) = sample(2, 200, replace=TRUE)
fixed function:
transitions <- function(poly, rast) {
result = vector("list", nrow(poly))
for (i in 1:nrow(poly)) {
clip <- mask(crop(rast, poly[i,]), poly[i,])
result[[i]] <- crosstab(clip, digits = 0, long = FALSE, useNA = FALSE)
}
return(result)
}
transitions(p, b)
An alternative would be to use extract
e <- extract(b, p)
To tabulate as in crosstab:
ee <- lapply(e, function(x) aggregate(data.frame(count=rep(1, nrow(x))), data.frame(x), FUN=sum))
To understand that last line, you need to unpack it.
class(e)
#[1] "list"
length(e)
#[1] 12
e[[1]]
# layer.1 layer.2
#[1,] 1 1
#[2,] 1 2
#[3,] 2 2
#[4,] 2 1
#[5,] 2 1
#[6,] 1 2
#[7,] 2 2
e is a list with the same length as the number of polygons (see length(p))
Let's that the first element and aggregate it to get a table with cases and counts.
x <- e[[1]]
aggregate(data.frame(count=rep(1, nrow(x))), data.frame(x), FUN=sum)
# layer.1 layer.2 count
#1 1 1 1
#2 2 1 2
#3 1 2 2
#4 2 2 2
A similar approach via table (the difference is that you could get Freq values that are zero
as.data.frame(table(x[,1], x[,2]))
# Var1 Var2 Freq
#1 1 1 1
#2 2 1 2
#3 1 2 2
#4 2 2 2
Now wrap the function you like into a lapply
z <- lapply(e, function(x) aggregate(data.frame(count=rep(1, nrow(x))), data.frame(x), FUN=sum))
And to take it further, bind the data.frames and add an identifier to link the data back to the polygons
y <- do.call(rbind, z,)
y$id <- rep(1:length(z), sapply(z, nrow))
head(y)
# Var1 Var2 Freq id
#1 1 1 1 1
#2 2 1 2 1
#3 1 2 2 1
#4 2 2 2 1
#5 1 1 1 2
#6 2 1 2 2

Issue with local variables in r custom function

I've got a dataset
>view(interval)
# V1 V2 V3 ID
# 1 NA 1 2 1
# 2 2 2 3 2
# 3 3 NA 1 3
# 4 4 2 2 4
# 5 NA 5 1 5
>dput(interval)
structure(list(V1 = c(NA, 2, 3, 4, NA),
V2 = c(1, 2, NA, 2, 5),
V3 = c(2, 3, 1, 2, 1), ID = 1:5), row.names = c(NA, -5L), class = "data.frame")
I would like to extract the previous not NA value (or the next, if NA is in the first row) for every row, and store it as a local variable in a custom function, because I have to perform other operations on every row based on this value(which should change for every row i'm applying the function).
I've written this function to print the local variables, but when I apply it the output is not what I want
myFunction<- function(x){
position <- as.data.frame(which(is.na(interval), arr.ind=TRUE))
tempVar <- ifelse(interval$ID == 1, interval[position$row+1,
position$col], interval[position$row-1, position$col])
return(tempVar)
}
I was expecting to get something like this
# [1] 2
# [2] 2
# [3] 4
But I get something pretty messed up instead.
Here's attempt number 1:
dat <- read.table(header=TRUE, text='
V1 V2 V3 ID
NA 1 2 1
2 2 3 2
3 NA 1 3
4 2 2 4
NA 5 1 5')
myfunc1 <- function(x) {
ind <- which(is.na(x), arr.ind=TRUE)
# since it appears you want them in row-first sorted order
ind <- ind[order(ind[,1], ind[,2]),]
# catch first-row NA
ind[,1] <- ifelse(ind[,1] == 1L, 2L, ind[,1] - 1L)
x[ind]
}
myfunc1(dat)
# [1] 2 2 4
The problem with this is when there is a second "stacked" NA:
dat2 <- dat
dat2[2,1] <- NA
dat2
# V1 V2 V3 ID
# 1 NA 1 2 1
# 2 NA 2 3 2
# 3 3 NA 1 3
# 4 4 2 2 4
# 5 NA 5 1 5
myfunc1(dat2)
# [1] NA NA 2 4
One fix/safeguard against this is to use zoo::na.locf, which takes the "last observation carried forward". Since the top-row is a special case, we do it twice, second time in reverse. This gives us the "next non-NA value in the column (up or down, depending).
library(zoo)
myfunc2 <- function(x) {
ind <- which(is.na(x), arr.ind=TRUE)
# since it appears you want them in row-first sorted order
ind <- ind[order(ind[,1], ind[,2]),]
# this is to guard against stacked NA
x <- apply(x, 2, zoo::na.locf, na.rm = FALSE)
# this special-case is when there are one or more NAs at the top of a column
x <- apply(x, 2, zoo::na.locf, fromLast = TRUE, na.rm = FALSE)
x[ind]
}
myfunc2(dat2)
# [1] 3 3 2 4

Nested lapply() in a list?

I have a list l, which has the following features:
It has 3 elements
Each element is a numeric vector of length 5
Each vector contains numbers from 1 to 5
l = list(a = c(2, 3, 1, 5, 1), b = c(4, 3, 3, 5, 2), c = c(5, 1, 3, 2, 4))
I want to do two things:
First
I want to know how many times each number occurs in the entire list and I want each result in a vector (or any form that can allow me to perform computations with the results later):
Code 1:
> a <- table(sapply(l, "["))
> x <- as.data.frame(a)
> x
Var1 Freq
1 1 3
2 2 3
3 3 4
4 4 2
5 5 3
Is there anyway to do it without using the table() function. I would like to do it "manually". I try to do it right below.
Code 2: (I know this is not very efficient!)
x <- data.frame(
"1" <- sum(sapply(l, "[")) == 1
"2" <- sum(sapply(l, "[")) == 2
"3" <- sum(sapply(l, "[")) == 3
"4" <- sum(sapply(l, "[")) == 4
"5" <- sum(sapply(l, "[")) == 5)
I tried the following, but I did not work. I actually did not understand the result.
> sapply(l, "[") == 1:5
a b c
[1,] FALSE FALSE FALSE
[2,] FALSE FALSE FALSE
[3,] FALSE TRUE TRUE
[4,] FALSE FALSE FALSE
[5,] FALSE FALSE FALSE
> sum(sapply(l, "[") == 1:5)
[1] 2
Second
Now, I would like to get the number of times each number appears in the list, but now in each element $a, $b and $c. I thought about using the lapply() but I don't know how exactly. Following is what I tried, but it is inefficient just like Code 2:
lapply(l, function(x) sum(x == 1))
lapply(l, function(x) sum(x == 2))
lapply(l, function(x) sum(x == 3))
lapply(l, function(x) sum(x == 4))
lapply(l, function(x) sum(x == 5))
What I get with these 5 lines of code are 5 lists of 3 elements each containing a single numeric value. For example, the second line of code tells me how many times number 2 appears in each element of l.
Code 3:
> lapply(l, function(x) sum(x == 2))
$a
[1] 1
$b
[1] 1
$c
[1] 1
What I would like to obtain is a list with three elements containing all the information I am looking for.
Please, use the references "Code 1", "Code 2" and "Code 3" in your answers. Thank you very much.
Just use as.data.frame(l) for the second part and table(unlist(l)) for the first.
> table(unlist(l))
1 2 3 4 5
3 3 4 2 3
> data.frame(lapply(l, tabulate))
a b c
1 2 0 1
2 1 1 1
3 1 2 1
4 0 1 1
5 1 1 1`
For code 1/2, you could use sapply to obtain the counts for whichever values you wanted:
l = list(a = c(2, 3, 1, 5, 1), b = c(4, 3, 3, 5, 2), c = c(5, 1, 3, 2, 4))
data.frame(number = 1:5,
freq = sapply(1:5, function(x) sum(unlist(l) == x)))
# number freq
# 1 1 3
# 2 2 3
# 3 3 4
# 4 4 2
# 5 5 3
For code 3, if you wanted to get the counts for lists a, b, and c, you could just apply your frequency function to each element of the list with the lapply function:
freqs = lapply(l, function(y) sapply(1:5, function(x) sum(unlist(y) == x)))
data.frame(number = 1:5, a=freqs$a, b=freqs$b, c=freqs$c)
# number a b c
# 1 1 2 0 1
# 2 2 1 1 1
# 3 3 1 2 1
# 4 4 0 1 1
# 5 5 1 1 1
here you have another example with nested lapply().
created data:
list = NULL
list[[1]] = c(1:5)
list[[2]] = c(1:5)+3
list[[2]] = c(1:5)+4
list[[3]] = c(1:5)-1
list[[4]] = c(1:5)*3
list2 = NULL
list2[[1]] = rep(1,5)
list2[[2]] = rep(2,5)
list2[[3]] = rep(0,5)
The result is this; it serve to subtract each element of one list with all elements of the other list.
lapply(list, function(d){ lapply(list2, function(a,b) {a-b}, b=d)})

Label portions of a dataframe based on boolean value, including previous rows?

For a given dataframe, I'd like to split it based on some boolean value, and then apply a label to that row and the previous rows up until that point.
Assuming the following dataframe:
test <- data.frame(x = 1:10, y = c(F, F, F, T, F, F, T, F, F, F))
I'd ultimately like to create a new column that would contain a label for that specific portion of the dataframe. Ideally, something like the following:
x y z
1 F 1
2 F 1
3 F 1
4 T 1
5 F 2
6 F 2
7 T 2
8 F 3
9 F 3
10 F 3
My current thought is that I need to loop through the dataframe with a function similar to the following (but not exactly):
label.portion <- function(test) {
for (i in 1:nrow(test)) {
z <- 1
if(test$y[i]) { z <- z + 1 }
return(z)
}
}
What is the best/easiest way of doing this? Any help is much appreciated.
Your z column can be built as
z <- with(test, sum(y)-rev(cumsum(rev(y)))+1)
in order to make every new z value start at a FALSE y after a TRUE y, as per your example.
Then you can do cbind(test, z) to get what you want.
One liner solution using transform
transform(test,z= cumsum(c(0,diff(y)) == -1)+1)
x y z
1 1 FALSE 1
2 2 FALSE 1
3 3 FALSE 1
4 4 TRUE 1
5 5 FALSE 2
6 6 FALSE 2
7 7 TRUE 2
8 8 FALSE 3
9 9 FALSE 3
10 10 FALSE 3
Another one liner solution which will be slightly faster than other solutions (except data.table)
test <- data.frame(x = 1:10, y = c(F, F, F, T, F, F, T, F, F, F))
test$z <- c(1, head(cumsum(test$y), -1) + 1)
test
## x y z
## 1 1 FALSE 1
## 2 2 FALSE 1
## 3 3 FALSE 1
## 4 4 TRUE 1
## 5 5 FALSE 2
## 6 6 FALSE 2
## 7 7 TRUE 2
## 8 8 FALSE 3
## 9 9 FALSE 3
## 10 10 FALSE 3
Benchmarks with other solutions provided (excluding data.table)
test <- data.frame(x = 1:1e+05, y = sample(c(T, F), size = 1e+05, replace = TRUE))
microbenchmark(c(1, head(cumsum(test$y), -1) + 1), cumsum(c(0, diff(test$y)) == -1) + 1, with(test, sum(y) - rev(cumsum(rev(y))) +
1), times = 100)
## Unit: milliseconds
## expr min lq median uq max neval
## c(1, head(cumsum(test$y), -1) + 1) 1.685473 1.758474 1.865409 4.647218 5.091512 100
## cumsum(c(0, diff(test$y)) == -1) + 1 4.064867 4.379714 6.936561 7.338810 7.657961 100
## with(test, sum(y) - rev(cumsum(rev(y))) + 1) 2.568766 2.720395 5.396096 5.701176 30.642436 100
Here is an approach using na.locf from xts and data.table for coding elegance (and efficiency)
library(data.table)
library(xts) # for na.locf
test <- data.table(test)
test[(y), grp := seq_along(y)][, grp := na.locf(grp, fromLast = TRUE)]
test[is.na(grp), grp := max(test[, grp], na.rm =TRUE) + 1L]
And a far clearer and faster approach
test[, grp := {xx <- diff(c(0,.I[y], length(.I))); rep.int(seq_along(xx),xx)}]
Note that diff uses a for loop implemented in R, so an Rcpp sugar implementation) would be faster (I'm sure that a cpp function would blow most of these out of the water)

How to ignore case when using subset in R

How to ignore case when using subset function in R?
eos91corr.data <- subset(test.data,select=c(c(X,Y,Z,W,T)))
I would like to select columns with names x,y,z,w,t. what should i do?
Thanks
If you can live without the subset() function, the tolower() function may work:
dat <- data.frame(XY = 1:5, x = 1:5, mm = 1:5,
y = 1:5, z = 1:5, w = 1:5, t = 1:5, r = 1:5)
dat[,tolower(names(dat)) %in% c("xy","x")]
However, this will return a data.frame with the columns in the order they are in the original dataset dat: both
dat[,tolower(names(dat)) %in% c("xy","x")]
and
dat[,tolower(names(dat)) %in% c("x","xy")]
will yield the same result, although the order of the target names has been reversed.
If you want the columns in the result to be in the order of the target vector, you need to be slightly more fancy. The two following commands both return a data.frame with the columns in the order of the target vector (i.e., the results will be different, with columns switched):
dat[,sapply(c("x","xy"),FUN=function(foo)which(foo==tolower(names(dat))))]
dat[,sapply(c("xy","x"),FUN=function(foo)which(foo==tolower(names(dat))))]
You could use regular expressions with the grep function to ignore case when identifying column names to select. Once you have identified the desired column names, then you can pass these to subset.
If your data are
dat <- data.frame(xy = 1:5, x = 1:5, mm = 1:5, y = 1:5, z = 1:5,
w = 1:5, t = 1:5, r = 1:5)
# xy x mm y z w t r
# 1 1 1 1 1 1 1 1 1
# 2 2 2 2 2 2 2 2 2
# 3 3 3 3 3 3 3 3 3
# 4 4 4 4 4 4 4 4 4
# 5 5 5 5 5 5 5 5 5
Then
(selNames <- grep("^[XYZWT]$", names(dat), ignore.case = TRUE, value = TRUE))
# [1] "x" "y" "z" "w" "t"
subset(dat, select = selNames)
# x y z w t
# 1 1 1 1 1 1
# 2 2 2 2 2 2
# 3 3 3 3 3 3
# 4 4 4 4 4 4
# 5 5 5 5 5 5
EDIT If your column names are longer than one letter, the above approach won't work too well. So assuming you can get your desired column names in a vector, you could use the following:
upperNames <- c("XY", "Y", "Z", "W", "T")
(grepPattern <- paste0("^", upperNames, "$", collapse = "|"))
# [1] "^XY$|^Y$|^Z$|^W$|^T$"
(selNames2 <- grep(grepPattern, names(dat), ignore.case = TRUE, value = TRUE))
# [1] "xy" "y" "z" "w" "t"
subset(dat, select = selNames2)
# xy y z w t
# 1 1 1 1 1 1
# 2 2 2 2 2 2
# 3 3 3 3 3 3
# 4 4 4 4 4 4
# 5 5 5 5 5 5
The 'stringr' library is a very neat wrapper for all of this functionality. It has 'ignore.case' option as follows:
also, you may want to consider using match not subset.

Resources