Closest subsequent index for a specified value - r

Consider a vector:
int = c(1, 1, 0, 5, 2, 0, 0, 2)
I'd like to get the closest subsequent index (not the difference) for a specified value. The first parameter of the function should be the vector, while the second should be the value one wants to see the closest subsequent elements.
For instance,
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
Here, the first element of the vector (1) is two positions away from the first subsequent 0, (3 - 1 = 2), so it should return 2. Then the second element is 1 position away from a 0 (2 - 1 = 1). When there is no subsequent values that match the specified value, return NA (here it's the case for the last element, because no subsequent value is 0).
Other examples:
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(int, 3)
# [1] NA NA NA NA NA NA NA NA
This should also work for character vectors:
char = c("A", "B", "C", "A", "A")
f(char, "A")
# [1] 0 2 1 0 0

Find the location of each value (numeric or character)
int = c(1, 1, 0, 5, 2, 0, 0, 2)
value = 0
idx = which(int == value)
## [1] 3 6 7
Expand the index to indicate the nearest value of interest, using an NA after the last value in int.
nearest = rep(NA, length(int))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx))),
## [1] 3 3 3 6 6 6 7 NA
Use simple arithmetic to find the difference between the index of the current value and the index of the nearest value
abs(seq_along(int) - nearest)
## [1] 2 1 0 2 1 0 0 NA
Written as a function
f <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
if (length(idx)) # non-NA values only if `value` in `x`
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
We have
> f(int, 0)
[1] 2 1 0 2 1 0 0 NA
> f(int, 1)
[1] 0 0 NA NA NA NA NA NA
> f(int, 2)
[1] 4 3 2 1 0 2 1 0
> f(char, "A")
[1] 0 2 1 0 0
> f(char, "B")
[1] 1 0 NA NA NA
> f(char, "C")
[1] 2 1 0 NA NA
The solution doesn't involve recursion or R-level loops, so should e fast even for long vectors.

Look for the match from nth position to the end of the vector, then get the 1st match:
f <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(int, 3)
# [1] NA NA NA NA NA NA NA NA
f(char, "A")
# [1] 0 2 1 0 0

Using sequence:
f <- function(v, x){
d = diff(c(0, which(v == x)))
vec <- sequence(d, d-1, by = -1)
length(vec) <- length(int)
vec
}
Output
int = c(1, 1, 0, 5, 2, 0, 0, 2)
char = c("A", "B", "C", "A", "A")
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(char, "A")
# [1] 0 2 1 0 0
Benchmark (n = 1000):
set.seed(123)
int = sample(0:100, size = 1000, replace = T)
library(microbenchmark)
bm <- microbenchmark(
fSequence(int, 0),
fzx8754(int, 0),
fRecursive(int, 0),
fMartinMorgan(int, 0),
fMap2dbl(int, 0),
fReduce(int, 0),
fAve(int, 0),
fjblood94(int, 0),
times = 10L,
setup = gc(FALSE)
)
autoplot(bm)
Martin Morgan's solution seems to be the quickest, followed by this answer's sequence solution, sbarbit's recursive solution, and jblood94's for loop solution.
Functions used:
fSequence <- function(v, x){
vec <- sequence(diff(c(0, which(v == x))), diff(c(0, which(v == x))) - 1, by = -1)
length(vec) <- length(v)
vec
}
fzx8754 <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
fRecursive <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f(lookup[-1], val))
} else {
ind
}
}
fMartinMorgan <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
fMap2dbl <- function(int, num)
{
n <- length(int)
map2_dbl(num, 1:n, ~ ifelse(length(which(.x == int[.y:n])) == 0, NA,
min(which(.x == int[.y:n])) - 1))
}
fReduce <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
fAve <- function(init, k) {
ave(
seq_along(init),
c(0, head(cumsum(init == k), -1)),
FUN = function(x) if (any(x == k)) rev(seq_along(x) - 1) else NA
)
}
fjblood94 <- function(v, val) {
out <- integer(length(v))
if (v[length(v)] != val) out[length(v)] <- NA_integer_
for (i in (length(v) - 1L):1) {
if (v[i] == val) {
out[i] <- 0L
} else {
out[i] <- out[i + 1L] + 1L
}
}
return(out)
}

Here f is defined as a recursive function that calls itself over shorter tails of the lookup vector:
f <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f(lookup[-1], val))
} else {
ind
}
}

Here is an approach using Reduce() and then some fiddling to get the NA values.
f <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
f(int, 0)
[1] 2 1 0 2 1 0 0 NA
f(int, 1)
[1] 0 0 NA NA NA NA NA NA
f(int, 2)
[1] 4 3 2 1 0 2 1 0
f(int, 3)
[1] NA NA NA NA NA NA NA NA
char = c("A", "B", "C", "A", "A")
f(char, "A")
[1] 0 2 1 0 0

Another possible solution, based on purrr::map2_dbl:
library(purrr)
int = c(1, 1, 0, 5, 2, 0, 0, 2)
f <- function(int, num)
{
n <- length(int)
map2_dbl(num, 1:n, ~ ifelse(length(which(.x == int[.y:n])) == 0, NA,
min(which(.x == int[.y:n])) - 1))
}
f(int, 0)
#> [1] 2 1 0 2 1 0 0 NA
f(int, 1)
#> [1] 0 0 NA NA NA NA NA NA
f(int, 2)
#> [1] 4 3 2 1 0 2 1 0
f(int, 3)
#> [1] NA NA NA NA NA NA NA NA
char = c("A", "B", "C", "A", "A")
f(char, "A")
#> [1] 0 2 1 0 0

A single-pass for loop is simple and efficient:
f1 <- function(v, val) {
out <- integer(length(v))
if (v[length(v)] != val) out[length(v)] <- NA_integer_
for (i in (length(v) - 1L):1) {
if (v[i] == val) {
out[i] <- 0L
} else {
out[i] <- out[i + 1L] + 1L
}
}
return(out)
}
int <- c(1, 1, 0, 5, 2, 0, 0, 2)
chr <- c("A", "B", "C", "A", "A")
f1(int, 0)
#> [1] 2 1 0 2 1 0 0 NA
f1(chr, "A")
#> [1] 0 2 1 0 0
Benchmarking against other solutions:
f2 <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
f3 <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f3(lookup[-1], val))
} else {
ind
}
}
f4 <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
f5 <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
microbenchmark::microbenchmark(f1 = {f1(int, 0); f1(chr, "A")},
f2 = {f2(int, 0); f2(chr, "A")},
f3 = {f3(int, 0); f3(chr, "A")},
f4 = {f4(int, 0); f4(chr, "A")},
f5 = {f5(int, 0); f5(chr, "A")},
check = "equal")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 6.0 7.50 8.990 8.40 9.60 18.3 100
#> f2 54.2 61.45 71.752 65.55 79.40 131.8 100
#> f3 25.5 28.60 33.393 30.75 35.90 105.2 100
#> f4 22.3 26.30 30.599 28.00 32.65 82.4 100
#> f5 59.7 64.55 73.474 69.10 75.70 157.0 100

A base R option using ave + cumsum
f <- function(init, k) {
ave(
seq_along(init),
c(0, head(cumsum(init == k), -1)),
FUN = function(x) if (any(x == k)) rev(seq_along(x) - 1) else NA
)
}
and you will see
> f(init, 0)
[1] 2 1 0 2 1 0 0 NA
> f(init, 1)
[1] 0 0 NA NA NA NA NA NA
> f(init, 2)
[1] 4 3 2 1 0 2 1 0
> f(init, 3)
[1] NA NA NA NA NA NA NA NA

Related

Change values in a dataframe according to cases

I have the following dataframe:
> dput(df)
structure(list(x = c(0.871877138037235, 0.534444199409336, 0.677225327817723,
0.124835065566003, 0.972407285822555, 0.179870884865522, 0.468708630651236,
0.405605535488576, 0.717907374724746, 0.157441936200485), y = c(0,
1, 2, 0, 0, 0, 0, 0, 1, 0)), class = "data.frame", row.names = c(NA,
-10L))
i.e.
> df
x y
1 0.8718771 0
2 0.5344442 1
3 0.6772253 2
4 0.1248351 0
5 0.9724073 0
6 0.1798709 0
7 0.4687086 0
8 0.4056055 0
9 0.7179074 1
10 0.1574419 0
I would like to obtain a new dataframe considering the following rules:
If in column y appear 1 and 2 (or 2 and 1) sequentially, then multiply the next 3 values in column x by -1.4
If in column y appears 1 (and just 1), then multiply the next 3 values column x by -1
If in column y appear 1 and 3 (or 3 and 1) sequentially, then multiply the next 3 values column x by -0.6
If in column y appears 2 (and just 2), then multiply the next 3 values column x by 1.4
In our case the desired result is:
> df
x y
1 0.8718771 0
2 0.5344442 1
3 0.6772253 2
4 -0.1747691 0
5 -1.36137 0
6 -0.2518193 0
7 0.4687086 0
8 0.4056055 0
9 0.7179074 1
10 -0.1574419 0
This solution may sound ugly but I think it's quite stable, however it may need further testings and improvements:
library(dplyr)
# First I set out to detect every observation that falls into any of the 4 categories
df %>%
mutate(z = case_when(
lag(y, n = 2, default = 0) %in% c(1, 2) & lag(y, default = 0) %in% c(2, 1) ~ 1,
lag(y, n = 2, default = 0) == 0 & lag(y, default = 0) == 1 & y == 0 ~ 2,
lag(y, n = 2, default = 0) %in% c(1, 3) & lag(y, default = 0) %in% c(2, 1) ~ 3,
lag(y, n = 2, default = 0) == 0 & lag(y, default = 0) == 2 & y == 0 ~ 4,
TRUE ~ 0
)) -> DF
# Then I wrote a custom function to apply multiplication phase on a sequence of three rows
fn <- function(x) {
out <- x$x
for(i in 1:nrow(x)) {
if(x$z[i] == 1) {
out[i:(i+2)] <- out[i:(i+2)] * (-1.4)
} else if(x$z[i] == 2) {
out[i:(i+2)] <- out[i:(i+2)] * (-1)
} else if(x$z[i] == 3) {
out[i:(i+2)] <- out[i:(i+2)] * (-0.6)
} else if(x$z[i] == 4) {
out[i:(i+2)] <- out[i:(i+2)] * (1.4)
} else {
out[i:(i+2)] <- out[i:(i+2)] * 1
}
}
dt <- cbind(new_x = out[!is.na(out)], y = x$y) |> as.data.frame()
dt
}
fn(DF)
new_x y
1 0.8718771 0
2 0.5344442 1
3 0.6772253 2
4 -0.1747691 0
5 -1.3613702 0
6 -0.2518192 0
7 0.4687086 0
8 0.4056055 0
9 0.7179074 1
10 -0.1574419 0
A for loop
df <- structure(list(x = c(0.871877138037235, 0.534444199409336, 0.677225327817723,
0.124835065566003, 0.972407285822555, 0.179870884865522, 0.468708630651236,
0.405605535488576, 0.717907374724746, 0.157441936200485), y = c(0,
1, 2, 0, 0, 0, 0, 0, 1, 0)), class = "data.frame", row.names = c(NA,
-10L))
df
#> x y
#> 1 0.8718771 0
#> 2 0.5344442 1
#> 3 0.6772253 2
#> 4 0.1248351 0
#> 5 0.9724073 0
#> 6 0.1798709 0
#> 7 0.4687086 0
#> 8 0.4056055 0
#> 9 0.7179074 1
#> 10 0.1574419 0
for(i in 2:nrow(df)){
if((df$y[i] == 1 & df$y[i+1] ==2) | (df$y[i] == 2 & df$y[i+1] ==1)) {
df$x[seq(i+2, by = 1, length.out= min(nrow(df) - (i+1), 3))] <- df$x[seq(i+2, by = 1, length.out=min(nrow(df) - (i+1), 3))] * -1.4
} else if ((df$y[i] == 1 & df$y[i+1] ==3) | (df$y[i] == 3 & df$y[i+1] ==1)){
df$x[seq(i+2, by = 1, length.out= min(nrow(df) - (i+1), 3))] <- df$x[seq(i+2, by = 1,length.out= min(nrow(df) - (i+1), 3))] * -0.6
} else if (df$y[i] == 1 & !df$y[i+1] %in% c(1,2,3) & !df$y[i-1] %in% c(1,2,3) ) {
df$x[seq(i+1, by = 1, length.out=min(nrow(df) - (i), 3)) ] <- df$x[seq(i+1, by = 1, length.out= min(nrow(df) - (i), 3))] * -1
} else if (df$y[i] == 2 & !df$y[i+1] %in% c(1,2,3) & !df$y[i-1] %in% c(1,2,3)) {
df$x[seq(i+1, by = 1, length.out=min(nrow(df) - (i), 3)) ] <- df$x[seq(i+1, by = 1, length.out=min(nrow(df) - (i), 3))] * 1.4
}
}
df
#> x y
#> 1 0.8718771 0
#> 2 0.5344442 1
#> 3 0.6772253 2
#> 4 -0.1747691 0
#> 5 -1.3613702 0
#> 6 -0.2518192 0
#> 7 0.4687086 0
#> 8 0.4056055 0
#> 9 0.7179074 1
#> 10 -0.1574419 0
Created on 2021-06-26 by the reprex package (v2.0.0)

R: "length.out" argument from the seq() function not working properly

I'm trying to build this function to check the multiples of 3, from 0 to the half of the element "number". I'm adding "n" that limits the number of results that I will get.
function1 <- function(number, n){
half <- number / 2
lessequal <- seq.int(from = 0, to = half, length.out = n)
multiple <- (lessequal %% 3) == 0
return (lessequal [multiple])
}
When I run this function with n = 2
function1 (24, 2)
[1] 0 12
When the expected result would be:
[1] 0 3
If I run it with n = 4. The outcome is always 2 elements instead of 4.
function1 (12, 4)
[1] 0 12
When I expected to get:
[1] 0 3 6 9
What am I doing wrong?
Thanks.
Try with this code:
function1 <- function(number, n){
half <- number / 2
lessequal <- seq.int(from = 0, to = half, by=1)
multiple <- (lessequal %% 3) == 0
vals <- lessequal [multiple]
vals <- vals[1:n]
return (vals)
}
function1 (24, 4)
Output:
function1 (24, 4)
[1] 0 3 6 9
For most of the cases you can get the output with seq function itself without passing numbers.
function1 <- function(n){
seq(0, length.out = n, by = 3)
}
function1(2)
#[1] 0 3
function1(4)
#[1] 0 3 6 9
If there are going to be cases when first n multiples of 3 will be higher than number/2 you can use :
function1 <- function(number, n){
val <- seq(0, number/2, by = 3)
val[1:n]
}
function1(24, 2)
#[1] 0 3
function1(24, 4)
#[1] 0 3 6 9
function1(12, 4)
#[1] 0 3 6 NA
we can use %/% (natural division) and then grab the first n numbers that can be divided by 3 i.e 3*k-1 as we start the vector from 0
get_odd_n <- function(x, n) lapply(x, function(i) (0:(i%/%2))[3*(0:(n-1))+1])
setNames(get_odd_n(6*1:6+4, 7), 6*1:6+4)
$`10`
[1] 0 3 NA NA NA NA NA
$`16`
[1] 0 3 6 NA NA NA NA
$`22`
[1] 0 3 6 9 NA NA NA
$`28`
[1] 0 3 6 9 12 NA NA
$`34`
[1] 0 3 6 9 12 15 NA
$`40`
[1] 0 3 6 9 12 15 18

Multiple for loop time computation very high in R

I have data about machines in the following form
Number of rows - 900k
Data
A B C D E F G H I J K L M N
---- -- --- ---- --- --- --- --- --- --- --- --- --- ---
1 1 1 1 1 1 1 1 1 1 0 1 1 0 0
2 0 0 0 0 1 1 1 0 1 1 0 0 1 0
3 0 0 0 0 0 0 0 1 1 1 1 1 0 0
1 indicates that the machine was active and 0 indicates that it was inactive.
I want my output to look like
A B C D E F G H I J K L M N
---- -- --- ---- --- --- --- --- --- --- --- --- --- ---
1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
2 0 0 0 0 1 1 1 1 1 1 0 0 1 0
3 0 0 0 0 0 0 0 1 1 1 1 1 0 0
Basically all I am trying to do is look for zeros in a particular row and if that zero is surrounded by ones on either side, replace 0 with 1
example -
in row 1 you have zero in column J
but you also have 1 in column I and K
which means I replace that 0 by 1 because it is surrounded by 1s
The code I am using is this
for(j in 2:13) {
if(data[i,j]==0 && data[i,j-1]==1 && data[i,j+1]==1){
data[i,j] = 1
}
}
}
Is there a way to reduce the time computation for this? This takes me almost 30 mins to run in R. Any help would be appreciated.
this is faster because it does not require to iterate through the rows.
for(j in 2:13) {
data[,j] = ifelse(data[,j-1] * data[,j+1]==1,1,data[,j])
}
or a littlebit more optimized, without using ifelse
for(j in 2:(ncol(data) - 1)) {
data[data[, j - 1] * data[, j + 1] == 1, j] <- 1
}
You could also use gsub to replace any instances of 101 with 111 using the following code:
collapsed <- gsub('101', '111', apply(df1, 1, paste, collapse = ''))
data <- as_tibble(t(matrix(unlist(sapply(collapsed, strsplit, split = '')), nrow = numLetters)))
names(data) <- LETTERS[1:numLetters]
Here's a comparison of everyone's solutions:
library(data.table)
library(rbenchmark)
library(tidyverse)
set.seed(1)
numLetters <- 13
df <- as_tibble(matrix(round(runif(numLetters * 100)), ncol = numLetters))
names(df) <- LETTERS[1:numLetters]
benchmark(
'gsub' = {
data <- df
collapsed <- gsub('101', '111', apply(data, 1, paste, collapse = ''))
data <- as_tibble(t(matrix(unlist(sapply(collapsed, strsplit, split = '')), nrow = numLetters)))
names(data) <- LETTERS[1:numLetters]
},
'for_orig' = {
data <- df
for(i in 1:nrow(data)) {
for(j in 2:(ncol(data) - 1)) {
if(data[i, j] == 0 && data[i, j - 1] == 1 && data[i, j + 1] == 1) {
data[i, j] = 1
}
}
}
},
'for_norows' = {
data <- df
for(j in 2:(ncol(data) - 1)) {
data[, j] = ifelse(data[, j - 1] * data[, j + 1] == 1, 1, data[, j])
}
},
'vectorize' = {
data <- df
for(i in seq(ncol(data) - 2) + 1) {
condition <- data[, i - 1] == data[, i + 1] & data[, i - 1] == 1 & data[, i] == 0
data[which(condition), i] <- 1
}
},
'index' = {
data <- df
idx <- apply(data, 1, function(x) c(0, diff(x)))
data[which(idx == -1 & lead(idx == 1), arr.ind = TRUE)[, 2:1]] <- 1
},
replications = 100
)
The indexing solution (which has since been deleted) wins hands-down in terms of computational time for a 13-by-100 data frame.
test replications elapsed relative user.self sys.self user.child
3 for_norows 100 1.19 7.438 1.19 0 NA
2 for_orig 100 9.29 58.063 9.27 0 NA
1 gsub 100 0.28 1.750 0.28 0 NA
5 index 100 0.16 1.000 0.16 0 NA
4 vectorize 100 0.87 5.438 0.87 0 NA
sys.child
3 NA
2 NA
1 NA
5 NA
4 NA
Cut the time by using vectorized operations. As you are planning to do the same thing for every row, this can be done by utilizing the vectorized conditional statements.
for(i in seq(ncol(data) - 2) + 1){ #<== all but last and first column
#Find all neighbouring columns that are equal, where the the center column is equal to 0
condition <- data[, i - 1] == data[, i + 1] & data[, i - 1] == 1 & data[, i] == 0
#Overwrite only the values that holds the condition
data[which(condition), i] <- 1
}
You can avoid loops altogether and use indexing to replace all the values at once:
nc <- ncol(df)
df[, 2:(nc - 1)][df[, 1:(nc - 2)] * df[, 3:nc] == 1] <- 1

repeating apply with different combination of cols

I can´t nest an apply funtion to repeat it many times with other combination of columns
I need to get a percentage of sp1==1 & s1==1and sp2==1 & s1==1 regarding s1, and in the same way regarding s2, s3... s1000. Here a short example:
x <- data.frame("sp1"=rep(0:1, times=5),
"sp2"=rep(0:1, each=5),
"s1" = rep(0:1, times=10),
"s2" = rep(0:1, each=2),
"s3" = rep(1:0, each=2))
> x
sp1 sp2 s1 s2 s3
1 0 0 0 0 1
2 1 0 1 0 1
3 0 0 0 1 0
4 1 0 1 1 0
5 0 0 0 0 1
6 1 1 1 0 1
7 0 1 0 1 0
8 1 1 1 1 0
9 0 1 0 0 1
10 1 1 1 0 1
11 0 0 0 1 0
12 1 0 1 1 0
13 0 0 0 0 1
14 1 0 1 0 1
15 0 0 0 1 0
16 1 1 1 1 0
17 0 1 0 0 1
18 1 1 1 0 1
19 0 1 0 1 0
20 1 1 1 1 0
Now I typed a function to calculate percentage regarding s1:
r <- as.data.frame(sapply(x[,1:2],
function(i) sum(i ==1 & x$s1 == 1)/sum(i ==1)))
> r
sapply(x[, 1:2], function(i) sum(i == 1 & x$s1 == 1)/sum(i == 1))
sp1 1.0
sp2 0.6
I want to built a df with all percentages of sp1, sp2, sp3, ...sp200 regarding s1, s2, s3, ...s1000...
> r
s1 s2 s3 ... s1000
sp1 1.0 0.5 0.5
sp2 0.6 0.5 0.5
...
sp200
I've tried to do a function with both groups-variables, one for sp's and another for s's:
intento <- as.data.frame(sapply(i=x[,1:2],
j=x[,3:5],
function(i,j)sum(i ==1 & j == 1)/sum(i ==1)))
But logically that´s not the way:
Error in match.fun(FUN) : argument "FUN" is missing, with no default
We can seperate the columns based on their names and use sapply on them
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))
sapply(x[sp_cols], function(i) sapply(x[s_cols],
function(j) sum(i == 1 & j == 1)/sum(i == 1)))
If you have only 1 and 0's as values in the columns this can be reduced to
sapply(x[s_cols], function(i) sapply(x[sp_cols], function(j) sum(i & j)/sum(j)))
# s1 s2 s3
#sp1 1.0 0.5 0.5
#sp2 0.6 0.5 0.5
You're looking for outer. Your function just needs to be Vectorized.
FUN <- Vectorize(function(i,j) sum(x[i] == 1 & x[j] == 1)/sum(x[i] == 1))
outer(1:2, 3:5, FUN)
# [,1] [,2] [,3]
# [1,] 1.0 0.5 0.5
# [2,] 0.6 0.5 0.5
You could refine this using grep to find the columns automatically
outer(grep("sp", names(x)), grep("s\\d+", names(x)), FUN)
A similar approach is to use lapply(x, function(x) which(x == 1) and then use that down the road. The thought process being that we might as well store the information instead of repeatedly checking it.
#as suggested by #Ronak
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))
x_l_zero <- lapply(x, function(x) which(x == 1))
sapply(x_l_zero[s_cols]
, function(x) sapply(x_l_zero[sp_cols]
, function(y) length(intersect(x,y))/length(y)))
s1 s2 s3
sp1 1.0 0.5 0.5
sp2 0.6 0.5 0.5
#Ronak has the fastest solution and is more-or-less the OP's code that's been addressed.
Unit: microseconds
expr min lq mean median uq max neval
jay.sf_outer_FUN 1190.8 1240.85 1360.103 1284.50 1337.30 2627.0 100
cole_which_apply 268.4 289.00 454.609 306.05 322.00 7610.7 100
ronak_1_unsimple 181.3 193.95 321.863 209.95 233.40 6227.4 100
ronak_2_simple 228.5 241.25 342.354 250.65 276.05 7478.4 100
akrun_dplyr 5218.7 5506.05 6108.997 5721.80 6081.65 25147.3 100
Code for performance:
library(microbenchmark)
library(tidyverse)
##data set
x <- data.frame("sp1"=rep(0:1, times=5),
"sp2"=rep(0:1, each=5),
"s1" = rep(0:1, times=10),
"s2" = rep(0:1, each=2),
"s3" = rep(1:0, each=2))
#for jay.sf
FUN <- Vectorize(function(i,j) sum(x[i] == 1 & x[j] == 1)/sum(x[i] == 1))
#names of columns
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))
sp_cols_nam <- grep("^sp", names(x), value = T)
s_cols_nam <- grep("^s\\d+", names(x), value = T)
#benchmark
microbenchmark(
outer_FUN = {
outer(sp_cols, s_cols, FUN)
}
, apply_heaven = {
x_l_zero <- lapply(x, function(x) which(x == 1))
sapply(x_l_zero[s_cols], function(x) sapply(x_l_zero[sp_cols] , function(y) length(intersect(x,y))/length(y)))
}
, ronak_1_unsimple = {
sapply(x[sp_cols], function(i) sapply(x[s_cols],
function(j) sum(i == 1 & j == 1)/sum(i == 1)))
}
, ronak_2_simple = {
sapply(x[s_cols], function(i) sapply(x[sp_cols], function(j) sum(i & j)/sum(j)))
}
, akrun_dplyr = {
crossing(nm1 = sp_cols_nam,
nm2 = s_cols_nam) %>%
mutate(val = pmap_dbl(., ~ sum(x[..1] ==1 & x[..2] == 1)/sum(x[..1]))) %>%
spread(nm2, val)
}
)
Here is an option with tidyverse
library(tidyverse)
crossing(nm1 = names(x)[startsWith(names(x), "sp")],
nm2 = grep("^s\\d+", names(x), value = TRUE)) %>%
mutate(val = pmap_dbl(., ~ sum(x[..1] ==1 & x[..2] == 1)/sum(x[..1]))) %>%
spread(nm2, val)
# A tibble: 2 x 4
# nm1 s1 s2 s3
# <chr> <dbl> <dbl> <dbl>
#1 sp1 1 0.5 0.5
#2 sp2 0.6 0.5 0.5

Problems with control statement if...else in R

I am a beginner in R and here's my code:
for (i in 1:7){
testing<-vector(length=(length(yy)-3))
if(all(yy[i:(i+2)]==0))
testing[i]<-1
else
testing[i]<-NA
}
yy refers to the following vector of length 10:
> yy
[1] 1 0 0 0 0 1 0 1 0 1
testing is like a predict function output, which will predict a 1 if the previous 3 elements in yy are all 0.If not, it will not predict anything, and so NA. Since yy has a total of 10 elements, testing will have a total of 7 elements (Therefore length 7) However, instead of giving me a output of 1s and NAs, it is giving this:
> testing
[1] FALSE FALSE FALSE FALSE FALSE FALSE NA
I cannot figure out why, some help will be great. Thank you.
You should define testing outside the loop:
testing<-vector(length=(length(yy)-3))
for (i in 1:7){
if(all(yy[i:(i+2)]==0))
testing[i]<-1
else
testing[i]<-NA
}
testing
[1] NA 1 1 NA NA NA NA
For this task you couls also use rollapply from zoo:
library(zoo)
rollapply(yy, 3, function(x) ifelse(all(x == 0), 1, NA))
[1] NA 1 1 NA NA NA NA NA
Here are some additional vectorized ways of solving this
Base r stats::filter
N <- 3
NA^(stats::filter(yy == 0, rep(1, N), sides = 1)[-(1:N-1)] != N)
# [1] NA 1 1 NA NA NA NA NA
data.table::shift
NA^(Reduce(`+`, data.table::shift(yy == 0, 0:(N-1)))[-(1:N-1)] != N)
# [1] NA 1 1 NA NA NA NA NA
RcppRoll::roll_sum
NA^(RcppRoll::roll_sum(yy == 0, N) != N)
# [1] NA 1 1 NA NA NA NA NA
Some becnmarks (I've also added a compiled version of the for loop using compiler::cmpfun and two more efficient zoo solutions)
ForLoop <- function(yy, N){
testing<-vector(length=(length(yy)-N))
for (i in 1:length(testing)){
if(all(yy[i:(i+(N-1))]==0))
testing[i]<-1
else
testing[i]<-NA
}
testing
}
ForLoopBin <- compiler::cmpfun(ForLoop)
ZOO <- function(yy, N) zoo::rollapply(yy, N, function(x) ifelse(all(x == 0), 1, NA))
ZOO2 <- function(yy, N) NA^!zoo::rollapply(yy == 0, N, all)
ZOO3 <- function(yy, N) NA^(zoo::rollsum(yy == 0, N) != N)
RCPPROLL <- function(yy, N) NA^(RcppRoll::roll_sum(yy == 0, N) != N)
BaseFilter <- function(yy, N) NA^(stats::filter(yy == 0, rep(1, N), sides = 1)[-(1:N-1)] != N)
DTShift <- function(yy, N) NA^(Reduce(`+`, data.table::shift(yy == 0, 0:(N-1)))[-(1:N-1)] != N)
set.seed(123)
yy <- sample(0:1, 1e4, replace = TRUE)
N <- 3
library(microbenchmark)
microbenchmark(
"for loop" = ForLoop(yy, N),
"Compiled for loop" = ForLoopBin(yy, N),
"zoo1" = ZOO(yy, N),
"zoo2" = ZOO2(yy, N),
"zoo3" = ZOO3(yy, N),
"Rcpproll" = RCPPROLL(yy, N),
"stats::filter" = BaseFilter(yy, N),
"data.table::shift" = DTShift(yy, N)
)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# for loop 25917.837 26858.936 30157.3927 28546.2595 29334.2430 110135.205 100 d
# Compiled for loop 7559.837 8208.142 9709.7256 8882.6875 9428.9155 22683.347 100 c
# zoo1 101699.548 107857.014 112210.5929 110402.3985 113335.7745 171537.068 100 f
# zoo2 72265.949 77788.709 81275.9028 80292.8135 81917.8985 153197.948 100 e
# zoo3 4584.861 4734.778 4939.3528 4785.9770 4853.6560 13228.514 100 b
# Rcpproll 216.636 246.076 290.7211 290.0745 311.3540 663.667 100 a
# stats::filter 425.912 475.350 536.0757 509.5900 544.6295 1497.568 100 a
# data.table::shift 334.394 365.593 443.2138 409.4325 424.6320 1944.279 100 a

Resources