I have a list of vectors:
> l <- list(A=c("one", "two", "three", "four"), B=c("one", "two"), C=c("two", "four", "five", "six"), D=c("six", "seven"))
> l
$A
[1] "one" "two" "three" "four"
$B
[1] "one" "two"
$C
[1] "two" "four" "five" "six"
$D
[1] "six" "seven"
I would like to calculate the length of the overlap between all possible pairwise combinations of the list elements, i.e. (the format of the result doesn't matter):
AintB 2
AintC 2
AintD 0
BintC 1
BintD 0
CintD 1
I know combn(x, 2) can be used to get a matrix of all possible pairwise combinations in a vector and that length(intersect(a, b)) would give me the length of the overlap of two vectors, but I can't think of a way to put the two things together.
Any help is much appreciated! Thanks.
If I understand correctly, you can look at crossprod and stack:
crossprod(table(stack(l)))
# ind
# ind A B C D
# A 4 2 2 0
# B 2 2 1 0
# C 2 1 4 1
# D 0 0 1 2
You can extend the idea if you want a data.frame of just the relevant values as follows:
Write a spiffy function
listIntersect <- function(inList) {
X <- crossprod(table(stack(inList)))
X[lower.tri(X)] <- NA
diag(X) <- NA
out <- na.omit(data.frame(as.table(X)))
out[order(out$ind), ]
}
Apply it
listIntersect(l)
# ind ind.1 Freq
# 5 A B 2
# 9 A C 2
# 13 A D 0
# 10 B C 1
# 14 B D 0
# 15 C D 1
Performance seems pretty decent.
Expand the list:
L <- unlist(replicate(100, l, FALSE), recursive=FALSE)
names(L) <- make.unique(names(L))
Set up some functions to test:
fun1 <- function(l) listIntersect(l)
fun2 <- function(l) apply( combn( l , 2 ) , 2 , function(x) length( intersect( unlist( x[1]) , unlist(x[2]) ) ) )
fun3 <- function(l) {
m1 <- combn(names(l),2)
val <- sapply(split(m1, col(m1)),function(x) {x1 <- l[[x[1]]]; x2 <- l[[x[2]]]; length(intersect(x1, x2))})
Ind <- apply(m1,2,paste,collapse="int")
data.frame(Ind, val, stringsAsFactors=F)
}
Check out the timings:
system.time(F1 <- fun1(L))
# user system elapsed
# 0.33 0.00 0.33
system.time(F2 <- fun2(L))
# user system elapsed
# 4.32 0.00 4.31
system.time(F3 <- fun3(L))
# user system elapsed
# 6.33 0.00 6.33
Everyone seems to be sorting the result differently, but the numbers match:
table(F1$Freq)
#
# 0 1 2 4
# 20000 20000 29900 9900
table(F2)
# F2
# 0 1 2 4
# 20000 20000 29900 9900
table(F3$val)
#
# 0 1 2 4
# 20000 20000 29900 9900
combn works with list structures as well, you just need a little unlist'ing of the result to use intersect...
# Get the combinations of names of list elements
nms <- combn( names(l) , 2 , FUN = paste0 , collapse = "" , simplify = FALSE )
# Make the combinations of list elements
ll <- combn( l , 2 , simplify = FALSE )
# Intersect the list elements
out <- lapply( ll , function(x) length( intersect( x[[1]] , x[[2]] ) ) )
# Output with names
setNames( out , nms )
#$AB
#[1] 2
#$AC
#[1] 2
#$AD
#[1] 0
#$BC
#[1] 1
#$BD
#[1] 0
#$CD
#[1] 1
Try:
m1 <- combn(names(l),2)
val <- sapply(split(m1, col(m1)),function(x) {x1 <- l[[x[1]]]; x2 <- l[[x[2]]]; length(intersect(x1, x2))})
Ind <- apply(m1,2,paste,collapse="int")
data.frame(Ind, val, stringsAsFactors=F)
# Ind val
# 1 AntB 2
# 2 AntC 2
# 3 AntD 0
# 4 BntC 1
# 5 BntD 0
# 6 CntD 1
Related
I want to find which values in df2 which is also present in df1, within a certain range. One value is considering both a and b in the data frames (a & b can't split up). For examples, can I find 9,1 (df1[1,1]) in df2? It doesn't have to be on the same position. Also, we can allow a diff of for example 1 for "a" and 1 for "b". For example, I want to find all values 9+-1,1+-1 in df2. "a" & "b" always go together, each row stick together. Does anyone have a suggestion of how to code this? Many many thanks!
set.seed(1)
a <- sample(10,5)
set.seed(1)
b <- sample(5,5, replace=T)
feature <- LETTERS[1:5]
df1 <- data.frame(feature,a,b)
df1
> df1
feature a b
A 9 1
B 4 4
C 7 1
D 1 2
E 2 5
set.seed(2)
a <- sample(10,5)
b <- sample(5,5, replace=T)
feature <- LETTERS[1:5]
df2 <- data.frame(feature,a,b)
df2
df2
feature a b
A 5 1
B 6 4
C 9 5
D 1 1
E 10 2
Not correct but Im imaging this can be done for a for loop somehow!
for(i in df1[,1]) {
for(j in df1[,2]){
s<- c(s,(df1[i,1] & df1[j,2]== df2[,1] & df2[,2]))# how to add certain allowed diff levels?
}
}
s
Output wanted:
feature_df1 <- LETTERS[1:5]
match <- c(1,0,0,1,0)
feature_df2 <- c("E","","","D", "")
df <- data.frame(feature_df1, match, feature_df2)
df
feature_df1 match feature_df2
A 1 E
B 0
C 0
D 1 D
E 0
I loooove data.table, which is (imo) the weapon of choice for these kind of problems..
library( data.table )
#make df1 and df2 a data.table
setDT(df1, key = "feature"); setDT(df2)
#now perform a join operation on each row of df1,
# creating an on-the-fly subset of df2
df1[ df1, c( "match", "feature_df2") := {
val = df2[ a %between% c( i.a - 1, i.a + 1) & b %between% c(i.b - 1, i.b + 1 ), ]
unique_val = sort( unique( val$feature ) )
num_val = length( unique_val )
list( num_val, paste0( unique_val, collapse = ";" ) )
}, by = .EACHI ][]
# feature a b match feature_df2
# 1: A 9 1 1 E
# 2: B 4 4 0
# 3: C 7 1 0
# 4: D 1 2 1 D
# 5: E 2 5 0
One way to go about this in Base R would be to split the data.frames() into a list of rows then calculate the absolute difference of row vectors to then evaluate how large the absolute difference is and if said difference is larger than a given value.
Code
# Find the absolute difference of all row vectors
listdif <- lapply(l1, function(x){
lapply(l2, function(y){
abs(x - y)
})
})
# Then flatten the list to a list of data.frames
listdifflat <- lapply(listdif, function(x){
do.call(rbind, x)
})
# Finally see if a pair of numbers is within our threshhold or not
m1 <- 2
m2 <- 3
listfin <- Map(function(x){
x[1] > m1 | x[2] > m2
},
listdifflat)
head(listfin, 1)
[[1]]
V1
[1,] TRUE
[2,] FALSE
[3,] TRUE
[4,] TRUE
[5,] TRUE
[6,] TRUE
[7,] TRUE
[8,] TRUE
[9,] TRUE
[10,] TRUE
Data
df1 <- read.table(text = "
4 1
7 5
1 5
2 10
13 6
19 10
11 7
17 9
14 5
3 5")
df2 <- read.table(text = "
15 1
6 3
19 6
8 2
1 3
13 7
16 8
12 7
9 1
2 6")
# convert df to list of row vectors
l1<- lapply(1:nrow(df1), function(x){
df1[x, ]
})
l2 <- lapply(1:nrow(df2), function(x){
df2[x, ]
})
I have written a code to obtain crosstab results of a rasterstack for different regions (delimited by a shapefile) covering the raster. However, I am getting an empty list.
This is the function:
transitions <- function(bound, themat) { # bound = shapefile # themat = rasterstack
result = vector("list", nrow(bound)) # empty result list
names(result) = bound#data$GEOCODIGO
for (i in 1:nrow(bound)) { # this is the number of polygons to iterate through
single <- bound[i,] # selects a single polygon
clip <- mask(crop(themat, single), single) # crops the raster to the polygon boundary
result[i] <- crosstab(clip, digits = 0, long = FALSE, useNA = FALSE)
return(result)
}
}
I have tested the steps for the first object in the shapefile/bound outside of the for loop; and it worked well. But I still cannot figure out why I am getting an empty list. Any ideas?
Example data:
p <- shapefile(system.file("external/lux.shp", package="raster"))
b <- brick(raster(p), nl=2)
values(b) = sample(2, 200, replace=TRUE)
fixed function:
transitions <- function(poly, rast) {
result = vector("list", nrow(poly))
for (i in 1:nrow(poly)) {
clip <- mask(crop(rast, poly[i,]), poly[i,])
result[[i]] <- crosstab(clip, digits = 0, long = FALSE, useNA = FALSE)
}
return(result)
}
transitions(p, b)
An alternative would be to use extract
e <- extract(b, p)
To tabulate as in crosstab:
ee <- lapply(e, function(x) aggregate(data.frame(count=rep(1, nrow(x))), data.frame(x), FUN=sum))
To understand that last line, you need to unpack it.
class(e)
#[1] "list"
length(e)
#[1] 12
e[[1]]
# layer.1 layer.2
#[1,] 1 1
#[2,] 1 2
#[3,] 2 2
#[4,] 2 1
#[5,] 2 1
#[6,] 1 2
#[7,] 2 2
e is a list with the same length as the number of polygons (see length(p))
Let's that the first element and aggregate it to get a table with cases and counts.
x <- e[[1]]
aggregate(data.frame(count=rep(1, nrow(x))), data.frame(x), FUN=sum)
# layer.1 layer.2 count
#1 1 1 1
#2 2 1 2
#3 1 2 2
#4 2 2 2
A similar approach via table (the difference is that you could get Freq values that are zero
as.data.frame(table(x[,1], x[,2]))
# Var1 Var2 Freq
#1 1 1 1
#2 2 1 2
#3 1 2 2
#4 2 2 2
Now wrap the function you like into a lapply
z <- lapply(e, function(x) aggregate(data.frame(count=rep(1, nrow(x))), data.frame(x), FUN=sum))
And to take it further, bind the data.frames and add an identifier to link the data back to the polygons
y <- do.call(rbind, z,)
y$id <- rep(1:length(z), sapply(z, nrow))
head(y)
# Var1 Var2 Freq id
#1 1 1 1 1
#2 2 1 2 1
#3 1 2 2 1
#4 2 2 2 1
#5 1 1 1 2
#6 2 1 2 2
I would like to replace up to n consecutive NA values in vector with latest non-NA value.
For example, if:
a <- c(1,NA,NA,NA,NA,NA,2,NA,1,NA,NA,NA)
n <- 2
I would like to obtain:
c(1,1,1,NA,NA,NA,2,2,1,1,1,NA)
n is maximum number of NA values that can be replaced by given element).
I know na.locf() function, but I don't know how to set the limit n. Is it possible to do it?
Here's an option using na.locf and rle
library(zoo)
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
So here I first computed the run lengths of elements in a (including the NA entries), then replaced all NA's using na.locf and finally turned those elements back to NA's where the run lengths were greater than n and the elements were NA.
As another idea, we can find the last indices of "a" without NAs:
is = seq_along(a)
i = cummax((!is.na(a)) * is)
i
# [1] 1 1 1 1 1 1 7 7 9 9 9 9
Replace the last non-NA index with the current index if last non-NA is more than "n" steps away:
wh = (is - i) > n
i[wh] = is[wh]
i
# [1] 1 1 1 4 5 6 7 7 9 9 9 12
And subset "a":
a[i]
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
You could do this using split and replace in base R
f <- function(a, n) {
# split the vector based on the position of non-NA values
l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f(a, n = 2)
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
f(a, n = 3)
#[1] 1 1 1 1 NA NA 2 2 1 1 1 1
Benchmarking (random generated vector of size 7467)
library(microbenchmark)
library(dplyr)
library(zoo)
set.seed(123)
a <- unlist(replicate(1000, c(sample(10, 2), rep(NA, sample.int(10, 1)))))
length(a)
# [1] 7467
n <- 3
f_989 <- function(a, n) {
# split the vector based on the position of non-NA values
l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f_zx8754 <- function(a, n)
data.frame(a) %>% mutate(gr = cumsum(!is.na(a))) %>%
group_by(gr) %>%
mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>%
.$res
f_docendo_discimus <- function(a, n){
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
}
f_akrun <- function(a,n)
ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))
f_alexis_laz=function(a,n){
is = seq_along(a)
i = cummax((!is.na(a)) * is)
wh = (is - i) > n
i[wh] = is[wh]
a[i]
}
r <- f_989(a,n)
identical(r, f_zx8754(a,n))
# [1] TRUE
identical(r, f_docendo_discimus(a,n))
# [1] TRUE
identical(r, f_akrun(a,n))
# [1] TRUE
identical(r, f_alexis_laz(a,n))
# [1] TRUE
res <- microbenchmark("f1"=f_989(a,n), "f2"=f_zx8754(a,n),
"f3"=f_docendo_discimus(a,n), "f4"=f_akrun(a,n), "f5"=f_alexis_laz(a,n))
print(res, order="mean")
# Unit: microseconds
# expr min lq mean median uq max neval
# f5 129.804 137.014 161.106 141.6715 151.7375 1031.511 100
# f3 1249.351 1354.215 1471.478 1392.9750 1482.2140 2553.086 100
# f1 4736.895 5093.852 5630.367 5345.3450 6069.9260 8848.513 100
# f4 22165.601 23936.866 24660.990 24485.6725 24883.6440 29453.177 100
# f2 205854.339 215582.174 221524.448 218643.9540 224211.0435 261512.922 100
We can use a base R approach by creating a grouping variable with cumsum and diff, then using the grouping variable in ave we replace the NA values based on the condition given by 'n'
ave(a, cumsum(c(TRUE, diff(is.na(a)) < 0)),
FUN = function(x) replace(x, is.na(x) & seq_along(x) <= n + 1, x[1]))
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
Or more compact option
ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
Using dplyr::group_by and zoo::na.locf:
library(dplyr)
library(zoo)
data.frame(a) %>%
mutate(gr = cumsum(!is.na(a))) %>%
group_by(gr) %>%
mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>%
.$res
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
I have a list with same structure for every member as the following
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
and I need to obtain 3 vectors VAL, ARR and DF, each with the concatenated elements of the corresponding member. such as
# VAL: 0,1,5
# ARR: 1,2,3,4,5,1,3,2,4,9,4,2,1,5,8
# DF: 1,5,3,8,2,6,1,9,4,2,1,7
Looking at similar situations, I have the feeling I need to use a combination of do.call and cbind or lapply but I have no clue. any suggestions?
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
sapply(names(config[[1]]), function(x)
unname(unlist(sapply(config, `[`, x))), USE.NAMES = TRUE)
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Or you can use this clist function
Unfortunately there were no other answers.
(l <- Reduce(clist, config))
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# X1 X2 X1 X2 X1 X2
# 1 1 3 2 1 4 1
# 2 5 8 6 9 2 7
It merges data frames and matrices, so you need to unlist to get the vector you want
l$DF <- unname(unlist(l$DF))
l
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Function
clist <- function (x, y) {
islist <- function(x) inherits(x, 'list')
'%||%' <- function(a, b) if (!is.null(a)) a else b
get_fun <- function(x, y)
switch(class(x %||% y),
matrix = cbind,
data.frame = function(x, y)
do.call('cbind.data.frame', Filter(Negate(is.null), list(x, y))),
factor = function(...) unlist(list(...)), c)
stopifnot(islist(x), islist(y))
nn <- names(rapply(c(x, y), names, how = 'list'))
if (is.null(nn) || any(!nzchar(nn)))
stop('All non-NULL list elements should have unique names', domain = NA)
nn <- unique(c(names(x), names(y)))
z <- setNames(vector('list', length(nn)), nn)
for (ii in nn)
z[[ii]] <- if (islist(x[[ii]]) && islist(y[[ii]]))
Recall(x[[ii]], y[[ii]]) else
(get_fun(x[[ii]], y[[ii]]))(x[[ii]], y[[ii]])
z
}
Another approach, with slightly less code.
un_config <- unlist(config)
un_configNAM <- names(un_config)
vecNAM <- c("VAL", "ARR", "DF")
for(n in vecNAM){
assign(n, un_config[grepl(n, un_configNAM)])
}
This will return 3 vectors as the OP requested. However, generally it is more advantageous to store results in a list as rawr suggests. You of course can adopt the above code so that results are stored within a list.
l <- rep(list(NA), length(vecNAM))
i = 1
for(n in vecNAM){
l[[i]] <- un_config[grepl(n, un_configNAM)]
i = i +1
}
I have a vector: a<-rep(sample(1:5,20, replace=T))
I determine the frequency of occurrence of each value:
tabulate(a)
I would now like to determine the position of the most frequently occurring values.
Let's say the vector is:
[1] 3 3 3 5 2 2 4 1 4 2 5 1 2 1 3 1 3 2 5 1
tabulate returns:
[1] 5 5 5 2 3
Now I determine the highest value returned by tabulate max(tabulate(a))
this returns
[1] 5
There are 3 values with frequency 5. I would like to know the position of these values in the tabulate output.
i.e. I the first three entries of tabulate.
Perhaps it is easier to work with table:
x <- table(a)
x
# a
# 1 2 3 4 5
# 5 5 5 2 3
names(x)[x == max(x)]
# [1] "1" "2" "3"
which(a %in% names(x)[x == max(x)])
# [1] 1 2 3 5 6 8 10 12 13 14 15 16 17 18 20
Alternatively, there's a similar approach with tabulate:
x <- tabulate(a)
sort(unique(a))[x == max(x)]
Here are some benchmarks on numeric and character vectors. The difference in performance is more noticeable with numeric data.
Sample data
set.seed(1)
a <- sample(20, 1000000, replace = TRUE)
b <- sample(letters, 1000000, replace = TRUE)
Functions to benchmark
t1 <- function() {
x <- table(a)
out1 <- names(x)[x == max(x)]
out1
}
t2 <- function() {
x <- tabulate(a)
out2 <- sort(unique(a))[x == max(x)]
out2
}
t3 <- function() {
x <- table(b)
out3 <- names(x)[x == max(x)]
out3
}
t4 <- function() {
x <- tabulate(factor(b))
out4 <- sort(unique(b))[x == max(x)]
out4
}
The results
library(rbenchmark)
benchmark(t1(), t2(), t3(), t4(), replications = 50)
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 t1() 50 30.548 24.244 30.416 0.064 0 0
# 2 t2() 50 1.260 1.000 1.240 0.016 0 0
# 3 t3() 50 8.919 7.079 8.740 0.160 0 0
# 4 t4() 50 5.680 4.508 5.564 0.100 0 0