create a set of cumulative intersection counts - r

I want to count the intersection of var1[i] and union(var2[1],...,var2[i]).
Using this data
var1 <- list('2003' = 1:3, '2004' = c(4:3), '2005' = c(6,4,1), '2006' = 1:4 )
var2 <- list('2003' = 1:3, '2004' = c(4:5), '2005' = c(2,3,6), '2006' = 2:3 )
I would like to populate a results list with:
1. intersect(var1$2003,var2$2003)
2. intersect(var1$2004,union(var2$2003,var2$2004))
3. intersect(var1$2005,union(var2$2005(union(var2$2003,var2$2004))))
and so on, until 2012 (not shown in the example)
Disclaimer: due to editing, the comments below might not make sense.

Is something like this what you want?
# create the data
var1 <- list('2003' = 1:3, '2004' = c(4:3), '2005' = c(6,4,1), '2006' = 1:4 )
var2 <- list('2003' = 1:3, '2004' = c(4:5), '2005' = c(2,3,6), '2006' = 2:3 )
# A couple of nested lapply statements
lapply(setNames(seq_along(var1), names(var1)),
function(i,l1,l2) length(intersect(l1[[i]], Reduce(union,l2[1:i]))),
l1 = var1,l2=var2)
$`2003`
[1] 3
$`2004`
[1] 2
$`2005`
[1] 3
$`2006`
[1] 4
note that Reduce(union,var2)reduces the list var2 by successively combining the elements using union (see ?Reduce)
Reduce(union,var2)
[1] 1 2 3 4 5 6
EDIT elegant alternative
use the accumulate = T argument in Reduce
lapply(mapply(intersect,var1, Reduce(union, var2, accumulate=T)),length)
Because --
Reduce(union, var2, accumulate = T)
## [[1]]
## [1] 1 2 3
##
## [[2]]
## [1] 1 2 3 4 5
##
## [[3]]
## [1] 1 2 3 4 5 6
##
## [[4]]
## [1] 1 2 3 4 5 6

Related

How to unnest a list of lists of data frame in R? [duplicate]

This question already has an answer here:
How to convert from a list of lists to a list in R retaining names?
(1 answer)
Closed 9 years ago.
I have a brief question, I would like to unnest this nested list:
mylist <- list(a = list(A=1, B=5),
b = list(C= 1, D = 2),
c = list(E = 1, F = 3))
Expected result is:
> list(a=c(1, 5), b = c(1, 2), c = c(1, 3))
$a
[1] 1 5
$b
[1] 1 2
$c
[1] 1 3
Any suggestions?
T
Slight variation on everyone else's and keeping it in base:
lapply(mylist, unlist, use.names=FALSE)
## $a
## [1] 1 5
##
## $b
## [1] 1 2
##
## $c
## [1] 1 3
Take a look at llply function from plyr package
> library(plyr)
> llply(mylist, unlist)
$a
A B
1 5
$b
C D
1 2
$c
E F
1 3
If you want to get rid of the names, then try:
> lapply(llply(mylist, unlist), unname)
$a
[1] 1 5
$b
[1] 1 2
$c
[1] 1 3
I think applying unlist() to each elment in your list should give you what you're looking for:
> mylist <- list(a = list(A=1, B=5), b = list(C= 1, D = 2), c = list(E = 1, F = 3))
> mylist2 <- list(a=c(1, 5), b = c(1, 2), c = c(1, 3))
> data.frame(lapply(mylist,unlist))
a b c
A 1 1 1
B 5 2 3
> data.frame(mylist2)
a b c
1 1 1 1
2 5 2 3

Find the closest enclosing FALSE value positions

Is there a more elegant way to solve this problem?
For every TRUE value I'm looking for the positions of the closest previous and following FALSE values.
data:
vec <- c(FALSE, TRUE, TRUE, FALSE, TRUE, FALSE)
desired outcome: (something like)
pos start end
[1,] 2 1 4
[2,] 3 1 4
[3,] 5 4 6
explanation of the first row of the outcome:
pos = 2, position of the first TRUE,
start = 1, position of the closest FALSE in front of pos = 2
end = 4, position of the closest FALSE after pos = 2.
Already working solution:
pos = which(vec)
f_pos = which(!vec)
t(
sapply(pos, function(x){ s <- rev(f_pos[f_pos < x])[1]; e <- f_pos[x < f_pos][1]; return(data.frame(pos = x, start = s, end = e)) })
)
Using findInterval
pos <- which(vec)
b <- which(!vec)
ix <- findInterval(pos, b)
cbind(pos, from = b[ix], to = b[ix + 1])
# pos from to
# [1,] 2 1 4
# [2,] 3 1 4
# [3,] 5 4 6
If we stretch your "something like" slightly, a simple cut will do:
data.frame(pos, rng = cut(pos, b))
# pos rng
# 1 2 (1,4]
# 2 3 (1,4]
# 3 5 (4,6]
If the vector ends with TRUE, the findInterval solution will give NA in 'to' column. In cut, the last 'interval' is then coded as NA.
You can do as if FALSE defined intervals and use data.table::foverlaps to find the right ones:
library(data.table)
# put your objects in data.tables:
f_pos_inter <- data.table(start=head(f_pos, -1), end=tail(f_pos, -1))
pos_inter <- data.table(start=pos, end=pos)
# define the keys:
setkeyv(pos_inter, c("start", "end")); setkeyv(f_pos_inter, c("start", "end"))
res <- foverlaps(pos_inter, f_pos_inter)
# start end i.start i.end
#1: 1 4 2 2
#2: 1 4 3 3
#3: 4 6 5 5
You can further reorder the columns and keep only the ones you need:
res[, i.end:=NULL]
setcolorder(res, c(3, 1, 2))
setnames(res, "i.start", "pos")
res
# pos start end
#1: 2 1 4
#2: 3 1 4
#3: 5 4 6
N.B: this will give NA in both columns start and end if vec ends with TRUE

How to permut list level in R

is there's a way to permute list levels in R?
In other words, how can i simply go from test list to test2?
test=list(
a=list("alpha"=1:2,"beta"=3:5),
b=list("alpha"=5:6,"omega"=7:10)
)
test2 <- list(
alpha=list("a"=1:2,"b"=5:6),
beta=list("a"=3:5),
omega=list("b"=7:10)
)
How about this:
tst <- unlist(test, recursive = FALSE)
lst <- split(tst, gsub("[a-z]\\.|[0-9]", "", names(tst)))
lapply(lst, function(z) setNames(z, substring(names(z), 1, 1)))
# $alpha
# $alpha$a
# [1] 1 2
#
# $alpha$b
# [1] 5 6
#
#
# $beta
# $beta$a
# [1] 3 4 5
#
#
# $omega
# $omega$b
# [1] 7 8 9 10

Convert a nested list to a list [duplicate]

This question already has an answer here:
How to convert from a list of lists to a list in R retaining names?
(1 answer)
Closed 9 years ago.
I have a brief question, I would like to unnest this nested list:
mylist <- list(a = list(A=1, B=5),
b = list(C= 1, D = 2),
c = list(E = 1, F = 3))
Expected result is:
> list(a=c(1, 5), b = c(1, 2), c = c(1, 3))
$a
[1] 1 5
$b
[1] 1 2
$c
[1] 1 3
Any suggestions?
T
Slight variation on everyone else's and keeping it in base:
lapply(mylist, unlist, use.names=FALSE)
## $a
## [1] 1 5
##
## $b
## [1] 1 2
##
## $c
## [1] 1 3
Take a look at llply function from plyr package
> library(plyr)
> llply(mylist, unlist)
$a
A B
1 5
$b
C D
1 2
$c
E F
1 3
If you want to get rid of the names, then try:
> lapply(llply(mylist, unlist), unname)
$a
[1] 1 5
$b
[1] 1 2
$c
[1] 1 3
I think applying unlist() to each elment in your list should give you what you're looking for:
> mylist <- list(a = list(A=1, B=5), b = list(C= 1, D = 2), c = list(E = 1, F = 3))
> mylist2 <- list(a=c(1, 5), b = c(1, 2), c = c(1, 3))
> data.frame(lapply(mylist,unlist))
a b c
A 1 1 1
B 5 2 3
> data.frame(mylist2)
a b c
1 1 1 1
2 5 2 3

How to ignore case when using subset in R

How to ignore case when using subset function in R?
eos91corr.data <- subset(test.data,select=c(c(X,Y,Z,W,T)))
I would like to select columns with names x,y,z,w,t. what should i do?
Thanks
If you can live without the subset() function, the tolower() function may work:
dat <- data.frame(XY = 1:5, x = 1:5, mm = 1:5,
y = 1:5, z = 1:5, w = 1:5, t = 1:5, r = 1:5)
dat[,tolower(names(dat)) %in% c("xy","x")]
However, this will return a data.frame with the columns in the order they are in the original dataset dat: both
dat[,tolower(names(dat)) %in% c("xy","x")]
and
dat[,tolower(names(dat)) %in% c("x","xy")]
will yield the same result, although the order of the target names has been reversed.
If you want the columns in the result to be in the order of the target vector, you need to be slightly more fancy. The two following commands both return a data.frame with the columns in the order of the target vector (i.e., the results will be different, with columns switched):
dat[,sapply(c("x","xy"),FUN=function(foo)which(foo==tolower(names(dat))))]
dat[,sapply(c("xy","x"),FUN=function(foo)which(foo==tolower(names(dat))))]
You could use regular expressions with the grep function to ignore case when identifying column names to select. Once you have identified the desired column names, then you can pass these to subset.
If your data are
dat <- data.frame(xy = 1:5, x = 1:5, mm = 1:5, y = 1:5, z = 1:5,
w = 1:5, t = 1:5, r = 1:5)
# xy x mm y z w t r
# 1 1 1 1 1 1 1 1 1
# 2 2 2 2 2 2 2 2 2
# 3 3 3 3 3 3 3 3 3
# 4 4 4 4 4 4 4 4 4
# 5 5 5 5 5 5 5 5 5
Then
(selNames <- grep("^[XYZWT]$", names(dat), ignore.case = TRUE, value = TRUE))
# [1] "x" "y" "z" "w" "t"
subset(dat, select = selNames)
# x y z w t
# 1 1 1 1 1 1
# 2 2 2 2 2 2
# 3 3 3 3 3 3
# 4 4 4 4 4 4
# 5 5 5 5 5 5
EDIT If your column names are longer than one letter, the above approach won't work too well. So assuming you can get your desired column names in a vector, you could use the following:
upperNames <- c("XY", "Y", "Z", "W", "T")
(grepPattern <- paste0("^", upperNames, "$", collapse = "|"))
# [1] "^XY$|^Y$|^Z$|^W$|^T$"
(selNames2 <- grep(grepPattern, names(dat), ignore.case = TRUE, value = TRUE))
# [1] "xy" "y" "z" "w" "t"
subset(dat, select = selNames2)
# xy y z w t
# 1 1 1 1 1 1
# 2 2 2 2 2 2
# 3 3 3 3 3 3
# 4 4 4 4 4 4
# 5 5 5 5 5 5
The 'stringr' library is a very neat wrapper for all of this functionality. It has 'ignore.case' option as follows:
also, you may want to consider using match not subset.

Resources