Check lengths of elements in list of equal length - r

Check whether the elements in a list is of equal length?
E.g.:
l <- list(c(1:3),c(2:7),c(12:13))
[[1]]
[1] 1 2 3
[[2]]
[1] 2 3 4 5 6 7
[[3]]
[1] 12 13
I have a long list with many entries and want a way to check if each element is of the same length.
Above it should return FALSE as the lengths differ (3,6,2).

Try this:
length(unique(sapply(l, length))) == 1
# [1] FALSE
Or #PierreLafortune's way:
length(unique(lengths(l))) == 1L
Or #CathG's way:
all(sapply(l, length) == length(l[[1]]))
#or
all(lengths(l) == length(l[[1]]))
Some benchmarking:
#data
set.seed(123)
l <- lapply(round(runif(1000,1,100)), runif)
library(microbenchmark)
library(ggplot2)
#benchmark
bm <- microbenchmark(
zx8754 = length(unique(sapply(l, length))) == 1,
PierreLafortune=length(unique(lengths(l))) == 1L,
CathG_1 = all(lengths(l) == length(l[[1]])),
CathG_2 = all(sapply(l, length) == length(l[[1]])),
times = 10000)
# result
bm
Unit: microseconds
expr min lq mean median uq max neval cld
zx8754 326.605 355.281 392.39741 364.034 377.618 84109.597 10000 d
PierreLafortune 23.545 25.960 30.24049 27.168 28.375 3312.829 10000 b
CathG_1 9.056 11.471 13.49464 12.679 13.584 1832.847 10000 a
CathG_2 319.965 343.207 371.50327 351.659 364.940 3531.068 10000 c
#plot benchmark
autoplot(bm)

I would use:
length(unique(lengths(l))) == 1L
[1] FALSE

Related

How can I delete certain rows according to two columns which have symmetricl values in data.table in R?

For example, I have a table as follows:
DT <- data.table(
A = c(1,1,1,2,2,2,3,3,3),
B = c(1,2,3,1,2,3,1,2,3),
key = "A"
)
I wand to delete the rows under the conditon such as "A" == 2 and "B" == 1, since there is already the row that "A" == 1 and "B" == 2.
In short, I want to delete the rows that already appears symmetrically in the previous rows, how can I realize it?
Maybe not the most efficient, but leverage the duplicated.matrix method:
DT[!duplicated(apply(cbind(A, B), 1L, sort), MARGIN = 2L)]
# A B
# 1: 1 1
# 2: 1 2
# 3: 1 3
# 4: 2 2
# 5: 2 3
# 6: 3 3
Another option:
DT[, g := paste(B, A, sep="_")][A < B, g := paste(A, B, sep="_")][!duplicated(g), !"g"]
A B
1: 1 1
2: 1 2
3: 1 3
4: 2 2
5: 2 3
6: 3 3
So ...
make a grouping variable as A + B,
flip the order to B + A on subset A < B or A > B
dedupe on the grouping variable
The last step could alternately be unique(DT, by="g").
if you only have two columns, then you could do:
unique(do.call(function(A,B)data.table(A=pmin(A,B),B=pmax(A,B)),DT))
A B
1: 1 1
2: 1 2
3: 1 3
4: 2 2
5: 2 3
6: 3 3
Another method for the case where there are only 2 columns, using anti-join.
dupes <- unique(DT[B > A])[unique(DT[A < B]), on=c("A"="B", "B"="A")]
ans <- unique(DT)[!dupes, on=.(A, B)]
timing code:
library(data.table)
set.seed(0L)
nr <- 1e5
nElem <- 1e3
mat <- matrix(sample(nElem, nr*2, replace=TRUE), ncol=2)
DT <- as.data.table(mat)
setnames(DT, c("A", "B"))
DT2 <- copy(DT)
library(microbenchmark)
mtd1 <- function() unique(data.frame(A=pmin(mat[, 1], mat[, 2]), B=pmax(mat[, 1], mat[, 2])))
mtd2 <- function() DT[!duplicated(apply(cbind(A, B), 1L, sort), MARGIN = 2L)]
mtd3 <- function() DT2[, g := paste(B, A, sep="_")][A < B, g := paste(A, B, sep="_")][!duplicated(g), !"g"]
mtd4 <- function() {
dupes <- unique(DT[B > A])[unique(DT[A < B]), on=c("A"="B", "B"="A")]
ans <- unique(DT)[!dupes, on=.(A, B)]
}
microbenchmark(mtd1(),mtd2(),mtd3(),mtd4(),times=3L)
some timings:
Unit: milliseconds
expr min lq mean median uq max neval
mtd1() 118.62051 129.50581 153.77216 140.39111 171.34799 202.30487 3
mtd2() 3500.47877 3552.80879 3732.67006 3605.13882 3848.76571 4092.39260 3
mtd3() 89.22901 92.94830 97.22658 96.66759 101.22536 105.78313 3
mtd4() 28.61628 32.37641 50.90126 36.13654 62.04375 87.95096 3
But the fastest is eddi's method: data.table with two string columns of set elements, extract unique rows with each row unsorted
mtd5 <- function() DT[DT[, .I[1L], by=.(pmin(A, B), pmax(A, B))]$V1]
microbenchmark(mtd1(),mtd2(),mtd3(),mtd4(),mtd5(),times=3L)
timings:
Unit: milliseconds
expr min lq mean median uq max neval
mtd1() 149.62224 150.70685 175.66394 151.79146 188.68479 225.57813 3
mtd2() 4126.51014 4140.72876 4277.37907 4154.94738 4352.81353 4550.67968 3
mtd3() 126.01679 131.26463 134.63642 136.51247 138.94624 141.38000 3
mtd4() 39.24141 42.42815 45.65804 45.61489 48.86635 52.11781 3
mtd5() 12.58396 16.68156 18.21613 20.77915 21.03221 21.28527 3

Define the value of a column in a dataframe based on 2 keys from a different dataframe

I have the following dataframe:
a <- seq(0, 5, by = 0.25)
b <-seq(0, 20, by = 1)
df <- data.frame(a, b)
and I'd like to create a new column "value", based on columns a and b, and the conversion table below:
a_min <- c(0,2, 0,2)
a_max <- c(2,5,2,5)
b_min <- c(0,0,10,10)
b_max <- c(10,10,30,30)
output <-c(1,2,3,4)
conv <- data.frame(a_min, a_max, b_min, b_max, output)
I've tried to do it using dplyr::mutate without much success...
require(dplyr)
mutate(df, value = calcula(conv, a, b))
longer object length is not a multiple of shorter object length
My expectation would be to obtain a dataframe like the 'df' above with the additional column value as per below:
df$value <- c(rep(1,8), rep(2,2), rep(4,11))
A possible relatively simple and very efficient data.table solution using binary non-equi joins
library(data.table) # v1.10.0
setDT(conv)[setDT(df), output, on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b)]
## [1] 1 1 1 1 1 1 1 1 1 2 2 2 4 4 4 4 4 4 4 4 4 4 4
As a side note, if output column is just the row index within conv, you could make this join even more efficient by just asking for the row indices by specifying which = TRUE
setDT(conv)[setDT(df), on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b), which = TRUE]
## [1] 1 1 1 1 1 1 1 1 1 2 2 2 4 4 4 4 4 4 4 4 4 4 4
One more option, this time with matrices.
with(df, with(conv, output[max.col(
outer(a, a_min, `>=`) + outer(a, a_max, `<=`) +
outer(b, b_min, `>=`) + outer(b, b_max, `<=`))]))
## [1] 1 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4
outer compares each element of the vector from df from the one from conv, producing a matrix of Booleans for each call. Since TRUE is 1, if you add all four matrices, the index you want will be the column with the most TRUEs, which you can get with max.col. Subset output, and you've got your result.
The benefit of working with matrices is that they're fast. Using #Phann's benchmarks on 1,000 rows:
Unit: microseconds
expr min lq mean median uq max neval cld
alistaire 276.099 320.4565 349.1045 339.8375 357.2705 941.551 100 a
akr1 830.934 966.6705 1064.8433 1057.6610 1152.3565 1507.180 100 ab
akr2 11431.246 11731.3125 12835.5229 11947.5775 12408.4715 36767.488 100 d
Pha 11985.129 12403.1095 13330.1465 12660.4050 13044.9330 29653.842 100 d
Ron 71132.626 74300.3540 81136.9408 78034.2275 88952.8765 98950.061 100 e
Dav1 2506.205 2765.4095 2971.6738 2948.6025 3082.4025 4065.368 100 c
Dav2 2104.481 2272.9180 2480.9570 2478.8775 2575.8740 3683.896 100 bc
and on 100,000 rows:
Unit: milliseconds
expr min lq mean median uq max neval cld
alistaire 30.00677 36.49348 44.28828 39.43293 54.28207 64.36581 100 a
akr1 36.24467 40.04644 48.46986 41.59644 60.15175 77.34415 100 a
Dav1 51.74218 57.23488 67.70289 64.11002 68.86208 382.25182 100 c
Dav2 48.48227 54.82818 60.25256 59.81041 64.92611 91.20212 100 b
We can try with Map with na.locf
library(zoo)
f1 <- function(u, v, x, y, z) z * NA^!((with(df, a >= u & a <v) & (b >=x & b <y)))
na.locf(do.call(pmax, c(do.call(Map, c(f=f1, unname(conv))), na.rm = TRUE)))
#[1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4 4
Or another way to write the Map solution is to pass the 'a' and 'b' columns as arguments, and then do the logical evaluation with columns of 'conv' to extract the 'output' value and unlist the list output
unlist(Map(function(x, y)
with(conv, output[x >= a_min & a_max > x & y >= b_min & b_max > y]),
df$a, df$b))
#[1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4
NOTE: The second solution should be slower as we are looping through the rows of the dataset while the first solution loops through the 'conv' rows (which we assume should not be many rows)
Another approach using apply:
df$value <- unlist(apply(df, 1, function(x){
ifelse(length(OUT <- output[which(x[1] >= a_min & x[1] <= a_max & x[2] >= b_min & x[2] <= b_max)]) > 0, OUT, 0)
}))
EDIT:
Because there are several answers so far, I checked the time needed to process the data. I created a little bit bigger example (similar to the given one with random numbers):
set.seed(23563)
a <- runif(1000, 0, 5)
b <- runif(1000, 0, 20)
df <- data.frame(a, b)
require(microbenchmark)
library(zoo)
require(data.table)
microbenchmark(
akr1 = { #akrun 1
f1 <- function(u, v, x, y, z) z * NA^!((with(df, a >= u & a <v) & (b >=x & b <y)))
na.locf(do.call(pmax, c(do.call(Map, c(f=f1, unname(conv))), na.rm = TRUE)))
},
akr2 = { #akrun 2
unlist(Map(function(x, y)
with(conv, output[x >= a_min & a_max > x & y >= b_min & b_max > y]),
df$a, df$b))
},
Pha = { #Phann
df$value <- unlist(apply(df, 1, function(x){
ifelse(length(OUT <- output[which(x[1] >= a_min & x[1] <= a_max & x[2] >= b_min & x[2] <= b_max)]) > 0, OUT, 0)
}))
},
Ron = { #Ronak Shah
unlist(mapply(function(x, y)
conv$output[x >= conv$a_min & conv$a_max > x & y >= conv$b_min & conv$b_max > y],
df$a, df$b))
},
Dav1 ={ #David Arenburg 1
setDT(conv)[setDT(df), on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b)]$output
},
Dav2 = { #David Arenburg 2
setDT(conv)[setDT(df), on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b), which = TRUE]
},
times = 100L
)
With 1000 random numbers:
# Unit: milliseconds
# expr min lq mean median uq max neval
# akr1 4.267206 4.749576 6.259695 5.351494 6.843077 54.39187 100
# akr2 33.437853 39.912785 49.932875 47.416888 57.070369 91.55602 100
# Pha 30.433779 36.939692 48.205592 46.393800 55.800204 83.91640 100
# Ron 174.765021 199.648315 227.493117 223.314661 240.579057 370.26929 100
# Dav1 6.944759 7.814469 10.685460 8.536694 11.974102 44.47915 100
# Dav2 6.106978 6.706424 8.961821 8.161707 10.376085 28.91255 100
With 10000 random numbers (same seed), I get:
# Unit: milliseconds
# expr min lq mean median uq max neval
# akr1 23.48180 24.03962 26.16747 24.46897 26.19565 41.83238 100
# akr2 357.38290 398.69965 434.92052 409.15385 440.98210 829.85113 100
# Pha 320.39285 347.66632 376.98118 361.76852 383.08231 681.28500 100
# Ron 1661.50669 1788.06228 1873.70929 1837.28187 1912.04123 2499.23235 100
# Dav1 20.91486 21.60953 23.12278 21.94707 22.42773 44.71900 100
# Dav2 19.69506 20.22077 21.63715 20.55793 21.27578 38.96819 100
Here is another attempt to utilize findIntervals efficiency on both memory and speed. A more convenient format of the conv "data.frame" could be
(i) a "list" of the intervals for each variable which are not overlapping:
vecs = list(a = unique(c(conv$a_min, conv$a_max)),
b = unique(c(conv$b_min, conv$b_max)))
vecs
#$a
#[1] 0 2 5
#
#$b
#[1] 0 10 30
and, (ii) a lookup structure that contains the group of each paired interval between the two variables:
maps = xtabs(output ~ a_min + b_min)
maps
# b_min
#a_min 0 10
# 0 1 3
# 2 2 4
where, for example, we note that the first interval of "a" && second of "b" are assigned a "3" etc.
Then we can use:
maps[mapply(findInterval, df, vecs, all.inside = TRUE)]
# [1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4 4
And extending the benchmarks of Phann and alistaire (re-written, partly, for convenience):
n = 1e6
set.seed(23563); a = runif(n, 0, 5); b = runif(n, 0, 20); df = data.frame(a, b)
library(microbenchmark); library(zoo); library(data.table)
alistaire = function() {
with(df, with(conv, output[max.col(
outer(a, a_min, `>=`) + outer(a, a_max, `<=`) +
outer(b, b_min, `>=`) + outer(b, b_max, `<=`))]))
}
david = function() {
as.data.table(conv)[setDT(df), output, on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b)]
}
akrun = function() {
f1 = function(u, v, x, y, z) z * NA^!((with(df, a >= u & a <v) & (b >=x & b <y)))
na.locf(do.call(pmax, c(do.call(Map, c(f=f1, unname(conv))), na.rm = TRUE)))
}
alex = function() {
vecs = list(a = unique(c(conv$a_min, conv$a_max)), b = unique(c(conv$b_min, conv$b_max)))
maps = xtabs(output ~ a_min + b_min)
maps[mapply(findInterval, df, vecs, all.inside = TRUE)]
}
identical(alistaire(), david())
#[1] TRUE
identical(david(), akrun())
#[1] TRUE
identical(akrun(), alex())
#[1] TRUE
microbenchmark(alistaire(), david(), akrun(), alex(), times = 20)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# alistaire() 592.46700 718.07148 799.28933 792.98107 860.16414 1136.4489 20 b
# david() 1363.76196 1375.43935 1398.53515 1385.11747 1425.69837 1457.1693 20 d
# akrun() 824.11962 850.88831 903.58723 906.21007 958.04310 995.2129 20 c
# alex() 70.82439 72.65993 82.87961 76.77627 81.20356 179.7669 20 a
We can use mapply on two variables a and b and find the correct output variable based on the range
unlist(mapply(function(x, y)
conv$output[x >= conv$a_min & conv$a_max > x & y >= conv$b_min & conv$b_max > y],
df$a, df$b))
#[1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4

Preserve names when coercing vector from binary to `as.numeric`?

In R, when you coerce a vector from binary to numeric, the names are stripped away.
There are a few possible solutions, which I've outlined before. It seems dangerous to rely on implicit conversion by adding 0 to all the values, and the sapply() adds an additional loop to my operations (which seems inefficient). Is there any other way to preserve the names when converting a vector using as.numeric?
# Set the seed
set.seed(1045)
# Create a small sample vector and give it names
example_vec <- sample(x = c(TRUE,FALSE),size = 10,replace = TRUE)
names(example_vec) <- sample(x = LETTERS,size = 10,replace = FALSE)
example_vec
# Y N M P L J H O F D
# FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
as.numeric(x = example_vec)
# [1] 0 1 0 0 1 1 1 1 1 1
example_vec + 0
# Y N M P L J H O F D
# 0 1 0 0 1 1 1 1 1 1
sapply(X = example_vec,FUN = as.numeric)
# Y N M P L J H O F D
# 0 1 0 0 1 1 1 1 1 1
One possibility is to use the mode<- replacement function to change the internal storage mode (type) of the object. Also, integers are more appropriate than doubles (i.e. numerics) for this case of logical coercion.
mode(example_vec) <- "integer"
example_vec
# Y N M P L J H O F D
# 0 1 0 0 1 1 1 1 1 1
From help(mode) -
mode(x) <- "newmode" changes the mode of object x to newmode. This is only supported if there is an appropriate as.newmode function, for example "logical", "integer", "double", "complex", "raw", "character", "list", "expression", "name", "symbol" and "function". Attributes are preserved.
The documentation also notes that storage.mode<- is a more efficient primitive version of mode<-. So the following could also be used.
storage.mode(example_vec) <- "integer"
But as #joran pointed out in the comments, it looks like class<- also does the same thing.
Just to throw another option out there, since your input is a logical vector, you can use ifelse(). And one could argue this approach is more explicit and straightforward:
ifelse(example_vec,1L,0L);
## Y N M P L J H O F D
## 0 1 0 0 1 1 1 1 1 1
Benchmarking
library(microbenchmark);
ifelse. <- function(x) ifelse(x,1L,0L);
sapply. <- function(x) sapply(x,as.integer);
setstoragemode <- function(x) { storage.mode(x) <- 'integer'; x; };
setmode <- function(x) { mode(x) <- 'integer'; x; };
setclass <- function(x) { class(x) <- 'integer'; x; };
as.and.setnames <- function(x) setNames(as.integer(x),names(x));
plus <- function(x) +x;
addzero <- function(x) x+0L;
## small scale (OP's example input)
set.seed(1045L);
x <- sample(c(T,F),10L,T);
names(x) <- sample(LETTERS,10L);
ex <- ifelse.(x);
identical(ex,sapply.(x));
## [1] TRUE
identical(ex,setstoragemode(x));
## [1] TRUE
identical(ex,setmode(x));
## [1] TRUE
identical(ex,setclass(x));
## [1] TRUE
identical(ex,as.and.setnames(x));
## [1] TRUE
identical(ex,plus(x));
## [1] TRUE
identical(ex,addzero(x));
## [1] TRUE
microbenchmark(ifelse.(x),sapply.(x),setstoragemode(x),setmode(x),setclass(x),as.and.setnames(x),plus(x),addzero(x));
## Unit: nanoseconds
## expr min lq mean median uq max neval
## ifelse.(x) 6843 8126.0 9627.13 8981 9837.0 21810 100
## sapply.(x) 18817 20100.5 23234.93 21383 22666.5 71418 100
## setstoragemode(x) 856 1283.0 1745.54 1284 1711.0 15396 100
## setmode(x) 7270 8126.0 9862.36 8982 10264.0 32074 100
## setclass(x) 429 1283.0 2138.97 1284 1712.0 32075 100
## as.and.setnames(x) 1283 1711.0 1997.78 1712 2139.0 7271 100
## plus(x) 0 428.0 492.39 428 428.5 9837 100
## addzero(x) 0 428.0 539.39 428 856.0 2566 100
## large scale
set.seed(1L);
N <- 1e5L;
x <- sample(c(T,F),N,T);
names(x) <- make.unique(rep_len(LETTERS,N));
ex <- ifelse.(x);
identical(ex,sapply.(x));
## [1] TRUE
identical(ex,setstoragemode(x));
## [1] TRUE
identical(ex,setmode(x));
## [1] TRUE
identical(ex,setclass(x));
## [1] TRUE
identical(ex,as.and.setnames(x));
## [1] TRUE
identical(ex,plus(x));
## [1] TRUE
identical(ex,addzero(x));
## [1] TRUE
microbenchmark(ifelse.(x),sapply.(x),setstoragemode(x),setmode(x),setclass(x),as.and.setnames(x),plus(x),addzero(x));
## Unit: microseconds
## expr min lq mean median uq max neval
## ifelse.(x) 7633.598 7757.1900 16615.71251 7897.4600 29401.112 96503.642 100
## sapply.(x) 86353.737 102576.0945 125547.32957 123909.1120 137900.406 264442.788 100
## setstoragemode(x) 84.676 92.8015 343.46124 98.3605 113.543 23939.133 100
## setmode(x) 124.020 155.0245 603.15744 167.2125 181.111 22395.736 100
## setclass(x) 85.104 92.3740 328.25393 100.2850 118.460 21807.713 100
## as.and.setnames(x) 70.991 78.2610 656.98177 82.3235 88.953 35710.697 100
## plus(x) 40.200 42.9795 48.68026 44.9040 49.608 88.953 100
## addzero(x) 181.326 186.4580 196.34882 189.6650 201.211 282.679 100
## very large scale
set.seed(1L);
N <- 1e7L;
x <- sample(c(T,F),N,T);
names(x) <- make.unique(rep_len(LETTERS,N));
ex <- ifelse.(x);
identical(ex,sapply.(x));
## [1] TRUE
identical(ex,setstoragemode(x));
## [1] TRUE
identical(ex,setmode(x));
## [1] TRUE
identical(ex,setclass(x));
## [1] TRUE
identical(ex,as.and.setnames(x));
## [1] TRUE
identical(ex,plus(x));
## [1] TRUE
identical(ex,addzero(x));
## [1] TRUE
microbenchmark(ifelse.(x),sapply.(x),setstoragemode(x),setmode(x),setclass(x),as.and.setnames(x),plus(x),addzero(x),times=5L);
## Unit: milliseconds
## expr min lq mean median uq max neval
## ifelse.(x) 1082.220903 1308.106967 3452.639836 1473.723533 6306.320235 7092.82754 5
## sapply.(x) 16766.199371 17431.458634 18401.672635 18398.345499 18843.890150 20568.46952 5
## setstoragemode(x) 13.298283 13.648103 173.574496 19.661753 24.736278 796.52806 5
## setmode(x) 19.043796 19.878573 75.669779 19.969235 39.683589 279.77370 5
## setclass(x) 14.025292 14.119804 259.627934 14.414457 26.838618 1228.74150 5
## as.and.setnames(x) 12.889875 24.241484 178.243948 24.962934 25.103631 804.02182 5
## plus(x) 7.577576 7.676364 9.047674 8.245142 8.253266 13.48602 5
## addzero(x) 18.861615 18.960403 71.284716 26.622226 26.950662 265.02867 5
Looks like the unary plus takes the cake. (And my ifelse() idea kinda sucks.)

Most efficient way to turn factor matrix into binary (indicator) matrix in R

I can think of several ways to turn matrix (data frame) of this type:
dat = data.frame(
x1 = rep(c('a', 'b'), 100),
x2 = rep(c('x', 'y'), 100)
)
head(dat)
x1 x2
1 a x
2 b y
3 a x
4 b y
5 a x
6 b y
Into a binary (indicator) matrix (or data frame) like this:
a b x y
1 0 1 0
0 1 0 1
...
(This structure is, of course, trivial and only for illustrative purpose!)
Many thanks!
We can use table
tbl <- table(rep(1:nrow(dat),2),unlist(dat))
head(tbl, 2)
# a b x y
# 1 1 0 1 0
# 2 0 1 0 1
Or a possibly efficient option would be
library(Matrix)
sM <- sparse.model.matrix(~ -1 + x1 +x2, dat,
contrasts.arg = lapply(dat, contrasts, contrasts = FALSE))
colnames(sM) <- sub(".*\\d", "", colnames(sM))
head(sM, 2)
# 2 x 4 sparse Matrix of class "dgCMatrix"
# a b x y
#1 1 . 1 .
#2 . 1 . 1
It can be converted to binary by converting to matrix
head(as.matrix(sM),2)
# a b x y
#1 1 0 1 0
#2 0 1 0 1
There are some good solutions posted already, but none are optimal for performance. We can optimize performance by looping over each input column, and then looping over each factor level index within each input column and doing a straight integer comparison of the factor indexes. It's not the most concise or elegant piece of code, but it's fairly straightforward and fast:
do.call(cbind,lapply(dat,function(col)
`colnames<-`(do.call(cbind,lapply(seq_along(levels(col)),function(i)
as.integer(as.integer(col)==i)
)),levels(col))
));
Performance:
library(Matrix);
library(data.table);
library(microbenchmark);
bgoldst <- function(dat) do.call(cbind,lapply(dat,function(col) `colnames<-`(do.call(cbind,lapply(seq_along(levels(col)),function(i) as.integer(as.integer(col)==i))),levels(col))));
akrun1 <- function(dat) table(rep(1:nrow(dat),2),unlist(dat));
akrun2 <- function(dat) sparse.model.matrix(~-1+x1+x2,dat,contrasts.arg=lapply(dat,contrasts,contrasts=FALSE));
davidar <- function(dat) { dat[,rowid:=.I]; dcast(melt(dat,id='rowid'),rowid~value,length); }; ## requires a data.table
dataminer <- function(dat) t(apply(dat,1,function(x) as.numeric(unique(unlist(dat))%in%x)));
N <- 100L; dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
identical(unname(bgoldst(dat)),matrix(as.vector(akrun1(dat)),ncol=4L));
## [1] TRUE
identical(unname(bgoldst(dat)),unname(matrix(as.integer(as.matrix(akrun2(dat))),ncol=4L)));
## [1] TRUE
identical(bgoldst(dat),as.matrix(davidar(datDT)[,rowid:=NULL]));
## [1] TRUE
identical(unname(bgoldst(dat)),matrix(as.integer(dataminer(dat)),ncol=4L));
## [1] TRUE
N <- 100L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT),dataminer(dat));
## Unit: microseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 67.570 92.374 106.2853 99.6440 121.2405 188.596 100
## akrun1(dat) 581.182 652.386 773.6300 690.6605 916.4625 1192.299 100
## akrun2(dat) 4429.208 4836.119 5554.5902 5145.3135 5977.0990 11263.537 100
## davidar(datDT) 5064.273 5498.555 6104.7621 5664.9115 6203.9695 11713.856 100
## dataminer(dat) 47577.729 49529.753 55217.3726 53190.8940 60041.9020 74346.268 100
N <- 1e4L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 1.775617 1.820949 2.299493 1.84725 1.972124 8.362336 100
## akrun1(dat) 38.954524 41.109257 48.409613 45.60304 52.147633 162.365472 100
## akrun2(dat) 16.915832 17.762799 21.288200 19.20164 23.775180 46.494055 100
## davidar(datDT) 36.151684 38.366715 42.875940 42.38794 45.916937 58.695008 100
N <- 1e5L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 17.16473 22.97654 35.01815 26.76662 31.75562 152.6188 100
## akrun1(dat) 501.72644 626.14494 671.98315 680.91152 727.88262 828.8313 100
## akrun2(dat) 212.12381 242.65505 298.90254 272.28203 357.65106 429.6023 100
## davidar(datDT) 368.04924 461.60078 500.99431 511.54921 540.39358 638.3840 100
If you have a data.frame as you are showing (not a matrix), you could as well recast the data
library(data.table)
setDT(dat)[, rowid := .I] # Creates a row index
res <- dcast(melt(dat, id = "rowid"), rowid ~ value, length) # long/wide format
head(res)
# rowid a b x y
# 1 1 1 0 1 0
# 2 2 0 1 0 1
# 3 3 1 0 1 0
# 4 4 0 1 0 1
# 5 5 1 0 1 0
# 6 6 0 1 0 1
Some benchmarks
dat = data.frame(
x1 = rep(c('a', 'b'), 1e3),
x2 = rep(c('x', 'y'), 1e3)
)
library(data.table)
library(Matrix)
library(microbenchmark)
dat2 <- copy(dat)
microbenchmark("akrun1 : " = table(rep(1:nrow(dat),2),unlist(dat)),
"akrun2 : " = sparse.model.matrix(~ -1 + x1 +x2, dat, contrasts.arg = lapply(dat, contrasts, contrasts = FALSE)),
"DatamineR : " = t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))),
"David Ar : " = {setDT(dat2)[, rowid := .I] ; dcast(melt(dat2, id = "rowid"), rowid ~ value, length)},
times = 10L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# akrun1 : 3.826075 4.061904 6.654399 5.165376 11.26959 11.82029 10 a
# akrun2 : 5.269531 5.713672 8.794434 5.943422 13.34118 20.01961 10 a
# DatamineR : 3199.336286 3343.774160 3410.618547 3385.756972 3517.22133 3625.70909 10 b
# David Ar : 8.092769 8.254682 11.030785 8.465232 15.44893 19.83914 10 a
The apply solution is highly inefficient and will take forever on a bigger data set. Comparing for a bigger data set while excluding the apply solution
dat = data.frame(
x1 = rep(c('a', 'b'), 1e4),
x2 = rep(c('x', 'y'), 1e4)
)
dat2 <- copy(dat)
microbenchmark("akrun1 : " = table(rep(1:nrow(dat),2),unlist(dat)),
"akrun2 : " = sparse.model.matrix(~ -1 + x1 +x2, dat, contrasts.arg = lapply(dat, contrasts, contrasts = FALSE)),
#"DatamineR : " = t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))),
"David Ar : " = {setDT(dat2)[, rowid := .I] ; dcast(melt(dat2, id = "rowid"), rowid ~ value, length)},
times = 100L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# akrun1 : 38.66744 41.27116 52.97982 42.72534 47.17203 161.0420 100 b
# akrun2 : 17.02006 18.93534 27.27582 19.35580 20.72022 153.2397 100 a
# David Ar : 34.15915 37.91659 46.11050 38.58536 41.40412 149.0038 100 b
Seems like the Matrix package shines for a bigger data sets.
It probably worth comparing different scenarios when there are more columns/unique values too.
One alternative using apply
head(t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))))
[,1] [,2] [,3] [,4]
[1,] 1 0 1 0
[2,] 0 1 0 1
[3,] 1 0 1 0
[4,] 0 1 0 1
[5,] 1 0 1 0
[6,] 0 1 0 1

How to omit rows with NA in only two columns in R?

I want to omit rows where NA appears in both of two columns.
I'm familiar with na.omit, is.na, and complete.cases, but can't figure out how to use these to get what I want. For example, I have the following dataframe:
(df <- structure(list(x = c(1L, 2L, NA, 3L, NA),
y = c(4L, 5L, NA, 6L, 7L),
z = c(8L, 9L, 10L, 11L, NA)),
.Names = c("x", "y", "z"),
class = "data.frame",
row.names = c(NA, -5L)))
x y z
1 4 8
2 5 9
NA NA 10
3 6 11
NA 7 NA
and I want to remove only those rows where NAappears in both the x and y columns (excluding anything in z), to give
x y z
1 4 8
2 5 9
3 6 11
NA 7 NA
Does anyone know an easy way to do this? Using na.omit, is.na, or complete.cases is not working.
df[!with(df,is.na(x)& is.na(y)),]
# x y z
#1 1 4 8
#2 2 5 9
#4 3 6 11
#5 NA 7 NA
I did benchmarked on a slightly bigger dataset. Here are the results:
set.seed(237)
df <- data.frame(x=sample(c(NA,1:20), 1e6, replace=T), y= sample(c(NA, 1:10), 1e6, replace=T), z= sample(c(NA, 5:15), 1e6,replace=T))
f1 <- function() df[!with(df,is.na(x)& is.na(y)),]
f2 <- function() df[rowSums(is.na(df[c("x", "y")])) != 2, ]
f3 <- function() df[ apply( df, 1, function(x) sum(is.na(x))>1 ), ]
library(microbenchmark)
microbenchmark(f1(), f2(), f3(), unit="relative")
Unit: relative
#expr min lq median uq max neval
# f1() 1.000000 1.000000 1.000000 1.000000 1.000000 100
# f2() 1.044812 1.068189 1.138323 1.129611 0.856396 100
# f3() 26.205272 25.848441 24.357665 21.799930 22.881378 100
dplyr solution
require("dplyr")
df %>% filter_at(.vars = vars(x, y), .vars_predicate = any_vars(!is.na(.)))
can be modified to take any number columns using the .vars argument
Update: dplyr 1.0.4
df %>%
filter(!if_all(c(x, y), is.na))
See similar answer: https://stackoverflow.com/a/66136167/6105259
You can apply to slice up the rows:
sel <- apply( df, 1, function(x) sum(is.na(x))>1 )
Then you can select with that:
df[ sel, ]
To ignore the z column, just omit it from the apply:
sel <- apply( df[,c("x","y")], 1, function(x) sum(is.na(x))>1 )
If they all have to be TRUE, just change the function up a little:
sel <- apply( df[,c("x","y")], 1, function(x) all(is.na(x)) )
The other solutions here are more specific to this particular problem, but apply is worth learning as it solves many other problems. The cost is speed (usual caveats about small datasets and speed testing apply):
> microbenchmark( df[!with(df,is.na(x)& is.na(y)),], df[rowSums(is.na(df[c("x", "y")])) != 2, ], df[ apply( df, 1, function(x) sum(is.na(x))>1 ), ] )
Unit: microseconds
expr min lq median uq max neval
df[!with(df, is.na(x) & is.na(y)), ] 67.148 71.5150 76.0340 86.0155 1049.576 100
df[rowSums(is.na(df[c("x", "y")])) != 2, ] 132.064 139.8760 145.5605 166.6945 498.934 100
df[apply(df, 1, function(x) sum(is.na(x)) > 1), ] 175.372 184.4305 201.6360 218.7150 321.583 100
Use rowSums with is.na, like this:
> df[rowSums(is.na(df[c("x", "y")])) != 2, ]
x y z
1 1 4 8
2 2 5 9
4 3 6 11
5 NA 7 NA
Jumping on the benchmarking wagon, and demonstrating what I was referring to about this being a fairly easy-to-generalize solution, consider the following:
## Sample data with 10 columns and 1 million rows
set.seed(123)
df <- data.frame(replicate(10, sample(c(NA, 1:20),
1e6, replace = TRUE)))
First, here's what things look like if you're just interested in two columns. Both solutions are pretty legible and short. Speed is quite close.
f1 <- function() {
df[!with(df, is.na(X1) & is.na(X2)), ]
}
f2 <- function() {
df[rowSums(is.na(df[1:2])) != 2, ]
}
library(microbenchmark)
microbenchmark(f1(), f2(), times = 20)
# Unit: milliseconds
# expr min lq median uq max neval
# f1() 745.8378 1100.764 1128.047 1199.607 1310.236 20
# f2() 784.2132 1101.695 1125.380 1163.675 1303.161 20
Next, let's look at the same problem, but this time, we are considering NA values across the first 5 columns. At this point, the rowSums approach is slightly faster and the syntax does not change much.
f1_5 <- function() {
df[!with(df, is.na(X1) & is.na(X2) & is.na(X3) &
is.na(X4) & is.na(X5)), ]
}
f2_5 <- function() {
df[rowSums(is.na(df[1:5])) != 5, ]
}
microbenchmark(f1_5(), f2_5(), times = 20)
# Unit: seconds
# expr min lq median uq max neval
# f1_5() 1.275032 1.294777 1.325957 1.368315 1.572772 20
# f2_5() 1.088564 1.169976 1.193282 1.225772 1.275915 20
This is also the very basic dplyr solution:
library(dplyr)
df %>%
filter(!(is.na(x) & is.na(y)))
x y z
1 1 4 8
2 2 5 9
3 3 6 11
4 NA 7 NA

Resources