I have a named dataframe containig logicals with missings and I want to get a vector with the column names where values are TRUE (going down the rows and, if multiple TRUEs in one row, going from left to right). Here an example:
df <- data.frame(a= c(FALSE, NA, TRUE, TRUE),
b= c(TRUE, FALSE, FALSE, NA),
c= c(TRUE, TRUE, NA, NA))
df
# a b c
# 1 FALSE TRUE TRUE
# 2 NA FALSE TRUE
# 3 TRUE FALSE NA
# 4 TRUE NA NA
expected <- c("b", "c", "c", "a", "a")
Going from first to last row we see TRUE in the first row. Here are multiple TRUEs, thus we go from left to right and get "b" and "c". In second tow we get "c", and so on.
How to do this (in an elegant way)?
You can do in base R:
pos <- which(t(df) == TRUE, arr.ind = TRUE)
names(df)[pos[, "row"]]
[1] "b" "c" "c" "a" "a"
You can also try using apply
unlist(apply(df, 1, function(x){na.omit(names(df)[x])}))
[1] "b" "c" "c" "a" "a"
Here is a tidyverse way:
library(dplyr)
library(tidyr)
vector <- df %>%
mutate(across(, ~case_when(.==TRUE ~ cur_column()), .names = 'new_{col}')) %>%
unite(New_Col, starts_with('new'), na.rm = TRUE, sep = ', ') %>%
separate_rows(New_Col) %>%
pull(New_Col)
Or:
library(dplyr)
library(tidyr)
df %>%
mutate(across(, ~case_when(.==TRUE ~ cur_column()))) %>%
pivot_longer(everything()) %>%
na.omit() %>%
pull(value)
[1] "b" "c" "c" "a" "a"
Another possible solution, based on purrr::pmap:
library(tidyverse)
pmap(df, ~ names(df)[c(...)] %>% na.omit) %>% unlist
#> [1] "b" "c" "c" "a" "a"
You can use %%(modulo) to identify the column indices.
names(df)[(which(t(df)) - 1) %% ncol(df) + 1]
# [1] "b" "c" "c" "a" "a"
Benchmark
df <- as.data.frame(matrix(sample(c(TRUE, FALSE, NA), 1e7, TRUE), 1e5, 1e2))
# A data.frame: 100,000 × 100
# V1 V2 V3 V4 V5 ...
# 1 TRUE TRUE NA FALSE FALSE ...
# 2 NA TRUE TRUE TRUE NA ...
# 3 NA FALSE FALSE FALSE TRUE ...
# 4 NA FALSE FALSE TRUE FALSE ...
# 5 NA FALSE FALSE FALSE TRUE ...
library(microbenchmark)
bm <- microbenchmark(
Darren = {
x1 <- names(df)[(which(t(df)) - 1) %% ncol(df) + 1]
}, Clemsang = {
x2 <- names(df)[which(t(df) == TRUE, arr.ind = TRUE)[, "row"]]
})
all(x1 == x2)
# [1] TRUE
bm
# Unit: milliseconds
# expr min lq mean median uq max neval
# Darren 140.5595 153.3333 163.7934 159.4783 167.5418 284.4146 100
# Clemsang 219.7802 242.6169 254.9226 250.8673 264.0462 356.9299 100
Related
I want to evaluate if ColA contains a new string than ColB. However, I am not interested in certain types of string, for example, oil. I would like to have an indicator variable as follow:
ColA ColB Ind
-------------------------- ------------------------ -----
coconut+grape+pine grape+coconut TRUE
orange+apple+grape+pine grape+coconut TRUE
grape+pine grape+oil TRUE
oil+grape grape+apple FALSE
grape grape+oil FALSE
grape+pine grape+orange+pine FALSE
Any Suggestions using R?
Many thanks!
Since we need to split the strings, we'll start with strsplit,
strsplit(dat$ColA, '+', fixed = TRUE)
# [[1]]
# [1] "coconut" "grape" "pine"
# [[2]]
# [1] "orange" "apple" "grape" "pine"
# [[3]]
# [1] "grape" "pine"
# [[4]]
# [1] "oil" "grape"
# [[5]]
# [1] "grape"
# [[6]]
# [1] "grape" "pine"
From here, we want to determine what is in ColA that is not in ColB. I'll use Map to run setdiff on each set (ColA's [[1]] with ColB's [[1]], etc).
Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE))
# [[1]]
# [1] "pine"
# [[2]]
# [1] "orange" "apple" "pine"
# [[3]]
# [1] "pine"
# [[4]]
# [1] "oil"
# [[5]]
# character(0)
# [[6]]
# character(0)
To determine which one has "new words", we can just check for non-zero length using lengths(.) > 0:
lengths(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE))) > 0
# [1] TRUE TRUE TRUE TRUE FALSE FALSE
But since you don't care about oil, we need to remove that as well.
lapply(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE)), setdiff, "oil")
# [[1]]
# [1] "pine"
# [[2]]
# [1] "orange" "apple" "pine"
# [[3]]
# [1] "pine"
# [[4]]
# character(0)
# [[5]]
# character(0)
# [[6]]
# character(0)
lengths(lapply(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE)),
setdiff, "oil")) > 0
# [1] TRUE TRUE TRUE FALSE FALSE FALSE
#akrun suggested a tidyverse variant:
library(dplyr)
library(purrr) # map2_lgl
library(stringr) # str_extract_all
dat %>%
mutate(
new = map2_lgl(
str_extract_all(ColB, "\\w+"), str_extract_all(ColA, "\\w+"),
~ !all(setdiff(.y, "oil") %in% .x)
)
)
# ColA ColB Ind new
# 1 coconut+grape+pine grape+coconut TRUE TRUE
# 2 orange+apple+grape+pine grape+coconut TRUE TRUE
# 3 grape+pine grape+oil TRUE TRUE
# 4 oil+grape grape+apple FALSE FALSE
# 5 grape grape+oil FALSE FALSE
# 6 grape+pine grape+orange+pine FALSE FALSE
Data
dat <- structure(list(ColA = c("coconut+grape+pine", "orange+apple+grape+pine", "grape+pine", "oil+grape", "grape", "grape+pine"), ColB = c("grape+coconut", "grape+coconut", "grape+oil", "grape+apple", "grape+oil", "grape+orange+pine"), Ind = c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE)), class = "data.frame", row.names = c(NA, -6L))
Here's a solution similar to r2evans's that calls strsplit only once with the help of do.call.
rid <- function(x) x[!x %in% z] ## helper FUN to get rid of the oil
z <- "oil"
L <- sapply(unname(dat), strsplit, "\\+")
dat$ind <- sapply(1:nrow(L), function(x) length(do.call(setdiff, rev(Map(rid, L[x,]))))) > 0
dat
# V1 V2 ind
# 1 grape+coconut coconut+grape+pine TRUE
# 2 grape+coconut orange+apple+grape+pine TRUE
# 3 grape+oil grape+pine TRUE
# 4 grape+apple oil+grape FALSE
# 5 grape+oil grape FALSE
# 6 grape+orange+pine grape+pine FALSE
Data:
dat <- structure(list(V1 = c("grape+coconut", "grape+coconut", "grape+oil",
"grape+apple", "grape+oil", "grape+orange+pine"), V2 = c("coconut+grape+pine",
"orange+apple+grape+pine", "grape+pine", "oil+grape", "grape",
"grape+pine")), row.names = c(NA, -6L), class = "data.frame")
Here is the toy sample including 2 character variables. I have another vector of two characters. By comparing one by one, I can get the result, but is there more graceful way of doing it?
set.seed(100)
DT <- data.table(V1 = LETTERS[sample(1:5, 10, replace = T)],
V2 = LETTERS[sample(3:7, 10, replace = T)])
V1V2 = c("B", "G")
DT[V1 %in% V1V2[1] & V2 %in% V1V2[2]]
# V1 V2
# 1: B G
Since we can directly get the row elements by apply with DT[apply(DT[,.(V1, V2)], 1, print)] There shall be a way to describe a multi-condition express in i.
I'm expecting something like:
DT[.(V1, V2) %in% V1V2]
but this seems not to be working.
Thanks for advice.
Another option is to use the join-capabilities of data.table:
setkey(DT, V1, V2)
DT[as.list(V1V2)]
or:
DT[as.list(V1V2), on = .(V1, V2)]
We could use as.list to compare column-wise every element in V1V2
DT == as.list(V1V2)
# V1 V2
# [1,] TRUE FALSE
# [2,] TRUE TRUE
# [3,] FALSE FALSE
# [4,] FALSE FALSE
# [5,] FALSE FALSE
# [6,] FALSE FALSE
# [7,] FALSE FALSE
# [8,] TRUE FALSE
# [9,] FALSE FALSE
#[10,] FALSE FALSE
This compares V1V2[1] with 1st column of DT and V1V2[2] with second column.
Now select rows where all elements are TRUE
DT[rowSums(DT == as.list(V1V2)) == ncol(DT), ]
# V1 V2
#1: B G
I have a matrix (1000 x 2830) like this:
9178 3574 3547
160 B_B B_B A_A
301 B_B A_B A_B
303 B_B B_B A_A
311 A_B A_B A_A
312 B_B A_B A_A
314 B_B A_B A_A
and I want to obtain the following (duplicating colnames and splitting each element of each column):
9178 9178 3574 3574 3547 3547
160 B B B B A A
301 B B A B A B
303 B B B B A A
311 A B A B A A
312 B B A B A A
314 B B A B A A
I tried using strsplit but I got error messages because this is a matrix, not a string. Could you please provide some ideas for resolving this?
Here's an option using dplyr (for bind_cols) and tidyr (for separate_) together with lapply from base R. It assumes that your data is a data.frame (i.e. you might need to convert it to data.frame first):
library(dplyr)
library(tidyr)
lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>%
bind_cols
# X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
#1 B B B B A A
#2 B B A B A B
#3 B B B B A A
#4 A B A B A A
#5 B B A B A A
#6 B B A B A A
I'm biased, but I would recommend using cSplit from my "splitstackshape" package. Since it appears that you have rownames in your input, use as.data.table(., keep.rownames = TRUE):
library(splitstackshape)
cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
# rn X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
# 1: 160 B B B B A A
# 2: 301 B B A B A B
# 3: 303 B B B B A A
# 4: 311 A B A B A A
# 5: 312 B B A B A A
# 6: 314 B B A B A A
Less legible than cSplit (but presently likely to be faster) would be to use stri_split_fixed from "stringi", like this:
library(stringi)
`dimnames<-`(do.call(cbind,
lapply(mydf, stri_split_fixed, "_", simplify = TRUE)),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
# X9178 X9178 X3574 X3574 X3547 X3547
# 160 "B" "B" "B" "B" "A" "A"
# 301 "B" "B" "A" "B" "A" "B"
# 303 "B" "B" "B" "B" "A" "A"
# 311 "A" "B" "A" "B" "A" "A"
# 312 "B" "B" "A" "B" "A" "A"
# 314 "B" "B" "A" "B" "A" "A"
If speed is of the essence, I would suggest checking out the "iotools" package, particularly the mstrsplit function. The approach would be similar to the "stringi" approach:
library(iotools)
`dimnames<-`(do.call(cbind,
lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
You may need to add an lapply(mydf, as character) in there if you forgot to use stringsAsFactors = FALSE when converting from a matrix to a data.frame, but it should still beat even the stri_split approach.
Something you can do, although it seems a bit "twisted" (yourmat being your matrix)...:
inter<-data.frame(t(sapply(as.vector(yourmat), function(x) {
strsplit(x, "_")[[1]]
})),
row.names=paste0(rep(colnames(yourmat), e=nrow(yourmat)), 1:nrow(yourmat)),
stringsAsFactors=F)
res<-do.call("cbind",
split(inter, factor(substr(row.names(inter), 1, 4), level = colnames(yourmat))))
res
# 9178.X1 9178.X2 3574.X1 3574.X2 3547.X1 3547.X2
# 91781 B B B B A A
# 91782 B B A B A B
# 91783 B B B B A A
# 91784 A B A B A A
# 91785 B B A B A A
# 91786 B B A B A A
Edit
If you want the row.names of resto be the same as in yourmat, you can do:
row.names(res)<-row.names(yourmat)
NB: If yourmat is a data.frame instead of a matrix the as.vector function in the first line needs to be changed to unlist.
base R solution without using data frames:
# split
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
# 9178 9178 3574 3574 3547 3547
# 160 "B" "B" "A" "B" "B" "A"
# 301 "B" "A" "A" "B" "B" "B"
# 303 "B" "B" "A" "B" "B" "A"
# 311 "A" "A" "A" "B" "B" "A"
# 312 "B" "A" "A" "B" "B" "A"
# 314 "B" "A" "A" "B" "B" "A"
[Update]
Here is a small benchmarking study of the proposed solutions (I didn't include the cSplit solution because it was too slow):
Setup:
m <- matrix('A_B',nrow=1000,ncol=2830)
d <- as.data.frame(m, stringsAsFactors = FALSE)
#####
f.mtrx <- function(m) {
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
M
}
library(stringi)
f.mtrx2 <- function(m) {
z <- unlist(stri_split_fixed(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
M
}
#####
library(splitstackshape)
f.cSplit <- function(mydf) cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
#####
library(stringi)
f.stringi <- function(mydf) `dimnames<-`(do.call(cbind,
lapply(mydf, stri_split_fixed, "_", simplify = TRUE)),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
#####
library(dplyr)
library(tidyr)
f.dplyr <- function(df) lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>%
bind_cols
#####
library(iotools)
f.mstrsplit <- function(mydf) `dimnames<-`(do.call(cbind,
lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
#####
library(rbenchmark)
benchmark(f.mtrx(m), f.mtrx2(m), f.dplyr(d), f.stringi(d), f.mstrsplit(d), replications = 10)
Results:
test replications elapsed relative user.self sys.self user.child sys.child
3 f.dplyr(d) 10 27.722 10.162 27.360 0.269 0 0
5 f.mstrsplit(d) 10 2.728 1.000 2.607 0.098 0 0
1 f.mtrx(m) 10 37.943 13.909 34.885 0.799 0 0
2 f.mtrx2(m) 10 15.176 5.563 13.936 0.802 0 0
4 f.stringi(d) 10 8.107 2.972 7.815 0.247 0 0
In the updated benchmark, the winner is f.mstrsplit.
I have a vector x containing 13 elements:
x <- letters[c(1:9, 12:15)]
x
# [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "l" "m" "n" "o"
I want to print the vector to a file in the following format, i.e. with 3 columns:
a b c
d e f
g h i
l m n
o
I can't find any direct way to print it in this way.
I tried to convert the vector to a matrix with 3 columns:
matrix(x, ncol = 3, byrow = TRUE)
# [,1] [,2] [,3]
# [1,] "a" "b" "c"
# [2,] "d" "e" "f"
# [3,] "g" "h" "i"
# [4,] "l" "m" "n"
# [5,] "o" "a" "b" # <~~ "a" and "b" recycled
# Warning message:
# In matrix(x, ncol = 3, byrow = TRUE) :
# data length [13] is not a sub-multiple or multiple of the number of rows [5]
But this method recycles the last two values of x (a and b) at the end of the matrix and I don't want that.
We can use stri_list2matrix from stringi after splitting the vector ('v1') into groups of successive 3 elements into a list ("lst"). The grouping can be done by gl or using %/% (ie. (seq_along(v1)-1)%/%3+1).
library(stringi)
lst <- split(v1, as.numeric(gl(length(v1), 3, length(v1))))
stri_list2matrix(lst, byrow=TRUE, fill='')
# [,1] [,2] [,3]
#[1,] "a" "b" "c"
#[2,] "d" "e" "f"
#[3,] "g" "h" "i"
#[4,] "l" "m" "n"
#[5,] "o" "" ""
Or using base R, we can pad "NA's" into those list elements that have less number of elements compared to the maximum length.
t(sapply(lst, `length<-`, max(sapply(lst, length))))
data
v1 <- letters[c(1:9,12:15)]
Some possibilities:
m <- matrix(x, ncol = 3, byrow = TRUE)
# create a matrix of x indices of same dimension as "m"
# find duplicated indices, retain same dimensions using MARGIN = 0
# replace duplicated x indices with `""`
m[duplicated(matrix(seq_along(x), ncol = 3, byrow = TRUE), MARGIN = 0)] <- ""
m
# [,1] [,2] [,3]
# [1,] "a" "b" "c"
# [2,] "d" "e" "f"
# [3,] "g" "h" "i"
# [4,] "l" "m" "n"
# [5,] "o" "" ""
Or:
# create row index
rr <- ceiling(seq_along(x)/3)
# create column index
cc <- ave(rr, rr, FUN = seq_along)
# create an empty matrix
m <- matrix("", nrow = max(rr), ncol = 3)
# replace values at idx with X
m[cbind(rr, cc)] <- x
m
# [,1] [,2] [,3]
# [1,] "a" "b" "c"
# [2,] "d" "e" "f"
# [3,] "g" "h" "i"
# [4,] "l" "m" "n"
# [5,] "o" "" ""
Added benchmarks
a1 <- function(x){
lst <- split(x, as.numeric(gl(length(x), 3, length(x))))
stri_list2matrix(lst, byrow = TRUE, fill = '')
}
a2 <- function(x){
lst <- split(x, as.numeric(gl(length(x), 3, length(x))))
t(sapply(lst, `length<-`, max(sapply(lst, length))))
}
h1 <- function(x){
m <- matrix(x, ncol = 3, byrow = TRUE)
m[duplicated(matrix(seq_along(x), ncol = 3, byrow = TRUE), MARGIN = 0)] <- ""
m
}
h2 <- function(x){
rr <- ceiling(seq_along(x)/3)
cc <- ave(rr, rr, FUN = seq_along)
m <- matrix("", nrow = max(rr), ncol = 3)
m[cbind(rr, cc)] <- x
m
}
x <- sample(letters, 13, replace = TRUE)
library(microbenchmark)
microbenchmark(
a1(x),
a2(x),
h1(x),
h2(x),
times = 5)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# a1(x) 135.708 140.270 171.5926 144.451 162.317 275.217 5 a
# a2(x) 270.276 270.655 277.2696 271.035 283.960 290.422 5 b
# h1(x) 191.968 217.436 246.4028 224.659 225.420 372.531 5 ab
# h2(x) 408.264 409.784 425.2176 411.685 412.445 483.910 5 c
x <- sample(letters, 1e6, replace = TRUE)
microbenchmark(
a1(x),
a2(x),
h1(x),
h2(x),
times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# a1(x) 2406.03363 2411.56990 2554.72548 2606.9757 2623.604 2725.4443 5 b
# a2(x) 4266.47556 4292.69452 4510.61242 4513.6833 4653.349 4826.8594 5 c
# h1(x) 76.51557 76.79041 91.24598 78.8207 101.336 122.7672 5 a
# h2(x) 2419.89711 2570.22968 2636.08654 2663.2898 2751.662 2775.3540 5 b
Thus, for a small vector a1 is faster. For a larger vector h1 is about 25* faster.
Is there an equivalent of == but with the result that x != NA if x is not NA?
The following does what I want, but it's clunky:
mapply(identical, vec1, vec2)
Just replace "==" with %in%.
Example:
> df <- data.frame(col1= c("a", "b", NA), col2= 1:3)
> df
col1 col2
1 a 1
2 b 2
3 <NA> 3
> df[df$col1=="a", ]
col1 col2
1 a 1
NA <NA> NA
> df[df$col1%in%"a", ]
col1 col2
1 a 1
> "x"==NA
[1] NA
> "x"%in%NA
[1] FALSE
1 == NA returns a logical NA rather than TRUE or FALSE. If you want to call NA FALSE, you could add a second conditional:
set.seed(1)
x <- 1:10
x[4] <- NA
y <- sample(1:10, 10)
x <= y
# [1] TRUE TRUE TRUE NA FALSE TRUE TRUE FALSE TRUE FALSE
x <= y & !is.na(x)
# [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
You could also use a second processing step to convert all the NA values from your equality test to FALSE.
foo <- x <= y
foo[is.na(foo)] <- FALSE
foo
# [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
Also, for what its worth, NA == NA returns NA as does NA != NA.
The == operator is often used in combination with filtering data.frames.
In that situation, dplyr::filter will retain only rows where your condition evaluates to TRUE, unlike [. That effectively implements == but where 1 == NA evalutes as FALSE.
Example:
> df <- data.frame(col1= c("a", "b", NA), col2= 1:3)
> df
col1 col2
1 a 1
2 b 2
3 <NA> 3
> dplyr::filter(df, col1=="a")
col1 col2
1 a 1
Why not use base R:
df <- data.frame(col1 = c("a", "b", NA), col2 = 1:3, col3 = 11:13)
df
subset(x = df, subset = col1=="a", select = c(col1, col2))
# col1 col2
# 1 a 1
or with arrays:
df <- c("a", "b", NA)
subset(x = df, subset = df == "a")