Extract rows from data frame - r

#Simulated data /model output
data1 <-data.frame(col1=c(10,20,30,40,50,60,70,80,90,100),
col2=c(2,4,6,8,10,12,14,16,20,22),
col3=c(3,9,12,15,18,21,24,27,30,33),
col4= c(4,8,12,16,20,24,28,32,36,40))
#Lower 95% CI from real data
lowdata <- as.data.frame(matrix(0,1,4))
lowdata[,1] <-5
lowdata[,2] <-34
lowdata[,3] <-25
lowdata[,4] <-30
# Higher 95% CI from real data
highdata <-as.data.frame(matrix(0,1,4))
highdata[,1] <- 59
highdata[,2] <- 60
highdata[,3] <- 50
highdata[,4] <- 49
I am new to R programming and not sure how to do the following.
I want to basically find out which ones of the data1 rows have values (for each column) that are within the Cis defined by two the two dataframes (lowdata1 and highdata1).
I have tried this, but it takes each value and not row:
wmax <- which(data1[,1:4] < highdata1[,1:4])
wmin <- which(data1[,1:4] > lowdata1[,1:4])
w <- intersect(wmax, wmin)
How could I achieve extracting which rows in data1 basically "fit" between the CIs?

You can check whether each element of each column is in the corresponding interval with
is.btwn <-
Map(function(x, low, high) x >= low & x <= high,
data1, lowdata, highdata)
is.btwn
# $col1
# [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
#
# $col2
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#
# $col3
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
#
# $col4
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
You can use Reduce to get rows where all columns are in the interval, but in this case there are no such rows
Reduce(`&`, is.btwn)
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Or the rows with any column in the corresponding interval
Reduce(`|`, is.btwn)
# [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE
All of these results can be passed to which to get indices with value TRUE

Related

Identifying positions of the last TRUEs in a sequence of TRUEs and FALSEs

I have a vector of TRUEs and FALSEs:
x <- c(F,F,F,T,T,T,F,F,F,T,T,T,F,T,T)
I'd like to elegantly (and in base) identify the position of the last TRUE before it changes to FALSE.
The following works, though, it seems like it could be simplified:
c((x[-1] != x[-length(x)]),T) & x
> FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
Input and output:
Taking advantage of diff with an appended FALSE to catch the implied TRUE-to-FALSE at the end.
diff(c(x,FALSE)) == -1
# [1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
#[13] FALSE FALSE TRUE
We may look where x is greater than shifted x with 0 appended.
x>c(x[-1],0)
# [1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
Check rle
rlex = rle(x)
end = cumsum(rlex$lengths)
x&(seq(length(x)) %in% end)
[1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
Another layout suggested by Frank
seq_along(x) %in% with(rle(x), cumsum(lengths)[values])
[1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
Another version with rle
x[setdiff(seq_along(x), with(rle(x), cumsum(lengths) * values))] <- FALSE
x
#[1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
An option with duplicated
library(data.table)
!duplicated(rleid(x), fromLast = TRUE) & x
#[1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
Non-base solution for identifying the last TRUE before a FALSE.
library(dplyr)
y <- data.frame(x = c(FALSE,FALSE,FALSE,TRUE,TRUE,TRUE,FALSE,FALSE,
FALSE,TRUE,TRUE,TRUE,FALSE,TRUE,TRUE))
y %>%
mutate(lasttrue = case_when(x == TRUE & lead(x) == FALSE ~ TRUE,
TRUE ~ FALSE))
Edit:
y %>%
mutate(lasttrue = case_when(x > lead(x) ~ T,
T ~ F))
benchmarks
Thanks for all the solutions. If anyone is interested in benchmarks:
library(dplyr)
library(data.table)
set.seed(1)
x <- sample(c(TRUE, FALSE), 1000000, replace = T)
y <- data.frame(x = x) # For M. Viking's solution
x_dt <- x # For Ronak Shah's solution
microbenchmark::microbenchmark(Khaynes = {Khaynes <- c((x[-1] != x[-length(x)]),T) & x},
jay.sf = {jay.sf <- x>c(x[-1],0)},
jay.sf_2 = {jay.sf_2 <- diff(c(x,0))<0},
thelatemail = {thelatemail <- diff(c(x,FALSE)) == -1},
WeNYoBen = {rlex = rle(x); end = cumsum(rlex$lengths); WeNYoBen <- x&(seq(length(x)) %in% end)},
M._Viking = {M._Viking <- y %>% mutate(lasttrue = case_when(x > lead(x) ~ T, T ~ F))},
akrun = {akrun <- !duplicated(rleid(x), fromLast = TRUE) & x},
frank = {frank <- seq_along(x) %in% with(rle(x), cumsum(lengths)[values])},
Ronak_Shah = {x_dt[setdiff(seq_along(x_dt), with(rle(x_dt), cumsum(lengths) * values))] <- FALSE},
times = 50)
# Output:
# Unit: milliseconds
# expr min lq mean median uq max neval
# Khaynes 23.0283 26.5010 31.76180 31.71290 37.1449 46.3824 50
# jay.sf 13.0630 13.5373 17.84056 13.77135 20.5462 73.5926 50
# jay.sf_2 26.1960 27.7653 35.25296 36.39615 39.3686 61.8858 50
# thelatemail 24.8204 26.7178 32.51675 33.50165 36.6328 41.9279 50
# WeNYoBen 83.9070 98.4700 107.79965 101.88475 107.1933 170.2940 50
# M._Viking 73.5963 83.4467 93.99603 86.58535 94.0915 151.7075 50
# akrun 42.5265 43.2879 48.42697 44.98085 51.1533 105.2836 50
# frank 81.9115 90.1559 95.40261 93.97015 98.2921 129.6162 50
# Ronak_Shah 109.0678 121.8230 133.10690 125.63930 133.7222 231.5350 50
all.equal(Khaynes, jay.sf)
all.equal(Khaynes, jay.sf_2)
all.equal(Khaynes, thelatemail)
all.equal(Khaynes, WeNYoBen)
all.equal(Khaynes, M._Viking$lasttrue) # When the last element is TRUE it will return false.
all.equal(Khaynes, akrun)
all.equal(Khaynes, frank)
all.equal(Khaynes, x_dt) # Ronak Shah solution.

Data split for train test for a model

I have two vectors and a data frame. I run a random generator as to obtain the split for training and test for a model. (TRUE train while FALSE test sets), if run multiple times the number of TRUE to FALSE changes in number (FALSE ranges from 4 to 8) as well as in position. This is an example, the actual data frame is much larger.
x <- c(1,2,3,5,4,1,2,3,5,7,4,2,1,5,6,8,5,3,2,4,6,8,9,0,2)
y <- c(3,5,7,8,4,2,2,5,4,7,9,0,0,7,6,4,2,2,1,4,6,8,9,0,0)
X <- data.frame(x,y)
runif(nrow (X)) <= 0.75
[1] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE
FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
I would like to find a function or to be able to instruct the generation of the split TRUE and FALSE sequentially with all elements named FALSE found only at the end while prior elements should be TRUE. It should yield something in line with the example below.
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
I Have looked for functions that could do this without luck as a function that serves this purpose createTimeSlices in the caret package implies significant changes in the model which are difficult to implement.
On another hand I have the expression below where I obtain FALSE only at the end once while the rest is random as expected, however I can not come out with an expression that would produce sequentially the number of FALSE as instruted by the split to be found only in the last possitions while prior to this yields TRUE as in the example above.
S<- runif(nrow (X)) <= 0.75
S[length(S)] <- FALSE
while(S[length(S)] [!FALSE]) { S<-runif(nrow (X)) <= 0.75}
train<-print(S)
Any help is welcomed
Many thanks
Maybe I'm misunderstanding, but couldn't you do
S <- runif(nrow(X)) <= 0.75
sort(S,decreasing = TRUE)
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[19] TRUE TRUE TRUE FALSE FALSE FALSE FALSE
that gives you (approximately) 75% TRUE values, always at the front of the vector.
correction?
It looks like you actually want the first 75% of rows (based on your comment above). in that case, I'd do this:
crit <- floor(nrow(X) * 0.75)
train <- seq_len(nrow(X)) < crit
train
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
[19] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Why not use sample? Anyways, all you need to do is sort your boolean vector:
Base R:
X$sample <- runif(nrow(X)) <= 0.75
X[order(X$sample, decreasing=TRUE), ]
Using dplyr:
library(dplyr)
X %>%
mutate(sample = runif(nrow(X)) <= 0.75) %>%
arrange(desc(sample))

Faster way to make this ifelse loop in r

I have a largish data frame in R [300000, 45]. I want to add a column (or create a vector) of TRUE/FALSE where a TRUE is assigned if the value of another column is different than the value above (i-1) and FALSE if they are the same. The basic R code would be:
etS$ar1TF <- NA
mode(etS$ar1TF) <- 'logical'
etS$ar1TF[1] <- TRUE
for(i in 2:length(etS$ar1TF)) {
if(etS$siteYear[i] == etS$siteYear[i-1]) {
etS$ar1TF[i] <- FALSE
} else {
etS$ar1TF[i] <- TRUE
}
}
However, this will be incredibly slow and inefficient. Are there better ways to use existing functions or vectorization to do this quickly and efficiently? I'm not sure if a while() statement would be any more efficient. I suppose I could start by assigning everything as TRUE then using the if statement within a for loop and removing the else statement but this really isn't much better. I'm not sure if the apply function would be faster or more efficient in this case because the size and type are already assigned.
Make use of vectorization. Something like below will do the trick:
ar1TF <- logical(length(siteYear))
ar1TF[-1] <- (siteYear[-1] != siteYear[-length(siteYear)])
ar1TF[1] <- NA
etS$ar1TF <- ar1TF # to add the column to the data.frame
EDIT: It seems that the diff solution may be a bit faster:
x <- sample(1:3, 100000, replace=TRUE)
library('microbenchmark')
microbenchmark({
y1 <- logical(length(x))
y1[-1] <- (x[-1] != x[-length(x)])
y1[1] <- NA
},{
y2 <- diff(x)
y2 <- c(NA, y2 != 0)
})
## Unit: microseconds
## expr min lq median uq max neval
## [!=] 1062.651 1070.690 1088.1935 1169.500 2367.582 100
## [diff] 811.121 821.443 844.3575 892.967 2244.022 100
You could use diff to perform the differencing:
vec = sample(1:10, 100, replace = TRUE)
diff(vec) == 0
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[25] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[37] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
[49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
[61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
[73] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[97] FALSE FALSE FALSE
The standard setting of diff uses a lag of 1, which is what you need. To add it to your data.frame, you need to append an NA:
df$new_col = c(NA, diff(vec) == 0)
Some basic timings show that this is quite fast, also for larger vectors:
> system.time(dum <- diff(sample(1:10, 10e3, replace = TRUE)) == 0)
user system elapsed
0.001 0.000 0.001
> system.time(dum <- diff(sample(1:10, 10e5, replace = TRUE)) == 0)
user system elapsed
0.189 0.012 0.202
> system.time(dum <- diff(sample(1:10, 10e7, replace = TRUE)) == 0)
user system elapsed
6.810 1.908 10.376
So, with your datasize the processing time should be less than a second. Note that these times include creating the test dataset, so the actually differencing is almost twice as fast.
Performing a direct comparison with a for loop based solution shows the difference in speed:
diff_for_loop = function(vec) {
result_vec = vec
for(i in seq_along(vec)[-1]) {
if(vec[i] == vec[i-1]) {
result_vec <- FALSE
} else {
result_vec <- TRUE
}
}
return(result_vec)
}
vec = sample(1:10, 10e5, replace = TRUE)
system.time(dum_for_loop <- diff_for_loop(vec))
# user system elapsed
# 1.220 0.008 1.232
system.time(dum_diff <- diff(vec) == 0)
# user system elapsed
# 0.051 0.005 0.056
Which makes the diff based solution 22 times faster.

find if each row of a logical matrix has at least one TRUE

is there some way to convert this data:
(Intercept) Timecoursecdc15 Timecoursecdc28 Timecourseclb Timecoursecln
YAL001C FALSE FALSE FALSE FALSE FALSE
YAL002W FALSE FALSE FALSE FALSE FALSE
YAL003W FALSE FALSE FALSE FALSE FALSE
YAL004W FALSE FALSE FALSE FALSE FALSE
YAL005C FALSE FALSE FALSE FALSE FALSE
YAL007C FALSE FALSE FALSE FALSE TRUE
to something like this:
YPR163C YPR164W YPR165W YPR166C YPR167C YPR168W YPR169W YPR170C
FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
YPR171W YPR172W YPR173C YPR174C YPR175W YPR176C YPR177C YPR178W
FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
YPR179C YPR180W YPR181C YPR182W YPR183W YPR184W YPR185W YPR186C
FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
YPR187W YPR188C YPR189W YPR190C YPR191W YPR192W YPR193C YPR194C
FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
YPR195C YPR196W YPR197C YPR198W YPR199C YPR200C YPR201W YPR202W
FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
So, looking at the values of each row, if any row contains TRUE, then YAL007C is TRUE.
Like this:
rowSums(dat) > 0L
which will be a lot faster than using a loop or apply...
apply(df, 1, any)
# YAL001C YAL002W YAL003W YAL004W YAL005C YAL007C
# FALSE FALSE FALSE FALSE FALSE TRUE
If your data is in a data frame, this is a fast approach:
setNames(Reduce("|", dat), rownames(dat))
where dat is the name of your data frame.
If you have a matrix, you can use this:
setNames(Reduce("|", as.data.frame(mat)), rownames(dat))
where mat is the name of your matrix.
The latter command works with both matrices and data frames.
Performance check (based on the example data in the question):
testDF <- read.table(text = "(Intercept) Timecoursecdc15 Timecoursecdc28 Timecourseclb Timecoursecln
YAL001C FALSE FALSE FALSE FALSE FALSE
YAL002W FALSE FALSE FALSE FALSE FALSE
YAL003W FALSE FALSE FALSE FALSE FALSE
YAL004W FALSE FALSE FALSE FALSE FALSE
YAL005C FALSE FALSE FALSE FALSE FALSE
YAL007C FALSE FALSE FALSE FALSE TRUE",
check.names = FALSE)
applyFun <- function() apply(testDF, 1, any)
rowSumsFun <- function() rowSums(testDF) > 0L
ReduceFun <- function() setNames(Reduce("|", testDF), rownames(testDF))
library(microbenchmark)
Unit: microseconds
expr min lq median uq max neval
applyFun() 234.444 237.6535 239.7680 250.0900 823.751 100
rowSumsFun() 153.645 155.8345 157.1610 159.5245 387.071 100
ReduceFun() 55.588 57.9465 60.1465 61.9545 370.339 100
# create data frame with 10000 times as many rows as the original one
testDF <- do.call(rbind, replicate(10000, testDF, simplify = FALSE))
microbenchmark(applyFun(), rowSumsFun(), ReduceFun())
Unit: milliseconds
expr min lq median uq max neval
applyFun() 337.457512 395.721527 429.13247 474.37774 698.43850 100
rowSumsFun() 5.591884 7.765213 9.17471 10.21152 16.93731 100
ReduceFun() 9.900725 11.418231 12.95423 13.32382 16.20043 100
Summary: For a small number of rows, the approach based on Reduce is the most efficient one, but the approach based on rowSums is the best for large data frames. I would prefer using the rowSums solution for the general case.

which rows match a given vector in R

I have a matrix A,
A = as.matrix(data.frame(col1 = c(1,1,2,3,1,2), col2 = c(-1,-1,-2,-3,-1,-2), col3 = c(2,6,1,3,2,4)))
And I have a vector v,
v = c(-1, 2)
How can I get a vector of TRUE/FALSE that compares the last two columns of the matrix and returns TRUE if the last two columns match the vector, or false if they don't?
I.e., If I try,
A[,c(2:3)] == v
I obtain,
col2 col3
[1,] TRUE FALSE
[2,] FALSE FALSE
[3,] FALSE FALSE
[4,] FALSE FALSE
[5,] TRUE FALSE
[6,] FALSE FALSE
Which is not what I want, I want both columns to be the same as vector v, more like,
result = c(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE)
Since the first, and 5th rows match the vector v entirely.
Here's a simple alternative
> apply(A[, 2:3], 1, function(x) all(x==v))
[1] TRUE FALSE FALSE FALSE TRUE FALSE
Ooops by looking into R mailing list I found an answer: https://stat.ethz.ch/pipermail/r-help/2010-September/254096.html,
check.equal <- function(x, y)
{
isTRUE(all.equal(y, x, check.attributes=FALSE))
}
result = apply(A[,c(2:3)], 1, check.equal, y=v)
Not sure I need to define a function and do all that, maybe there are easier ways to do it.
Here's another straightforward option:
which(duplicated(rbind(A[, 2:3], v), fromLast=TRUE))
# [1] 1 5
results <- rep(FALSE, nrow(A))
results[which(duplicated(rbind(A[, 2:3], v), fromLast=TRUE))] <- TRUE
results
# [1] TRUE FALSE FALSE FALSE TRUE FALSE
Alternatively, as one line:
duplicated(rbind(A[, 2:3], v), fromLast=TRUE)[-(nrow(A)+1)]
# [1] TRUE FALSE FALSE FALSE TRUE FALSE
A dirty one:
result <- c()
for(n in 1:nrow(A)){result[n] <-(sum(A[n,-1]==v)==2)}
> result
[1] TRUE FALSE FALSE FALSE TRUE FALSE

Resources