Select rows based on value in multiple columns defined by vector - r

I have the following data frame
df <- data.frame(A1 = c("A","A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B"),
B2 = c("C","D","C","D","C","D","C","D","C","D","C","D","C","D","C","D","C","D","C","D"),
C3 = c("E","F","E","F","E","F","E","F","E","F","E","F","E","F","E","F","E","F","E","F"),
D4=c(1,12,5,41,45,4,5,6,12,7,3,4,6,8,12,4,12,1,6,7))
and I would like to subset all the rows for which the first 3 column match the vector c("A","C","E")
I have tried to use which but it does not work
vct <- c("A","C","E")
df[which(df[1:3] == vct)]

You can probably use paste (or interaction):
vct <- c("A","C","E")
do.call(paste, df[1:3]) %in% paste(vct, collapse = " ")
# [1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE
# [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
df[do.call(paste, df[1:3]) %in% paste(c("A", "C", "E"), collapse = " "), ]
# A1 B2 C3 D4
# 1 A C E 1
# 3 A C E 5
# 5 A C E 45
# 7 A C E 5
# 9 A C E 12
## with "interaction"
df[interaction(df[1:3], drop=TRUE) %in% paste(vct, collapse = "."), ]
You can also do something like this:
df[with(df, A1 == "A" & B2 == "C" & C3 == "E"), ]

Related

Getting rows with same or different logical values in R

I have a dataframe of two columns of T and F.
I want to know
which row is T in the first and F in the second
which row is F in the first and T in the second
which row is F in both
I have very little clues on the matter, con someone shine a light?
You can use case when
library(dplyr)
df = data.frame(x = c("T","T","F","F","F"), y = c("T","F","T","F","X"))
df %>%
mutate(condition = case_when(
x == "T" & y == "T" ~ "Both are T",
x == "T" & y == "F" ~ "First is T fecond is F",
x == "F" & y == "F" ~ "Both are F",
x == "F" & y == "T" ~ "First is F, second is T",
TRUE ~ "Something else"
))
#> x y condition
#> 1 T T Both are T
#> 2 T F First is T fecond is F
#> 3 F T First is F, second is T
#> 4 F F Both are F
#> 5 F X Something else
Created on 2021-08-05 by the reprex package (v2.0.0)
Here is one possible way to solve your problem:
library(dplyr)
df <- data.frame(a = rep(c(T, F, T, F), each=2),
b = rep(c(T, T, F, F), each=2))
# a b
# 1 TRUE TRUE
# 2 TRUE TRUE
# 3 FALSE TRUE
# 4 FALSE TRUE
# 5 TRUE FALSE
# 6 TRUE FALSE
# 7 FALSE FALSE
# 8 FALSE FALSE
df %>%
mutate(newcol = case_when(a & !b ~ "first=T second=F",
!a & b ~ "first=F second=T",
!a & !b ~ "both=F",
TRUE ~ "other"))
# a b newcol
# 1 TRUE TRUE other
# 2 TRUE TRUE other
# 3 FALSE TRUE first=F second=T
# 4 FALSE TRUE first=F second=T
# 5 TRUE FALSE first=T second=F
# 6 TRUE FALSE first=T second=F
# 7 FALSE FALSE both=F
# 8 FALSE FALSE both=F
You can treat [a,b] columns as a 2-bit binary number vector, and a*2+b transfer it from binary to decimal. Thus, 2*a+b+1 is mapped to 1,2,3,4.
Try the base R code below
transform(
df,
newcol = c("both=F", "first=F,second=T", "first=T,second=F", "other")[a * 2 + b + 1]
)
which gives
a b newcol
1 TRUE TRUE other
2 TRUE TRUE other
3 FALSE TRUE first=F,second=T
4 FALSE TRUE first=F,second=T
5 TRUE FALSE first=T,second=F
6 TRUE FALSE first=T,second=F
7 FALSE FALSE both=F
8 FALSE FALSE both=F
Data
df <- data.frame(a = rep(c(T, F, T, F), each=2),
b = rep(c(T, T, F, F), each=2))

Filter rows that contain specific boolean value in any column in a dataframe in R

Let's say I have a data frame:
data <- data.frame(w = c(1, 2, 3, 4), x = c(F, F, F, F), y = c(T, T, F, T),
z = c(T, F, F, T), z1 = c(12, 4, 5, 15))
data
#> w x y z z1
#> 1 1 FALSE TRUE TRUE 12
#> 2 2 FALSE TRUE FALSE 4
#> 3 3 FALSE FALSE FALSE 5
#> 4 4 FALSE TRUE TRUE 15
Question
How do I filter the rows in which all boolean variables are FALSE? In this case, row 3.
Or in other words, I would like to get a data frame that has at least one TRUE value per row.
Expected output
#> w x y z z1
#> 1 1 FALSE TRUE TRUE 12
#> 2 2 FALSE TRUE FALSE 4
#> 3 4 FALSE TRUE TRUE 15
Attempt
library(tidyverse)
data %>% filter(x == T | y == T | z == T)
#> w x y z z1
#> 1 1 FALSE TRUE TRUE 12
#> 2 2 FALSE TRUE FALSE 4
#> 3 4 FALSE TRUE TRUE 15
Above is a working option, but not scalable at all. Is there a more convenient option using the dplyr's filter() function?
rowSums() is a good option - TRUE is 1, FALSE is 0.
cols = c("x", "y", "z")
## all FALSE
df[rowSums[cols] == 0, ]
## at least 1 TRUE
df[rowSums[cols] >= 1, ]
## etc.
With dplyr, I would use the same idea like this:
df %>%
filter(
rowSums(. %>% select(all_of(cols))) >= 1
)
With dplyr's filter(),
library(dplyr)
filter(data, (x + y + z) > 0 )
w x y z z1
1 1 FALSE TRUE TRUE 12
2 2 FALSE TRUE FALSE 4
3 4 FALSE TRUE TRUE 15
# after #Gregor Thomas's suggestion on using TRUE or FALSE
df[!(apply(!df[, c('x', 'y', 'z')], 1, all)), ]
# without rowSums
df[!(apply(df[, c('x', 'y', 'z')] == FALSE, 1, all)), ]
# with rowSums
df[rowSums(df[, c('x', 'y', 'z')] == FALSE) != 3, ]
# w x y z z1
#1 1 FALSE TRUE TRUE 12
#2 2 FALSE TRUE FALSE 4
#4 4 FALSE TRUE TRUE 15

vector filling true or false based on the given conditions in R without cycle

I have sample vector and some values:
q = c(0.00000000, -0.70218526, -0.60635393, 0.32325554, -0.45921704, -0.57336113, -0.77683717,
-1.76347868, -1.90884891, -0.86157465, -0.72896622, -0.86831735, -0.79357262, -0.65279976,
0.39921356, 0.78018094, 0.75703279, 0.70898895, 1.10155383, 0.88428135, 0.81338108,
0.65611568, 0.89776945, 0.65447442, 0.16289673, 0.19464041, 0.01762445, -0.57663945,
-1.01231868, -0.81204022, -0.99165533, -0.62666993, -1.05661282, -0.78221866, -0.03129549, 1.04051915)
s = -1.59688
i = -0.6373684
z = 0
I need to create a new vector in which boolean values will be filled according to the following conditions:
if q is less than i we fill TRUE until
q becomes more than 0 (that is, z)
or until q becomes less than s.
If Filling has stopped due to the condition of the s value, then you need to wait until the q becomes greater than 0 (that is, z) and
only after that you can start filling TRUE again, otherwise fill in FALSE
As a result, for this sample data, you should get the following result (I filled it in manually):
out <- c(FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE)
out
[1] FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
I would like to do it without loops, since they are too slow in R
** FURTHER EDIT in view of OP's request to state logic/strategy**
Actually, your conditions are combination of three conditions. If we create four zones, as I created in the plot above and name the zones as 1 to 4 with
1 where q values are >= z
2 where q values are >= i
3 where q values are >= s
4 where q values are < s
Now, the conditions can be translated as
TRUE when in zone 2 and 3
But if exited once from these TRUE zones, it will become TRUE only if it arrives in zone 3
Moreover, if it has hit zone 4, it can become TRUE only if it arrives/hits zone 1, at least once.
Strategy
To integrate these all, I used tidyverse piped syntax
First divide all values in respective zones (say q1)
As a first condition divide zones 2 & 3 in TRUE and others in `FALSE
As second condition, say c2 , i.e. whether exited from zone 4 and has hit zone 1 or not, mark zone 4 as F and zone 1 as T rest all as NAs.
First value can be NA so replace first value, if NA, with c1
As last condition say c3 i.e. TRUE when arrive in zone 3, mark 3 as TRUE 1 and 4 as FALSE and leave zone 2 as NA to later-on check whether it arrived here from which zone.
First value can be NA so replace first value, if NA, with FALSE
Now only job remains to fill NAs in c2 and c3. Use zoo::na.locf or tidyr::fill which fills all NAs will last available value.
Your final desired result is combination of all conditions so c1 & c2 & c3
q = c(0.00000000, -0.70218526, -0.60635393, 0.32325554, -0.45921704, -0.57336113, -0.77683717,
-1.76347868, -1.90884891, -0.86157465, -0.72896622, -0.86831735, -0.79357262, -0.65279976,
0.39921356, 0.78018094, 0.75703279, 0.70898895, 1.10155383, 0.88428135, 0.81338108,
0.65611568, 0.89776945, 0.65447442, 0.16289673, 0.19464041, 0.01762445, -0.57663945,
-1.01231868, -0.81204022, -0.99165533, -0.62666993, -1.05661282, -0.78221866, -0.03129549, 1.04051915)
s = -1.59688
i = -0.6373684
z = 0
library(tidyverse)
q %>% as.data.frame() %>% setNames('q') %>%
mutate(q1 = case_when(q >= z ~ 1,
q >= i ~ 2,
q >= s ~ 3,
TRUE ~ 4),
c1 = q1 %in% c(2,3),
c2 = case_when(q1 == 4 ~ F,
q1 == 1 ~ T,
TRUE ~ NA),
c2 = ifelse(row_number() == 1 & is.na(c2), c1, c2),
c3 = case_when(q1 %in% c(1,4) ~ F,
q1 == 3 ~ T,
TRUE ~ NA),
c3 = ifelse(row_number() ==1 & is.na(c3), F, c3)) %>%
fill(c2, c3) %>%
transmute(output = c1 & c2 & c3) %>% pull(output)
#> [1] FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
#> [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [25] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
Created on 2021-06-02 by the reprex package (v2.0.0)
OLD ANSWER
#data given
q = c(0.00000000, -0.70218526, -0.60635393, 0.32325554, -0.45921704, -0.57336113, -0.77683717,
-1.76347868, -1.90884891, -0.86157465, -0.72896622, -0.86831735, -0.79357262, -0.65279976,
0.39921356, 0.78018094, 0.75703279, 0.70898895, 1.10155383, 0.88428135, 0.81338108,
0.65611568, 0.89776945, 0.65447442, 0.16289673, 0.19464041, 0.01762445, -0.57663945,
-1.01231868, -0.81204022, -0.99165533, -0.62666993, -1.05661282, -0.78221866, -0.03129549, 1.04051915)
s = -1.59688
i = -0.6373684
z = 0
#loading libraries
library(dplyr)
library(tidyr)
#creating zones
q1 <- dplyr::case_when(q >= z ~ 1,
q >= i ~ 2,
q >= s ~ 3,
TRUE ~ 4)
#first condition
c1 <- dplyr::case_when(q1 %in% c(2,3) ~ T,
TRUE ~ F)
#second condition (third in above statements)
c2 <- dplyr::case_when(q1 == 4 ~ F,
q1 == 1 ~ T,
TRUE ~ NA)
c2[1] <- ifelse(is.na(c2[1]), c1[1], c2[1])
c2 <- tidyr::fill(data.frame(id = 1:length(q), c2 = c2), c2)$c2
#third condition
c3 <- dplyr::case_when(q1 == 3 ~ T,
q1 %in% c(1,4) ~ F,
TRUE ~ NA)
c3[1] <- ifelse(is.na(c3[1]), F, c3[1])
c3 <- tidyr::fill(data.frame(id = 1:length(q), c3 = c3), c3)$c3
#creating output
output <- (c1 & c2 & c3)
> output
[1] FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[21] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
#check it with your given `out`
> which((c1 & c2 & c3) == out)
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
#OR
> which((c1 & c2 & c3) != out)
integer(0)
UPDATE If you want to use baseR only, use these expressions/codes for c2 and c3
#second condition
c2 <- case_when(q1 == 4 ~ F,
q1 == 1 ~ T,
TRUE ~ NA)
c2 <- c2[!cumsum(!is.na(c2)) | !is.na(c2)][cumsum(!cumsum(!is.na(c2)) | !is.na(c2))]
#third condition
c3 <- case_when(q1 == 3 ~ T,
q1 %in% c(1,4) ~ F,
TRUE ~ NA)
c3 <- c3[!cumsum(!is.na(c3)) | !is.na(c3)][cumsum(!cumsum(!is.na(c3)) | !is.na(c3))]
for new data
q <- c(-0.01563733, -0.05829460, -0.05884189, -0.08954093, -0.13268677, -0.31748724, -0.40060792, -0.08515156, -0.14303489, -0.24525535, -0.93842637, -0.77738228, -1.29502715, -0.89000932, -1.49038656, -1.64953167, -1.67114179, -1.47482366, -0.85874778, -1.01021450, -0.90078260, -1.24313333, -0.99053914, -1.11684140, -1.34073045, -1.36406163, -1.25163185, -1.42429376, -1.48127185, -1.79040671, -2.26811789, -1.82124304, -1.85208201, -1.76394637, -1.63173292)
i = -0.489
s = -1.032
z = 0
#after running the above code
> output
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
[17] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[33] FALSE FALSE FALSE
and chart
for a new vector with random values
set.seed(202)
q <- runif(35, -2, 2)

Order a matrix depending on row and column of another in R

Hello I need to order column and row names in a mtrix according to another matrix, here is an exemple
M1
D E F
A 1 2 3
B 4 5 6
C 7 8 9
M2
F D E
C T F F
A F T T
So here I would like to 1 sort the M2 columns in order to have the same as M1
and then sort the rows (as you can see here there is not the row B as in M1, so I simply add a new one filled by F letters.
New_M2
D E F
A T T F
B F F F
C F F T
I know for exemple how to sort the column using M2[,colnames(M1)] but that is all...
Step 1. Match column and row names of M1 and M2
M3 <- M2[match(rownames(M1), rownames(M2)),
match(colnames(M1), colnames(M2))]
# D E F
# A TRUE TRUE FALSE
# <NA> NA NA NA
# C FALSE FALSE TRUE
Step 2. Set the dimnames and replace missing values with FALSE
dimnames(M3) <- dimnames(M1)
M3[is.na(M3)] <- FALSE
# D E F
# A TRUE TRUE FALSE
# B FALSE FALSE FALSE
# C FALSE FALSE TRUE
Data
M1 <- matrix(1:9, 3, 3, T, dimnames = list(c("A", "B", "C"), c("D", "E", "F")))
M2 <- matrix(c(T, F, F, T, F, T), 2, 3, dimnames = list(c("C", "A"), c("F", "D", "E")))
Here's a way but perhaps it isn't the best one :
#Get the rownames which are missing
diff_row <- setdiff(rownames(m1), rownames(m2))
#Create a matrix with `FALSE` values for those rownames
m3 <- matrix(FALSE, nrow = length(diff_row), ncol = ncol(m2),
dimnames = list(diff_row, colnames(m2)))
#rbind it to m2 matrix
m4 <- rbind(m2, m3)
#rearrange based on m1 matrix
m4[rownames(m1), colnames(m1)]
# D E F
#A TRUE TRUE FALSE
#B FALSE FALSE FALSE
#C FALSE FALSE TRUE

convert dataframe to venn diagram table

So, I writting a function that takes dataframe and unique number <1, 5>
let say we want a unique number to be 3 in this case
how_much = 100
A <- sample(how_much, replace = TRUE, x = 1:5)
B <- sample(how_much, replace = TRUE, x = 1:5)
VennData <- data.frame(A, B)
and then return a described table as below:
count A B
24 TRUE TRUE
20 TRUE FALSE
13 FALSE TRUE
43 FALSE FALSE
when we can see that we have 24 observations where both A and B is equal to 3,
20 observations have A equal to 3 and B non equal to 3,
13 observations have A not equal to 3 and B equal to 3 etc...
With set.seed(43)
library(dplyr)
VennData %>%
mutate(A = (A == 3),
B = (B == 3)) %>%
count(A, B)
## A tibble: 4 x 3
# A B n
# <lgl> <lgl> <int>
#1 FALSE FALSE 64
#2 FALSE TRUE 20
#3 TRUE FALSE 13
#4 TRUE TRUE 3
In base R,
aggregate(Count ~ ., transform(VennData, A = A == 3, B = B == 3, Count = 1), sum)
# A B Count
#1 FALSE FALSE 64
#2 TRUE FALSE 13
#3 FALSE TRUE 20
#4 TRUE TRUE 3
An option with data.table
library(data.table)
set.seed(43)
setDT(VennData)[, .N, .(A = A == 3, B = B == 3)]
# A B N
#1: FALSE FALSE 64
#2: FALSE TRUE 20
#3: TRUE TRUE 3
#4: TRUE FALSE 13

Resources