Subset a dataframe based on multiple columns simultaneously - r

Suppose I have two dataframes like
> r1 <- data.frame(replicate(5, sample(1:3, 5, replace = TRUE)))
> r2 <- data.frame(replicate(5, sample(1:3, 5, replace = TRUE)))
> r1
X1 X2 X3 X4 X5
1 2 3 1 3 1
2 1 3 1 1 3
3 3 2 3 3 3
4 1 1 1 2 3
5 1 1 3 2 3
> r2
X1 X2 X3 X4 X5
1 1 3 3 3 2
2 3 1 2 1 2
3 1 1 1 2 2
4 2 3 1 2 2
5 2 2 1 2 3
I would like to subset r1 so that the result would only contain rows where r1$X1 == r2$X1 AND r1$X2 == r2$X2 AND r1$X3 == r2$X3, e.g.
r1_example
X1 X2 X3 X4 X5
1 2 3 1 3 1
4 1 1 1 2 3
If I actually do subset, I'm getting more values, because some of the columns separately correspond to the r2 columns.
> r1_sub <- subset(r1, X1 %in% r2$X1 & X2 %in% r2$X2 & X3 %in% r2$X3)
> r1_sub
X1 X2 X3 X4 X5
1 2 3 1 3 1
2 1 3 1 1 3
3 3 2 3 3 3
4 1 1 1 2 3
5 1 1 3 2 3
I can figure a workaround like
> r1$concat <- paste(r1$X1, '&', r1$X2, '&', r1$X3)
> r2$concat <- paste(r2$X1, '&', r2$X2, '&', r2$X3)
> r1_concat <- subset(r1, concat %in% r2$concat)
> r1_concat
X1 X2 X3 X4 X5 concat
1 2 3 1 3 1 2 & 3 & 1
4 1 1 1 2 3 1 & 1 & 1
But that's crude to say the least. Is there a more elegant solution?

You can keep only the columns X1 to X3 in r2 and merge the data :
merge(r1, r2[c('X1', 'X2', 'X3')])
X1 X2 X3 X4 X5
#1 1 1 1 2 3
#2 2 3 1 3 1
In dplyr :
library(dplyr)
r2 %>% select(X1:X3) %>% inner_join(r1)

using semi_join
library(tidyverse)
r1 <- read.table(text = " X1 X2 X3 X4 X5
1 2 3 1 3 1
2 1 3 1 1 3
3 3 2 3 3 3
4 1 1 1 2 3
5 1 1 3 2 3", header = T)
r2 <- read.table(text = " X1 X2 X3 X4 X5
1 1 3 3 3 2
2 3 1 2 1 2
3 1 1 1 2 2
4 2 3 1 2 2
5 2 2 1 2 3", header = T)
semi_join(r1, r2, by = c("X1", "X2", "X3"))
#> X1 X2 X3 X4 X5
#> 1 2 3 1 3 1
#> 4 1 1 1 2 3
Created on 2021-03-12 by the reprex package (v1.0.0)

Related

How to count the number of occurrences of a given value for each row?

I'm sure this is a really easy fix but I can't seem to find the answer... I am trying to create a column at the end of my dataframe that is a sum of the number of times a specific value (say "1") appears across that row. So for example, if I started with the following dataframe:
X1 <- c(5,1,7,8,1,5)
X2 <- c(5,0,0,2,3,7)
X3 <- c(6,2,3,4,1,7)
X4 <- c(1,1,5,2,1,7)
df <- data.frame(id,X1,X2,X3,X4)
id X1 X2 X3 X4
1 1 5 5 6 1
2 2 1 0 1 1
3 3 7 0 3 5
4 4 8 2 4 2
5 5 1 3 2 1
6 6 5 7 7 7
and I was trying to identify how many times the value "1" appears across that row, I would want the output to look like this:
id X1 X2 X3 X4 one_appears
1 1 5 5 6 1 2
2 2 1 0 1 1 3
3 3 7 0 3 5 0
4 4 8 2 4 2 0
5 5 1 3 2 1 2
6 6 5 7 7 7 0
Thanks very much in advance!
library(tidyverse)
df %>%
mutate(
one = rowSums(across(everything(), ~ .x == 1))
)
# A tibble: 6 × 6
id X1 X2 X3 X4 one
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 5 5 6 1 2
2 2 1 0 2 1 2
3 3 7 0 3 5 0
4 4 8 2 4 2 0
5 5 1 3 1 1 3
6 6 5 7 7 7 0
EDIT:
df %>%
mutate(
one = rowSums(across(starts_with("X"), ~ .x == 1))
)
df %>%
mutate(
one = rowSums(across(X1:X4, ~ .x == 1))
)
We can use rowSums on a logical matrix
df$one_appears <- rowSums(df == 1, na.rm = TRUE)
-output
> df
id X1 X2 X3 X4 one_appears
1 1 5 5 6 1 2
2 2 1 0 1 1 3
3 3 7 0 3 5 0
4 4 8 2 4 2 0
5 5 1 3 2 1 2
6 6 5 7 7 7 0
Another option using apply with sum:
id <- c(1:6)
X1 <- c(5,1,7,8,1,5)
X2 <- c(5,0,0,2,3,7)
X3 <- c(6,2,3,4,1,7)
X4 <- c(1,1,5,2,1,7)
df <- data.frame(id,X1,X2,X3,X4)
df$one_appear = apply(df, 1, \(x) sum(x == 1))
df
#> id X1 X2 X3 X4 one_appear
#> 1 1 5 5 6 1 2
#> 2 2 1 0 2 1 2
#> 3 3 7 0 3 5 0
#> 4 4 8 2 4 2 0
#> 5 5 1 3 1 1 3
#> 6 6 5 7 7 7 0
Created on 2023-01-18 with reprex v2.0.2
This answer may not be the best of the approach, but an alternative that I tried so thought to share
code
library(dplyr)
X1 <- c(5,1,7,8,1,5)
X2 <- c(5,0,0,2,3,7)
X3 <- c(6,2,3,4,1,7)
X4 <- c(1,1,5,2,1,7)
df <- data.frame(X1,X2,X3,X4) %>% rowwise %>%
mutate(across(starts_with('X'), function(x) ifelse(x==1,1,NA), .names = 'Y_{col}'),
one_appears=sum(across(starts_with('Y')), na.rm = T)
)

repeat list in to a data frame in R

I have a list let's say
k<-c(1,2,3,4)
I want to create a dataframe with let's say 5 rows using the same list in each row as shown below.
X1 X2 X3 X4
1 1 2 3 4
2 1 2 3 4
3 1 2 3 4
4 1 2 3 4
5 1 2 3 4
I tried doing:-
> rep(k, each = 5)
[1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4
However I am not able to get intended result. Any suggestions?
data.frame(t(replicate(5, k)))
#OR
data.frame(matrix(rep(k, each = 5), 5))
#OR
data.frame(t(sapply(1:5, function(x) k)))
# X1 X2 X3 X4
#1 1 2 3 4
#2 1 2 3 4
#3 1 2 3 4
#4 1 2 3 4
#5 1 2 3 4
Here is one option by converting the vector to list with as.list, change it to data.frame (as.data.frame and replicate the rows
as.data.frame(as.list(k))[rep(1, 5),]
# X1 X2 X3 X4
#1 1 2 3 4
#1.1 1 2 3 4
#1.2 1 2 3 4
#1.3 1 2 3 4
#1.4 1 2 3 4
Or another option is to take the transpose of the vector to get a row matrix, replicate the rows and convert to data.frame
as.data.frame(t(k)[rep(1, 5),])
In tidyverse, one option is to convert to tibble and then uncount
library(dplyr)
library(tidyr)
library(stringr)
as.list(k) %>%
set_names(str_c("X", seq_along(k))) %>%
as_tibble %>%
uncount(5)
# A tibble: 5 x 4
# X1 X2 X3 X4
# <dbl> <dbl> <dbl> <dbl>
#1 1 2 3 4
#2 1 2 3 4
#3 1 2 3 4
#4 1 2 3 4
#5 1 2 3 4
purrr::map_dfc(k, rep, 5)
# # A tibble: 5 x 4
# V1 V2 V3 V4
# <dbl> <dbl> <dbl> <dbl>
# 1 1 2 3 4
# 2 1 2 3 4
# 3 1 2 3 4
# 4 1 2 3 4
# 5 1 2 3 4
Using data.table:
k = c(1,2,3,4)
n = 5 # Number of rows
df = data.table()
df = df[, lapply(1:length(k), function(x) rep(k[x], n))]
> df
V1 V2 V3 V4
1: 1 2 3 4
2: 1 2 3 4
3: 1 2 3 4
4: 1 2 3 4
5: 1 2 3 4

Removing a different value from each column of a data frame

I have the following items
A<-data.frame(replicate(5,c(1,2,3,4)))
A= X1 X2 X3 X4 X5
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
B<-c(1,2,3,4,1)
B = 1 2 3 4 5
I want to find a way of removing the first element of B from the first column of A, the second element of B from the second column of A and so on so I obtain the following result
A= X1 X2 X3 X4 X5
2 1 1 1 2
3 3 2 2 3
4 4 4 3 4
Using mapply we can pass A and B in parallel and filter the values which are not present in B
mapply(function(x, y) x[x != y], A, B)
# X1 X2 X3 X4 X5
#[1,] 2 1 1 1 2
#[2,] 3 3 2 2 3
#[3,] 4 4 4 3 4
PS - Make sure that ncol(A) and length(B) are the same otherwise it would lead to vector recycling giving some unexpected results.
A purrr solution:
A<-data.frame(replicate(5,c(1,2,3,4)))
# X1 X2 X3 X4 X5
# 1 1 1 1 1 1
# 2 2 2 2 2 2
# 3 3 3 3 3 3
# 4 4 4 4 4 4
B<-c(1,2,3,4,1)
# [1] 1 2 3 4 1
purrr::map2_df(A, B, ~.x[.x != .y]) # function(x,y) x[x != y]
# # A tibble: 3 x 5
# X1 X2 X3 X4 X5
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2 1 1 1 2
# 2 3 3 2 2 3
# 3 4 4 4 3 4

How to filter multiple columns using a single critera

I have
4 5 6 7
1 3 3 3 3
2 1 2 2 1
3 2 1 1 NA
4 2 7 1 NA
5 1 1 1 1
I want to filter rows with either 2 or 3 in columns 1 to 4 so I only get rows 1,2,4
I tried
df1%>%filter_at(vars(4:7), all_vars(c(2,3)) -> df2
which returns
Error in filter_impl(.data, quo) : Result must have length 413, not 2
and
filter(d1[4:7]%in%c(1,3))
which returns
Error in filter_impl(.data, quo) : Result must have length 413, not 4
I want to avoid using
df1%>%filter(rowname1%in%c(1,3)|rowname1%in%c(1,3)| ...)
I dont get the syntax. Thanks
We can use any_vars and %in% to achieve this task.
library(dplyr)
df1 %>% filter_at(vars(1:4), any_vars(. %in% c(2, 3)))
# X4 X5 X6 X7
# 1 3 3 3 3
# 2 1 2 2 1
# 3 2 1 1 NA
# 4 2 7 1 NA
Or use == with |.
df1 %>% filter_at(vars(1:4), any_vars(. == 2 | . == 3))
# X4 X5 X6 X7
# 1 3 3 3 3
# 2 1 2 2 1
# 3 2 1 1 NA
# 4 2 7 1 NA
DATA
df1 <- read.table(text = " 4 5 6 7
1 3 3 3 3
2 1 2 2 1
3 2 1 1 NA
4 2 7 1 NA
5 1 1 1 1",
header = TRUE, stringsAsFactors = FALSE)

Deleting rows in a data frame based on the contents of the rows

If I have a code like the following:
x1 <- list(1,2,3,4,5,5)
x2 <- list(1,4,7,8)
x3 <- list(5,6)
x4 <- list(1,4,4,5,6,7)
x5 <- list(1,2,3,5,6,9)
x6 <- list(1,4, 6,7,8,7)
myList <- list(x1, x2, x3, x4,x5,x6)
df <- data.frame(t(sapply(myList, function(x){c(x, rep(tail(x, 1),max(lengths(myList)) - length(x)))
})))
Which gives a data frame like this
X1 X2 X3 X4 X5 X6
1 1 2 3 4 5 5
2 1 4 7 8 8 8
3 5 6 6 6 6 6
4 1 4 4 5 6 7
5 1 2 3 5 6 9
6 1 4 6 7 8 7
How could I delete the 2 rows that have the highest values of X6 and the 2 rows that have the lowest values of X6.
Try this (I updated my answer based on your updated sample df):
o <- order(unlist(df[names(df)[ncol(df)]]))
df[-c(head(o, 2), tail(o, 2)),]
# X1 X2 X3 X4 X5 X6
#4 1 4 4 5 6 7
#6 1 4 6 7 8 7
names(df)[ncol(df)] gives the name of the right most column in df.
In baseR, using subsetting with [:
#function sort sorts the df$X6 vector which we subset for the two highest and lowest values
mycol <- df[[rev(names(df))[1]]]
df[!mycol %in% c(sort(mycol)[1:2], rev(sort(mycol))[1:2]), ]
# X1 X2 X3 X4 X5 X6
#4 1 4 4 5 6 7
#6 1 4 6 7 8 7
In base r few simple steps can be used to arrived desired data.
# Data is:
# X1 X2 X3 X4 X5 X6
#1 1 2 3 4 5 5
#2 1 4 7 8 8 8
#3 5 6 6 6 6 6
#4 1 4 4 5 6 7
#5 1 2 3 5 6 9
#6 1 4 6 7 8 7
#order on X6
df <- df[order(df$X6),]
# > df
# X1 X2 X3 X4 X5 X6
# 1 2 3 4 5 5
# 5 6 6 6 6 6
# 1 4 4 5 6 7
# 1 4 6 7 8 7
# 1 4 7 8 8 8
# 1 2 3 5 6 9
#Remove top 2 rows
df <- tail(df, nrow(df) - 2)
#Remove bottom 2 (highest) value one.
> df <- head(df, nrow(df) - 2)
#The result
# > df
# X1 X2 X3 X4 X5 X6
# 1 4 4 5 6 7
# 1 4 6 7 8 7

Resources