Dplyr select based on multiple strings in a column - r

I have a data frame containing following columns:-
sample.data
a_b_c d_b_e r_f_g c_b_a
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
How do I select only columns that contain both let's say "a" and "c" in the column name?

To select variables that contain a and c we could do:
library(dplyr)
df %>%
select(matches("(a.*c)|(c.*a)"))
a_b_c c_b_a
1 1 1
2 2 2
3 3 3
4 4 4
Note that var a_a_e is not selected because it doesn't contain c and var c_f_g is not selected because it doesn't contain a. Column names with two a's and two c's will not be selected either as seen with var a_a_e.
We could also use str_subset:
library(dplyr)
library(stringr)
df %>%
select(str_subset(names(df), "(a.*c)|(c.*a)"))
Data:
df <- data.frame(
a_b_c = 1:4,
a_a_e = 1:4,
c_f_g = 1:4,
c_b_a = 1:4
)

Try df %>% dplyr::select(matches("(a|c)"))
library(dplyr)
df <- data.frame(
a_b_c=1:4,
d_b_e=1:4,
r_f_g=1:4,
c_b_a=1:4
)
Results
> df %>% dplyr::select(matches("(a|c)"))
a_b_c c_b_a
1 1 1
2 2 2
3 3 3
4 4 4

If you want to see how it works under the hood, use the following function:
contain_both <- function(data_frame, letter_a, letter_b) {
j <- 0
keep_columns <- NULL
for(i in 1:ncol(data_frame)) {
has_letters <- unlist(strsplit(names(data_frame)[i], '_'))
if(is.element(letter_a, has_letters) && is.element(letter_b, has_letters)) {
j <- j + 1
keep_columns[j] <- i
}
}
return(data_frame[, keep_columns])
}
Data:
df <- data.frame(seq(1:4), seq(1:4), seq(1:4), seq(1:4))
names(df) <- c('a_b_c', 'd_b_e', 'r_f_g', 'c_b_a')
Just pass in your data frame, along with your 2 letter choices:
Usage:
contain_both(df, 'b', 'c')

Hope this is what you are looking for:
a_b_c <- c(1,2,3,4)
d_b_e <- c(1,2,3,4)
yy <- cbind(a_b_c, d_b_e)
> yy
a_b_c d_b_e
[1,] 1 1
[2,] 2 2
[3,] 3 3
[4,] 4 4
yy <- as.data.frame(yy)
yy
a_b_c d_b_e
1 1 1
2 2 2
3 3 3
4 4 4
y <- yy[which(names(yy) %in% "a_b_c")]
> y
a_b_c
1 1
2 2
3 3
4 4
In your example, you can use this:
y <- sample.data[which(names(sample.data) %in% c("a_b_c","c_b_a" )]

Related

How to find duplicated values in two columns between two dataframes and remove non-duplicates in R?

So let's say I have two dataframes that look like this
df1 <- data.frame(ID = c("A","B","F","G","B","B","A","G","G","F","A","A","A","B","F"),
code = c(1,2,2,3,3,1,2,2,1,1,3,2,2,1,1),
class = c(2,4,5,5,2,3,2,5,1,2,4,5,3,2,1))
df2 <- data.frame(ID = c("G","F","C","F","B","A","F","C","A","B","A","B","C","A","G"),
code = c(1,2,2,3,3,1,2,2,1,1,3,2,2,1,1),
class = c(2,4,5,5,2,3,2,5,1,2,4,5,3,2,1))
I want to check the duplicates in df1$ID and df2$ID and remove all the rows from df2 if the IDs are not present in df1 so the new dataframe would look like this:
df3 <- data.frame(ID = c("G","F","F","B","A","F","A","B","A","B","A","G"),
code = c(1,2,3,3,1,2,1,1,3,2,1,1),
class = c(2,4,5,2,3,2,1,2,4,5,2,1))
With %in%:
df2[df2$ID %in% df1$ID, ]
ID code class
1 G 1 2
2 F 2 4
4 F 3 5
5 B 3 2
6 A 1 3
7 F 2 2
9 A 1 1
10 B 1 2
11 A 3 4
12 B 2 5
14 A 1 2
15 G 1 1
You can use the 'intersect' function to tackle the issue.
common_ids <- intersect(df1$ID, df2$ID)
df3 <- df2[df2$ID %in% common_ids, ]
ID code class
1 G 1 2
2 F 2 4
4 F 3 5
5 B 3 2
6 A 1 3
7 F 2 2
9 A 1 1
10 B 1 2
11 A 3 4
12 B 2 5
14 A 1 2
15 G 1 1
I want to throw semi_join in.
library(tidyverse)
df_test <- df2 |> semi_join(df1, by = "ID")
all.equal(df3, df_test)
#> [1] TRUE

I have a list of data frames and a character vector. I want to rename the second column of each data frame by iterating through the vector. How do I?

I have a list of dataframes. Each of these dataframes has the same number of columns and rows, and has a similar data structure:
df.list <- list(data.frame1, data.frame2, data.frame3)
I have a vector of characters:
charvec <- c("a","b","c")
I want to replace the column name of the second column in each data frame by iterating through the above character vector. For example, the first data frame's second column should be "a". The second data frame's second column should be "b".
[[1]]
col1 a
1 1 2
2 2 3
[[2]]
col1 b
1 1 2
2 2 3
A reproducible example:
charvec <- c("a","b","c")
df_list <- list(df1 = data.frame(x = seq_len(3), y = seq_len(3)), df2 = data.frame(x = seq_len(4), y = seq_len(4)), df3 = data.frame(x = seq_len(5), y = seq_len(5)))
for(i in seq_along(df_list)){
names(df_list[[i]])[2] <- charvec[i]
}
> df_list
$df1
x a
1 1 1
2 2 2
3 3 3
$df2
x b
1 1 1
2 2 2
3 3 3
4 4 4
$df3
x c
1 1 1
2 2 2
3 3 3
4 4 4
5 5 5
Also can use map2 from purrr. Thanks to #ismirsehregal for example data.
library(purrr)
map2(
df_list,
charvec,
\(x, y) {
names(x)[2] <- y
x
}
)
Output
$df1
x a
1 1 1
2 2 2
3 3 3
$df2
x b
1 1 1
2 2 2
3 3 3
4 4 4
$df3
x c
1 1 1
2 2 2
3 3 3
4 4 4
5 5 5

Looping through dataframe to filter rows

In a given dataframe, I need to filter the rows on separate columns, one at a time, using the same condition. The following formulation does not work. Any suggestions?
DF <- data.frame(A = c(1,4,99),
B = c(2,5,6),
C = c(3,99,7))
r <- c("A", "C")
for (i in r){
column = as.formula(paste0("DF$",i))
DF<- DF[column != 99,]
print(DF)
}
The desired outputs are the following two:
A B C
1 1 2 3
2 4 5 99
A B C
1 1 2 3
3 99 6 7
We may use
library(dplyr)
library(purrr)
map(r, ~ DF %>%
filter(!! rlang::sym(.x) != 99))
-output
[[1]]
A B C
1 1 2 3
2 4 5 99
[[2]]
A B C
1 1 2 3
2 99 6 7
Or in base R
lapply(r, \(x) subset(DF, DF[[x]] != 99))
[[1]]
A B C
1 1 2 3
2 4 5 99
[[2]]
A B C
1 1 2 3
3 99 6 7
If it is to filter and then remove on a loop
library(data.table)
setDT(df)
for(nm in r) {
tmp <- DF[DF[[nm]] != 99]
... do some calc ...
rm(tmp)
gc()
}

filter() or subset() all the dataframes stored in a list

If I want to remove all the rows that contain 0s in a specific column, I can just do:
df <- data.frame(a = c(0,1,2,3,0,5),
b = c(1,2,3,5,3,1))
df <- filter(df, a != 0)
How can I do the same if I'm working with lists?
My intuition tells me to use 'lapply' but I cannot seem to make the syntax work:
#same dataframe.
df <- data.frame(a = c(0,1,2,3,0,5),
b = c(1,2,3,5,3,1))
df2 <- df
list.df <- list (df, df2)
lapply(list.df, filter(), a !=0) #don't work. How do I fix this syntax?
Many thanks in advance!
One option involving purrr could be:
map(.x = list.df, ~ .x %>%
filter(a != 0))
[[1]]
a b
1 1 2
2 2 3
3 3 5
4 5 1
[[2]]
a b
1 1 2
2 2 3
3 3 5
4 5 1
You have other options using lapply as:
#Without dplyr
lapply(list.df, function(x)x["a"!=0,])
#With dplyr
library(dplyr)
lapply(list.df, function(x)filter(x,a!=0))
# Result
# [[1]]
# a b
# 1 1 2
# 2 2 3
# 3 3 5
# 4 5 1
#
# [[2]]
# a b
# 1 1 2
# 2 2 3
# 3 3 5
# 4 5 1

How can I subset a dataframe according to group membership?

I am wanting to write a function so that a (potentially large) dataframe can be subsetted according to group membership, where a 'group' is a unique combination of a set of column values.
For example, I would like to subset the following data frame according to unique combination of the first two columns (Loc1 and Loc2).
Loc1 <- c("A","A","A","A","B","B","B")
Loc2 <- c("a","a","b","b","a","a","b")
Dat1 <- c(1,1,1,1,1,1,1)
Dat2 <- c(1,2,1,2,1,2,2)
Dat3 <- c(2,2,4,4,6,5,3)
DF=data.frame(Loc1,Loc2,Dat1,Dat2,Dat3)
Loc1 Loc2 Dat1 Dat2 Dat3
1 A a 1 1 2
2 A a 1 2 2
3 A b 1 1 4
4 A b 1 2 4
5 B a 1 1 6
6 B a 1 2 5
7 B b 1 2 3
I want to return (i) the number of groups (i.e. 4), (ii) the number in each group (i.e. c(2,2,2,1), and (iii) to relabel the rows so that I can further analyse the data frame according to group membership (e.g. for ANOVA and MANOVA) (i.e.
Group<-as.factor(c(1,1,2,2,3,3,4))
Data <- cbind(Group,DF[,-1:-2])
Group Dat1 Dat2 Dat3
1 1 1 1 2
2 1 1 2 2
3 2 1 1 4
4 2 1 2 4
5 3 1 1 6
6 3 1 2 5
7 4 1 2 3
).
So far all I have managed is to get the number of groups, and I'm suspicious that there's a better way to do even this:
nrow(unique(DF[,1:2]))
I was hoping to avoid for-loops as I am concerned about the function being slow.
I have tried converting to a data matrix so that I could concatenate the row values but I couldn't get that to work either.
Many thanks
You could try:
Create Group column by using unique level combination of Loc1 and Loc2.
indx <- paste(DF[,1], DF[,2])
DF$Group <- as.numeric(factor(indx, unique(indx))) #query No (iii)
DF1 <- DF[-(1:2)][,c(4,1:3)]
# Group Dat1 Dat2 Dat3
#1 1 1 1 2
#2 1 1 2 2
#3 2 1 1 4
#4 2 1 2 4
#5 3 1 1 6
#6 3 1 2 5
#7 4 1 2 3
table(DF$Group) #(No. ii)
#1 2 3 4
#2 2 2 1
length(unique(DF$Group)) #(i)
#[1] 4
Then, if you need to subset the datasets by group, you could split the dataset using the Group to create a list of 4 list elements
split(DF1, DF1$Group)
Update
If you have multiple columns, you could still try:
ColstoGroup <- 1:2
indx <- apply(DF[,ColstoGroup], 1, paste, collapse="")
as.numeric(factor(indx, unique(indx)))
#[1] 1 1 2 2 3 3 4
You could create a function;
fun1 <- function(dat, GroupCols){
FactGroup <- dat[, GroupCols]
if(length(GroupCols)==1){
dat$Group <- as.numeric(factor(FactGroup, levels=unique(FactGroup)))
}
else {
indx <- apply(FactGroup, 1, paste, collapse="")
dat$Group <- as.numeric(factor(indx, unique(indx)))
}
dat
}
fun1(DF, "Loc1")
fun1(DF, c("Loc1", "Loc2"))
This gets all three of your queries.
Begin with a table of the first two columns and then work with that data.
> (tab <- table(DF$Loc1, DF$Loc2))
#
# a b
# A 2 2
# B 2 1
#
> (ct <- c(tab)) ## (ii)
# [1] 2 2 2 1
> length(unlist(dimnames(tab))) ## (i)
# [1] 4
> cbind(Group = rep(seq_along(ct), ct), DF[-c(1,2)]) ## (iii)
# Group Dat1 Dat2 Dat3
# 1 1 1 1 2
# 2 1 1 2 2
# 3 2 1 1 4
# 4 2 1 2 4
# 5 3 1 1 6
# 6 3 1 2 5
# 7 4 1 2 3
Borrowing a bit from this answer and using some dplyr idioms:
library(dplyr)
Loc1 <- c("A","A","A","A","B","B","B")
Loc2 <- c("a","a","b","b","a","a","b")
Dat1 <- c(1,1,1,1,1,1,1)
Dat2 <- c(1,2,1,2,1,2,2)
Dat3 <- c(2,2,4,4,6,5,3)
DF <- data.frame(Loc1, Loc2, Dat1, Dat2, Dat3)
emitID <- local({
idCounter <- -1L
function(){
idCounter <<- idCounter + 1L
}
})
DF %>% group_by(Loc1, Loc2) %>% mutate(Group=emitID())
## Loc1 Loc2 Dat1 Dat2 Dat3 Group
## 1 A a 1 1 2 0
## 2 A a 1 2 2 0
## 3 A b 1 1 4 1
## 4 A b 1 2 4 1
## 5 B a 1 1 6 2
## 6 B a 1 2 5 2
## 7 B b 1 2 3 3

Resources