Related
So let's say I have two dataframes that look like this
df1 <- data.frame(ID = c("A","B","F","G","B","B","A","G","G","F","A","A","A","B","F"),
code = c(1,2,2,3,3,1,2,2,1,1,3,2,2,1,1),
class = c(2,4,5,5,2,3,2,5,1,2,4,5,3,2,1))
df2 <- data.frame(ID = c("G","F","C","F","B","A","F","C","A","B","A","B","C","A","G"),
code = c(1,2,2,3,3,1,2,2,1,1,3,2,2,1,1),
class = c(2,4,5,5,2,3,2,5,1,2,4,5,3,2,1))
I want to check the duplicates in df1$ID and df2$ID and remove all the rows from df2 if the IDs are not present in df1 so the new dataframe would look like this:
df3 <- data.frame(ID = c("G","F","F","B","A","F","A","B","A","B","A","G"),
code = c(1,2,3,3,1,2,1,1,3,2,1,1),
class = c(2,4,5,2,3,2,1,2,4,5,2,1))
With %in%:
df2[df2$ID %in% df1$ID, ]
ID code class
1 G 1 2
2 F 2 4
4 F 3 5
5 B 3 2
6 A 1 3
7 F 2 2
9 A 1 1
10 B 1 2
11 A 3 4
12 B 2 5
14 A 1 2
15 G 1 1
You can use the 'intersect' function to tackle the issue.
common_ids <- intersect(df1$ID, df2$ID)
df3 <- df2[df2$ID %in% common_ids, ]
ID code class
1 G 1 2
2 F 2 4
4 F 3 5
5 B 3 2
6 A 1 3
7 F 2 2
9 A 1 1
10 B 1 2
11 A 3 4
12 B 2 5
14 A 1 2
15 G 1 1
I want to throw semi_join in.
library(tidyverse)
df_test <- df2 |> semi_join(df1, by = "ID")
all.equal(df3, df_test)
#> [1] TRUE
I have to subset the data of 6 rows every time. How to do that in R?
data:
col1 : 1,2,3,4,5,6,7,8,9,10
col2 : a1,a2,a3,a4,a5,a6,a7,a8,a9,a10
I want to do subset of 6 rows every time. First subset of the rows will have 1:6 ,next subset of the rows will have 7:nrow(data). I have tried using seq function.
seqData <- seq(1,nrow(data),6)
output: It is giving 1 and 7th row but I want 1 to 6 rows first, next onwards 7 to nrow(data).
How to get output like that.
Will this work:
set.seed(1)
dat <- data.frame(c1 = sample(1:5,12,T),
c2 = sample(1:5,12,T))
dat
c1 c2
1 1 2
2 4 2
3 1 1
4 2 5
5 5 5
6 3 1
7 2 1
8 3 5
9 3 5
10 1 2
11 5 2
12 5 1
split(dat, rep(1:ceiling(nrow(dat)/6), each = 6))
$`1`
c1 c2
1 1 2
2 4 2
3 1 1
4 2 5
5 5 5
6 3 1
$`2`
c1 c2
7 2 1
8 3 5
9 3 5
10 1 2
11 5 2
12 5 1
The function below creates a numeric vector with integers increasing by 1 unit every n rows. And uses this vector to split the data as needed.
data <- data.frame(col1 = 1:10, col2 = paste0("a", 1:10))
split_nrows <- function(x, n){
f <- c(1, rep(0, n - 1))
f <- rep(f, length.out = NROW(x))
f <- cumsum(f)
split(x, f)
}
split_nrows(data, 6)
Here's a simple example with mtcars that yields a list of 6 subset dfs.
nrows <- nrow(mtcars)
breaks <- seq(1, nrows, 6)
listdfs <- lapply(breaks, function(x) mtcars[x:(x+5), ]) # increment by 5 not 6
listdfs[[6]] <- listdfs[[6]][1:2, ] #last df: remove 4 NA rows (36 - 32)
I have a data frame containing following columns:-
sample.data
a_b_c d_b_e r_f_g c_b_a
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
How do I select only columns that contain both let's say "a" and "c" in the column name?
To select variables that contain a and c we could do:
library(dplyr)
df %>%
select(matches("(a.*c)|(c.*a)"))
a_b_c c_b_a
1 1 1
2 2 2
3 3 3
4 4 4
Note that var a_a_e is not selected because it doesn't contain c and var c_f_g is not selected because it doesn't contain a. Column names with two a's and two c's will not be selected either as seen with var a_a_e.
We could also use str_subset:
library(dplyr)
library(stringr)
df %>%
select(str_subset(names(df), "(a.*c)|(c.*a)"))
Data:
df <- data.frame(
a_b_c = 1:4,
a_a_e = 1:4,
c_f_g = 1:4,
c_b_a = 1:4
)
Try df %>% dplyr::select(matches("(a|c)"))
library(dplyr)
df <- data.frame(
a_b_c=1:4,
d_b_e=1:4,
r_f_g=1:4,
c_b_a=1:4
)
Results
> df %>% dplyr::select(matches("(a|c)"))
a_b_c c_b_a
1 1 1
2 2 2
3 3 3
4 4 4
If you want to see how it works under the hood, use the following function:
contain_both <- function(data_frame, letter_a, letter_b) {
j <- 0
keep_columns <- NULL
for(i in 1:ncol(data_frame)) {
has_letters <- unlist(strsplit(names(data_frame)[i], '_'))
if(is.element(letter_a, has_letters) && is.element(letter_b, has_letters)) {
j <- j + 1
keep_columns[j] <- i
}
}
return(data_frame[, keep_columns])
}
Data:
df <- data.frame(seq(1:4), seq(1:4), seq(1:4), seq(1:4))
names(df) <- c('a_b_c', 'd_b_e', 'r_f_g', 'c_b_a')
Just pass in your data frame, along with your 2 letter choices:
Usage:
contain_both(df, 'b', 'c')
Hope this is what you are looking for:
a_b_c <- c(1,2,3,4)
d_b_e <- c(1,2,3,4)
yy <- cbind(a_b_c, d_b_e)
> yy
a_b_c d_b_e
[1,] 1 1
[2,] 2 2
[3,] 3 3
[4,] 4 4
yy <- as.data.frame(yy)
yy
a_b_c d_b_e
1 1 1
2 2 2
3 3 3
4 4 4
y <- yy[which(names(yy) %in% "a_b_c")]
> y
a_b_c
1 1
2 2
3 3
4 4
In your example, you can use this:
y <- sample.data[which(names(sample.data) %in% c("a_b_c","c_b_a" )]
Let's say I have this simple data frame:
df <- data.frame(x=c(1,3,3,1,3,1), y = c(2,2,2,2,2,2),z = c('a','b','c','d','e','f'))
> df
x y z
1 1 2 a
2 3 2 b
3 3 2 c
4 1 2 d
5 3 2 e
6 1 2 f
I would like to subset where x= 3, return only column x and y and include a calculated colum x+y.
I can get the first 2 things done, but I can't get the caclulated column to also appear.
df[df$x==3,c("x","y")]
How can I do that, but using base R only.
Staying in base, just do a rowSums before your subset.
df$xy <- rowSums(df[, c("x", "y")])
df[df$x == 3, c("x", "y", "xy")]
# x y xy
# 2 3 2 5
# 3 3 2 5
# 5 3 2 5
Personally, I do prefer the dplyr approach, which #akrun commented on your question.
You can also do like this
df <- data.frame(x=c(1,3,3,1,3,1), y = c(2,2,2,2,2,2),z = c('a','b','c','d','e','f'))
df$z <- ifelse(df$x == 3, (df$x + df$y), df$y)
df
x y z
1 1 2 2
2 3 2 5
3 3 2 5
4 1 2 2
5 3 2 5
6 1 2 2
I am wanting to write a function so that a (potentially large) dataframe can be subsetted according to group membership, where a 'group' is a unique combination of a set of column values.
For example, I would like to subset the following data frame according to unique combination of the first two columns (Loc1 and Loc2).
Loc1 <- c("A","A","A","A","B","B","B")
Loc2 <- c("a","a","b","b","a","a","b")
Dat1 <- c(1,1,1,1,1,1,1)
Dat2 <- c(1,2,1,2,1,2,2)
Dat3 <- c(2,2,4,4,6,5,3)
DF=data.frame(Loc1,Loc2,Dat1,Dat2,Dat3)
Loc1 Loc2 Dat1 Dat2 Dat3
1 A a 1 1 2
2 A a 1 2 2
3 A b 1 1 4
4 A b 1 2 4
5 B a 1 1 6
6 B a 1 2 5
7 B b 1 2 3
I want to return (i) the number of groups (i.e. 4), (ii) the number in each group (i.e. c(2,2,2,1), and (iii) to relabel the rows so that I can further analyse the data frame according to group membership (e.g. for ANOVA and MANOVA) (i.e.
Group<-as.factor(c(1,1,2,2,3,3,4))
Data <- cbind(Group,DF[,-1:-2])
Group Dat1 Dat2 Dat3
1 1 1 1 2
2 1 1 2 2
3 2 1 1 4
4 2 1 2 4
5 3 1 1 6
6 3 1 2 5
7 4 1 2 3
).
So far all I have managed is to get the number of groups, and I'm suspicious that there's a better way to do even this:
nrow(unique(DF[,1:2]))
I was hoping to avoid for-loops as I am concerned about the function being slow.
I have tried converting to a data matrix so that I could concatenate the row values but I couldn't get that to work either.
Many thanks
You could try:
Create Group column by using unique level combination of Loc1 and Loc2.
indx <- paste(DF[,1], DF[,2])
DF$Group <- as.numeric(factor(indx, unique(indx))) #query No (iii)
DF1 <- DF[-(1:2)][,c(4,1:3)]
# Group Dat1 Dat2 Dat3
#1 1 1 1 2
#2 1 1 2 2
#3 2 1 1 4
#4 2 1 2 4
#5 3 1 1 6
#6 3 1 2 5
#7 4 1 2 3
table(DF$Group) #(No. ii)
#1 2 3 4
#2 2 2 1
length(unique(DF$Group)) #(i)
#[1] 4
Then, if you need to subset the datasets by group, you could split the dataset using the Group to create a list of 4 list elements
split(DF1, DF1$Group)
Update
If you have multiple columns, you could still try:
ColstoGroup <- 1:2
indx <- apply(DF[,ColstoGroup], 1, paste, collapse="")
as.numeric(factor(indx, unique(indx)))
#[1] 1 1 2 2 3 3 4
You could create a function;
fun1 <- function(dat, GroupCols){
FactGroup <- dat[, GroupCols]
if(length(GroupCols)==1){
dat$Group <- as.numeric(factor(FactGroup, levels=unique(FactGroup)))
}
else {
indx <- apply(FactGroup, 1, paste, collapse="")
dat$Group <- as.numeric(factor(indx, unique(indx)))
}
dat
}
fun1(DF, "Loc1")
fun1(DF, c("Loc1", "Loc2"))
This gets all three of your queries.
Begin with a table of the first two columns and then work with that data.
> (tab <- table(DF$Loc1, DF$Loc2))
#
# a b
# A 2 2
# B 2 1
#
> (ct <- c(tab)) ## (ii)
# [1] 2 2 2 1
> length(unlist(dimnames(tab))) ## (i)
# [1] 4
> cbind(Group = rep(seq_along(ct), ct), DF[-c(1,2)]) ## (iii)
# Group Dat1 Dat2 Dat3
# 1 1 1 1 2
# 2 1 1 2 2
# 3 2 1 1 4
# 4 2 1 2 4
# 5 3 1 1 6
# 6 3 1 2 5
# 7 4 1 2 3
Borrowing a bit from this answer and using some dplyr idioms:
library(dplyr)
Loc1 <- c("A","A","A","A","B","B","B")
Loc2 <- c("a","a","b","b","a","a","b")
Dat1 <- c(1,1,1,1,1,1,1)
Dat2 <- c(1,2,1,2,1,2,2)
Dat3 <- c(2,2,4,4,6,5,3)
DF <- data.frame(Loc1, Loc2, Dat1, Dat2, Dat3)
emitID <- local({
idCounter <- -1L
function(){
idCounter <<- idCounter + 1L
}
})
DF %>% group_by(Loc1, Loc2) %>% mutate(Group=emitID())
## Loc1 Loc2 Dat1 Dat2 Dat3 Group
## 1 A a 1 1 2 0
## 2 A a 1 2 2 0
## 3 A b 1 1 4 1
## 4 A b 1 2 4 1
## 5 B a 1 1 6 2
## 6 B a 1 2 5 2
## 7 B b 1 2 3 3