Conditional Subsetting based on column numbers - r

I need to subset data for when columns don't match. For example if I have an identifier in the first column X like 1 then all of the following examples in column Y should match:
X <- rep(1:4, times=2, each=2)
Y <- rep(c("Dave","Sam","Sam","Sam"))
Z <- as.data.frame(cbind(X,Y))
head(Z)
So on this one I would like to subset the data when X = 1 and 3 on this example since column y doesn't fully agree by not subset column 2. It would be great to get a function to subset for this type of problem I have on a larger dataframe
Thanks,

With dplyr:
df <- data.frame(x = rep(1:4, times=2, each=2),
y = rep(c("Dave","Sam","Sam","Sam")))
library(dplyr)
df %>%
group_by(x) %>%
filter(any(!y == lag(y), na.rm = T))
#> Source: local data frame [8 x 2]
#> Groups: x [2]
#>
#> x y
#> <int> <fctr>
#> 1 1 Dave
#> 2 1 Sam
#> 3 3 Dave
#> 4 3 Sam
#> 5 1 Dave
#> 6 1 Sam
#> 7 3 Dave
#> 8 3 Sam
I tested some cases, not sure if this holds a lot of edge cases

This is the way I would do it, though there may be a more elegant way. Is this what you need?
X <- rep(1:4, times=2, each=2)
Y <- rep(c("Dave","Sam","Sam","Sam"))
Z <- as.data.frame(cbind(X,Y))
head(Z)
# First Create Concatenated column
Z$XY <- paste(Z$X, Z$Y)
# Eliminate all duplicates
Z_unique <- unique(Z)
# Find number of occurences of each X value
n_occur <- data.frame(table(Z_unique$X))
# Pull only those that have occurred more than once
n_occur[n_occur$Freq > 1,]
# Subset the output to only those values
output <- Z[Z$X %in% n_occur$Var1[n_occur$Freq > 1],]

We can use data.table
library(data.table)
setDT(df)[, .SD[any(!y == shift(y))], x]
# x y
#1: 1 Dave
#2: 1 Sam
#3: 1 Dave
#4: 1 Sam
#5: 3 Dave
#6: 3 Sam
#7: 3 Dave
#8: 3 Sam
data
df <- data.frame(x = rep(1:4, times=2, each=2),
y = rep(c("Dave","Sam","Sam","Sam")))

Related

Extract first Non NA value over multiple columns

I'm still learning R and was wondering if I there was an elegant way of manipulating the below df to achieve df2.
I'm not sure if it's a loop that is supposed to be used for this, but basically I want to extract the first Non NA "X_No" Value if the "X_No" value is NA in the first row. This would perhaps be best described through an example from df to the desired df2.
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
I'm hoping for an elegant solution to this as there are over a 1000 columns similar to the example provided.
I've looked all over the web for a similar example however to no avail that would reproduce the expected result.
Your help is very much appreciated.
Thankyou
I don't know if I'd call it "elegant", but here is a potential solution:
library(tidyverse)
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
output <- df %>%
pivot_longer(everything(),
names_sep = "_",
names_to = c("Col", ".value")) %>%
drop_na() %>%
group_by(Col) %>%
slice_head(n = 1) %>%
ungroup() %>%
select(-Col)
df2
#> ID No
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
output
#> # A tibble: 5 × 2
#> ID No
#> <chr> <dbl>
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
all_equal(df2, output)
#> [1] TRUE
Created on 2023-02-08 with reprex v2.0.2
Using base R with max.col (assuming the columns are alternating with ID, No)
ind <- max.col(!is.na(t(df[c(FALSE, TRUE)])), "first")
m1 <- cbind(seq_along(ind), ind)
data.frame(ID = t(df[c(TRUE, FALSE)])[m1], No = t(df[c(FALSE, TRUE)])[m1])
ID No
1 A 11
2 D 12
3 F 13
4 M 40
5 W 50
Here is a data.table solution that should scale well to a (very) large dataset.
functionally
split the data.frame to a list of chunks of columns, based on their
names. So all columns startting with A_ go to
the first element, all colums startting with B_ to the second
Then, put these list elements on top of each other, using
data.table::rbindlist. Ignure the column-namaes (this only works if
A_ has the same number of columns as B_ has the same number of cols
as n_)
Now get the first non-NA value of each value in the first column
code
library(data.table)
# split based on what comes after the underscore
L <- split.default(df, f = gsub("(.*)_.*", "\\1", names(df)))
# bind together again
DT <- rbindlist(L, use.names = FALSE)
# extract the first value of the non-NA
DT[!is.na(A_No), .(No = A_No[1]), keyby = .(ID = A_ID)]
# ID No
# 1: A 11
# 2: D 12
# 3: F 13
# 4: G 14
# 5: I 15
# 6: M 40
# 7: P 20
# 8: W 50
# 9: X 32
#10: Y 48
#11: Z 40

Initialise a dataframe where a column references another column

I wonder if there is a way to do:
df <- data.frame(x = 1:3)
df$y = df$x + 5
yielding:
x y
1 1 6
2 2 7
3 3 8
in one line of code where the y column refers to the x column? For example:
data.frame(x = 1:3, y = self$x + 5) # doesn't work
(I won't accept answers that ignore the x column, for example, data.frame(x = 1:3, y = 6:8 :-))
This is possible using tibble from tibble library. Credit to #DaveArmstrong from the comments.
library(tibble)
tibble(x = 1:3, y = x + 5)
# A tibble: 3 × 2
x y
<int> <dbl>
1 1 6
2 2 7
3 3 8
Here's a base R method that do not need to use external package (e.g. tibble).
We can use outer to add 5 to each element in df$x, then cbind the result with df.
setNames(data.frame(cbind(1:3, outer(1:3, 5, `+`))), c("x", "y"))
# or to expand your code
setNames(cbind(data.frame(x = 1:3), outer(1:3, 5, `+`)), c("x", "y"))
x y
1 1 6
2 2 7
3 3 8

Get random sample from subset of other dataframe

I have a large data frame of 100,000's rows, and I want to add a column where the value is a sample of a subset of another data frame based on common names in the data frames. Might be easier to explain with examples...
largeDF <- data.frame(colA = c('a', 'b', 'b', 'a', 'a', 'b'),
colB = c('x', 'y', 'y', 'x', 'y', 'y'),
colC = 1:6)
sampleDF <- data.frame(colA = c('a','a','a','a','b','b','b','b','b','b'),
colB = c('x','x','y','y','x','y','y','y','y','y'),
sample = 1:10)
I then want to add a new column sample to largeDF, which is a random sample of the sample column in sampleDF for the appropriate subset of colA and colB.
For example, for the first row the values are a and x, so the value will be a random sample of 1 or 2, for the next row (b and y) it will be a random sample of 6, 7, 8, 9 or 10.
So we could end up with something like:
rowA rowB rowC sample
1 a x 1 2
2 b y 2 9
3 b y 3 7
4 a x 4 2
5 a y 5 4
6 b y 6 8
Any help would be appreciated!
Using dplyr... (This throws a few warnings, but appears to work anyway.)
library(dplyr)
largeDF <- largeDF %>% group_by(colA,colB) %>%
mutate(sample=sample(sampleDF$sample[sampleDF$colA==colA & sampleDF$colB==colB],
size=n(),replace=TRUE))
largeDF
colA colB colC sample
<fctr> <fctr> <int> <int>
1 a x 1 2
2 b y 2 6
3 b y 3 9
4 a x 4 1
5 a y 5 4
6 b y 6 9
You could do something like this:
largeDF$sample <- apply(largeDF,1,function(a)
with(sampleDF, sample(sampleDF[colA==a[1] & colB==a[2],]$sample,1)))
I do not quite understand the question but it seems that you are just adding a new column in the large data frame that is just the sampled "sample" column from a subsample...
see if the following code gives you an idea into the functionality you need:
cbind.data.frame(largeDF, sample = sample(sampleDF$sample, nrow(largeDF)))
# colA colB colC sample
#1 a x 1 9
#2 b y 2 10
#3 b y 3 1
#4 a x 4 3
#5 a y 5 6
#6 b y 6 7
I think this is one possible solution for you...
library(dplyr)
largeDF_sample <- sapply(1:nrow(largeDF), function(x) {
sampleDF_part = filter(sampleDF, colA==largeDF$colA[x] & colB==largeDF$colB[x])
return(sample(sampleDF_part$sample)[1])
})
largeDF$sample <- largeDF_sample

Bind data frames on longer identifiers R

I've got two data frames in which the unique identifiers common to both frames differ in the number of observations. I would like to create a dataframe from both in which the observations from each frame are taken if they have more observations for a common identifier. For example:
f1 <- data.frame(x = c("a", "a", "b", "c", "c", "c"), y = c(1,1,2,3,3,3))
f2 <- data.frame(x = c("a","b", "b", "c", "c"), y = c(4,5,5,6,6))
I would like this to generate a merge based on the longer x such that it produces:
x y
a 1
a 1
b 5
b 5
c 3
c 3
c 3
Any and all thoughts would be great.
Here's a solution using split
dd<-rbind(cbind(f1, s="f1"), cbind(f2, s="f2"))
keep<-unsplit(lapply(split(dd$s, dd$x), FUN=function(x) {
y<-table(x)
x == names(y[which.max(y)])
}), dd$x)
dd <- dd[keep,]
Normally i'd prefer to use the ave function here but because i'm changing data.types from a factor to a logical, it wasn't as appropriate so I basically copied the idea that ave uses and used split.
dplyr solution
library(dplyr)
First we combine the data:
with rbind() and introduce a new variable called ref to know where each observation came from:
both <- rbind( f1, f2 )
both$ref <- rep( c( "f1", "f2" ) , c( nrow(f1), nrow(f2) ) )
then count the observations:
make another new variable that contains how many observations for each ref and x combination:
both_with_counts <- both %>%
group_by( ref ,x ) %>%
mutate( counts = n() )
then filter for the largest count:
both_with_counts %>% group_by( x ) %>% filter( n==max(n) )
note: you could also select only the x and y cols with select(x,y)...
this gives:
## Source: local data frame [7 x 4]
## Groups: x
##
## x y ref counts
## 1 a 1 f1 2
## 2 a 1 f1 2
## 3 c 3 f1 3
## 4 c 3 f1 3
## 5 c 3 f1 3
## 6 b 5 f2 2
## 7 b 5 f2 2
Altogether now...
what_I_want <-
rbind(cbind(f1,ref = "f1"),cbind(f2,ref = "f2")) %>%
group_by(ref,x) %>%
mutate(counts = n()) %>%
group_by( x ) %>%
filter( counts==max(counts) ) %>%
select( x, y )
and thus:
> what_I_want
# Source: local data frame [7 x 2]
# Groups: x
#
# x y
# 1 a 1
# 2 a 1
# 3 c 3
# 4 c 3
# 5 c 3
# 6 b 5
# 7 b 5
Not a elegant answer but still give the desired result. Hope this help.
f1table <- data.frame(table(f1$x))
colnames(f1table) <- c("x","freq")
f1new <- merge(f1,f1table)
f2table <- data.frame(table(f2$x))
colnames(f2table) <- c("x","freq")
f2new <- merge(f2,f2table)
table <- rbind(f1table, f2table)
table <- table[with(table, order(x,-freq)), ]
table <- table[!duplicated(table$x), ]
data <-rbind(f1new, f2new)
merge(data, table, by=c("x","freq"))[,c(1,3)]
x y
1 a 1
2 a 1
3 b 5
4 b 5
5 c 3
6 c 3
7 c 3

dplyr filter: Get rows with minimum of variable, but only the first if multiple minima

I want to make a grouped filter using dplyr, in a way that within each group only that row is returned which has the minimum value of variable x.
My problem is: As expected, in the case of multiple minima all rows with the minimum value are returned. But in my case, I only want the first row if multiple minima are present.
Here's an example:
df <- data.frame(
A=c("A", "A", "A", "B", "B", "B", "C", "C", "C"),
x=c(1, 1, 2, 2, 3, 4, 5, 5, 5),
y=rnorm(9)
)
library(dplyr)
df.g <- group_by(df, A)
filter(df.g, x == min(x))
As expected, all minima are returned:
Source: local data frame [6 x 3]
Groups: A
A x y
1 A 1 -1.04584335
2 A 1 0.97949399
3 B 2 0.79600971
4 C 5 -0.08655151
5 C 5 0.16649962
6 C 5 -0.05948012
With ddply, I would have approach the task that way:
library(plyr)
ddply(df, .(A), function(z) {
z[z$x == min(z$x), ][1, ]
})
... which works:
A x y
1 A 1 -1.04584335
2 B 2 0.79600971
3 C 5 -0.08655151
Q: Is there a way to approach this in dplyr? (For speed reasons)
Update
With dplyr >= 0.3 you can use the slice function in combination with which.min, which would be my favorite approach for this task:
df %>% group_by(A) %>% slice(which.min(x))
#Source: local data frame [3 x 3]
#Groups: A
#
# A x y
#1 A 1 0.2979772
#2 B 2 -1.1265265
#3 C 5 -1.1952004
Original answer
For the sample data, it is also possible to use two filter after each other:
group_by(df, A) %>%
filter(x == min(x)) %>%
filter(1:n() == 1)
Just for completeness: Here's the final dplyr solution, derived from the comments of #hadley and #Arun:
library(dplyr)
df.g <- group_by(df, A)
filter(df.g, rank(x, ties.method="first")==1)
For what it's worth, here's a data.table solution, to those who may be interested:
# approach with setting keys
dt <- as.data.table(df)
setkey(dt, A,x)
dt[J(unique(A)), mult="first"]
# without using keys
dt <- as.data.table(df)
dt[dt[, .I[which.min(x)], by=A]$V1]
This can be accomplished by using row_number combined with group_by. row_number handles ties by assigning a rank not only by the value but also by the relative order within the vector. To get the first row of each group with the minimum value of x:
df.g <- group_by(df, A)
filter(df.g, row_number(x) == 1)
For more information see the dplyr vignette on window functions.
dplyr offers slice_min function, wich do the job with the argument with_ties = FALSE
library(dplyr)
df %>%
group_by(A) %>%
slice_min(x, with_ties = FALSE)
Output :
# A tibble: 3 x 3
# Groups: A [3]
A x y
<fct> <dbl> <dbl>
1 A 1 0.273
2 B 2 -0.462
3 C 5 1.08
Another way to do it:
set.seed(1)
x <- data.frame(a = rep(1:2, each = 10), b = rnorm(20))
x <- dplyr::arrange(x, a, b)
dplyr::filter(x, !duplicated(a))
Result:
a b
1 1 -0.8356286
2 2 -2.2146999
Could also be easily adapted for getting the row in each group with maximum value.
In case you are looking to filter the minima of x and then the minima of y. An intuitive way of do it is just using filtering functions:
> df
A x y
1 A 1 1.856368296
2 A 1 -0.298284187
3 A 2 0.800047796
4 B 2 0.107289719
5 B 3 0.641819999
6 B 4 0.650542284
7 C 5 0.422465687
8 C 5 0.009819306
9 C 5 -0.482082635
df %>% group_by(A) %>%
filter(x == min(x), y == min(y))
# A tibble: 3 x 3
# Groups: A [3]
A x y
<chr> <dbl> <dbl>
1 A 1 -0.298
2 B 2 0.107
3 C 5 -0.482
This code will filter the minima of x and y.
Also you can do a double filter
that looks even more readable:
df %>% group_by(A) %>%
filter(x == min(x)) %>%
filter(y == min(y))
# A tibble: 3 x 3
# Groups: A [3]
A x y
<chr> <dbl> <dbl>
1 A 1 -0.298
2 B 2 0.107
3 C 5 -0.482
I like sqldf for its simplicity..
sqldf("select A,min(X),y from 'df.g' group by A")
Output:
A min(X) y
1 A 1 -1.4836989
2 B 2 0.3755771
3 C 5 0.9284441
For the sake of completeness, here's the base R answer:
df[with(df, ave(x, A, FUN = \(x) rank(x, ties.method = "first")) == 1), ]
# A x y
#1 A 1 0.1076158
#4 B 2 -1.3909084
#7 C 5 0.3511618

Resources