Identify which row of data.frame exactly matches a vector - r

Given this data.frame:
var1 <- c(1, 2)
var2 <- c(3, 4)
var3 <- c(5, 6)
df <- expand.grid(var1 = var1, var2 = var2, var3 = var3)
var1 var2 var3
1 1 3 5
2 2 3 5
3 1 4 5
4 2 4 5
5 1 3 6
6 2 3 6
7 1 4 6
8 2 4 6
I would like to identify the data.frame row number matching this vector (4 is the answer in this case):
vec <- c(var1 = 2, var2 = 4, var3 = 5)
var1 var2 var3
2 4 5
I can't seem to sort out a simple subsetting method. The best I have been able to come up with is the following:
working <- apply(df, 2, match, vec)
which(apply(working, 1, anyNA) == FALSE)
This seems less straightforward than expected; I was wondering if there was a more straightforward solution?

We can transpose the dataframe, compare it with vec and select the row where all of the value matches.
which(colSums(t(df) == vec) == ncol(df))
#[1] 4

For the sake of completeness, subsetting can be implemented using data.table's join:
library(data.table)
setDT(df)[as.list(vec), on = names(vec), which = TRUE]
[1] 4

This can be solved using the prodlim library:
> library(prodlim)
> row.match(vec, df)
[1] 4

Here is a dplyr option:
library(dplyr)
library(magrittr)
df %>% mutate(new=paste0(var1,var2,var3), num=row_number()) %>%
filter(new=="245") %>% select(num) %>% as.integer()
[1] 4

Related

How do I add a column to a data frame consisting of minimum values from other columns?

How do I add a column to a data frame consisting of the minimum values from other columns? So in this case, to create a third column that will have the values 1, 2 and 2?
df = data.frame(A = 1:3, B = 4:2)
You can use apply() function to do this. See below.
df$C <- apply(df, 1, min)
The second argument allows you to choose the dimension in which you want min to be applied, in this case 1, applies min to all columns in each row separately.
You can choose specific columns from the dataframe, as follows:
df$newCol <- apply(df[c('A','B')], 1, min)
You can call the parallel minimum function with do.call to apply it on all your columns:
df$C <- do.call(pmin, df)
df %>%
rowwise() %>%
mutate(C = min(A, B))
# A tibble: 3 × 3
# Rowwise:
A B C
<int> <int> <int>
1 1 4 1
2 2 3 2
3 3 2 2
Using input with equal values across rows:
df = data.frame(A = 1:10, B = 11:2)
df %>%
rowwise() %>%
mutate(C = min(A, B))
# A tibble: 10 × 3
# Rowwise:
A B C
<int> <int> <int>
1 1 11 1
2 2 10 2
3 3 9 3
4 4 8 4
5 5 7 5
6 6 6 6
7 7 5 5
8 8 4 4
9 9 3 3
10 10 2 2
You do simply:
df$C <- apply(FUN=min,MARGIN=1,X=df)
Or:
df[, "C"] <- apply(FUN=min,MARGIN=1,X=df)
or:
df["C"] <- apply(FUN=min,MARGIN=1,X=df)
Instead of apply, you could also use data.farme(t(df)), where t transposes df, because sapply would traverse a data frame column-wise applying the given function. So the rows must be made columns. Since t outputs always a matrix, you need to make it a data.frame() again.
df$C <- sapply(data.frame(t(df)), min)
Or one could use the fact that ifelse is vectorized:
df$C <- with(df, ifelse(A<B,A,B))
Or:
df$C <- ifelse(df$A < df$B, df$A, df$B)
matrixStats
# install.packages("matrixStats")
matrixStats::rowMins(as.matrix(df))
According to this SO answer the fastest.
apply-type functions use lists and are always quite slow.
You can use transform() to add the min column as the output of pmin(a, b) and access the elements of df without indexing:
df <- transform(df, min = pmin(a, b))
or
In data.table
library(data.table)
DT = data.table(a = 1:3, b = 4:2)
DT[, min := pmin(a, b)]

Group data by factor level, then transform to data frame with colname being levels?

There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!

Getting rows whose value are greater than the group mean

I have a data frame where column "A" has 6 distinct values. Column "B" has float values. By using dplyr, I can group by column "A" and find mean of column "B" of each group as follows:
mydf %>% group_by(A) %>% summarize(Mean = mean(B, na.rm=TRUE))
My utter aim is to find rows in each group whose "B" values are higher than the group average. How can I achieve this (using base R or dplyr)?
A simple alternative with base R ave would be
df[df$b > ave(df$b, df$a) , ]
# a b
#4 1 4
#5 1 5
#9 2 9
#10 2 10
The default argument for ave is mean so no need to mention it explicitly, if there are NA values present in b modify it to
df[df$b > ave(df$b, df$a, FUN = function(x) mean(x,na.rm = TRUE)) , ]
Another solution with subset and ave as suggested by #Onyambu
subset(df,b>ave(b,a))
# a b
#4 1 4
#5 1 5
#9 2 9
#10 2 10
data
df <- data.frame(a = rep(c(1, 2), each = 5), b = 1:10)
df
# a b
#1 1 1
#2 1 2
#3 1 3
#4 1 4
#5 1 5
#6 2 6
#7 2 7
#8 2 8
#9 2 9
#10 2 10
You can just group and then filter:
mydf %>%
group_by(A) %>%
filter(B > mean(B, na.rm = TRUE)) %>%
ungroup()
Using Base R, I would go for this. It is not as elegant as dplyr.
mean.df <- aggregate(mydf$b, by =list(a = mydf$a), FUN = mean)
names(mean.df)[2] <- "mean"
mydf <- merge(mydf, mean.df, by = "a")
# Rows whose values are higher than mean
new.df <- subset(mydf, b > mean, select = -mean)
I like working with Data tables. So a data.table solution would be,
mydt <- data.table(mydf)
mydt[, mean := mean(b), by = a]
new.dt <- mydt[b > mean, -c("mean"), with = TRUE]
Another way to do it using base R and tapply:
mydf = cbind.data.frame(A=sample(6,20,rep=T),B=runif(20))
mydf.ave = tapply(mydf$B,mydf$A,mean)
newdf = mydf[mydf$B > mydf.ave[as.character(mydf$A)],]
(thus the one liner would be:mydf[mydf$B > tapply(mydf$B,mydf$A,mean)[as.character(mydf$A)],])

convert data frame of counts to proportions by conditions in R

I would need to expand on this question: convert data frame of counts to proportions in R
I need to calculate proportion by one condition and retain the information of the dataset.
Reproducible example:
ID <- rep(c(1,2,3), each=3)
trial <- rep("a", 9)
variable1 <- sample(1:10, 9)
variable2 <- sample(1:10, 9)
variable3 <- sample(1:10, 9)
condition <- rep(c("i","j","k"), 3)
dat <- data.frame(cbind(ID, trial,variable1,variable2,variable3,condition))
For each variable I would like to have the proportion by the ID (i.e. 3 times)
Ideally the new variables would be stored in the same database as dat$variable1_p
I know how to do the trick by a series of for loops but I would like to learn how to use the apply function. Also to be able to expand it to more conditions if necessary.
We can use adply from the plyr package:
library(plyr)
adply(dat, 1, function(x)
c('variable1_p' = x$variable1 / sum(dat[x$ID == dat$ID,]$variable1)))
# ID trial variable1 variable2 variable3 condition variable1_p
# 1 1 a 3 5 4 i 0.20000000
# 2 1 a 8 9 9 j 0.53333333
# 3 1 a 4 4 8 k 0.26666667
# 4 2 a 7 10 5 i 0.50000000
# 5 2 a 6 8 10 j 0.42857143
# 6 2 a 1 1 7 k 0.07142857
# 7 3 a 10 6 3 i 0.47619048
# 8 3 a 9 7 6 j 0.42857143
# 9 3 a 2 3 2 k 0.09523810
Another option is to use dplyr, which would handle cases where there is more than one row per condition per ID:
library(dplyr)
dat %>%
group_by(ID, condition) %>%
mutate(sum_v1_cond = sum(variable1)) %>%
ungroup() %>%
group_by(ID) %>%
mutate(variable1_p = sum_v1_cond / sum(variable1)) %>%
select(-sum_v1_cond)
Edit - here's a full solution for variable1, variable2, and variable3:
adply(dat, 1, function(x)
c('variable1_p' = x$variable1 / sum(dat[x$ID == dat$ID,]$variable1),
'variable2_p' = x$variable2 / sum(dat[x$ID == dat$ID,]$variable2),
'variable3_p' = x$variable3 / sum(dat[x$ID == dat$ID,]$variable3)))
Data:
set.seed(123)
ID <- rep(c(1,2,3), each=3)
trial <- rep("a", 9)
variable1 <- sample(1:10, 9)
variable2 <- sample(1:10, 9)
variable3 <- sample(1:10, 9)
condition <- rep(c("i","j","k"), 3)
dat <- data.frame(ID, trial,variable1,variable2,variable3,condition,
stringsAsFactors = FALSE)

Removing rows when flipped in two columns

Considering the following data frame:
df <- data.frame(var1 = 1:5, var2 = c(5,6,7,8,1))
> df
var1 var2
1 1 5
2 2 6
3 3 7
4 4 8
5 5 1
I'd like to remove all rows whose values are flipped across the two columns. In this case, it would be row 1 and row 5 as the values 1 and 5 in row 1 are flipped to 5 and 1 in row 5. These two rows should be removed.
I hope it came clear what I am asking for :-)
Kind regards!
Perhaps something like this could work too:
df <- data.frame(var1 = 1:5, var2 = c(5,6,7,8,1))
df[!do.call(paste, df) %in% do.call(paste, rev(df)), ]
var1 var2
2 2 6
3 3 7
4 4 8
I'd have to test it on a few more test cases though, but the general idea is to use rev to reverse the order of the columns in "df" and paste them together and compare that with the pasted columns from "df".
Here's a simple but not especially elegant way: make a reversed data frame with a flag, and then merge it on to df:
# Make a reversed dataset
fd <- data.frame(var1 = df$var2, var2 = df$var1, flag = TRUE)
# Merge it onto your original df, then drop the matched rows and the flag var
df.sub <- subset(merge(x = df, y = fd, by = c("var1", "var2"), all.x = TRUE),
subset = is.na(flag),
select = c("var1", "var2"))
Using a bit of maths - the two rows are the same up to a permutation if the sum and absolute value of difference are the same:
df[with(df, !duplicated(data.frame(var1 + var2, abs(var1 - var2)), fromLast = TRUE)),]
# var1 var2
#1 1 5
#2 2 6
#3 3 7
#4 4 8
edit: should've read the question more carefully, to remove both duplicates, follow Ananda's suggestion:
df.ind = with(df, data.frame(var1 + var2, abs(var1 - var2)))
df[!duplicated(df.ind) & !duplicated(df.ind, fromLast = TRUE),]
# var1 var2
#2 2 6
#3 3 7
#4 4 8
If creating a copy doesn't cause memory issues then this works as well -
df <- data.frame(var1 = 1:5, var2 = c(5,6,7,8,1))
df2 <- data.frame(var12 = 1:5, var22 = c(5,6,7,8,1))
df3 <- merge(df,df2, by.x = 'var2', by.y = 'var12', all.x = TRUE)
df3 <- subset(
df3,
is.na(var22),
select = c('var1','var2')
)
Output:
> df3
var1 var2
3 2 6
4 3 7
5 4 8
I tried merging df with df but that gives a warning about the column var2 being duplicated. Anybody know what to do?
If you can assume there are no duplicates in the data frame. Here's a one line answer, but still not too concise:
df[!duplicated(rbindlist(list(df,df[,2:1])))[nrow(df) + 1:nrow(df)],]
## var1 var2
## 2 2 6
## 3 3 7
## 4 4 8
rbindlist is necessary here because rbind(df,df[,2:1]) will match by column name rather than index, so the other option is something like rbind(df,setnames(df[,2:1],names(df))). If you want to keep duplicates from the original, this gets even more unpleasant:
> df <- data.frame(var1 = 1:5, var2 = c(5,6,7,8,1))
> df<-rbind(df,c(2,6))
> df[!duplicated(rbindlist(list(df,df[,2:1])))[nrow(df)+1:nrow(df)],]
var1 var2
2 2 6
3 3 7
4 4 8
> df[!duplicated(rbindlist(list(df,df[,2:1])))[nrow(df)+1:nrow(df)] | duplicated(df),]
var1 var2
2 2 6
3 3 7
4 4 8
6 2 6

Resources