Related
I have a data frame representing equivalences between members from two sets:
print(x)
G S
1 g1 s2
2 g1 s1
3 g2 s3
4 g3 s3
5 g4 s3
Does someone know of a function or a useful data structure for grouping the objects into equivalence classes? In the example above, the result should be two equivalence classes
{g1, s1, s2}, {g2, g3, g4, s3}
An option is to use igraph to extract vertices from clusters:
library(igraph)
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
tapply(names(m), m, sort)
output:
$`1`
[1] "g1" "s1" "s2"
$`2`
[1] "g2" "g3" "g4" "s3"
data:
x <- read.table(text="G S
g1 s2
g1 s1
g2 s3
g3 s3
g4 s3", header=TRUE, stringsAsFactors=FALSE)
You can test for equality using outer and combine them with | or. From this matrix get the unique lines and then use apply to return a list of the groups.
tt <- outer(x$G, x$G, "==") | outer(x$S, x$S, "==")
tt <- unique(tt)
apply(tt, 1, function(i) unique(unlist(x[i,])))
#[[1]]
#[1] "g1" "s2" "s1"
#
#[[2]]
#[1] "g2" "g3" "g4" "s3"
Another option which is looping over the vector instead of expanding it as outer is doing:
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- unique(c(t1, t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
#[[1]]
#[1] "g1" "s2" "s1"
#
#[[2]]
#[1] "g2" "g3" "g4" "s3"
Data:
x <- structure(list(G = c("g1", "g1", "g2", "g3", "g4"), S = c("s2",
"s1", "s3", "s3", "s3")), class = "data.frame", row.names = c(NA, -5L))
You can apply the following code for grouping
# function to categorize incoming `v` within existing `lst`
grp <- function(lst, v) {
if (length(lst) == 0) return(c(lst,list(v)))
idx <- which(unlist(Map(function(x) any(!is.na(match(v,x))), lst)))
if (length(idx) == 0) {
lst <- c(lst,list(v))
} else {
lst[idx] <- list(union(unlist(lst[idx]),v))
}
return(unique(lst))
}
# generate grouping results
df <- unique(df)
res <- Reduce(function(lst,x) grp(lst,x),
c(list(NULL),unname(Map(function(x) as.character(unlist(x)),split(df,seq(nrow(df)))))),
accumulate = F)
Application Examples
given input data df <- data.frame(G = c("g1","g1","g2","g3","g4"), S = c("s2","s1","s3","s3","s3"))
then
> df
G S
1 g1 s2
2 g1 s1
3 g2 s3
4 g3 s3
5 g4 s3
> res
[[1]]
[1] "g1" "s2" "s1"
[[2]]
[1] "g2" "s3" "g3" "g4"
given input data df <- data.frame(G = sprintf("g%i", c(2,3,4,2,2)), S = sprintf("s%i", c(3,3,2,4,3)))
then
> df
G S
1 g2 s3
2 g3 s3
3 g4 s2
4 g2 s4
> res
[[1]]
[1] "g2" "s3" "g3" "s4"
[[2]]
[1] "g4" "s2"
UPDATE: above solution become rather slow when dealing with huge dataset. An improved solution is given as below:
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
To compare the runtime of the three answers by #GKi, #chinsoon12, and #ThomasisCoding, I have created random sets of different size n and measured the runtime (as "elapsed" from proc.time).
From the results, I conclude that methods relying on igraph's connected component decomposition is the fastest:
n chinsoon12 ThomasisCoding GKi
500 0.002 0.054 0.030
2500 0.010 0.203 0.416
5000 0.020 0.379 1.456
7500 0.033 0.670 3.351
10000 0.044 0.832 5.837
Edit (2019-11-19): Upon request of #GKI, here is the code I used for comparing the runtime of the three algorithms. Beware that all functions work on the global variable x, because R only supports call-by-value, which would add unwanted overhead in this runtime estimation:
library(igraph)
# solution by chinsson12: CC decomposition from igraph
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
# solution by ThomasisCoding
method.B <- function() {
# find 1-to-1 mapping
r <- Reduce(intersect,lapply(names(x), function(v) split(x,x[v])))
r1map <- unlist(Map(toString,Map(unlist,r)))
# removel one-to-one mapping and find N-to-1 mapping
if (length(r1map) >0) {
xx <- x[-as.numeric(rownames(Reduce(rbind,r))),]
} else {
xx <- x
}
rNmap <- c()
if (nrow(xx)> 0) {
rNmap <- sapply(names(xx),
function(v) {
z <- split(xx,xx[v])
u <- z[unlist(Map(nrow,z))>1]
ifelse(length(u)==0, NA, toString(c(names(u),as.vector(u[[1]][,setdiff(names(xx),v)]))))
},USE.NAMES = F)
rNmap <- rNmap[!is.na(rNmap)]
}
# combine both 1-to-1 and n-to-1 mappings
res <- c(r1map,rNmap)
return(res)
}
# solution by GKi: with outer product
method.C <- function() {
tt <- outer(x$G, x$G, "==") | outer(x$S, x$S, "==")
tt <- unique(tt)
res <- apply(tt, 1, function(i) unique(unlist(x[i,])))
return(res)
}
# runtime results
rt <- data.frame()
for (n in seq(500,10000, by=500)) {
# this won't work because of ambigous node ids (see [answer by GKi][6]):
#x <- data.frame(G = sample(1:n,n,replace=TRUE), S = sample(1:n,n,replace=TRUE))
# therefore, make the node ids unique:
x <- data.frame(G = sprintf("g%i", sample(1:n,n,replace=TRUE)), S = sprintf("s%i", sample(1:n,n,replace=TRUE)))
t1 <- proc.time()
method.A()
tA <- proc.time() - t1
t1 <- proc.time()
method.B()
tB <- proc.time() - t1
t1 <- proc.time()
method.C()
tC <- proc.time() - t1
rt <- rbind(rt, data.frame(n=n, t.A=tA[["elapsed"]], t.B=tB[["elapsed"]], t.C=tC[["elapsed"]]))
}
print(rt)
plot(rt$n, rt$t.C, xlab="n", ylab="run time [s]", ylim=c(min(rt$t.A),max(rt$t.C)), type='l')
lines(rt$n, rt$t.B, col="red")
lines(rt$n, rt$t.A, col="blue")
legend("topleft", c("GKi", "ThomasisCoding", "chinsoon12"), lt=c(1,1,1), col=c("black", "red", "blue"))
Comparison on results of the methods:
method.A()
#$`1`
#[1] "1" "2" "3" "4"
method.A2()
#$`1`
#[1] "3" "1" "4" "2"
#
#$`2`
#[1] "2" "3"
method.B()
#[[1]]
#[1] 3 1 4 2
#
#[[2]]
#[1] 2 3
method.C()
#[[1]]
#[[1]]$All
#[1] 3 1 4 2
#
#[[1]]$G
#[1] 3 1
#
#[[1]]$S
#[1] 4 2 1
#
#
#[[2]]
#[[2]]$All
#[1] 2 3
#
#[[2]]$G
#[1] 2
#
#[[2]]$S
#[1] 3
Methods:
library(igraph)
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
method.A2 <- function() {
g <- graph_from_data_frame(t(apply(x, 1, function(x) paste0(names(x), x))))
m <- clusters(g)$membership
res <- tapply(substring(names(m),2), m, unique)
return(res)
}
method.B <- function() {
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
return(res)
}
method.C <- function() {
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- list(All=unique(c(t1, t2)), G=unique(t1), S=unique(t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
}
Data:
x <- data.frame(G = c(3,1,1,2,3), S=c(4,1,2,3,2))
x
# G S
#1 3 4
#2 1 1
#3 1 2
#4 2 3
#5 3 2
UPDATE: performance comparison based on latest updates by #GKi, #chinsoon12, and #ThomasisCoding
code for comparison
library(igraph)
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
method.B <- function() {
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
return(res)
}
method.C <- function() {
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- list(All=unique(c(t1, t2)), G=unique(t1), S=unique(t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
}
# runtime results
rt <- data.frame()
for (n in seq(500,10000, by=500)) {
# this won't work because of ambigous node ids (see [answer by GKi][6]):
#x <- data.frame(G = sample(1:n,n,replace=TRUE), S = sample(1:n,n,replace=TRUE))
# therefore, make the node ids unique:
x <- data.frame(G = sprintf("g%i", sample(1:n,n,replace=TRUE)), S = sprintf("s%i", sample(1:n,n,replace=TRUE)))
t1 <- proc.time()
method.A()
tA <- proc.time() - t1
t1 <- proc.time()
method.B()
tB <- proc.time() - t1
t1 <- proc.time()
method.C()
tC <- proc.time() - t1
rt <- rbind(rt, data.frame(n=n, t.A=tA[["elapsed"]], t.B=tB[["elapsed"]], t.C=tC[["elapsed"]]))
}
print(rt)
plot(rt$n, rt$t.C, xlab="n", ylab="run time [s]", ylim=c(min(rt$t.A),max(rt$t.C)), type='l')
lines(rt$n, rt$t.B, col="red")
lines(rt$n, rt$t.A, col="blue")
legend("topleft", c("GKi", "ThomasisCoding", "chinsoon12"), lt=c(1,1,1), col=c("black", "red", "blue"))
runtime of three methods:
n t.A t.B t.C
1 500 0.00 0.16 0.26
2 1000 0.02 0.31 0.53
3 1500 0.02 0.51 1.11
4 2000 0.03 0.90 1.47
5 2500 0.03 1.35 2.17
6 3000 0.04 2.08 3.14
7 3500 0.04 2.66 3.97
8 4000 0.07 3.38 4.92
9 4500 0.07 4.38 6.35
10 5000 0.06 5.41 7.58
11 5500 0.08 6.79 9.55
12 6000 0.08 7.81 10.91
13 6500 0.10 9.03 12.06
14 7000 0.09 10.06 14.20
15 7500 0.11 11.76 15.65
16 8000 0.13 13.41 17.84
17 8500 0.11 14.87 20.67
18 9000 0.13 16.88 23.52
19 9500 0.14 18.38 25.57
20 10000 0.14 22.81 30.05
visualization of runtime
Additional (Thanks to comment by #GKi): When keeping the dataset integers, the grouping process non-igraph methods are largely reduced:
n t.A t.B t.C
1 500 0.00 0.09 0.13
2 1000 0.01 0.15 0.23
3 1500 0.01 0.22 0.38
4 2000 0.03 0.31 0.50
5 2500 0.05 0.45 0.76
6 3000 0.07 0.51 0.77
7 3500 0.06 0.67 0.97
8 4000 0.07 0.85 1.20
9 4500 0.07 0.90 1.39
10 5000 0.09 1.23 1.55
11 5500 0.09 1.30 1.78
12 6000 0.09 1.51 1.94
13 6500 0.11 1.77 2.20
14 7000 0.13 2.18 2.55
15 7500 0.12 2.37 2.79
16 8000 0.13 2.56 2.96
17 8500 0.14 2.76 3.39
18 9000 0.15 3.03 3.54
19 9500 0.15 3.54 4.23
20 10000 0.16 3.76 4.32
I would like to speed up my calculations and obtain results without using loop in function m. Reproducible example:
N <- 2500
n <- 500
r <- replicate(1000, sample(N, n))
m <- function(r, N) {
ic <- matrix(0, nrow = N, ncol = N)
for (i in 1:ncol(r)) {
p <- r[, i]
ic[p, p] <- ic[p, p] + 1
}
ic
}
system.time(ic <- m(r, N))
# user system elapsed
# 6.25 0.51 6.76
isSymmetric(ic)
# [1] TRUE
In every iteration of for loop we are dealing with matrix not vector, so how this could be Vectorized?
#joel.wilson The purpose of this function is to calculate pairwise frequencies of elements. So afterwards we could estimate pairwise inclusion probabilities.
Thanks to #Khashaa and #alexis_laz. Benchmarks:
> require(rbenchmark)
> benchmark(m(r, N),
+ m1(r, N),
+ mvec(r, N),
+ alexis(r, N),
+ replications = 10, order = "elapsed")
test replications elapsed relative user.self sys.self user.child sys.child
4 alexis(r, N) 10 4.73 1.000 4.63 0.11 NA NA
3 mvec(r, N) 10 5.36 1.133 5.18 0.18 NA NA
2 m1(r, N) 10 5.48 1.159 5.29 0.19 NA NA
1 m(r, N) 10 61.41 12.983 60.43 0.90 NA NA
This should be significantly faster as it avoids operations on double indexing
m1 <- function(r, N) {
ic <- matrix(0, nrow = N, ncol=ncol(r))
for (i in 1:ncol(r)) {
p <- r[, i]
ic[, i][p] <- 1
}
tcrossprod(ic)
}
system.time(ic1 <- m1(r, N))
# user system elapsed
# 0.53 0.01 0.55
all.equal(ic, ic1)
# [1] TRUE
Simple "counting/adding" operations can almost always be vectorized
mvec <- function(r, N) {
ic <- matrix(0, nrow = N, ncol=ncol(r))
i <- rep(1:ncol(r), each=nrow(r))
ic[cbind(as.vector(r), i)] <- 1
tcrossprod(ic)
}
Say I have a data.frame like this:
X1 X2 X3
1 A B A
2 A C B
3 B A B
4 A A C
I would like to count the occurrences of A, B, C, etc. in each column, and return the result as
A_count B_count C_count
X1 3 1 0
X2 2 1 1
X3 1 2 1
I'm sure this question has a thousand duplicates, but I can't seem to find an answer that works for me :(
By running
apply(mydata, 2, table)
I get something like
$X1
B A
1 3
$X2
A C B
2 1 1
But it's not exactly what I want and if I try to build it back into a data frame, it doesn't work because I don't get the same number of columns for every row (like $X1 above where there are no C's).
What am I missing?
Many thanks!
You can refactor to include the factor levels common to each column, then tabulate. I would also recommend using lapply() instead of apply(), as apply() is for matrices.
df <- read.table(text = "X1 X2 X3
1 A B A
2 A C B
3 B A B
4 A A C", h=T)
do.call(
rbind,
lapply(df, function(x) table(factor(x, levels=levels(unlist(df)))))
)
# A B C
# X1 3 1 0
# X2 2 1 1
# X3 1 2 1
Suppose your data frame is x, I would simply do:
do.call(rbind, tapply(unlist(x, use.names = FALSE),
rep(1:ncol(x), each = nrow(x)),
table))
# A B C
#1 3 1 0
#2 2 1 1
#3 1 2 1
Benchmarking
# a function to generate toy data
# `k` factor levels
# `n` row
# `p` columns
datsim <- function(n, p, k) {
as.data.frame(replicate(p, sample(LETTERS[1:k], n, TRUE), simplify = FALSE),
col.names = paste0("X",1:p), stringsAsFactors = TRUE)
}
# try `n = 100`, `p = 500` and `k = 3`
x <- datsim(100, 500, 3)
## DirtySockSniffer's answer
system.time(do.call(rbind, lapply(x, function(u) table(factor(u, levels=levels(unlist(x)))))))
# user system elapsed
# 21.240 0.068 21.365
## my answer
system.time(do.call(rbind, tapply(unlist(x, use.names = FALSE), rep(1:ncol(x), each = nrow(x)), table)))
# user system elapsed
# 0.108 0.000 0.111
Dirty's answer can be improved, by:
## improved DirtySockSniffer's answer
system.time({clevels <- levels(unlist(x, use.names = FALSE));
do.call(rbind, lapply(x, function(u) table(factor(u, levels=clevels))))})
# user system elapsed
# 0.108 0.000 0.108
Also consider user20650's answer:
## Let's try a large `n`, `p`, `k`
x <- datsim(200, 5000, 5)
system.time(t(table(stack(lapply(x, as.character)))))
# user system elapsed
# 0.592 0.052 0.646
While my answer does:
system.time(do.call(rbind, tapply(unlist(x, use.names = FALSE), rep(1:ncol(x), each = nrow(x)), table)))
# user system elapsed
# 1.844 0.056 1.904
Improved Dirty's answer does:
system.time({clevels <- levels(unlist(x, use.names = FALSE));
do.call(rbind, lapply(x, function(u) table(factor(u, levels=clevels))))})
# user system elapsed
# 1.240 0.012 1.263
Is there an elegant/fastR way to combine all pairs of columns in a data.frame?
For example, using mapply() and paste() we can turn this data.frame:
mydf <- data.frame(a.1 = letters, a.2 = 26:1, b.1 = letters, b.2 = 1:26)
head(mydf)
a.1 a.2 b.1 b.2
1 a 26 a 1
2 b 25 b 2
3 c 24 c 3
4 d 23 d 4
5 e 22 e 5
6 f 21 f 6
into this data.frame:
mydf2 <- mapply(function(x, y) {
paste(x, y, sep = ".")},
mydf[ ,seq(1, ncol(mydf), by = 2)],
mydf[ ,seq(2, ncol(mydf), by = 2)])
head(mydf2)
a.1 b.1
[1,] "a.26" "a.1"
[2,] "b.25" "b.2"
[3,] "c.24" "c.3"
[4,] "d.23" "d.4"
[5,] "e.22" "e.5"
[6,] "f.21" "f.6"
However, this feels clumsy and is a bit slow when applied to big datasets. Any suggestions, perhaps using a Hadley package?
EDIT:
The ideal solution would easily scale to large numbers of columns, such that the names of the columns would not need to be included in the function call. Thanks!
It's amusing to note that the OP's solution appears to be the fastest one:
f1 <- function(mydf) {
mapply(function(x, y) {
paste(x, y, sep = ".")},
mydf[ ,seq(1, ncol(mydf), by = 2)],
mydf[ ,seq(2, ncol(mydf), by = 2)])
}
f.thelatemail <- function(mydf) {
mapply(paste,mydf[c(TRUE,FALSE)],mydf[c(FALSE,TRUE)],sep=".")
}
require(dplyr)
f.on_the_shores_of_linux_sea <- function(mydf) {
transmute(mydf,x1=paste0( a.1,'.', a.2),x2=paste0( b.1,'.', b.2))
}
f.jazurro <- function(mydf) {
odd <- seq(1, ncol(mydf), 2);
lapply(odd, function(x) paste(mydf[,x], mydf[,x+1], sep = ".")) %>%
do.call(cbind,.)
}
library(data.table)
f.akrun <- function(mydf) {
res <- as.data.table(matrix(, ncol=ncol(mydf)/2, nrow=nrow(mydf)))
indx <- seq(1, ncol(mydf), 2)
setDT(mydf)
for(j in seq_along(indx)){
set(res, i=NULL, j=j, value= paste(mydf[[indx[j]]],
mydf[[indx[j]+1]], sep='.'))
}
res
}
mydf <- data.frame(a.1 = letters, a.2 = 26:1, b.1 = letters, b.2 = 1:26)
mydf <- mydf[rep(1:nrow(mydf),5000),]
library(rbenchmark)
benchmark(f1(mydf),f.thelatemail(mydf),f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf))
Results:
# test replications elapsed relative user.self sys.self user.child sys.child
# 5 f.akrun(mydf) 100 14.000 75.269 13.673 0.296 0 0
# 4 f.jazurro(mydf) 100 0.388 2.086 0.314 0.071 0 0
# 3 f.on_the_shores_of_linux_sea(mydf) 100 15.585 83.790 15.293 0.280 0 0
# 2 f.thelatemail(mydf) 100 26.416 142.022 25.736 0.639 0 0
# 1 f1(mydf) 100 0.186 1.000 0.169 0.017 0 0
[Updated Benchmark]
I've added one solution from #thelatemail, which I missed in the original answer, and one solution from #akrun:
f.thelatemail2 <- function(mydf) {
data.frame(Map(paste,mydf[c(TRUE,FALSE)],mydf[c(FALSE,TRUE)],sep="."))
}
f.akrun2 <- function(mydf) {
setDT(mydf)
indx <- as.integer(seq(1, ncol(mydf), 2))
mydf2 <- copy(mydf)
for(j in indx){
set(mydf2, i=NULL, j=j, value= paste(mydf2[[j]],
mydf2[[j+1]], sep="."))
}
mydf2[,indx, with=FALSE]
}
Benchmark:
library(rbenchmark)
benchmark(f1(mydf),f.thelatemail(mydf), f.thelatemail2(mydf), f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf),f.akrun2(mydf))
# test replications elapsed relative user.self sys.self user.child sys.child
# 6 f.akrun(mydf) 100 13.247 69.356 12.897 0.340 0 0
# 7 f.akrun2(mydf) 100 12.746 66.733 12.405 0.339 0 0
# 5 f.jazurro(mydf) 100 0.327 1.712 0.254 0.073 0 0
# 4 f.on_the_shores_of_linux_sea(mydf) 100 16.347 85.586 15.838 0.445 0 0
# 2 f.thelatemail(mydf) 100 26.307 137.733 25.536 0.708 0 0
# 3 f.thelatemail2(mydf) 100 15.938 83.445 15.136 0.750 0 0
# 1 f1(mydf) 100 0.191 1.000 0.156 0.036 0 0
I'm not sure this is the best approach. See if the below code gives any speed improvement
require(dplyr)
transmute(mydf,x1=paste0( a.1,'.', a.2),x2=paste0( b.1,'.', b.2))
Answer updated based on comment :-)
An option using set from data.table. It should be fast for large datasets as it modifies by reference and the overhead of [.data.table is avoided. Assuming that the columns are ordered for each pair of columns.
library(data.table)
res <- as.data.table(matrix(, ncol=ncol(mydf)/2, nrow=nrow(mydf)))
indx <- seq(1, ncol(mydf), 2)
setDT(mydf)
for(j in seq_along(indx)){
set(res, i=NULL, j=j, value= paste(mydf[[indx[j]]],
mydf[[indx[j]+1]], sep='.'))
}
head(res)
# V1 V2
#1: a.26 a.1
#2: b.25 b.2
#3: c.24 c.3
#4: d.23 d.4
#5: e.22 e.5
#6: f.21 f.6
Instead of creating a new result dataset, we can also update the same or a copy of the original dataset. There will be some warnings about type conversion, but I guess this would be a bit faster (not benchmarked)
setDT(mydf)
mydf2 <- copy(mydf)
for(j in indx){
set(mydf2, i=NULL, j=j, value= paste(mydf2[[j]],
mydf2[[j+1]], sep="."))
}
mydf2[,indx, with=FALSE]
Benchmarks
I tried the benchmarks on a slightly bigger data with many columns.
data
set.seed(24)
d1 <- as.data.frame(matrix(sample(letters,500*10000, replace=TRUE),
ncol=500), stringsAsFactors=FALSE)
set.seed(4242)
d2 <- as.data.frame(matrix(sample(1:200,500*10000,
replace=TRUE), ncol=500))
d3 <- cbind(d1,d2)
mydf <- d3[,order(c(1:ncol(d1), 1:ncol(d2)))]
mydf1 <- copy(mydf)
Compared f1, f.jazurro (fastest) (from #Marat Talipov's post) with f.akrun2
microbenchmark(f1(mydf), f.jazurro(mydf), f.akrun2(mydf1),
unit='relative', times=20L)
#Unit: relative
# expr min lq mean median uq max neval
# f1(mydf) 3.420448 2.3217708 2.714495 2.653178 2.819952 2.736376 20
#f.jazurro(mydf) 1.000000 1.0000000 1.000000 1.000000 1.000000 1.000000 20
#f.akrun2(mydf1) 1.204488 0.8015648 1.031248 1.042262 1.097136 1.066671 20
#cld
#b
#a
#a
In this, f.jazurro is slighly better than f.akrun2. I think if I increase the group size, nrows etc, it would be an interesting comparison
For what its worth seven years later, here is a trick using the glue package and eval() + parse(). I don't know how it compares to other answers, but it works pretty darn well for me.
mydf <- data.frame(a.1 = letters, a.2 = 26:1, b.1 = letters, b.2 = 1:26)
mydf2 <- mydf
vars <- c('a', 'b')
eval(parse(text = glue::glue('mydf2 <- mydf2 |> unite({vars}, c(`{vars}.1`, `{vars}.2`), na.rm = T, sep = ".")')))
mydf2
I want to reshape my dataframe from long to wide format and I loose some data that I'd like to keep.
For the following example:
df <- data.frame(Par1 = unlist(strsplit("AABBCCC","")),
Par2 = unlist(strsplit("DDEEFFF","")),
ParD = unlist(strsplit("foo,bar,baz,qux,bla,xyz,meh",",")),
Type = unlist(strsplit("pre,post,pre,post,pre,post,post",",")),
Val = c(10,20,30,40,50,60,70))
# Par1 Par2 ParD Type Val
# 1 A D foo pre 10
# 2 A D bar post 20
# 3 B E baz pre 30
# 4 B E qux post 40
# 5 C F bla pre 50
# 6 C F xyz post 60
# 7 C F meh post 70
dfw <- dcast(df,
formula = Par1 + Par2 ~ Type,
value.var = "Val",
fun.aggregate = mean)
# Par1 Par2 post pre
# 1 A D 20 10
# 2 B E 40 30
# 3 C F 65 50
this is almost what I need but I would like to have
some field keeping data from ParD field (for example, as single merged string),
number of observations used for aggregation.
i.e. I would like the resulting data.frame to be as follows:
# Par1 Par2 post pre Num.pre Num.post ParD
# 1 A D 20 10 1 1 foo_bar
# 2 B E 40 30 1 1 baz_qux
# 3 C F 65 50 1 2 bla_xyz_meh
I would be grateful for any ideas. For example, I tried to solve the second task by writing in dcast: fun.aggregate=function(x) c(Val=mean(x),Num=length(x)) - but this causes an error.
Late to the party, but here's another alternative using data.table:
require(data.table)
dt <- data.table(df, key=c("Par1", "Par2"))
dt[, list(pre=mean(Val[Type == "pre"]),
post=mean(Val[Type == "post"]),
pre.num=length(Val[Type == "pre"]),
post.num=length(Val[Type == "post"]),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo_bar
# 2: B E 30 40 1 1 baz_qux
# 3: C F 50 65 1 2 bla_xyz_meh
[from Matthew] +1 Some minor improvements to save repeating the same ==, and to demonstrate local variables inside j.
dt[, list(pre=mean(Val[.pre <- Type=="pre"]), # save .pre
post=mean(Val[.post <- Type=="post"]), # save .post
pre.num=sum(.pre), # reuse .pre
post.num=sum(.post), # reuse .post
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo_bar
# 2: B E 30 40 1 1 baz_qux
# 3: C F 50 65 1 2 bla_xyz_meh
dt[, { .pre <- Type=="pre" # or save .pre and .post up front
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = paste(ParD, collapse="_")) }
, by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo_bar
# 2: B E 30 40 1 1 baz_qux
# 3: C F 50 65 1 2 bla_xyz_meh
And if a list column is ok rather than a paste, then this should be faster :
dt[, { .pre <- Type=="pre"
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = list(ParD)) } # list() faster than paste()
, by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo,bar
# 2: B E 30 40 1 1 baz,qux
# 3: C F 50 65 1 2 bla,xyz,meh
Solution in 2 steps using ddply ( i am not happy with , but I get the result)
dat <- ddply(df,.(Par1,Par2),function(x){
data.frame(ParD=paste(paste(x$ParD),collapse='_'),
Num.pre =length(x$Type[x$Type =='pre']),
Num.post = length(x$Type[x$Type =='post']))
})
merge(dfw,dat)
Par1 Par2 post pre ParD Num.pre Num.post
1 A D 2.0 1 foo_bar 1 1
2 B E 4.0 3 baz_qux 1 1
3 C F 6.5 5 bla_xyz_meh 1 2
You could do a merge of two dcasts and an aggregate, here all wrapped into one large expression mostly to avoid having intermediate objects hanging around afterwards:
Reduce(merge, list(
dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=mean),
setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=length), c("Par1", "Par2", "Num.post",
"Num.pre")),
aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
))
I'll post but agstudy's puts me to shame:
step1 <- with(df, split(df, list(Par1, Par2)))
step2 <- step1[sapply(step1, nrow) > 0]
step3 <- lapply(step2, function(x) {
piece1 <- tapply(x$Val, x$Type, mean)
piece2 <- tapply(x$Type, x$Type, length)
names(piece2) <- paste0("Num.", names(piece2))
out <- x[1, 1:2]
out[, 3:6] <- c(piece1, piece2)
names(out)[3:6] <- names(c(piece1, piece2))
out$ParD <- paste(unique(x$ParD), collapse="_")
out
})
data.frame(do.call(rbind, step3), row.names=NULL)
Yielding:
Par1 Par2 post pre Num.post Num.pre ParD
1 A D 2.0 1 1 1 foo_bar
2 B E 4.0 3 1 1 baz_qux
3 C F 6.5 5 2 1 bla_xyz_meh
What a great opprotunity to benchmark!
Below are some runs of the plyr method (as suggested by #agstudy) compared with the data.table method (as suggested by #Arun)
using different sample sizes (N = 900, 2700, 10800)
Summary:
The data.table method outperforms the plyr method by a factor of 7.5
#-------------------#
# M E T H O D S #
#-------------------#
# additional methods below, in the updates
# Method 1 -- suggested by #agstudy
plyrMethod <- quote({
dfw<-dcast(df,
formula = Par1+Par2~Type,
value.var="Val",
fun.aggregate=mean)
dat <- ddply(df,.(Par1,Par2),function(x){
data.frame(ParD=paste(paste(x$ParD),collapse='_'),
Num.pre =length(x$Type[x$Type =='pre']),
Num.post = length(x$Type[x$Type =='post']))
})
merge(dfw,dat)
})
# Method 2 -- suggested by #Arun
dtMethod <- quote(
dt[, list(pre=mean(Val[Type == "pre"]),
post=mean(Val[Type == "post"]),
Num.pre=length(Val[Type == "pre"]),
Num.post=length(Val[Type == "post"]),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
)
# Method 3 -- suggested by #regetz
reduceMethod <- quote(
Reduce(merge, list(
dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=mean),
setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=length), c("Par1", "Par2", "Num.post",
"Num.pre")),
aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
))
)
# Method 4 -- suggested by #Ramnath
castddplyMethod <- quote(
reshape::cast(Par1 + Par2 + ParD ~ Type,
data = ddply(df, .(Par1, Par2), transform,
ParD = paste(ParD, collapse = "_")),
fun = c(mean, length)
)
)
# SAMPLE DATA #
#-------------#
library(data.table)
library(plyr)
library(reshape2)
library(rbenchmark)
# for Par1, ParD
LLL <- apply(expand.grid(LETTERS, LETTERS, LETTERS, stringsAsFactors=FALSE), 1, paste0, collapse="")
lll <- apply(expand.grid(letters, letters, letters, stringsAsFactors=FALSE), 1, paste0, collapse="")
# max size is 17568 with current sample data setup, ie: floor(length(LLL) / 18) * 18
size <- 17568
size <- 10800
size <- 900
set.seed(1)
df<-data.frame(Par1=rep(LLL[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
, Par2=rep(lll[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
, ParD=sample(unlist(lapply(c("f", "b"), paste0, lll)), size, FALSE)
, Type=rep(c("pre","post"), size/2)
, Val =sample(seq(10,100,10), size, TRUE)
)
dt <- data.table(df, key=c("Par1", "Par2"))
# Confirming Same Results #
#-------------------------#
# Evaluate
DF1 <- eval(plyrMethod)
DF2 <- eval(dtMethod)
# Convert to DF and sort columns and sort ParD levels, for use in identical
colOrder <- sort(names(DF1))
DF1 <- DF1[, colOrder]
DF2 <- as.data.frame(DF2)[, colOrder]
DF2$ParD <- factor(DF2$ParD, levels=levels(DF1$ParD))
identical((DF1), (DF2))
# [1] TRUE
#-------------------------#
RESULTS
#--------------------#
# BENCHMARK #
#--------------------#
benchmark(plyr=eval(plyrMethod), dt=eval(dtMethod), reduce=eval(reduceMethod), castddply=eval(castddplyMethod),
replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"),
order="relative")
# SAMPLE SIZE = 900
relative test elapsed user.self sys.self replications
1.000 reduce 0.392 0.375 0.018 5
1.003 dt 0.393 0.377 0.016 5
7.064 plyr 2.769 2.721 0.047 5
8.003 castddply 3.137 3.030 0.106 5
# SAMPLE SIZE = 2,700
relative test elapsed user.self sys.self replications
1.000 dt 1.371 1.327 0.090 5
2.205 reduce 3.023 2.927 0.102 5
7.291 plyr 9.996 9.644 0.377 5
# SAMPLE SIZE = 10,800
relative test elapsed user.self sys.self replications
1.000 dt 8.678 7.168 1.507 5
2.769 reduce 24.029 23.231 0.786 5
6.946 plyr 60.277 52.298 7.947 5
13.796 castddply 119.719 113.333 10.816 5
# SAMPLE SIZE = 17,568
relative test elapsed user.self sys.self replications
1.000 dt 27.421 13.042 14.470 5
4.030 reduce 110.498 75.853 34.922 5
5.414 plyr 148.452 105.776 43.156 5
Update : Added results for baseMethod1
# Used only sample size of 90, as it was taking long
relative test elapsed user.self sys.self replications
1.000 dt 0.044 0.043 0.001 5
7.773 plyr 0.342 0.339 0.003 5
65.614 base1 2.887 2.866 0.028 5
Where
baseMethod1 <- quote({
step1 <- with(df, split(df, list(Par1, Par2)))
step2 <- step1[sapply(step1, nrow) > 0]
step3 <- lapply(step2, function(x) {
piece1 <- tapply(x$Val, x$Type, mean)
piece2 <- tapply(x$Type, x$Type, length)
names(piece2) <- paste0("Num.", names(piece2))
out <- x[1, 1:2]
out[, 3:6] <- c(piece1, piece2)
names(out)[3:6] <- names(c(piece1, piece2))
out$ParD <- paste(unique(x$ParD), collapse="_")
out
})
data.frame(do.call(rbind, step3), row.names=NULL)
})
Update 2: Added keying the DT as part of the metric
Adding the indexing step to the benchmark for fairness as per #MatthewDowle s comment.
However, presumably, if data.table is used, it will be in place of the data.frame and
hence the indexing will occur once and not simply for this procedure
dtMethod.withkey <- quote({
dt <- data.table(df, key=c("Par1", "Par2"))
dt[, list(pre=mean(Val[Type == "pre"]),
post=mean(Val[Type == "post"]),
Num.pre=length(Val[Type == "pre"]),
Num.post=length(Val[Type == "post"]),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
})
# SAMPLE SIZE = 10,800
relative test elapsed user.self sys.self replications
1.000 dt 9.155 7.055 2.137 5
1.043 dt.withkey 9.553 7.245 2.353 5
3.567 reduce 32.659 31.196 1.586 5
6.703 plyr 61.364 54.080 7.600 5
Update 3: Benchmarking #MD's edits to #Arun's original answer
dtMethod.MD1 <- quote(
dt[, list(pre=mean(Val[.pre <- Type=="pre"]), # save .pre
post=mean(Val[.post <- Type=="post"]), # save .post
pre.num=sum(.pre), # reuse .pre
post.num=sum(.post), # reuse .post
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
)
dtMethod.MD2 <- quote(
dt[, { .pre <- Type=="pre" # or save .pre and .post up front
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = paste(ParD, collapse="_")) }
, by=list(Par1, Par2)]
)
dtMethod.MD3 <- quote(
dt[, { .pre <- Type=="pre"
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = list(ParD)) } # list() faster than paste()
, by=list(Par1, Par2)]
)
benchmark(dt.M1=eval(dtMethod.MD1), dt.M2=eval(dtMethod.MD2), dt.M3=eval(dtMethod.MD3), dt=eval(dtMethod),
replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"),
order="relative")
#--------------------#
Comparing the different data.table methods amongst themselves
# SAMPLE SIZE = 900
relative test elapsed user.self sys.self replications
1.000 dt.M3 0.198 0.197 0.001 5 <~~~ "list()" Method
1.242 dt.M1 0.246 0.243 0.004 5
1.253 dt.M2 0.248 0.242 0.007 5
1.884 dt 0.373 0.367 0.007 5
# SAMPLE SIZE = 17,568
relative test elapsed user.self sys.self replications
1.000 dt.M3 33.492 24.487 9.122 5 <~~~ "list()" Method
1.086 dt.M1 36.388 11.442 25.086 5
1.086 dt.M2 36.388 10.845 25.660 5
1.126 dt 37.701 13.256 24.535 5
Comparing MD3 ("list" method) with MD1 (best of DT non-list methods)
Using a clean session (ie, removing string cache)
_Note: Ran the following twice, fresh session each time, with practically identical results
Then re-ran in the *same* session, with reps=5. Results very different._
benchmark(dt.M1=eval(dtMethod.MD1), dt.M3=eval(dtMethod.MD3), replications=1, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), order="relative")
# SAMPLE SIZE=17,568; CLEAN SESSION
relative test elapsed user.self sys.self replications
1.000 dt.M1 8.885 4.260 4.617 1
1.633 dt.M3 14.506 12.821 1.677 1
# SAMPLE SIZE=17,568; *SAME* SESSION
relative test elapsed user.self sys.self replications
1.000 dt.M1 33.443 10.200 23.226 5
1.048 dt.M3 35.060 26.127 8.915 5
#--------------------#
New benchmarks against previous methods
_Note: Not using the "list method" as results are not the same as other methods_
# SAMPLE SIZE = 900
relative test elapsed user.self sys.self replications
1.000 dt.M1 0.254 0.247 0.008 5
1.705 reduce 0.433 0.425 0.010 5
11.280 plyr 2.865 2.842 0.031 5
# SAMPLE SIZE = 17,568
relative test elapsed user.self sys.self replications
1.000 dt.M1 24.826 10.427 14.458 5
4.348 reduce 107.935 70.107 38.314 5
5.942 plyr 147.508 106.958 41.083 5
One Step solution combining reshape::cast with plyr::ddply
cast(Par1 + Par2 + ParD ~ Type, data = ddply(df, .(Par1, Par2), transform,
ParD = paste(ParD, collapse = "_")), fun = c(mean, length))
NOTE that the dcast function in reshape2 does not allow multiple aggregate functions to be passed, while the cast function in reshape does.
I believe this base R solution is comparable with #Arun's data table solution. (Which isn't to say I would prefer it; that code is much simpler!)
baseMethod2 <- quote({
is <- unname(split(1:nrow(df), with(df, paste(Par1, Par2, sep="\b"))))
i1 <- sapply(is, `[`, 1)
out <- with(df, data.frame(Par1=Par1[i1], Par2=Par2[i1]))
js <- lapply(is, function(i) split(i, df$Type[i]))
out$post <- sapply(js, function(j) mean(df$Val[j$post]))
out$pre <- sapply(js, function(j) mean(df$Val[j$pre]))
out$Num.pre <- sapply(js, function(j) length(j$pre))
out$Num.post <- sapply(js, function(j) length(j$post))
out$ParD <- sapply(is, function(x) paste(df$ParD[x], collapse="_"))
out
})
Using #RicardoSaporta's timing code with 900, 2700, and 10,800, respectively:
> relative test elapsed user.self sys.self replications
3 1.000 baseMethod2 0.230 0.229 0 5
1 1.130 dt 0.260 0.257 0 5
2 8.752 plyr 2.013 2.006 0 5
> relative test elapsed user.self sys.self replications
3 1.000 baseMethod2 0.877 0.872 0 5
1 1.068 dt 0.937 0.934 0 5
2 8.060 plyr 7.069 7.043 0 5
> relative test elapsed user.self sys.self replications
1 1.000 dt 6.232 6.178 0.031 5
3 1.085 baseMethod2 6.763 6.683 0.054 5
2 7.263 plyr 45.261 44.983 0.104 5
Trying to wrap different aggregation expressions into a self-contained function (expressions should yield atomic values)...
multi.by <- function(X, INDEX,...) {
expressions <- substitute(...())
duplicates <- duplicated(INDEX)
res <- do.call(rbind,sapply(split(X,cumsum(!duplicates),drop=T), function(part)
sapply(expressions,eval,part,simplify=F),simplify=F))
if (is.data.frame(INDEX)) res <- cbind(INDEX[!duplicates,],res)
else rownames(res) <- INDEX[!duplicates]
res
}
multi.by(df,df[,1:2],
pre=mean(Val[Type=="pre"]),
post=mean(Val[Type=="post"]),
Num.pre=sum(Type=="pre"),
Num.post=sum(Type=="post"),
ParD=paste(ParD, collapse="_"))