R: cbind function in for loop - r

c <- readline(prompt="Enter an integer: ")
b <- readline(prompt="Enter an integer: ")
for(i in 1:c){
assign(paste("a", i, sep = ""), i)
}
This gives a1, a2 ... ac variables containing 1,2 ... c
How can I use cbind based on the value of b? For example, take the following:
# assume b = 3 and c = 12:
t1 <- cbind(a1,a2,a3)
t2 <- cbind(a4,a5,a6)
t3 <- cbind(a7,a8,a9)
t4 <- cbind(a10,a11,a12)
# assume b = 4 and c = 12:
t1 <- cbind(a1,a2,a3,a4)
t2 <- cbind(a5,a6,a7,a8)
t3 <- cbind(a9,a10,a11,a12)
Another example to clarify: assume b = 3, c=6
a1 <- c(3,5,2)
a2 <- c(4,7,3)
a3 <- c(3,5,2)
a4 <- c(4,5,3)
a5 <- c(5,5,5)
a6 <- c(4,3,1)
t1 <- cbind(a1,a2,a3)
t2 <- cbind(a4,a5,a6)
Expected value of t1:
3 4 3
5 7 5
2 3 2

I am making some assumptions about your data. I am assuming you have values assigned with the columns you are trying to cbind.
a <- 12
b <- 3
test <- NULL
index <- NULL
for(i in 1:a){
test[i] <- paste0("n_", i)
index[i] <- paste(i)
}
start <- seq(1,a-b+1, by=b)
end <- seq(b,a, by=b)
s = list()
k=1
for(k in 1:length(start)){
cbind_list <- start[k]:end[k]
s[[k]] <- rbind(test[seq(cbind_list[1],cbind_list[length(cbind_list)],by=1)])
}
list_cols <- do.call(rbind, s)
n_1 <- rep(1,4)
n_2 <- rep(2,4)
n_3 <- rep(3,4)
n_4 <- rep(4,4)
n_5 <- rep(5,4)
n_6 <- rep(6,4)
n_7 <- rep(7,4)
n_8 <- rep(8,4)
n_9 <- rep(9,4)
n_10 <- rep(10,4)
n_11 <- rep(11,4)
n_12 <- rep(12,4)
df <- data.frame(n_1,n_2,n_3,n_4,n_5,n_6,n_7,n_8,n_9,n_10,n_11,n_12)
t=list()
for(p in 1:nrow(list_cols)){
nam <- paste0("t",p)
assign(nam,cbind(df[,match(list_cols[p,], colnames(df))]))
}
OUTPUT:
> t1
n_1 n_2 n_3
1 1 2 3
2 1 2 3
3 1 2 3
4 1 2 3
UPDATED:
a <- 6
b <- 3
test <- NULL
index <- NULL
for(i in 1:a){
test[i] <- paste0("n_", i)
index[i] <- paste(i)
}
start <- seq(1,a-b+1, by=b)
end <- seq(b,a, by=b)
s = list()
k=1
for(k in 1:length(start)){
cbind_list <- start[k]:end[k]
s[[k]] <- rbind(test[seq(cbind_list[1],cbind_list[length(cbind_list)],by=1)])
}
list_cols <- do.call(rbind, s)
n_1 <- c(3,5,2)
n_2 <- c(4,7,3)
n_3 <- c(3,5,2)
n_4 <- c(4,5,3)
n_5 <- c(5,5,5)
n_6 <- c(4,3,1)
df <- data.frame(n_1,n_2,n_3,n_4,n_5,n_6)
t=list()
p=1
for(p in 1:nrow(list_cols)){
nam <- paste0("t",p)
assign(nam,cbind(df[,match(list_cols[p,], colnames(df))]))
}
OUTPUT:
> t1
n_1 n_2 n_3
1 3 4 3
2 5 7 5
3 2 3 2

Related

Grouping factors or integers into equivalence classes in R

I have a data frame representing equivalences between members from two sets:
print(x)
G S
1 g1 s2
2 g1 s1
3 g2 s3
4 g3 s3
5 g4 s3
Does someone know of a function or a useful data structure for grouping the objects into equivalence classes? In the example above, the result should be two equivalence classes
{g1, s1, s2}, {g2, g3, g4, s3}
An option is to use igraph to extract vertices from clusters:
library(igraph)
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
tapply(names(m), m, sort)
output:
$`1`
[1] "g1" "s1" "s2"
$`2`
[1] "g2" "g3" "g4" "s3"
data:
x <- read.table(text="G S
g1 s2
g1 s1
g2 s3
g3 s3
g4 s3", header=TRUE, stringsAsFactors=FALSE)
You can test for equality using outer and combine them with | or. From this matrix get the unique lines and then use apply to return a list of the groups.
tt <- outer(x$G, x$G, "==") | outer(x$S, x$S, "==")
tt <- unique(tt)
apply(tt, 1, function(i) unique(unlist(x[i,])))
#[[1]]
#[1] "g1" "s2" "s1"
#
#[[2]]
#[1] "g2" "g3" "g4" "s3"
Another option which is looping over the vector instead of expanding it as outer is doing:
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- unique(c(t1, t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
#[[1]]
#[1] "g1" "s2" "s1"
#
#[[2]]
#[1] "g2" "g3" "g4" "s3"
Data:
x <- structure(list(G = c("g1", "g1", "g2", "g3", "g4"), S = c("s2",
"s1", "s3", "s3", "s3")), class = "data.frame", row.names = c(NA, -5L))
You can apply the following code for grouping
# function to categorize incoming `v` within existing `lst`
grp <- function(lst, v) {
if (length(lst) == 0) return(c(lst,list(v)))
idx <- which(unlist(Map(function(x) any(!is.na(match(v,x))), lst)))
if (length(idx) == 0) {
lst <- c(lst,list(v))
} else {
lst[idx] <- list(union(unlist(lst[idx]),v))
}
return(unique(lst))
}
# generate grouping results
df <- unique(df)
res <- Reduce(function(lst,x) grp(lst,x),
c(list(NULL),unname(Map(function(x) as.character(unlist(x)),split(df,seq(nrow(df)))))),
accumulate = F)
Application Examples
given input data df <- data.frame(G = c("g1","g1","g2","g3","g4"), S = c("s2","s1","s3","s3","s3"))
then
> df
G S
1 g1 s2
2 g1 s1
3 g2 s3
4 g3 s3
5 g4 s3
> res
[[1]]
[1] "g1" "s2" "s1"
[[2]]
[1] "g2" "s3" "g3" "g4"
given input data df <- data.frame(G = sprintf("g%i", c(2,3,4,2,2)), S = sprintf("s%i", c(3,3,2,4,3)))
then
> df
G S
1 g2 s3
2 g3 s3
3 g4 s2
4 g2 s4
> res
[[1]]
[1] "g2" "s3" "g3" "s4"
[[2]]
[1] "g4" "s2"
UPDATE: above solution become rather slow when dealing with huge dataset. An improved solution is given as below:
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
To compare the runtime of the three answers by #GKi, #chinsoon12, and #ThomasisCoding, I have created random sets of different size n and measured the runtime (as "elapsed" from proc.time).
From the results, I conclude that methods relying on igraph's connected component decomposition is the fastest:
n chinsoon12 ThomasisCoding GKi
500 0.002 0.054 0.030
2500 0.010 0.203 0.416
5000 0.020 0.379 1.456
7500 0.033 0.670 3.351
10000 0.044 0.832 5.837
Edit (2019-11-19): Upon request of #GKI, here is the code I used for comparing the runtime of the three algorithms. Beware that all functions work on the global variable x, because R only supports call-by-value, which would add unwanted overhead in this runtime estimation:
library(igraph)
# solution by chinsson12: CC decomposition from igraph
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
# solution by ThomasisCoding
method.B <- function() {
# find 1-to-1 mapping
r <- Reduce(intersect,lapply(names(x), function(v) split(x,x[v])))
r1map <- unlist(Map(toString,Map(unlist,r)))
# removel one-to-one mapping and find N-to-1 mapping
if (length(r1map) >0) {
xx <- x[-as.numeric(rownames(Reduce(rbind,r))),]
} else {
xx <- x
}
rNmap <- c()
if (nrow(xx)> 0) {
rNmap <- sapply(names(xx),
function(v) {
z <- split(xx,xx[v])
u <- z[unlist(Map(nrow,z))>1]
ifelse(length(u)==0, NA, toString(c(names(u),as.vector(u[[1]][,setdiff(names(xx),v)]))))
},USE.NAMES = F)
rNmap <- rNmap[!is.na(rNmap)]
}
# combine both 1-to-1 and n-to-1 mappings
res <- c(r1map,rNmap)
return(res)
}
# solution by GKi: with outer product
method.C <- function() {
tt <- outer(x$G, x$G, "==") | outer(x$S, x$S, "==")
tt <- unique(tt)
res <- apply(tt, 1, function(i) unique(unlist(x[i,])))
return(res)
}
# runtime results
rt <- data.frame()
for (n in seq(500,10000, by=500)) {
# this won't work because of ambigous node ids (see [answer by GKi][6]):
#x <- data.frame(G = sample(1:n,n,replace=TRUE), S = sample(1:n,n,replace=TRUE))
# therefore, make the node ids unique:
x <- data.frame(G = sprintf("g%i", sample(1:n,n,replace=TRUE)), S = sprintf("s%i", sample(1:n,n,replace=TRUE)))
t1 <- proc.time()
method.A()
tA <- proc.time() - t1
t1 <- proc.time()
method.B()
tB <- proc.time() - t1
t1 <- proc.time()
method.C()
tC <- proc.time() - t1
rt <- rbind(rt, data.frame(n=n, t.A=tA[["elapsed"]], t.B=tB[["elapsed"]], t.C=tC[["elapsed"]]))
}
print(rt)
plot(rt$n, rt$t.C, xlab="n", ylab="run time [s]", ylim=c(min(rt$t.A),max(rt$t.C)), type='l')
lines(rt$n, rt$t.B, col="red")
lines(rt$n, rt$t.A, col="blue")
legend("topleft", c("GKi", "ThomasisCoding", "chinsoon12"), lt=c(1,1,1), col=c("black", "red", "blue"))
Comparison on results of the methods:
method.A()
#$`1`
#[1] "1" "2" "3" "4"
method.A2()
#$`1`
#[1] "3" "1" "4" "2"
#
#$`2`
#[1] "2" "3"
method.B()
#[[1]]
#[1] 3 1 4 2
#
#[[2]]
#[1] 2 3
method.C()
#[[1]]
#[[1]]$All
#[1] 3 1 4 2
#
#[[1]]$G
#[1] 3 1
#
#[[1]]$S
#[1] 4 2 1
#
#
#[[2]]
#[[2]]$All
#[1] 2 3
#
#[[2]]$G
#[1] 2
#
#[[2]]$S
#[1] 3
Methods:
library(igraph)
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
method.A2 <- function() {
g <- graph_from_data_frame(t(apply(x, 1, function(x) paste0(names(x), x))))
m <- clusters(g)$membership
res <- tapply(substring(names(m),2), m, unique)
return(res)
}
method.B <- function() {
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
return(res)
}
method.C <- function() {
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- list(All=unique(c(t1, t2)), G=unique(t1), S=unique(t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
}
Data:
x <- data.frame(G = c(3,1,1,2,3), S=c(4,1,2,3,2))
x
# G S
#1 3 4
#2 1 1
#3 1 2
#4 2 3
#5 3 2
UPDATE: performance comparison based on latest updates by #GKi, #chinsoon12, and #ThomasisCoding
code for comparison
library(igraph)
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
method.B <- function() {
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
return(res)
}
method.C <- function() {
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- list(All=unique(c(t1, t2)), G=unique(t1), S=unique(t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
}
# runtime results
rt <- data.frame()
for (n in seq(500,10000, by=500)) {
# this won't work because of ambigous node ids (see [answer by GKi][6]):
#x <- data.frame(G = sample(1:n,n,replace=TRUE), S = sample(1:n,n,replace=TRUE))
# therefore, make the node ids unique:
x <- data.frame(G = sprintf("g%i", sample(1:n,n,replace=TRUE)), S = sprintf("s%i", sample(1:n,n,replace=TRUE)))
t1 <- proc.time()
method.A()
tA <- proc.time() - t1
t1 <- proc.time()
method.B()
tB <- proc.time() - t1
t1 <- proc.time()
method.C()
tC <- proc.time() - t1
rt <- rbind(rt, data.frame(n=n, t.A=tA[["elapsed"]], t.B=tB[["elapsed"]], t.C=tC[["elapsed"]]))
}
print(rt)
plot(rt$n, rt$t.C, xlab="n", ylab="run time [s]", ylim=c(min(rt$t.A),max(rt$t.C)), type='l')
lines(rt$n, rt$t.B, col="red")
lines(rt$n, rt$t.A, col="blue")
legend("topleft", c("GKi", "ThomasisCoding", "chinsoon12"), lt=c(1,1,1), col=c("black", "red", "blue"))
runtime of three methods:
n t.A t.B t.C
1 500 0.00 0.16 0.26
2 1000 0.02 0.31 0.53
3 1500 0.02 0.51 1.11
4 2000 0.03 0.90 1.47
5 2500 0.03 1.35 2.17
6 3000 0.04 2.08 3.14
7 3500 0.04 2.66 3.97
8 4000 0.07 3.38 4.92
9 4500 0.07 4.38 6.35
10 5000 0.06 5.41 7.58
11 5500 0.08 6.79 9.55
12 6000 0.08 7.81 10.91
13 6500 0.10 9.03 12.06
14 7000 0.09 10.06 14.20
15 7500 0.11 11.76 15.65
16 8000 0.13 13.41 17.84
17 8500 0.11 14.87 20.67
18 9000 0.13 16.88 23.52
19 9500 0.14 18.38 25.57
20 10000 0.14 22.81 30.05
visualization of runtime
Additional (Thanks to comment by #GKi): When keeping the dataset integers, the grouping process non-igraph methods are largely reduced:
n t.A t.B t.C
1 500 0.00 0.09 0.13
2 1000 0.01 0.15 0.23
3 1500 0.01 0.22 0.38
4 2000 0.03 0.31 0.50
5 2500 0.05 0.45 0.76
6 3000 0.07 0.51 0.77
7 3500 0.06 0.67 0.97
8 4000 0.07 0.85 1.20
9 4500 0.07 0.90 1.39
10 5000 0.09 1.23 1.55
11 5500 0.09 1.30 1.78
12 6000 0.09 1.51 1.94
13 6500 0.11 1.77 2.20
14 7000 0.13 2.18 2.55
15 7500 0.12 2.37 2.79
16 8000 0.13 2.56 2.96
17 8500 0.14 2.76 3.39
18 9000 0.15 3.03 3.54
19 9500 0.15 3.54 4.23
20 10000 0.16 3.76 4.32

Splitting data.frame inside and outside an R function

I have 3 data.frames (A, B1 and B2). I split each by variable study.name and get my desired output shown as out1, out2, out3:
J <- split(A, A$study.name); out1 <- do.call(rbind, c(J, make.row.names = F))
M <- split(B1, B1$study.name); out2 <- do.call(rbind, c(M, make.row.names = F))
N <- split(B2, B2$study.name); out3 <- do.call(rbind, c(N, make.row.names = F))
But I'm wondering why I can't achieve the same output from my function foo? (see below)
A <- read.csv("https://raw.githubusercontent.com/izeh/m/master/irr.csv", h = T) ## data A
B1 <- read.csv('https://raw.githubusercontent.com/izeh/m/master/irr2.csv', h = T) ## data B1
B2 <- read.csv("https://raw.githubusercontent.com/izeh/m/master/irr4.csv", h = T) ## data B2
foo <- function(...){ ## The unsuccessful function `foo`
r <- list(...)
## r <- Can we HERE delete rows and columns that are ALL `NA` or EMPTY in `r`?
J <- unlist(lapply(seq_along(r), function(i) split(r[[i]], r[[i]]$study.name)), recursive = FALSE)
lapply(seq_along(J), function(i)do.call(rbind, c(J[[i]], make.row.names = FALSE)) )
}
foo(B1, B2) # Example without success
We can do the cleaning of rows/columns before doing the split
foo <- function(...){
r <- list(...)
lapply(r, function(dat) {
m1 <- is.na(dat)|dat == ""
i1 <- rowSums(m1) < ncol(m1)
j1 <- colSums(m1) < nrow(m1)
dat1 <- dat[i1, j1]
facColumns <- sapply(dat1, is.factor)
dat1[facColumns] <- lapply(dat1[facColumns], as.character)
dat1$study.name <- factor(dat1$study.name, levels = unique(dat1$study.name))
l1 <- split(dat1, dat1$study.name)
do.call(rbind, c(l1, make.row.names = FALSE))
}
)
}
lapply(foo(B1, B2), head, 2)
#[[1]]
# study.name group.name outcome ESL prof scope type
#1 Shin.Ellis ME.short 1 1 2 1 1
#2 Shin.Ellis ME.long 1 1 2 1 1
#[[2]]
# study.name group.name outcome ESL prof scope type
#1 Shin.Ellis ME.short 1 1 2 1 1
#2 Shin.Ellis ME.long 1 1 2 1 1
or using a single object as argument
lapply(foo(A), head, 2)
#[[1]]
# study.name group.name outcome ESL prof scope type ESL.1 prof.1 scope.1 type.1
#1 Shin.Ellis ME.short 1 1 2 1 1 1 2 1 1
#2 Shin.Ellis ME.long 1 1 2 1 1 1 2 1 1

variables not recognized in for-loop nested within lapply

I have the following data
set.seed(42)
dat <- list(data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10)),
data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10)))
to which I would like to apply this function element by element and group by group.
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
mon = data.table(cond = as.character(L))[, skip := FALSE]
for (i in seq_along(L)){
d = eval( substitute(x[cond, verbose=v], list(cond = L[[i]], v = verbose)) )
if (nrow(d)){
x = d
} else {
mon[i, skip := TRUE]
}
}
#print(mon)
return(x)
}
However, when I run this code
# works
out <- lapply(1:2, function(h){
res <- list()
d <- dat[[h]]
for(k in 1:2){
g <- d[group==k]
cutoff <- 1
print(cutoff)
res[[k]] <- subs(g, x>cutoff)
}
res
})
I receive the error that object cutoff cannot be found, although it is printed correctly. However, when I apply the same for-loop outside of the lapply(), it appears to work.
d1 <- dat[[1]]
s <- list()
for(k in 1:2){
g <- d1[group==k]
cutoff <- 1
s[[k]] <- subs(g, x>cutoff)
}
> s
[[1]]
id group x
1: 1 1 1.370958
[[2]]
id group x
1: 7 2 1.511522
2: 9 2 2.018424
This leads me to suspect that it's the inclusion in the lapply() that causes the error but I find it hard to see what the error is, let along how to fix it.
Edit
Data with two variables:
set.seed(42)
dat <- list(data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10), y=11:20),
data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10), y=11:20))
with expected result
[[1]]
id group x y
1: 9 2 2.0184237 19
2: 1 1 1.3709584 11
3: 2 1 -0.5646982 12
4: 3 1 0.3631284 13
5: 4 1 0.6328626 14
6: 5 1 0.4042683 15
[[2]]
id group x y
1: 2 1 2.2866454 12
2: 10 2 1.3201133 20
If you use non-standard evaluation you always pay a price. Here it is a scoping issue.
It works like this:
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
mon = data.table(cond = as.character(L))[, skip := FALSE]
for (i in seq_along(L)){
d = eval( substitute(x[cond,, #needed to add this comma, don't know why
verbose=v], list(cond = L[[i]], v = verbose)))
if (nrow(d)){
x = d
} else {
mon[i, skip := TRUE]
}
}
#print(mon)
return(x)
}
out <- lapply(1:2, function(h){
res <- list()
d <- dat[[h]]
for(k in 1:2){
g <- d[group==k]
cutoff <- 1
res[[k]] <- eval(substitute(subs(g, x>cutoff), list(cutoff = cutoff)))
}
res
})
#works
Is there a particular reason for not using data.table's by parameter?
Edit:
Background: The point of subs() is to apply multiple conditions (if
multiple are passed to it) unless one would result in an empty subset.
I would use a different approach then:
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
for (i in seq_along(L)){
d = eval( substitute(x[cond, , verbose=v], list(cond = L[[i]], v = verbose)))
x <- rbind(d, x[!d, on = "group"])
}
return(x)
}
out <- lapply(dat, function(d){
cutoff <- 2 #to get empty groups
eval(substitute(subs(d, x>cutoff), list(cutoff = cutoff)))
})
#[[1]]
# id group x
#1: 9 2 2.0184237
#2: 1 1 1.3709584
#3: 2 1 -0.5646982
#4: 3 1 0.3631284
#5: 4 1 0.6328626
#6: 5 1 0.4042683
#
#[[2]]
# id group x
#1: 2 1 2.2866454
#2: 6 2 0.6359504
#3: 7 2 -0.2842529
#4: 8 2 -2.6564554
#5: 9 2 -2.4404669
#6: 10 2 1.3201133
Beware that this does not retain the ordering.
Another option that retains the ordering:
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
for (i in seq_along(L)){
x = eval( substitute(x[, {
res <- .SD[cond];
if (nrow(res) > 0) res else .SD
}, by = "group", verbose=v], list(cond = L[[i]], v = verbose)))
}
return(x)
}
The by variable could be passed as a function parameter and then substituted in together with the condition.
I haven't done benchmarks comparing the efficiency of these two.

Summing Multiple Times in a Column

I have a dataframe, df, of two columns, x and y. I am trying to sum values within column y and put the sums into another dataframe. The summing only occurs for a section of column y between NA values. There are multiple sections of column y that must be summed but I want each sum to be a separate value in the new data frame.
df <- data.frame(x = c(1966,0.1,0.2,0.3,0.4,5622,0.9,0.8,0.7,0.6,7889),
y = c(NA,1,2,3,4,NA,9,8,7,6,NA))
The answer should be in the format of a data frame with one column of two rows:
df <- data.frame(x = c(10,30))
I thought of solving this using some for loop and if statements for values between values of NA in column y. Any ideas?
So far, I have the following code, but I ultimately want it to work for a column with a series of more than two summations:
NAs <- which(is.na(df$y))
L1 <- length(NAs)
L0 <- dim(df)[1]
soln1 <- data.frame(matrix(nrow = L1-1, ncol = 1))
for(i in 1:L0){
for(j in 1:L1){
if (j == L1){
break
} else
soln1[j,1] <- sum(df[NAs[j] +1,2]:df[NAs[j+1] -1,2])
}
}
I took a stab at it with some fake data:
df <- data.frame(x = c(1,1,3,1,3,1,1,1,1,1,3,1,1,1,1,1),
y = c(1,2,NA,4,5,NA,7,8,NA,10,11,NA,13,14,NA,16))
# df
# x y
#1 1 1
#2 1 2
#3 3 NA
#4 1 4
#5 3 5
#6 1 NA
#7 1 7
#8 1 8
#9 1 NA
#10 1 10
#11 3 11
#12 1 NA
#13 1 13
#14 1 14
#15 1 NA
#16 1 16
The magic function:
# sum rows in y if section is between NA values & before a value in column x that is > 2
specialSum <- function(x, y){
starting <- which(c(NA,x[-length(x)]) > 2 & is.na(y))
NAs <- which(is.na(y))
L <- length(starting)
ending <- sapply(1:L, function(z) NAs[NAs[-starting] > starting[z]][1])
output <- matrix(NA, nrow = L)
naming <- rep("",L)
for(i in 1:L){
output[i] <- sum(y[starting[i]:ending[i]], na.rm = T)
naming[i] <- paste0(starting[i]+1,":",ending[i]-1)
}
dimnames(output) <- list(naming, "specialSum")
output
}
specialSum(df$x, df$y)
# specialSum
#7:8 15
#13:14 27
EDIT:
df <- data.frame(x = c(1966,0.1,0.2,0.3,0.4,5622,0.9,0.8,0.7,0.6,7889),
y = c(NA,1,2,3,4,NA,9,8,7,6,NA))
specialSum <- function(y){
NAs <- which(is.na(y))
starting <- NAs[-length(NAs)]+1
ending <- NAs[-1]-1
L <- length(starting)
sums <- matrix(NA, nrow = L) ; naming <- rep("",L) # initialize for speed
for(i in 1:L){
sums[i] <- sum(y[starting[i]:ending[i]], na.rm = T)
naming[i] <- paste0(starting[i],":",ending[i])
}
sums <- sums[sums != 0,,drop = F] # in case there are multiple NAs in a row
data.frame(specialSum = sums, row.names = naming)
}
specialSum(df$y)
# specialSum
#2:5 10
#7:10 30
EDIT#2:
NAs <- which(is.na(df$y))
sumlist <- vector("list", length(NAs)-1)
count <- 0
for(i in 1:nrow(df)){
if(i %in% NAs){
count = count + 1
} else {
sumlist[[count]] <- append(sumlist[[count]], df$y[i])
}
}
data.frame(specialSum = unlist(lapply(sumlist, sum))) # less pretty output
# specialSum
#1 10
#2 30

How to add columns to data.frame based on vector length

I have a function runBootstrap whose output result is a vector of variable length (depending on # of values for cat, which itself is a product of test). Apologies that this isn't "minimal".
require(dplyr)
test <- function(combo) {
if(combo[1] == 4) {
cat <- 4
} else if((combo[1] == 3 & combo[2] == 2) | (combo[1] == 2 & combo[2] == 2)) {
cat <- 3
} else if((combo[1] == 2 & combo[2] == 1) | (combo[1] == 1 & combo[2] == 2)) {
cat <- 2
} else {
cat <- 1
}
}
arg1.freqs <- c(0.5, 0.2, 0.1, 0.1)
arg2.freqs <- c(0.8, 0.2)
runBootstrap <- function(arg1.freqs, arg2.freqs) {
sim.df <- data.frame(x1 = 1:10000, y1 = NA)
sim.df$x1 <- sample(1:4, 10000, replace = TRUE,
prob = arg1.freqs)
sim.df$y1 <- sample(1:2, 10000, replace = TRUE,
prob = arg2.freqs)
sim.df$cat <- NA
for(i in 1:nrow(sim.df)) {
combo <- c(sim.df[i, 1], sim.df[i, 2])
sim.df$cat[i] <- test(combo)
}
sim.df <- sim.df %>%
select(cat) %>%
group_by(cat) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
sim.df <- as.data.frame(sim.df)
result <- c(sim.df[1, 3], sim.df[2, 3])
}
In this current version there are only two values for cat so result is a vector of length 2; in a future version I will adjust code so that length(result) will equal # values of cat.
When using the function in a for loop, I would like to use the vector values to create new columns in an already existing data.frame df1. The code I've tried thus far is as follows:
df1$result <- NA
for (i in 1:nrow(df1)) {
df1$result[i] <- runBootstrap(arg1.freqs, arg2.freqs)
}
This clearly doesn't work unless the result vector is length = 1. But I don't know the length of the vector until the function runs (although once it runs it will be same length each iteration).
What I would like to achieve is the following:
Example 1: if length(result) == 2
df1.col x1 x2
1 1 1 1
2 2 2 2
3 3 3 3
4 4 4 4
5 5 5 5
6 6 6 6
Example 2: if length(result) == 3
df1.col x1 x2 x3
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
5 5 5 5 5
6 6 6 6 6
Thanks for any advice or direction.
edited for clarification
UPDATE - edited with solution
I got it to work as I wanted by creating a blank list, populating, then using rbind as follows:
appendResults <- function(df1, arg1, arg2) {
my.list <- vector("list", nrow(df1))
for (i in 1:nrow(df1)) {
arg1.freqs <- as.numeric(arg1[i, 3:6])
arg2.freqs <- as.numeric(arg2[i, 3:4])
my.list[[i]] <- runBootstrap(arg1.freqs, arg2.freqs)
}
result.df <- do.call(rbind, my.list)
df2 <- do.call(cbind, list(df1, result.df))
}
Check this one, not sure what the result looks like, but this creates empty columns, equal to the length of results, with NAs:
# fake data frame
df1 <- data.frame(x = c(1,2,3), y = c("a", "b", "c"))
# say result has length 3
res <- c(5,6,7)
# make columns with names x1, ..., x + length of res
# and assign NA values to those column
df1[ , paste("x", 1:length(res), sep = "")] <- NA

Resources