Apply a family of functions over nested list -R - r

I need to apply a family of functions of form (a*x +b+ c) to
a nested list, e.g. in following form
map_function <- function(x,y){
return(linear_function(x[1],x[2],x[3],y))
}
linear_function <- function(x1,x2,x3,y){
g <- sapply(y, function(x){x1*x+x2+x3})%>% min(.)
return(g)
}
over two lists e.g. so that when map_function is passed on all arguments
pr_list <- list(c(1,2,3),c(4,5,6)) and
f_list <- list(c(234,34),c(456,34,567),c(111,222))
It will generate a nested list/matrix of 3 with 2 values in each. What is the R-way to do it other than using for loop?
e.g. if the output is a matrix, examples of the elements will be
M11 <- linear_function(pr_list[[1]][1],pr_list[[1]][2],pr_list[[1]][3],f_list[[1]] )
M12 <- linear_function(pr_list[[1]][1],pr_list[[1]][2],pr_list[[1]][3],f_list[[2]] )
M13 <- linear_function(pr_list[[1]][1],pr_list[[1]][2],pr_list[[1]][3],f_list[[3]] )
M21 <- linear_function(pr_list[[2]][1],pr_list[[2]][2],pr_list[[2]][3],f_list[[1]] )
M22 <- linear_function(pr_list[[2]][1],pr_list[[2]][2],pr_list[[2]][3],f_list[[2]] )
M23 <- linear_function(pr_list[[2]][1],pr_list[[2]][2],pr_list[[2]][3],f_list[[3]] )
M <- list(c(M11,M21),c(M12,M22),c(M13,M23))
print(M)
[[1]]
[1] 39 147
[[2]]
[1] 39 147
[[3]]
[1] 116 455

This is my best guess for you.
x <- data.frame(x1 = c(1,2,3), x2 = c(4,5,6))
y <- data.frame(y1 = c(234,34,NA),y2 = c(456,34,567), y3 = c(111,222,NA))
linear_function <- function(x, y){x[[1]]*y +x[[2]]+x[[3]]}
Which when applied like this, results in the following.
> linear_function(x$x1, y)
y1 y2 y3
1 239 461 116
2 39 39 227
3 NA 572 NA
> linear_function(x$x2, y)
y1 y2 y3
1 947 1835 455
2 147 147 899
3 NA 2279 NA
If you want a single object.
> z <- lapply(x, linear_function, y)
> z
$x1
y1 y2 y3
1 239 461 116
2 39 39 227
3 NA 572 NA
$x2
y1 y2 y3
1 947 1835 455
2 147 147 899
3 NA 2279 NA

Related

How to make a data frame of a list containing vectors and a matrix in R

I am struggling with transformation of a list to a data frame in R. I would like to make a data frame from a list containing two vectors and a matrix (x, y and z). The data looks like this:
x <- c(1,2,3,4,5)
y <- c(0,1,2,3,4)
z <- matrix(as.numeric(sample(100:200, 25)), nrow=5, ncol=5, byrow = TRUE)
lst <- list(x,y,z)
I would like to convert this to a data frame that looks like this:
x1 y1 z
x1 y2 z
x1 y3 z
x1 y4 z
x1 y5 z
x2 y1 z
etc...
With the column names x, y and z, respectively. There are a lot of questions already on Stackoverflow about conversion of a list to data frame, however I could not find an option for a list containing vectors as well as a data frame. The options I have tried so far did not succeed.
I tried:
#Option 1:
as.data.frame(lst)
#Option 2:
enframe(lst) #from the tidyverse package
#Option 3:
melt(lst) #from the reshape2 package
What did I miss and how can I fix it?
x <- c(1,2,3,4,5)
y <- c(0,1,2,3,4)
set.seed(123)
z <- matrix(as.numeric(sample(100:200, 25)), nrow=5, ncol=5, byrow = TRUE)
df <- data.frame( z = as.vector( z ) )
df$x <- rep( x, each = nrow( df ) / length( x ) )
df$y <- rep( y, times = nrow( df ) / length( y ) )
head(df, 10)
# z x y
# 1 129 1 0
# 2 104 1 1
# 3 187 1 2
# 4 177 1 3
# 5 172 1 4
# 6 178 2 0
# 7 150 2 1
# 8 198 2 2
# 9 120 2 3
# 10 155 2 4
Another option is expand.grid(x, y) - which returns a dataframe - to which we add z after we converted it to a vector.
setNames(`[<-`(expand.grid(y, x), "z", value = c(z)), c("y", "x", "z"))
# y x z
#1 0 1 129
#2 1 1 104
#3 2 1 187
#4 3 1 177
#5 4 1 172
#6 0 2 178
# ...

An error about vectorization in R

My R code is as follows. The main task is to calculate the row number of repetitions.
library(plyr)
data<-data.frame(1,2,3);
x <- read.table(text = "ID1 ID2 n m
13 156 12 15
94 187 14 16
66 297 41 48
29 89 42 49
78 79 51 79", header= TRUE)
distfunc <- function(data,ID1,ID2,n,m){
X1<-ID1; ################
X2<-ID2; ################
X3<-unlist(mapply(':', n, m));
data<-rbind(data,data.frame(X1,X2,X3));
return(data);
}
data<-distfunc(data,x$ID1, x$ID2,x$n, x$m)
data<-data[-1,]
plyr::count(data, names(data)); ## Calculates the row number of repetitions
The error message I get:
Error in data.frame(X1, X2, X3) :
arguments imply differing number of rows: 5, 52
I try to fix it by R Error: “In numerical expression has 19 elements: only the first used”, but it failed and the result is wrong. This probelm is not the same as that probelm.
I suppose you want to do:
# library(plyr)
# data<-data.frame(1,2,3);
x <- read.table(header=TRUE, text =
"ID1 ID2 n m
13 156 12 15
94 187 14 16
66 297 41 48
29 89 42 49
78 79 51 79")
#distfunc <- function(data, ID1, ID2, n, m) {
# X1 <- ID1 ################
# X2 <- ID2 ################
# X3 <- unlist(mapply(':', n, m))
# data <- rbind(data, data.frame(X1,X2,X3))
#}
#data <- distfunc(data, x$ID1, x$ID2, x$n, x$m)
L <- apply(x, 1, function(x) data.frame(X1=x[1], X2=x[2], X3=x[3]:x[4], row.names=NULL))
data <- L[[1]]
for (i in 2:length(L)) data <- rbind(data, L[[i]])
or with a better readable function in apply():
L <- apply(x, 1, function(r) data.frame(X1=r["ID1"], X2=r["ID2"], X3=r["n"]:r["m"], row.names=NULL))
data <- L[[1]]; for (i in 2:length(L)) data <- rbind(data, L[[i]])
Here is a simpler variant:
data <- data.frame(X1=x$ID1[1], X2=x$ID2[1], X3=x$n[1]:x$m[1])
for (i in 2:nrow(x)) data <- rbind(data, data.frame(X1=x$ID1[i], X2=x$ID2[i], X3=x$n[i]:x$m[i]))
I just fixed it.
distfunc <- function(data, ID1, ID2, n, m) {
X1 <- ID1
X2 <- ID2
X3 <- unlist(mapply(':', n, m))
data <- rbind(data,data.frame(X1, X2, X3))
return(data)
}

NA issues with Linear Regression

I want to regress y1 with x, then y2 with x and so on and save the slope,intercept,r2 values ,p values etc. to a vector such that the final vector should contain values for y1...n even if the value is NA.
The following is how my data looks
y1 y2 y3 y4 y5 x
NA 89 86 91 69 1
NA 86 NA 88 NA 2
NA 86 83 88 66 3
NA 100 NA 102 80 4
Using the following code, the slope values will not be calculated for those columns where all the values of y[,i] are NA but will still be calculated if one of the values is a NA.
slope<-0
for(i in 1:length(names(y))){
if (all(is.na(y[,i]))==FALSE) {
m <- lm(y[,i] ~ time)
slope <- c(slope, m$coefficients[2])
}
}
However,I still cannot figure out a way by which I maintain the positional information of all y[,i] such that my final vector output would look something like this
slope
1 NA
2 9.362637e-01
3 8.461538e-01
4 3.450549e-01
5 6.593407e-01
ANy help will be much appreciated
sapply over the non-x columns of DF returning the coefficients if there are any non-NAs in the dependent variable (y) and returning NAs otherwise:
t(sapply(DF[-6], function(y) if (any(!is.na(y))) coef(lm(y ~ x, DF)) else c(NA, NA)))
This gives the following where column 1 is the intercepts and column 2 is the slopes:
[,1] [,2]
y1 NA NA
y2 82.00000 3.300000
y3 87.50000 -1.500000
y4 84.00000 3.300000
y5 63.85714 2.928571
If only the slopes are needed then:
matrix(sapply(DF[-6], function(y) if (any(!is.na(y))) coef(lm(y ~ x, DF))[2] else NA))
#This is for the slope only.
nn<-lapply(paste0("y",1:5),function(i){
if (all(is.na(y[[i]]))==FALSE) {bb<-lm(y[[i]]~x,data=y)
return(bb[[1]][2])
}else{
return(NA)
}
})
do.call(rbind,kk)
x
[1,] NA
[2,] 3.300000
[3,] -1.500000
[4,] 3.300000
[5,] 2.928571
do.call(rbind,nn)

matrix operations and component-wise addition using data.table

What is the best way to do component-wise matrix addition if the number of matrices to be summed is not known in advance? More generally, is there a good way to perform matrix (or multi-dimensional array) operations in the context of data.table? I use data.table for its efficiency at sorting and grouping data by several fixed variables, or categories, each comprising a different number of observations.
For example:
Find the outer product of vector components given in each observation (row) of the data, returning a matrix for each row.
Sum the resulting matrices component-wise over all rows of each grouping of data categories.
Here illustrated with 2x2 matrices and only one category:
library(data.table)
# example data, number of rows differs by category t
N <- 5
dt <- data.table(t = rep(c("a", "b"), each = 3, len = N),
x1 = rep(1:2, len = N), x2 = rep(3:5, len = N),
y1 = rep(1:3, len = N), y2 = rep(2:5, len = N))
setkey(dt, t)
> dt
t x1 x2 y1 y2
1: a 1 3 1 2
2: a 2 4 2 3
3: a 1 5 3 4
4: b 2 3 1 5
5: b 1 4 2 2
I attempted a function to compute matrix sum on outer product, %o%
mat_sum <- function(x1, x2, y1, y2){
x <- c(x1, x2) # x vector
y <- c(y1, y2) # y vector
xy <- x %o% y # outer product (i.e. 2x2 matrix)
sum(xy) # <<< THIS RETURNS A SINGLE VALUE, NOT WHAT I WANT.
}
which, of course, does not work because sum adds up all the elements across the arrays.
I saw this answer using Reduce('+', .list) but that seems to require already having a list of all the matrices to be added. I haven't figured out how to do that within data.table, so instead I've got a cumbersome work-around:
# extract each outer product component first...
mat_comps <- function(x1, x2, y1, y2){
x <- c(x1, x2) # x vector
y <- c(y1, y2) # y vector
xy <- x %o% y # outer product (i.e. 2x2 matrix)
xy11 <- xy[1,1]
xy21 <- xy[2,1]
xy12 <- xy[1,2]
xy22 <- xy[2,2]
return(c(xy11, xy21, xy12, xy22))
}
# ...then running this function on dt,
# taking extra step (making column 'n') to apply it row-by-row...
dt[, n := 1:nrow(dt)]
dt[, c("xy11", "xy21", "xy12", "xy22") := as.list(mat_comps(x1, x2, y1, y2)),
by = n]
# ...then sum them individually, now grouping by t
s <- dt[, list(s11 = sum(xy11),
s21 = sum(xy21),
s12 = sum(xy12),
s22 = sum(xy22)),
by = key(dt)]
> s
t s11 s21 s12 s22
1: a 8 26 12 38
2: b 4 11 12 23
and that gives the summed components, which can finally be converted back to matrices.
In general, data.table is designed to work with columns. The more you transform your problem to col-wise operations, the more you can get out of data.table.
Here's an attempt at accomplishing this operation col-wise. Probably there are better ways. This is intended more as a template, to provide an idea on approaching the problem (even though I understand it may not be possible in all cases).
xcols <- grep("^x", names(dt))
ycols <- grep("^y", names(dt))
combs <- CJ(ycols, xcols)
len <- seq_len(nrow(combs))
cols = paste("V", len, sep="")
for (i in len) {
c1 = combs$V2[i]
c2 = combs$V1[i]
set(dt, i=NULL, j=cols[i], value = dt[[c1]] * dt[[c2]])
}
# t x1 x2 y1 y2 V1 V2 V3 V4
# 1: a 1 3 1 2 1 3 2 6
# 2: a 2 4 2 3 4 8 6 12
# 3: a 1 5 3 4 3 15 4 20
# 4: b 2 3 1 5 2 3 10 15
# 5: b 1 4 2 2 2 8 2 8
This basically applies the outer product col-wise. Now it's just a matter of aggregating it.
dt[, lapply(.SD, sum), by=t, .SDcols=cols]
# t V1 V2 V3 V4
# 1: a 8 26 12 38
# 2: b 4 11 12 23
HTH
Edit: Modified cols, c1, c2 a bit to get the output with the correct order for V2 and V3.
EDIT:
For not only 2 elements in "x"s and "y"s, a modified function could be:
ff2 = function(x_ls, y_ls)
{
combs_ls = lapply(seq_along(x_ls[[1]]),
function(i) list(sapply(x_ls, "[[", i),
sapply(y_ls, "[[", i)))
rowSums(sapply(combs_ls, function(x) as.vector(do.call(outer, x))))
}
where, "x_ls" and "y_ls" are lists of the respective vectors.
Using it:
dt[, as.list(ff2(list(x1, x2), list(y1, y2))), by = t]
# t V1 V2 V3 V4
#1: a 8 26 12 38
#2: b 4 11 12 23
And on other "data.frames/tables":
set.seed(101)
DF = data.frame(group = rep(letters[1:3], c(4, 2, 3)),
x1 = sample(1:20, 9, T), x2 = sample(1:20, 9, T),
x3 = sample(1:20, 9, T), x4 = sample(1:20, 9, T),
y1 = sample(1:20, 9, T), y2 = sample(1:20, 9, T),
y3 = sample(1:20, 9, T), y4 = sample(1:20, 9, T))
DT = as.data.table(DF)
DT[, as.list(ff2(list(x1, x2, x3, x4),
list(y1, y2, y3, y4))), by = group]
# group V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
#1: a 338 661 457 378 551 616 652 468 460 773 536 519 416 766 442 532
#2: b 108 261 171 99 29 77 43 29 154 386 238 146 161 313 287 121
#3: c 345 351 432 293 401 421 425 475 492 558 621 502 510 408 479 492
I don't know, though, how would one in "data.table" not state explicitly which columns to use inside the function; i.e. how you could do the equivalent of:
do.call(rbind, lapply(split(DF[-1], DF$group),
function(x)
do.call(ff2, c(list(x[grep("^x", names(x))]),
list(x[grep("^y", names(x))])))))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
#a 338 661 457 378 551 616 652 468 460 773 536 519 416 766 442 532
#b 108 261 171 99 29 77 43 29 154 386 238 146 161 313 287 121
#c 345 351 432 293 401 421 425 475 492 558 621 502 510 408 479 492
OLD ANSWER:
Perhaps you could define your function like:
ff1 = function(x1, x2, y1, y2)
rowSums(sapply(seq_along(x1),
function(i) as.vector(c(x1[i], x2[i]) %o% c(y1[i], y2[i]))))
dt[, as.list(ff1(x1, x2, y1, y2)), by = list(t)]
# t V1 V2 V3 V4
#1: a 8 26 12 38
#2: b 4 11 12 23

R - List of combinations with outer() and expand.grid()

I have a list of prime numbers with I multiply using outer() and upper.tri() to get a unique set of numbers.
primes <- c(2, 3, 5, 7, 11, 13, 17, 19, 23, 29)
m <- outer(primes, primes, "*")
unq <- m[which(upper.tri(m))]
> unq
6 10 15 14 21 35 22 33 55 77 26 39 65 91 143 34 51 85 119 187 221 38 57 95 133 209 247 323 46 69 115 161 253 299 391 437 58 87 145 203 319 377 493 551 667
Each of the original prime numbers represents a set of two numbers:
a2 <- c(1,1)
a3 <- c(1,2)
a5 <- c(2,2)
a7 <- c(1,3)
a11 <- c(1,4)
a13 <- c(2,3)
a17 <- c(2,4)
a19 <- c(3,3)
a23 <- c(3,4)
a29 <- c(4,4)
The combination of the two sets of two numbers produces 4 numbers
expand.grid(a2,a3)
1 1
1 1
1 2
1 2
So what I would like to do is have a kind of a list of lists, with each prime number having all 4 possible combinations.
I tried something like this, but I am missing some fundamentals here:
outer(a ,a , "expand.grid")
So the result would look something like this for the first prime:
6 c(11, 11, 12, 12)
I'm not sure I understand correctly, but I hope this helps:
#function to `outer`
fun <- function(x, y)
{
a1 <- get(paste0("a", x))
a2 <- get(paste0("a", y))
res <- apply(expand.grid(a1, a2), 1, paste, collapse = "")
res2 <- paste(res, collapse = ";")
return(res2)
}
#`outer` a vectorized `fun`
m2 <- outer(primes, primes, Vectorize(fun))
#select `upper.tri`
unq2 <- m2[upper.tri(m2)]
#combine to a list
myls <- lapply(as.list(unq2), function(x) as.numeric(unlist(strsplit(x, ";"))))
names(myls) <- unq
myls
#$`6`
#[1] 11 11 12 12
#$`10`
#[1] 12 12 12 12
#$`15`
#[1] 12 22 12 22
#$`14`
#[1] 11 11 13 13
#...

Resources