Preferred performant procedure for R data.table row-wise operations? - r

Does the following code represent the preferred procedure for traversing the rows of an R data.table and passing the values found at each row to a function? Or is there a more performant way to do this?
library(data.table)
set.seed(2)
n <- 100
b <- c(0.5, 1.5, -1)
phi <- 0.8
X <- cbind(1, matrix(rnorm(n*2, 0, 1), ncol = 2))
y <- X %*% matrix(b, ncol = 1) + rnorm(n, 0, phi)
d <- data.table(y, X)
setnames(d, c("y", "x0", "x1", "x2"))
logpost <- function(d, b1, b2, b3, phi, mub = 1, taub = 10, a = 0.5, z = 0.7){
N <- nrow(d)
mu <- b1 + b2 * d$x1 + b3 * d$x2
lp <- -N * log(phi) -
(1/(2*phi^2)) * sum( (d$y-mu)^2 ) -
(1/(2*taub^2))*( (b1-mub)^2 + (b2-mub)^2 + (b3-mub)^2 ) -
(a+1)*log(phi) - (z/phi)
lp
}
nn <- 21
grid <- data.table(
expand.grid(b1 = seq(0, 1, len = nn),
b2 = seq(1, 2, len = nn),
b3 = seq(-1.5, -0.5, len = nn),
phi = seq(0.4, 1.2, len = nn)))
grid[, id := 1:.N]
setkey(grid, id)
wraplogpost <- function(dd){
logpost(d, dd$b1, dd$b2, dd$b3, dd$phi)
}
start <- Sys.time()
grid[, lp := wraplogpost(.SD), by = seq_len(nrow(grid))]
difftime(Sys.time(), start)
# Time difference of 2.081544 secs
Edit: display first few records
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152
I have tried using set but that approach seems inferior
start <- Sys.time()
grid[, lp := NA_real_]
for(i in 1:nrow(grid)){
llpp <- wraplogpost(grid[i])
set(grid, i, "lp", llpp)
}
difftime(Sys.time(), start)
# Time difference of 21.71291 secs
Edit: display first few records
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152
Suggestions or pointers to the relevant docs would be appreciated.
Edit: per comments:
start <- Sys.time()
grid[, lp := wraplogpost(.SD), by = .I]
difftime(Sys.time(), start)
Warning messages:
1: In b2 * d$x1 :
longer object length is not a multiple of shorter object length
2: In b3 * d$x2 :
longer object length is not a multiple of shorter object length
3: In d$y - mu :
longer object length is not a multiple of shorter object length
> difftime(Sys.time(), start)
Time difference of 0.01199317 secs
>
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -620977.2
2: 0.05 1 -1.5 0.4 2 -620977.2
3: 0.10 1 -1.5 0.4 3 -620977.2
4: 0.15 1 -1.5 0.4 4 -620977.2
5: 0.20 1 -1.5 0.4 5 -620977.2
6: 0.25 1 -1.5 0.4 6 -620977.2
which generates the wrong values for lp.
Edit thank you for the comments and responses. I am aware that this scenario could be addressed by using alternative methods, my interest is in what the preferred way to do this is when using data.table.
Edit thank you for the responses again. As there have been none that address the question of how to do this explicitly with data.table, at the moment, I am assuming that there is no ideal way to achieve this without turning to base R.

If you want to have a better performance (time) you could rewrite the rowwise function to a calculation with matrices.
start <- Sys.time()
grid_mat <- as.matrix(grid[, list(b1, b2, b3, 1)])
# function parameters
N <- nrow(d); mub = 1; taub = 10; a = 0.5; z = 0.7
d$const <- 1
# combining d$y - mu in this step already
mu_op <- matrix(c(-d$const, -d$x1, -d$x2, d$y), nrow = 4, byrow = TRUE)
mu_mat <- grid_mat %*% mu_op
mub_mat <- (grid_mat[, c("b1", "b2", "b3")] - mub)^2
# just to save one calculation of the log
phi <- grid$phi
log_phi <- log(grid$phi)
grid$lp2 <- -N * log_phi -
(1/(2*phi^2)) * rowSums(mu_mat^2) -
(1/(2*taub^2))*( rowSums(mub_mat) ) -
(a+1)*log_phi - (z/phi)
head(grid)
difftime(Sys.time(), start)
The first rows:
b1 b2 b3 phi id lp lp2
1: 0.00 1 -1.5 0.4 1 -398.7618 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152 -322.4152
For the timing:
# on your code on my pc:
Time difference of 4.390684 secs
# my code on my pc:
Time difference of 0.680476 secs

I think you can use matrix multiplication and other vectorization techniques to simplify your code, which helps you avoid running function logpost in a row-wise manner.
Below is a vectorized version of logpost, i.e., logpost2
logpost2 <- function(d, dd, mub = 1, taub = 10, a = 0.5, z = 0.7) {
bmat <- as.matrix(dd[, .(b1, b2, b3)])
xmat <- cbind(1, as.matrix(d[, .(x1, x2)]))
phi <- dd$phi
phi_log <- log(phi)
lp <- -(a + nrow(d) + 1) * phi_log -
(1 / (2 * phi^2)) * colSums((d$y - tcrossprod(xmat, bmat))^2) -
(1 / (2 * taub^2)) * rowSums((bmat - mub)^2) - (z / phi)
lp
}
and you will see
> start <- Sys.time()
> grid[, lp := logpost2(d, .SD)]
> difftime(Sys.time(), start)
Time difference of 0.1966231 secs
and
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152

Related

Computation with different combinations of parameters using for loop

I am trying to implement a for loop in R to fill a df with some combinations of learning rates and decays used in machine learning. The ideia is to try several learning rates and decays, calculate error metrics of these combinations and save in a dataset. So I could point out which combination is better.
Below is the code and my result. I don't understand why I get this result.
learning_rate = c(0.01, 0.02)
decay = c(0, 1e-1)
combinations = length(learning_rate) * length(decay)
df <- data.frame(Combination=character(combinations),
lr=double(combinations),
decay=double(combinations),
result=character(combinations),
stringsAsFactors=FALSE)
for (i in 1:combinations) {
for (lr in learning_rate) {
for (dc in decay) {
df[i, 1] = i
df[i, 2] = lr
df[i, 3] = dc
df[i, 4] = 10*lr + dc*4 # Here I'd do some machine learning. Just put this is easy equation as example
}
}
}
The result I get. It seems that only the combination loop worked well. What I did wrong?
Combination lr decay result
1 0.02 0.1 0.6
2 0.02 0.1 0.6
3 0.02 0.1 0.6
4 0.02 0.1 0.6
I expected this result
Combination lr decay result
1 0.01 0 0.1
2 0.01 1e-1 0.5
3 0.02 0 0.2
4 0.02 1e-1 0.6
Tuning with for-loop:
df <- data.frame()
for (lr in learning_rate) {
for (dc in decay) {
df <- rbind(df, data.frame(
lr = lr,
decay = dc,
result = 10*lr + dc*4
))
}
}
df
# lr decay result
# 1 0.01 0.0 0.1
# 2 0.01 0.1 0.5
# 3 0.02 0.0 0.2
# 4 0.02 0.1 0.6
Tuning with mapply():
df <- expand.grid(lr = learning_rate, decay = decay)
ML.fun <- function(lr, dc) 10*lr + dc*4
df$result <- mapply(ML.fun, lr = df$lr, dc = df$decay)
df
# lr decay result
# 1 0.01 0.0 0.1
# 2 0.02 0.0 0.2
# 3 0.01 0.1 0.5
# 4 0.02 0.1 0.6

Looping Through R data to replace all values

Basic level R programmer trying to re-calibrate data using a weighted effect and some other value. In particular I want to 1) if the weighted effect is negative take the row value of X and subtract the person's value or 2) if the weighted effect is positive take the person's value and subtract X.
Mock data:
p1 <- c(0.4,0.7,0.3,0.2)
p2 <- c(0.8,0.4,0.5,0.1)
p3 <- c(0.6,0.5,0.4,0.3)
wef <- c(1.5,-1.2,1.8,-1.3)
x <- c(0.5,0.4,0.6,0.2)
print(df)
p1 p2 p3 wef x
1 0.4 0.8 0.6 1.5 0.5
2 0.7 0.4 0.5 -1.2 0.4
3 0.3 0.5 0.4 1.8 0.6
4 0.2 0.1 0.3 -1.3 0.2
I attempted this (which did nothing and likely would be inefficient with for loops):
for(row in 1:nrow(df)) {
for(col in 1:ncol(df)) {
ifelse(weightef[row] < 0, df[row,col]==(df$x[row]-df[row,col]),
df[row,col]==df[row,col]-df$x[row])
}
}
my desired output in case the above was to hard to follow is this
person1 person2 person3 weightef x
1 -0.1 0.3 0.1 1.5 0.5
2 -0.3 0.0 -0.1 -1.2 0.4
3 -0.3 -0.1 -0.2 1.8 0.6
4 0.0 0.1 -0.1 -1.3 0.2
You can using apply and ifelse function in R. This is just one line function, and you are not required to understand grep. The second line of code just put everything into data frame.
result <- apply(df[, 1:3], 2, FUN = function(y) with(df, ifelse(wef < 0, x - y, y - x)))
df <- as.data.frame(cbind(result, wef, x))
p1 p2 p3 wef x
1 -0.1 0.3 0.1 1.5 0.5
2 -0.3 0.0 -0.1 -1.2 0.4
3 -0.3 -0.1 -0.2 1.8 0.6
4 0.0 0.1 -0.1 -1.3 0.2
We can do this without a loop in R
nm1 <- grep("^p\\d+", names(df), value = TRUE)
i1 <- df$wef > 0
df[i1, nm1] <- df[i1, nm1] - df$x[i1]
df[!i1, nm1] <- df$x - df[!i1, nm1]
data
df <- data.frame(p1, p2, p3, wef, x)
If you want to use for loops you can do it in this way
#Create dataframe
df = data.frame(p1, p2, p3, wef, x)
#looping lenght of vector wef
for (w in 1:length(df$wef))
{
#Checking positive or negative weight
if (df$wef[w] >= 0)
{
#subtracting
df$p1[w] = df$p1[w] - df$x[w]
df$p2[w] = df$p2[w] - df$x[w]
df$p3[w] = df$p3[w] - df$x[w]
}
else
{
#subtracting
df$p1[w] = df$x[w] - df$p1[w]
df$p2[w] = df$x[w] - df$p2[w]
df$p3[w] = df$x[w] - df$p3[w]
}
}
#print result
print(df)
p1 p2 p3 wef x
1 -0.1 0.3 0.1 1.5 0.5
2 -0.3 0.0 -0.1 -1.2 0.4
3 -0.3 -0.1 -0.2 1.8 0.6
4 0.0 0.1 -0.1 -1.3 0.2

Calculations as per the formula

I need to do some calculation as per the below formula:
B1 = A1 + (1-A1) * B1
example:
B1 = 0.2 + (1 - 0.2) * 0.4
= 0.52
C1 = 0.4 + (1 - 0.4) * 0.8
= 0.904
D1 = 0.8 + (1 - 0.8) * 0.5
= 0.952
Same logic applied for other rows and other columns, there are total 11.
dataframe:
df
A B C D
0.2 0.4 0.8 0.5
0.4 0.5 0.6 0.2
0.8 0.1 0.5 0.4
0.3 0.4 0.1 0.8
Expected output:
A B C D
0.2 0.52 0.904 0.952
0.4 0.7 0.88 0.904
0.8 0.82 0.91 0.946
0.3 0.58 0.622 0.9244
I tried it for 1 with the below code:
Df <- df[-ncol(df)] + ( 1 – df[-ncol(df)]) * df[-1]
I was able to get the column B as per the output, but not working for rest of the column.
Please help, thanks. BM.
You can do this recursively as follows:
do.call(cbind, Reduce(f = function(A1, B1) A1+(1-A1)*B1,
x = df,
accumulate = TRUE))
Explanation:
Since df is a data.frame which is a list of vectors, Reduce will take each vector and apply your function. Then do.call(cbind,...) combine the results into a data.frame.

Find column number that satisfies condition

I have two columns where the sum of each row is 1 (they are the probability of one of two classes). I need to find the column number where a condition is met.
C1 C2
0.4 0.6
0.3 0.7
1 0
0.7 0.3
0.1 0.9
For example, if I need to find the column where the number is >= 0.6,
in the table above it should result in:
2
2
1
1
2
Thanks for this interesting question. Here is an idea using apply.
apply(dat, 1, function(x) which(x >= 0.6))
# [1] 2 2 1 1 2
DATA
dat <- read.table(textConnection("C1 C2
0.4 0.6
0.3 0.7
1 0
0.7 0.3
0.1 0.9"), header = T)
Benchmarking
I conducted the benchmark for the original data frame dat, and a data frame with 5000 rows dat2. The results are as follows. I feel a little bit embarrassed that my apply method is the slowest.
If anyone has any idea how to improve the way I conducted benchmark, please let me know.
library(microbenchmark)
# Benchmark 1
perf <- microbenchmark(m1 = {apply(dat, 1, function(x) which(x >= 0.6))},
m2 = {ifelse(dat$C1 <= 0.4, 2, 1)},
m3 = {(dat$C2 >= 0.6) + 1},
m4 = {(which(t(dat) >= 0.6) + 1) %% ncol(dat) + 1},
m5 = {((dat>=0.6) %*% c(1,2))[, 1]},
m6 = {m <- which(dat >= 0.6, arr.ind = TRUE)
m[order(m[, 1]), ][, 2]},
m7 = {max.col(dat >= 0.6)})
perf
# Unit: microseconds
# expr min lq mean median uq max neval
# m1 58.602 65.0280 88.34563 67.5985 70.6825 1746.246 100
# m2 9.253 12.8515 15.45772 13.8790 14.9080 49.349 100
# m3 4.112 5.6540 6.59015 6.1690 7.1970 23.132 100
# m4 30.844 35.7270 40.29682 38.0405 40.8670 134.683 100
# m5 23.647 26.7310 30.13404 27.7590 29.8160 77.109 100
# m6 49.863 53.4620 61.31148 56.5460 59.8875 168.610 100
# m7 37.012 40.0960 45.36537 42.1530 45.2370 97.671 100
# Benchmark 2
dat2 <- dat[rep(1:5, 1000), ]
perf2 <- microbenchmark(m1 = {apply(dat2, 1, function(x) which(x >= 0.6))},
m2 = {ifelse(dat2$C1 <= 0.4, 2, 1)},
m3 = {(dat2$C2 >= 0.6) + 1},
m4 = {(which(t(dat2) >= 0.6) + 1) %% ncol(dat2) + 1},
m5 = {((dat2 >= 0.6) %*% c(1,2))[, 1]},
m6 = {m <- which(dat2 >= 0.6, arr.ind = TRUE)
m[order(m[, 1]), ][, 2]},
m7 = {max.col(dat2 >= 0.6)})
perf2
# Unit: microseconds
# expr min lq mean median uq max neval
# m1 13842.995 14830.2380 17173.18941 15716.2125 16551.8095 165431.735 100
# m2 133.140 146.7630 168.86722 160.6420 179.9195 314.602 100
# m3 22.104 25.7030 31.93827 28.0160 33.9280 67.341 100
# m4 156.787 179.6620 212.97310 210.5055 234.6665 320.257 100
# m5 131.598 148.8195 173.42179 164.2410 189.9440 286.843 100
# m6 403.019 439.2600 496.25370 472.6735 549.0110 791.646 100
# m7 140.337 156.7870 270.48048 179.4055 208.9635 8631.503 100
You can make use of the fact that TRUE = 1 and FALSE = 0:
> df <- read.table(textConnection("C1 C2
+ 0.4 0.6
+ 0.3 0.7
+ 1 0
+ 0.7 0.3
+ 0.1 0.9"), header = T)
> (df$C2 >= 0.6) + 1
[1] 2 2 1 1 2
This is using matrix multiple
(dt>=0.6)%*%c(1,2)
[,1]
[1,] 2
[2,] 2
[3,] 1
[4,] 1
[5,] 2
If I consider the case where more than 1 column can satisfy the condition then which will be a better option.
I have modified the data so that both column 1 and 2 satisfy the condition in row 3.
# Data
df <- read.table(text = "C1 C2
0.4 0.6
0.3 0.7
1 1
0.7 0.3
0.1 0.9", header = T, stringsAsFactors = F)
# Use of which with arr.ind = TRUE
which(df >= 0.6, arr.ind = TRUE)
# Result shows row number 3 twice
# row col
#[1,] 3 1
#[2,] 4 1
#[3,] 1 2
#[4,] 2 2
#[5,] 3 2
#[6,] 5 2
Use the ternary function
ifelse(daf$C1<=0.4, 2, 1)
#[1] 2 2 1 1 2
Here is another possibility using the modulo operator %%:
(which(t(df) >= 0.6) + 1) %% ncol(df) + 1
#[1] 2 2 1 1 2
Sample data
df <- read.table(text =
"C1 C2
0.4 0.6
0.3 0.7
1 0
0.7 0.3
0.1 0.9", header = T)

How to plot the min and max of each row in y-axis?

I'm new and learning R and got a problem plotting the min and max value of a matrix.
The matrix is something like this:
X Y1 Y2 Y3 Y4 Y5
1 0.5 0.6 0.3 0.3 0.2
2 0.3 0.4 0.1 0.7 0.4
3 0.4 0.3 0.5 0.6 0.3
Now I would like to plot the first column(X) as x-axis, and pick out the min and max values of each row (e.g. X=1, Ymin=0.2, Ymax=0.6 in the first row), and plot them as the y-axis.
Could someone help me to figure it out?
Here is one possibility, considering you want a scatterplot.
#reading your data
table = read.table(header=TRUE, text="
X Y1 Y2 Y3 Y4 Y5
1 0.5 0.6 0.3 0.3 0.2
2 0.3 0.4 0.1 0.7 0.4
3 0.4 0.3 0.5 0.6 0.3", sep= " ")
#using a for loop to filter only data to be used in the plot (X, Min_Y, Max_Y)
df = data.frame(X=NA,min_Y=NA,max_Y=NA)
for (i in c(1:length(df))) {
X = table[i,1] #X values from table
min_Y = c(min(table[i,c(2:6)])) #minimum values inside table columns 2 to 6
max_Y = c(max(table[i,c(2:6)])) #maximum values inside table columns 2 to 6
df = rbind(df,c(X,min_Y,max_Y)) #new df with X, Min_Y, Max_Y
}
df = df[-1,]
df #df results
X min_Y max_Y
2 1 0.2 0.6
3 2 0.1 0.7
4 3 0.3 0.6
#produce scatterplot with R package ggplot2
library(ggplot2)
ggplot(df) +
geom_point(aes(x=X,y=min_Y),colour="red") +
geom_point(aes(x=X,y=max_Y),colour="blue") +
ylab("Y") +
theme_bw()
A solution with rbind and 2 apply functions (for min and max) (surely not the best tho) :
mat <- as.matrix(read.table(header = T, text = "X Y1 Y2 Y3 Y4 Y5
1 0.5 0.6 0.3 0.3 0.2
2 0.3 0.4 0.1 0.7 0.4
3 0.4 0.3 0.5 0.6 0.3"))
mat2 <- t(rbind(X = mat[ ,1], Ymin = apply(mat[ ,-1], 1, min), Ymax = apply(mat[ ,-1], 1, max)))
matplot(mat2[ ,1], mat2[ ,-1], pch = 20, cex = 1.5)
For example using pmin and pmax:
mn = Reduce(pmin,as.list(dat[,-1]))
mx = Reduce(pmax,as.list(dat[,-1]))
library(lattice)
xyplot(mn+mx~x,data.frame(x= dat[,1],mn=mn,mx=mx),
type='l',auto.key=T,
ylab=list(label="max and min"))
Where dat is :
dat <-
read.table(text='
X Y1 Y2 Y3 Y4 Y5
1 0.5 0.6 0.3 0.3 0.2
2 0.3 0.4 0.1 0.7 0.4
3 0.4 0.3 0.5 0.6 0.3',header=TRUE)
So here is (another...) way to get the column-wise min and max (using m as your matrix).
z <- t(apply(m,1,
function(x)return(c(x[1],min=min(x[2:length(x)]),max=max(x[2:length(x)])))))
z <- data.frame(z)
z
# X min max
# [1,] 1 0.2 0.6
# [2,] 2 0.1 0.7
# [3,] 3 0.3 0.6
From here, plotting is straightforward.
plot(z$X, z$max, ylim=c(min(z$min),max(z$max)),col="blue")
points(z$X, z$min, col="red")

Resources