Calculations as per the formula - r

I need to do some calculation as per the below formula:
B1 = A1 + (1-A1) * B1
example:
B1 = 0.2 + (1 - 0.2) * 0.4
= 0.52
C1 = 0.4 + (1 - 0.4) * 0.8
= 0.904
D1 = 0.8 + (1 - 0.8) * 0.5
= 0.952
Same logic applied for other rows and other columns, there are total 11.
dataframe:
df
A B C D
0.2 0.4 0.8 0.5
0.4 0.5 0.6 0.2
0.8 0.1 0.5 0.4
0.3 0.4 0.1 0.8
Expected output:
A B C D
0.2 0.52 0.904 0.952
0.4 0.7 0.88 0.904
0.8 0.82 0.91 0.946
0.3 0.58 0.622 0.9244
I tried it for 1 with the below code:
Df <- df[-ncol(df)] + ( 1 – df[-ncol(df)]) * df[-1]
I was able to get the column B as per the output, but not working for rest of the column.
Please help, thanks. BM.

You can do this recursively as follows:
do.call(cbind, Reduce(f = function(A1, B1) A1+(1-A1)*B1,
x = df,
accumulate = TRUE))
Explanation:
Since df is a data.frame which is a list of vectors, Reduce will take each vector and apply your function. Then do.call(cbind,...) combine the results into a data.frame.

Related

Preferred performant procedure for R data.table row-wise operations?

Does the following code represent the preferred procedure for traversing the rows of an R data.table and passing the values found at each row to a function? Or is there a more performant way to do this?
library(data.table)
set.seed(2)
n <- 100
b <- c(0.5, 1.5, -1)
phi <- 0.8
X <- cbind(1, matrix(rnorm(n*2, 0, 1), ncol = 2))
y <- X %*% matrix(b, ncol = 1) + rnorm(n, 0, phi)
d <- data.table(y, X)
setnames(d, c("y", "x0", "x1", "x2"))
logpost <- function(d, b1, b2, b3, phi, mub = 1, taub = 10, a = 0.5, z = 0.7){
N <- nrow(d)
mu <- b1 + b2 * d$x1 + b3 * d$x2
lp <- -N * log(phi) -
(1/(2*phi^2)) * sum( (d$y-mu)^2 ) -
(1/(2*taub^2))*( (b1-mub)^2 + (b2-mub)^2 + (b3-mub)^2 ) -
(a+1)*log(phi) - (z/phi)
lp
}
nn <- 21
grid <- data.table(
expand.grid(b1 = seq(0, 1, len = nn),
b2 = seq(1, 2, len = nn),
b3 = seq(-1.5, -0.5, len = nn),
phi = seq(0.4, 1.2, len = nn)))
grid[, id := 1:.N]
setkey(grid, id)
wraplogpost <- function(dd){
logpost(d, dd$b1, dd$b2, dd$b3, dd$phi)
}
start <- Sys.time()
grid[, lp := wraplogpost(.SD), by = seq_len(nrow(grid))]
difftime(Sys.time(), start)
# Time difference of 2.081544 secs
Edit: display first few records
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152
I have tried using set but that approach seems inferior
start <- Sys.time()
grid[, lp := NA_real_]
for(i in 1:nrow(grid)){
llpp <- wraplogpost(grid[i])
set(grid, i, "lp", llpp)
}
difftime(Sys.time(), start)
# Time difference of 21.71291 secs
Edit: display first few records
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152
Suggestions or pointers to the relevant docs would be appreciated.
Edit: per comments:
start <- Sys.time()
grid[, lp := wraplogpost(.SD), by = .I]
difftime(Sys.time(), start)
Warning messages:
1: In b2 * d$x1 :
longer object length is not a multiple of shorter object length
2: In b3 * d$x2 :
longer object length is not a multiple of shorter object length
3: In d$y - mu :
longer object length is not a multiple of shorter object length
> difftime(Sys.time(), start)
Time difference of 0.01199317 secs
>
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -620977.2
2: 0.05 1 -1.5 0.4 2 -620977.2
3: 0.10 1 -1.5 0.4 3 -620977.2
4: 0.15 1 -1.5 0.4 4 -620977.2
5: 0.20 1 -1.5 0.4 5 -620977.2
6: 0.25 1 -1.5 0.4 6 -620977.2
which generates the wrong values for lp.
Edit thank you for the comments and responses. I am aware that this scenario could be addressed by using alternative methods, my interest is in what the preferred way to do this is when using data.table.
Edit thank you for the responses again. As there have been none that address the question of how to do this explicitly with data.table, at the moment, I am assuming that there is no ideal way to achieve this without turning to base R.
If you want to have a better performance (time) you could rewrite the rowwise function to a calculation with matrices.
start <- Sys.time()
grid_mat <- as.matrix(grid[, list(b1, b2, b3, 1)])
# function parameters
N <- nrow(d); mub = 1; taub = 10; a = 0.5; z = 0.7
d$const <- 1
# combining d$y - mu in this step already
mu_op <- matrix(c(-d$const, -d$x1, -d$x2, d$y), nrow = 4, byrow = TRUE)
mu_mat <- grid_mat %*% mu_op
mub_mat <- (grid_mat[, c("b1", "b2", "b3")] - mub)^2
# just to save one calculation of the log
phi <- grid$phi
log_phi <- log(grid$phi)
grid$lp2 <- -N * log_phi -
(1/(2*phi^2)) * rowSums(mu_mat^2) -
(1/(2*taub^2))*( rowSums(mub_mat) ) -
(a+1)*log_phi - (z/phi)
head(grid)
difftime(Sys.time(), start)
The first rows:
b1 b2 b3 phi id lp lp2
1: 0.00 1 -1.5 0.4 1 -398.7618 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152 -322.4152
For the timing:
# on your code on my pc:
Time difference of 4.390684 secs
# my code on my pc:
Time difference of 0.680476 secs
I think you can use matrix multiplication and other vectorization techniques to simplify your code, which helps you avoid running function logpost in a row-wise manner.
Below is a vectorized version of logpost, i.e., logpost2
logpost2 <- function(d, dd, mub = 1, taub = 10, a = 0.5, z = 0.7) {
bmat <- as.matrix(dd[, .(b1, b2, b3)])
xmat <- cbind(1, as.matrix(d[, .(x1, x2)]))
phi <- dd$phi
phi_log <- log(phi)
lp <- -(a + nrow(d) + 1) * phi_log -
(1 / (2 * phi^2)) * colSums((d$y - tcrossprod(xmat, bmat))^2) -
(1 / (2 * taub^2)) * rowSums((bmat - mub)^2) - (z / phi)
lp
}
and you will see
> start <- Sys.time()
> grid[, lp := logpost2(d, .SD)]
> difftime(Sys.time(), start)
Time difference of 0.1966231 secs
and
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152

How split a heatmap at the specific rows? Error invalid for atomic vectors

I am trying to build a heatmap in R. I wanted to split the heatmap at specific rows. For example my matrix is as:
ID A B C
FD_1 0.3 0.2 1
FD_2 0.4 1 0.9
FD_3 0.6 0.8 0.2
FS_1 0.3 0.2 1
FS_2 0.4 1 0.9
FS_3 0.6 0.8 0.2
FS_4 0.4 1 0.9
FS_5 0.6 0.8 0.2
FE_1 0.3 0.2 1
FE_2 0.4 1 0.9
FE_3 0.6 0.8 0.2
FE_4 0.4 1 0.9
I need to make a heatmap that includes 3 slice: one for 3 FD, one for 5 FS and one for 4 FE. And label each slice with their name as FD, FS and FE.
I'm using this code:
Heatmap(M_matrix, name = "level", row_split = M_matrix$ID)
But I'm getting this error:
Error in M_matrix$ID : $ operator is invalid for atomic vectors
Any suggestion?
Thanks
You can define the splits based on your ID column:
library(ComplexHeatmap)
ID=c(paste0("FD_", 1:3), paste0("FS_", 1:5), paste0("FE_", 1:4))
df <- data.frame(ID=ID,
matrix(rnorm(3*12, mean = 3), ncol=3,
dimnames=list(ID, LETTERS[1:3])),
stringsAsFactors = FALSE)
splits <- factor(gsub("_.*", "", ID))
Heatmap(matrix=as.matrix(df[,-1] ), row_split = splits, cluster_row_slices = FALSE)
If you want list of dataframes based on ID, we can use split.
list_df <- split(df, sub("_.*", "", df$ID))
list_df
#$FD
# ID A B C
#1 FD_1 0.3 0.2 1.0
#2 FD_2 0.4 1.0 0.9
#3 FD_3 0.6 0.8 0.2
#$FE
# ID A B C
#9 FE_1 0.3 0.2 1.0
#10 FE_2 0.4 1.0 0.9
#11 FE_3 0.6 0.8 0.2
#12 FE_4 0.4 1.0 0.9
#$FS
# ID A B C
#4 FS_1 0.3 0.2 1.0
#5 FS_2 0.4 1.0 0.9
#6 FS_3 0.6 0.8 0.2
#7 FS_4 0.4 1.0 0.9
#8 FS_5 0.6 0.8 0.2
We can use group_split
library(dplyr)
library(stringr)
list_df <- df %>%
group_split(grp = str_remove(ID, "_.*"), keep = FALSE)

Looping Through R data to replace all values

Basic level R programmer trying to re-calibrate data using a weighted effect and some other value. In particular I want to 1) if the weighted effect is negative take the row value of X and subtract the person's value or 2) if the weighted effect is positive take the person's value and subtract X.
Mock data:
p1 <- c(0.4,0.7,0.3,0.2)
p2 <- c(0.8,0.4,0.5,0.1)
p3 <- c(0.6,0.5,0.4,0.3)
wef <- c(1.5,-1.2,1.8,-1.3)
x <- c(0.5,0.4,0.6,0.2)
print(df)
p1 p2 p3 wef x
1 0.4 0.8 0.6 1.5 0.5
2 0.7 0.4 0.5 -1.2 0.4
3 0.3 0.5 0.4 1.8 0.6
4 0.2 0.1 0.3 -1.3 0.2
I attempted this (which did nothing and likely would be inefficient with for loops):
for(row in 1:nrow(df)) {
for(col in 1:ncol(df)) {
ifelse(weightef[row] < 0, df[row,col]==(df$x[row]-df[row,col]),
df[row,col]==df[row,col]-df$x[row])
}
}
my desired output in case the above was to hard to follow is this
person1 person2 person3 weightef x
1 -0.1 0.3 0.1 1.5 0.5
2 -0.3 0.0 -0.1 -1.2 0.4
3 -0.3 -0.1 -0.2 1.8 0.6
4 0.0 0.1 -0.1 -1.3 0.2
You can using apply and ifelse function in R. This is just one line function, and you are not required to understand grep. The second line of code just put everything into data frame.
result <- apply(df[, 1:3], 2, FUN = function(y) with(df, ifelse(wef < 0, x - y, y - x)))
df <- as.data.frame(cbind(result, wef, x))
p1 p2 p3 wef x
1 -0.1 0.3 0.1 1.5 0.5
2 -0.3 0.0 -0.1 -1.2 0.4
3 -0.3 -0.1 -0.2 1.8 0.6
4 0.0 0.1 -0.1 -1.3 0.2
We can do this without a loop in R
nm1 <- grep("^p\\d+", names(df), value = TRUE)
i1 <- df$wef > 0
df[i1, nm1] <- df[i1, nm1] - df$x[i1]
df[!i1, nm1] <- df$x - df[!i1, nm1]
data
df <- data.frame(p1, p2, p3, wef, x)
If you want to use for loops you can do it in this way
#Create dataframe
df = data.frame(p1, p2, p3, wef, x)
#looping lenght of vector wef
for (w in 1:length(df$wef))
{
#Checking positive or negative weight
if (df$wef[w] >= 0)
{
#subtracting
df$p1[w] = df$p1[w] - df$x[w]
df$p2[w] = df$p2[w] - df$x[w]
df$p3[w] = df$p3[w] - df$x[w]
}
else
{
#subtracting
df$p1[w] = df$x[w] - df$p1[w]
df$p2[w] = df$x[w] - df$p2[w]
df$p3[w] = df$x[w] - df$p3[w]
}
}
#print result
print(df)
p1 p2 p3 wef x
1 -0.1 0.3 0.1 1.5 0.5
2 -0.3 0.0 -0.1 -1.2 0.4
3 -0.3 -0.1 -0.2 1.8 0.6
4 0.0 0.1 -0.1 -1.3 0.2

Defining a function that includes for loops

I have 2 data frames.
One (df1) has columns for slopes and intercepts, and the other (df2) has an index column (i.e., row numbers).
I wish to apply a function based on parameters from df1 to the entire index column in df2. I don't want the function to mix and match slopes and intercepts (i.e., I want to make sure that the function always uses slopes and intercepts from the same columns in df1).
I tried to do this
my_function <- function(x) {for (i in df1$slope) for (j in df1$intercept) {((i*x)+j)}}
df3 <- for (k in df2$Index) {my_function(k)}
df3
but it didn't work.
Here are sample data:
> df1
thermocouple slope intercept
1 1 0.01 0.5
2 2 -0.01 0.4
3 3 0.03 0.2
> df2
index t_1 t_2 t_3
1 1 0.3 0.2 0.2
2 2 0.5 0.2 0.3
3 3 0.3 0.9 0.1
4 4 1.2 1.8 0.4
5 5 2.3 3.1 1.2
Here would be the output I need:
index baseline_t_1 baseline_t_2 baseline_t_3
1 0.51 0.39 0.23
2 0.52 0.38 0.26
3 0.53 0.37 0.29
4 0.54 0.36 0.32
5 0.55 0.35 0.35
What am I doing wrong?
Thanks!
Try this:
By passing three arguments with values at the same time to a anonymous function defined in Map.
Map( function(index, slope, intercept) (index * slope ) + intercept,
index = df2$Index, slope = df1$slope, intercept = df1$intercept)
May be this: I am not sure which one you prefer given there is no data and expected output in the question.
lapply( df2$index, function(index){
unlist( Map( function(slope, intercept) (index * slope ) + intercept,
slope = df1$slope, intercept = df1$intercept) )
})

How to plot the min and max of each row in y-axis?

I'm new and learning R and got a problem plotting the min and max value of a matrix.
The matrix is something like this:
X Y1 Y2 Y3 Y4 Y5
1 0.5 0.6 0.3 0.3 0.2
2 0.3 0.4 0.1 0.7 0.4
3 0.4 0.3 0.5 0.6 0.3
Now I would like to plot the first column(X) as x-axis, and pick out the min and max values of each row (e.g. X=1, Ymin=0.2, Ymax=0.6 in the first row), and plot them as the y-axis.
Could someone help me to figure it out?
Here is one possibility, considering you want a scatterplot.
#reading your data
table = read.table(header=TRUE, text="
X Y1 Y2 Y3 Y4 Y5
1 0.5 0.6 0.3 0.3 0.2
2 0.3 0.4 0.1 0.7 0.4
3 0.4 0.3 0.5 0.6 0.3", sep= " ")
#using a for loop to filter only data to be used in the plot (X, Min_Y, Max_Y)
df = data.frame(X=NA,min_Y=NA,max_Y=NA)
for (i in c(1:length(df))) {
X = table[i,1] #X values from table
min_Y = c(min(table[i,c(2:6)])) #minimum values inside table columns 2 to 6
max_Y = c(max(table[i,c(2:6)])) #maximum values inside table columns 2 to 6
df = rbind(df,c(X,min_Y,max_Y)) #new df with X, Min_Y, Max_Y
}
df = df[-1,]
df #df results
X min_Y max_Y
2 1 0.2 0.6
3 2 0.1 0.7
4 3 0.3 0.6
#produce scatterplot with R package ggplot2
library(ggplot2)
ggplot(df) +
geom_point(aes(x=X,y=min_Y),colour="red") +
geom_point(aes(x=X,y=max_Y),colour="blue") +
ylab("Y") +
theme_bw()
A solution with rbind and 2 apply functions (for min and max) (surely not the best tho) :
mat <- as.matrix(read.table(header = T, text = "X Y1 Y2 Y3 Y4 Y5
1 0.5 0.6 0.3 0.3 0.2
2 0.3 0.4 0.1 0.7 0.4
3 0.4 0.3 0.5 0.6 0.3"))
mat2 <- t(rbind(X = mat[ ,1], Ymin = apply(mat[ ,-1], 1, min), Ymax = apply(mat[ ,-1], 1, max)))
matplot(mat2[ ,1], mat2[ ,-1], pch = 20, cex = 1.5)
For example using pmin and pmax:
mn = Reduce(pmin,as.list(dat[,-1]))
mx = Reduce(pmax,as.list(dat[,-1]))
library(lattice)
xyplot(mn+mx~x,data.frame(x= dat[,1],mn=mn,mx=mx),
type='l',auto.key=T,
ylab=list(label="max and min"))
Where dat is :
dat <-
read.table(text='
X Y1 Y2 Y3 Y4 Y5
1 0.5 0.6 0.3 0.3 0.2
2 0.3 0.4 0.1 0.7 0.4
3 0.4 0.3 0.5 0.6 0.3',header=TRUE)
So here is (another...) way to get the column-wise min and max (using m as your matrix).
z <- t(apply(m,1,
function(x)return(c(x[1],min=min(x[2:length(x)]),max=max(x[2:length(x)])))))
z <- data.frame(z)
z
# X min max
# [1,] 1 0.2 0.6
# [2,] 2 0.1 0.7
# [3,] 3 0.3 0.6
From here, plotting is straightforward.
plot(z$X, z$max, ylim=c(min(z$min),max(z$max)),col="blue")
points(z$X, z$min, col="red")

Resources