I have two columns where the sum of each row is 1 (they are the probability of one of two classes). I need to find the column number where a condition is met.
C1 C2
0.4 0.6
0.3 0.7
1 0
0.7 0.3
0.1 0.9
For example, if I need to find the column where the number is >= 0.6,
in the table above it should result in:
2
2
1
1
2
Thanks for this interesting question. Here is an idea using apply.
apply(dat, 1, function(x) which(x >= 0.6))
# [1] 2 2 1 1 2
DATA
dat <- read.table(textConnection("C1 C2
0.4 0.6
0.3 0.7
1 0
0.7 0.3
0.1 0.9"), header = T)
Benchmarking
I conducted the benchmark for the original data frame dat, and a data frame with 5000 rows dat2. The results are as follows. I feel a little bit embarrassed that my apply method is the slowest.
If anyone has any idea how to improve the way I conducted benchmark, please let me know.
library(microbenchmark)
# Benchmark 1
perf <- microbenchmark(m1 = {apply(dat, 1, function(x) which(x >= 0.6))},
m2 = {ifelse(dat$C1 <= 0.4, 2, 1)},
m3 = {(dat$C2 >= 0.6) + 1},
m4 = {(which(t(dat) >= 0.6) + 1) %% ncol(dat) + 1},
m5 = {((dat>=0.6) %*% c(1,2))[, 1]},
m6 = {m <- which(dat >= 0.6, arr.ind = TRUE)
m[order(m[, 1]), ][, 2]},
m7 = {max.col(dat >= 0.6)})
perf
# Unit: microseconds
# expr min lq mean median uq max neval
# m1 58.602 65.0280 88.34563 67.5985 70.6825 1746.246 100
# m2 9.253 12.8515 15.45772 13.8790 14.9080 49.349 100
# m3 4.112 5.6540 6.59015 6.1690 7.1970 23.132 100
# m4 30.844 35.7270 40.29682 38.0405 40.8670 134.683 100
# m5 23.647 26.7310 30.13404 27.7590 29.8160 77.109 100
# m6 49.863 53.4620 61.31148 56.5460 59.8875 168.610 100
# m7 37.012 40.0960 45.36537 42.1530 45.2370 97.671 100
# Benchmark 2
dat2 <- dat[rep(1:5, 1000), ]
perf2 <- microbenchmark(m1 = {apply(dat2, 1, function(x) which(x >= 0.6))},
m2 = {ifelse(dat2$C1 <= 0.4, 2, 1)},
m3 = {(dat2$C2 >= 0.6) + 1},
m4 = {(which(t(dat2) >= 0.6) + 1) %% ncol(dat2) + 1},
m5 = {((dat2 >= 0.6) %*% c(1,2))[, 1]},
m6 = {m <- which(dat2 >= 0.6, arr.ind = TRUE)
m[order(m[, 1]), ][, 2]},
m7 = {max.col(dat2 >= 0.6)})
perf2
# Unit: microseconds
# expr min lq mean median uq max neval
# m1 13842.995 14830.2380 17173.18941 15716.2125 16551.8095 165431.735 100
# m2 133.140 146.7630 168.86722 160.6420 179.9195 314.602 100
# m3 22.104 25.7030 31.93827 28.0160 33.9280 67.341 100
# m4 156.787 179.6620 212.97310 210.5055 234.6665 320.257 100
# m5 131.598 148.8195 173.42179 164.2410 189.9440 286.843 100
# m6 403.019 439.2600 496.25370 472.6735 549.0110 791.646 100
# m7 140.337 156.7870 270.48048 179.4055 208.9635 8631.503 100
You can make use of the fact that TRUE = 1 and FALSE = 0:
> df <- read.table(textConnection("C1 C2
+ 0.4 0.6
+ 0.3 0.7
+ 1 0
+ 0.7 0.3
+ 0.1 0.9"), header = T)
> (df$C2 >= 0.6) + 1
[1] 2 2 1 1 2
This is using matrix multiple
(dt>=0.6)%*%c(1,2)
[,1]
[1,] 2
[2,] 2
[3,] 1
[4,] 1
[5,] 2
If I consider the case where more than 1 column can satisfy the condition then which will be a better option.
I have modified the data so that both column 1 and 2 satisfy the condition in row 3.
# Data
df <- read.table(text = "C1 C2
0.4 0.6
0.3 0.7
1 1
0.7 0.3
0.1 0.9", header = T, stringsAsFactors = F)
# Use of which with arr.ind = TRUE
which(df >= 0.6, arr.ind = TRUE)
# Result shows row number 3 twice
# row col
#[1,] 3 1
#[2,] 4 1
#[3,] 1 2
#[4,] 2 2
#[5,] 3 2
#[6,] 5 2
Use the ternary function
ifelse(daf$C1<=0.4, 2, 1)
#[1] 2 2 1 1 2
Here is another possibility using the modulo operator %%:
(which(t(df) >= 0.6) + 1) %% ncol(df) + 1
#[1] 2 2 1 1 2
Sample data
df <- read.table(text =
"C1 C2
0.4 0.6
0.3 0.7
1 0
0.7 0.3
0.1 0.9", header = T)
Related
I have two dataframes A and B:
A
x y
1 0.0 0.0000000
2 0.5 0.8000000
3 -0.5 0.8000000
4 -1.0 0.0000000
5 -0.5 -0.8000000
6 0.5 -0.8000000
7 1.0 0.0000000
8 1.5 0.8000000
B
x y
1 -1.0 0.0000000
2 0.5 -0.8000000
3 3.0 0.0000000
I want to extract just the row indexes in A that exist in B so that the final result will be:
c(4,6)
How should I go about doing this?
interaction could be used to use %in% on multiple columns.
which(interaction(A) %in% interaction(B))
#[1] 4 6
Data
A <- read.table(header=TRUE, text=" x y
1 0.0 0.0000000
2 0.5 0.8000000
3 -0.5 0.8000000
4 -1.0 0.0000000
5 -0.5 -0.8000000
6 0.5 -0.8000000
7 1.0 0.0000000
8 1.5 0.8000000")
B <- read.table(header=TRUE, text=" x y
1 -1.0 0.0000000
2 0.5 -0.8000000
3 3.0 0.0000000")
Add another column to A which is just a sequence and then merge
> A$c=1:nrow(A)
> merge(A,B)$c
[1] 4 6
Using the join.keys function from plyr:
library(plyr)
with(join.keys(A, B), which(x %in% y))
Output:
[1] 4 6
One possible way is to count the number of times TRUE appears twice. If the columns get wider you can map over them.
which(`+`(df1$x %in% df2$x, df1$y %in% df2$y) == 2)
4 6
Using outer
which(rowSums(outer(1:nrow(A), 1:nrow(B), Vectorize(\(i, j) all(A[i, ] == B[j, ])))) == 1)
# [1] 4 6
or a for loop
r <- c()
for (j in seq_len(nrow(B))) {
for (i in seq_len(nrow(A))) {
if (all(A[i, ] == B[j, ])) r <- c(r, i)
}
}
r
# [1] 4 6
library(data.table)
setDT(A)
setDT(B)
A[fintersect(A, B), on = names(A), which = TRUE]
# [1] 4 6
library(dplyr)
A |>
mutate(row = row_number()) |>
inner_join(B, by = names(A)) |>
pull(row)
# [1] 4 6
Data
A = data.frame(
x = c(0, 0.5, -0.5, -1, -0.5, 0.5, 1, 1.5),
y = c(0, 0.8, 0.8, 0, -0.8, -0.8, 0, 0.8))
B = data.frame(
x = c(-1, 0.5, 3), y = c(0, -0.8, 0)
)
I would like to create a function in R that rounds numeric dataframes (or columns in a dataframe) depending on the number. If the number is less than 1, round to 1 decimal, but if it is greater than 1, round to 0 decimals.
This is what I have
data <- data.frame(x = c(1.111, 0.809, 5.55555, 0.567), y = c(0.235, 0.777, 4.55555555, 393.55))
round0 <- function(x) format(round(x, digits=0), nsmall = 0, trim = TRUE)
round0(data)
x y
1 1 0
2 1 1
3 6 5
4 1 394
# What I want
x y
1 1 0.2
2 0.8 1
3 6 5
4 1 394
> round0 <- function(x) ifelse(x<1,round(x,1),round(x))
> sapply(data,round0)
x y
[1,] 1.0 0.2
[2,] 0.8 0.8
[3,] 6.0 5.0
[4,] 0.6 394.0
You can use :
round0 <- function(x) ifelse(x < 1, format(round(x, 1), nsmall = 1), round(x))
data[] <- lapply(data, round0)
data
# x y
#1 1 0.2
#2 0.8 0.8
#3 6 5
#4 0.6 394
Note that this is only for display purpose and classes of columns are of type character. If you want to perform any mathematical calculation on it you need to convert it back to numeric.
Does the following code represent the preferred procedure for traversing the rows of an R data.table and passing the values found at each row to a function? Or is there a more performant way to do this?
library(data.table)
set.seed(2)
n <- 100
b <- c(0.5, 1.5, -1)
phi <- 0.8
X <- cbind(1, matrix(rnorm(n*2, 0, 1), ncol = 2))
y <- X %*% matrix(b, ncol = 1) + rnorm(n, 0, phi)
d <- data.table(y, X)
setnames(d, c("y", "x0", "x1", "x2"))
logpost <- function(d, b1, b2, b3, phi, mub = 1, taub = 10, a = 0.5, z = 0.7){
N <- nrow(d)
mu <- b1 + b2 * d$x1 + b3 * d$x2
lp <- -N * log(phi) -
(1/(2*phi^2)) * sum( (d$y-mu)^2 ) -
(1/(2*taub^2))*( (b1-mub)^2 + (b2-mub)^2 + (b3-mub)^2 ) -
(a+1)*log(phi) - (z/phi)
lp
}
nn <- 21
grid <- data.table(
expand.grid(b1 = seq(0, 1, len = nn),
b2 = seq(1, 2, len = nn),
b3 = seq(-1.5, -0.5, len = nn),
phi = seq(0.4, 1.2, len = nn)))
grid[, id := 1:.N]
setkey(grid, id)
wraplogpost <- function(dd){
logpost(d, dd$b1, dd$b2, dd$b3, dd$phi)
}
start <- Sys.time()
grid[, lp := wraplogpost(.SD), by = seq_len(nrow(grid))]
difftime(Sys.time(), start)
# Time difference of 2.081544 secs
Edit: display first few records
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152
I have tried using set but that approach seems inferior
start <- Sys.time()
grid[, lp := NA_real_]
for(i in 1:nrow(grid)){
llpp <- wraplogpost(grid[i])
set(grid, i, "lp", llpp)
}
difftime(Sys.time(), start)
# Time difference of 21.71291 secs
Edit: display first few records
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152
Suggestions or pointers to the relevant docs would be appreciated.
Edit: per comments:
start <- Sys.time()
grid[, lp := wraplogpost(.SD), by = .I]
difftime(Sys.time(), start)
Warning messages:
1: In b2 * d$x1 :
longer object length is not a multiple of shorter object length
2: In b3 * d$x2 :
longer object length is not a multiple of shorter object length
3: In d$y - mu :
longer object length is not a multiple of shorter object length
> difftime(Sys.time(), start)
Time difference of 0.01199317 secs
>
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -620977.2
2: 0.05 1 -1.5 0.4 2 -620977.2
3: 0.10 1 -1.5 0.4 3 -620977.2
4: 0.15 1 -1.5 0.4 4 -620977.2
5: 0.20 1 -1.5 0.4 5 -620977.2
6: 0.25 1 -1.5 0.4 6 -620977.2
which generates the wrong values for lp.
Edit thank you for the comments and responses. I am aware that this scenario could be addressed by using alternative methods, my interest is in what the preferred way to do this is when using data.table.
Edit thank you for the responses again. As there have been none that address the question of how to do this explicitly with data.table, at the moment, I am assuming that there is no ideal way to achieve this without turning to base R.
If you want to have a better performance (time) you could rewrite the rowwise function to a calculation with matrices.
start <- Sys.time()
grid_mat <- as.matrix(grid[, list(b1, b2, b3, 1)])
# function parameters
N <- nrow(d); mub = 1; taub = 10; a = 0.5; z = 0.7
d$const <- 1
# combining d$y - mu in this step already
mu_op <- matrix(c(-d$const, -d$x1, -d$x2, d$y), nrow = 4, byrow = TRUE)
mu_mat <- grid_mat %*% mu_op
mub_mat <- (grid_mat[, c("b1", "b2", "b3")] - mub)^2
# just to save one calculation of the log
phi <- grid$phi
log_phi <- log(grid$phi)
grid$lp2 <- -N * log_phi -
(1/(2*phi^2)) * rowSums(mu_mat^2) -
(1/(2*taub^2))*( rowSums(mub_mat) ) -
(a+1)*log_phi - (z/phi)
head(grid)
difftime(Sys.time(), start)
The first rows:
b1 b2 b3 phi id lp lp2
1: 0.00 1 -1.5 0.4 1 -398.7618 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152 -322.4152
For the timing:
# on your code on my pc:
Time difference of 4.390684 secs
# my code on my pc:
Time difference of 0.680476 secs
I think you can use matrix multiplication and other vectorization techniques to simplify your code, which helps you avoid running function logpost in a row-wise manner.
Below is a vectorized version of logpost, i.e., logpost2
logpost2 <- function(d, dd, mub = 1, taub = 10, a = 0.5, z = 0.7) {
bmat <- as.matrix(dd[, .(b1, b2, b3)])
xmat <- cbind(1, as.matrix(d[, .(x1, x2)]))
phi <- dd$phi
phi_log <- log(phi)
lp <- -(a + nrow(d) + 1) * phi_log -
(1 / (2 * phi^2)) * colSums((d$y - tcrossprod(xmat, bmat))^2) -
(1 / (2 * taub^2)) * rowSums((bmat - mub)^2) - (z / phi)
lp
}
and you will see
> start <- Sys.time()
> grid[, lp := logpost2(d, .SD)]
> difftime(Sys.time(), start)
Time difference of 0.1966231 secs
and
> head(grid)
b1 b2 b3 phi id lp
1: 0.00 1 -1.5 0.4 1 -398.7618
2: 0.05 1 -1.5 0.4 2 -380.3674
3: 0.10 1 -1.5 0.4 3 -363.5356
4: 0.15 1 -1.5 0.4 4 -348.2663
5: 0.20 1 -1.5 0.4 5 -334.5595
6: 0.25 1 -1.5 0.4 6 -322.4152
I have a dataset like this:
library(data.table)
library(EnvStats)
DT <- data.table(MEAN = c(0.5,0.7,0.9,0.8),MIN = c(0.4,0.6,0.8,0.8),MAX = c(0.6,0.8,1,0.8),REF = rnorm(4,1000,200))
I compute a var containing a list of simulated values from vars MEAN, MIN and MAX but I need to return NA values if MEAN <= MIN or MEAN >= MAX.
Something like this:
DT[, Sim_rtri := Map(function(w, x, y, z) ifelse(z <= x | z >= y,NA,w*(1+rtri(n = 10000,min = x,max =
y,mode = z))), REF, MIN, MAX, MEAN)]
But I get an output with just a single value in all cases. Like this:
DT
MEAN MIN MAX REF Sim_rtri
1: 0.5 0.4 0.6 1274.1917 1957.572
2: 0.7 0.6 0.8 887.0604 1503.418
3: 0.9 0.8 1.0 1072.6257 2011.683
4: 0.8 0.8 0.8 1126.5725 NA
Instead my desired output is:
DT
MEAN MIN MAX REF Sim_rtri
1: 0.5 0.4 0.6 1274.1917 1946.223,1849.996,1933.170,1940.845,1905.784,1943.204,...
2: 0.7 0.6 0.8 887.0604 1512.938,1530.315,1480.203,1542.298,1500.740,1513.961,...
3: 0.9 0.8 1.0 1072.6257 2055.113,2085.123,1991.335,2022.209,2010.288,1984.313,...
4: 0.8 0.8 0.8 1126.5725 NA, NA, NA, NA, NA,....
How can I get this?
Here, we need if/else instead of ifelse as ifelse requires all arguments to be of same length, the rtri n is obviously large and is not of the same length as test vector which is of length 1 as we are looping on each row
DT[, Sim_rtri := Map(function(w, x, y, z) if(z <= x | z >= y) NA
else(w*(1+rtri(n = 10000,min = x,max =
y,mode = z))), REF, MIN, MAX, MEAN)]
DT
# MEAN MIN MAX REF Sim_rtri
#1: 0.5 0.4 0.6 1292.650 1894.088,1943.958,1935.992,1881.802,1918.530,1975.114,...
#2: 0.7 0.6 0.8 1037.545 1817.918,1742.029,1719.068,1753.283,1786.834,1803.231,...
#3: 0.9 0.8 1.0 1204.405 2327.954,2306.122,2198.317,2223.934,2235.752,2328.857,...
#4: 0.8 0.8 0.8 881.633 NA
Also, if we need to have n NAs, just replicate the NA in if
if(z <= x | z >= y) rep(NA, 10000)
Recently, I have found that I am using the following pattern over and over again. The process is:
cross-tabulate numeric variable by factor using table
create data frame from created table
add original numeric values to data frame (from row names (!))
remove row names
reorder columns of aggregated data frame
In R, it looks like this:
# Sample data
df <- data.frame(x = round(runif(100), 1),
y = factor(ifelse(runif(100) > .5, 1, 0),
labels = c('failure', 'success'))
)
# Get frequencies
dfSummary <- as.data.frame.matrix(table(df$x, df$y))
# Add column of original values from rownames
dfSummary$x <- as.numeric(rownames(dfSummary))
# Remove rownames
rownames(dfSummary) <- NULL
# Reorder columns
dfSummary <- dfSummary[, c(3, 1, 2)]
Is there anything more elegant in R, preferably using base functions? I know I can use sql to do this in single command - I think that it has to be possible to achieve similar behavior in R.
sqldf solution:
library(sqldf)
dfSummary <- sqldf("select
x,
sum(y = 'failure') as failure,
sum(y = 'success') as success
from df group by x")
An alternative with base R could be:
aggregate(. ~ x, transform(df, success = y == "sucess",
failure = y == "failure", y = NULL), sum)
# x success failure
#1 0.0 2 4
#2 0.1 6 8
#3 0.2 1 7
#4 0.3 5 4
#5 0.4 6 6
#6 0.5 3 3
#7 0.6 4 6
#8 0.7 6 6
#9 0.8 4 5
#10 0.9 6 7
#11 1.0 1 0
Your code modified as a function would be efficient compared to the other solutions in base R (so far). If you wanted the code in one-line, a "reshape/table" combo from base R could be used.
reshape(as.data.frame(table(df)), idvar='x', timevar='y',
direction='wide')
# x Freq.failure Freq.success
#1 0 3 2
#2 0.1 3 9
#3 0.2 5 5
#4 0.3 8 7
#5 0.4 5 3
#6 0.5 9 4
#7 0.6 3 6
#8 0.7 7 6
#9 0.8 3 1
#10 0.9 4 3
#11 1 0 4
In case you want to try data.table
library(data.table)
dcast.data.table(setDT(df), x~y)
# x failure success
# 1: 0.0 3 2
# 2: 0.1 3 9
# 3: 0.2 5 5
# 4: 0.3 8 7
# 5: 0.4 5 3
# 6: 0.5 9 4
# 7: 0.6 3 6
# 8: 0.7 7 6
# 9: 0.8 3 1
#10: 0.9 4 3
#11: 1.0 0 4
Update
I didn't notice the as.data.frame(table( converts to "factor" columns (thanks to #Hadley's comment). A workaround is:
res <- transform(reshape(as.data.frame(table(df), stringsAsFactors=FALSE),
idvar='x', timevar='y', direction='wide'), x= as.numeric(x))
data
set.seed(24)
df <- data.frame(x = round(runif(100), 1),
y = factor(ifelse(runif(100) > .5, 1, 0),
labels = c('failure', 'success'))
)
Benchmarks
set.seed(24)
df <- data.frame(x = round(runif(1e6), 1),
y = factor(ifelse(runif(1e6) > .5, 1, 0),
labels = c('failure', 'success'))
)
tomas <- function(){
dfSummary <- as.data.frame.matrix(table(df$x, df$y))
dfSummary$x <- as.numeric(rownames(dfSummary))
dfSummary <- dfSummary[, c(3, 1, 2)]}
doc <- function(){aggregate(. ~ x, transform(df,
success = y == "success", failure = y == "failure",
y = NULL), sum)}
akrun <- function(){reshape(as.data.frame(table(df)),
idvar='x', timevar='y', direction='wide')}
library(microbenchmark)
microbenchmark(tomas(), doc(), akrun(), unit='relative', times=20L)
Unit: relative
#expr min lq mean median uq max neval cld
#tomas() 1.000000 1.0000000 1.000000 1.000000 1.0000000 1.000000 20 a
#doc() 13.451037 11.5050997 13.082074 13.043584 12.8048306 19.715535 20 b
#akrun() 1.019977 0.9522809 1.012332 1.007569 0.9993835 1.533191 20 a
Updated with dcast.data.table
df1 <- copy(df)
akrun2 <- function() {dcast.data.table(setDT(df1), x~y)}
microbenchmark(tomas(), akrun2(), unit='relative', times=20L)
# Unit: relative
# expr min lq mean median uq max neval cld
# tomas() 6.493231 6.345752 6.410853 6.51594 6.502044 5.591753 20 b
# akrun2() 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 20 a
This should be relatively efficient. You cannot really suppress rownames in a dataframe, since they are a requirement of a valid dataframe
X <- table(df$x,df$y)
cbind( data.frame(x=rownames(X)), unclass(X) )
x failure success
0 0 5 3
0.1 0.1 6 1
0.2 0.2 7 8
0.3 0.3 7 3
0.4 0.4 6 6
0.5 0.5 6 4
0.6 0.6 2 5
0.7 0.7 2 7
0.8 0.8 3 7
0.9 0.9 4 6
1 1 2 0