Large raster frequency table / counts

Large raster frequency table / counts - r

I try to calculate the frequency/count of pixel values of a raster in R using freq().
Create two example rasters for comparison:
library(raster)
RastSmall <- raster(nrow=70, ncol=70)
RastBig <- raster(nrow=7000, ncol=7000)
set.seed(0)
RastSmall[] <- round(runif(1:ncell(r_hr), 1, 5))
RastBig[] <- round(runif(1:ncell(r_hr), 1, 5))
Get the pixel count using freq()
freq(RastSmall)
value count
[1,] 1 6540000
[2,] 2 12150000
[3,] 3 12140000
[4,] 4 11720000
[5,] 5 6450000
However, it is a fairly large file and takes extremely long, i.e. up to hours. Is there a faster way in R?
Here the speed difference for a small and a large raster:
system.time(freq(RastSmall))
user system elapsed
0.008 0.000 0.004
system.time(freq(RastBig))
user system elapsed
40.484 0.964 41.445
Is there a way to speed this up? Alternatively can this be done in the command line using something like gdal tools?

I did exactly that last week, however I couldn't find other faster ways to do it in R. I've tried to do it with the rqgis package by calling the r.report of GRASS. It works but was slower than the R native function. Maybe you'll have a better luck. Here is my code with grass in case you want to try it:
library(RQGIS)
monqgis <- set_env("C:\\Mrnmicro\\Applic\\OSGeo4W")
find_algorithms(search_term = "report", qgis_env = monqgis)
get_usage(alg = "grass7:r.report", qgis_env = monqgis)
params <- get_args_man(alg = "grass7:r.report", qgis_env = monqgis)
get_usage(alg = "grass7:r.report", qgis_env = monqgis)
params$map <- classif
params$units <- 5
params$rawoutput <- "C:\\temp\\outputRQGIS_raw"
params$html <- "C:\\temp\\outputRQGIS"
system.time(asas <- run_qgis(alg = "grass7:r.report", params=params,load_output = params$OUTPUT, qgis_env = monqgis))

not an amazing saving but if you getValues from your raster and then run the base::table function, it saves about 20%. My raster was c.500m cells.
# read in raster to obtain frequency table
r <- raster("./path/myraster.tif")
# perform tests; traditional freq() vs. getValues() & table()
require(microbenchmark)
mbm <- microbenchmark(
Freq = {freqf <- freq(r,useNA="no");
freq.df <- data.frame(CODE=freqf[,1], N=freqf[,2]},
GetVals = {v <- getValues(r);
vt <- table(v);
getval.df <- data.frame(CODE=as.numeric(names(vt)),N=as.numeric(as.matrix(vt)))},
times=5
)
mbm
Unit: seconds
expr min lq mean median uq max neval
Freq 191.1649 191.8001 198.8567 192.5256 193.0986 225.6942 5
GetVals 153.5552 154.8776 156.9173 157.0539 159.0400 160.0598 5
# check the routines have identical results
identical(freq.df,getval.df)
[1] TRUE
bit of a saving i guess
(N.B. the reason i make the data frames is that I go on to process the data that comes out of the frequency analysis)

I think the most effective way to calculate that is by using GetHistogram( ) from GDAL. Unfortunately, I can't find a way to use it from R. The closest approach is by using gdalUtilities::gdalinfo from R, and use the flag -hist, or hist = TRUE, but is limited the calculations between 0 - 255.
Another option is using rasterDT::freqDT, which is faster than regular options. Here an example:
library(gdalUtilities)
library(raster)
library(rasterDT)
library(microbenchmark)
RastBig <- raster(nrow=7000, ncol=7000)
set.seed(0)
RastBig[] <- round(runif(1:ncell(RastBig), 1, 5))
writeRaster(RastBig, filename = 'C:/temp/RastBig.tif')
mbm <- microbenchmark(times = 50,
freq1 = freq(RastBig),
freq2 = table(RastBig[]),
freq3 = freqDT(RastBig),
freq4 = ({
gdalLog <- capture.output(gdalUtilities::gdalinfo(datasetname = 'C:/temp/RastBig.tif', hist = TRUE));
(bucxml <- as.numeric(sub('buckets.+', '', grep('buckets ', gdalLog, value = TRUE))));
(minxml <- as.numeric(gsub('.+from | to.+', '', grep('buckets ', gdalLog, value = TRUE)) ));
(maxxml <- as.numeric(gsub('.+to |:', '', grep('buckets ', gdalLog, value = TRUE))));
(histxml <- as.numeric(strsplit(split = '[[:space:]]', gsub("^ |^ ", "", gdalLog[grep('buckets', gdalLog)+1]))[[1]]));
labs <- seq(from = minxml, to = maxxml, length.out = bucxml);
df <- data.frame(labs, nwlab = c(ceiling(labs[1]),
round(labs[2:(bucxml-1)]),
floor(labs[bucxml])),
val = histxml);
hist <- aggregate(df$val, by = list(df$nwlab), sum)})
)
Results:
> freq1
value count
[1,] 1 6127755
[2,] 2 12251324
[3,] 3 12249376
[4,] 4 12248938
[5,] 5 6122607
> freq2
1 2 3 4 5
6127755 12251324 12249376 12248938 6122607
> freq3
ID freq
1: 1 6127755
2: 2 12251324
3: 3 12249376
4: 4 12248938
5: 5 6122607
> freq4
Group.1 x
1 1 6127755
2 2 12251324
3 3 12249376
4 4 12248938
5 5 6122607
Unit: milliseconds
expr min lq mean median uq max neval
freq1 58628.486301 59100.539302 59400.304887 59383.913701 59650.412 60841.3975 50
freq2 55912.170401 56663.025202 56954.032395 56919.905051 57202.001 58307.9500 50
freq3 3785.767301 4006.858102 4288.699531 4292.447250 4536.382 4996.0598 50
freq4 7.892201 8.883102 9.255641 9.154001 9.483 15.6072 50
EDIT: using this is quite faster than option 3:
rB <- raster('C:/temp/RastBig.tif')
freq3B <- freqDT(rB)

Related

How to optimzie my function by dropping loops

I have the following function that uses nested loops and honestly I'm not sure how to proceed with making the code run more efficient. It runs fine for 100 sims in my opinion but when I ran for 2000 sims it took almost 12 seconds.
This code will generate any n Brownian Motion simulations and works well, the issue is once the simulation size is increased to say 500+ then it starts to bog down, and when it hits 2k then it's pretty slow ie 12.
Here is the function:
ts_brownian_motion <- function(.time = 100, .num_sims = 10, .delta_time = 1,
.initial_value = 0) {
# TidyEval ----
T <- as.numeric(.time)
N <- as.numeric(.num_sims)
delta_t <- as.numeric(.delta_time)
initial_value <- as.numeric(.initial_value)
# Checks ----
if (!is.numeric(T) | !is.numeric(N) | !is.numeric(delta_t) | !is.numeric(initial_value)){
rlang::abort(
message = "All parameters must be numeric values.",
use_cli_format = TRUE
)
}
# Initialize empty data.frame to store the simulations
sim_data <- data.frame()
# Generate N simulations
for (i in 1:N) {
# Initialize the current simulation with a starting value of 0
sim <- c(initial_value)
# Generate the brownian motion values for each time step
for (t in 1:(T / delta_t)) {
sim <- c(sim, sim[t] + rnorm(1, mean = 0, sd = sqrt(delta_t)))
}
# Bind the time steps, simulation values, and simulation number together in a data.frame and add it to the result
sim_data <- rbind(
sim_data,
data.frame(
t = seq(0, T, delta_t),
y = sim,
sim_number = i
)
)
}
# Clean up
sim_data <- sim_data %>%
dplyr::as_tibble() %>%
dplyr::mutate(sim_number = forcats::as_factor(sim_number)) %>%
dplyr::select(sim_number, t, y)
# Return ----
attr(sim_data, ".time") <- .time
attr(sim_data, ".num_sims") <- .num_sims
attr(sim_data, ".delta_time") <- .delta_time
attr(sim_data, ".initial_value") <- .initial_value
return(sim_data)
}
Here is some output of the function:
> ts_brownian_motion(.time = 10, .num_sims = 25)
# A tibble: 275 × 3
sim_number t y
<fct> <dbl> <dbl>
1 1 0 0
2 1 1 -2.13
3 1 2 -1.08
4 1 3 0.0728
5 1 4 0.562
6 1 5 0.255
7 1 6 -1.28
8 1 7 -1.76
9 1 8 -0.770
10 1 9 -0.536
# … with 265 more rows
# ℹ Use `print(n = ...)` to see more rows

As suggested in the comments, if you want speed, you should use cumsum. You need to be clear what type of Brownian Motion you want (arithmetic, geometric). For geometric Brownian motion, you'll need to correct the approximation error by adjusting the mean. As an example, the NMOF package (which I maintain), contains a function gbm that implements geometric Brownian Motion through cumsum. Here is an example call for 2000 paths with 100 timesteps each.
library("NMOF")
library("zoo") ## for plotting
timesteps <- 100
system.time(b <- NMOF::gbm(2000, tau = 1, timesteps = 100, r = 0, v = 1))
## user system elapsed
## 0.013 0.000 0.013
dim(b) ## each column is one path, starting at time zero
## [1] 101 2000
plot(zoo(b[, 1:5], 0:timesteps), plot.type = "single")

Calculate a percentile of dataframe column efficiently

This question is an extension to the StackOverflow question asked and answered here.
My circumstances are different in that I want to calculate the percentile of each value within a vector of 50,000 (or more!) values. For example --
df <- data.frame(val = rnorm(n = 50000, mean = 50, sd = 20))
df$val.percentile <- sapply(X = df$val, function(x) ecdf(df$val)(x))
head(df)
Is there a good way to optimize the process for calculating the percentile for each value? Essentially I'd like to make it as efficient as possible so the run time is as small as possible.

ecdf is already vectorized, there is no reason to use an apply function. You can simply run:
df$val.percentile <- ecdf(df$val)(df$val)

You can implement dplyr::percent_rank() to rank each value based on the percentile. This is different, however, from determining the rank based on a cumulative distribution function dplyr::cume_dist() (Proportion of all values less than or equal to the current rank).
Reproducible example:
set.seed(1)
df <- data.frame(val = rnorm(n = 1000000, mean = 50, sd = 20))
Show that percent_rank() differs from cume_dist() and that cume_dist() is the same as ecdf(x)(x):
library(tidyverse)
head(df) %>%
mutate(pr = percent_rank(val),
cd = ecdf(val)(val),
cd2 = cume_dist(val))
val pr cd cd2
1 37.47092 0.4 0.5000000 0.5000000
2 53.67287 0.6 0.6666667 0.6666667
3 33.28743 0.0 0.1666667 0.1666667
4 81.90562 1.0 1.0000000 1.0000000
5 56.59016 0.8 0.8333333 0.8333333
6 33.59063 0.2 0.3333333 0.3333333
Speed of each approach for this example dataset is roughly similar, not exceeding a factor of 2:
library(microbenchmark)
mbm <- microbenchmark(
pr_dplyr = mutate(df, pr = percent_rank(val)),
cd_dplyr = mutate(df, pr = percent_rank(val)),
cd_base = mutate(df, pr = ecdf(val)(val)),
times = 20
)
autoplot(mbm)

Using rollapply and lm over multiple columns of data

I have a data frame similar to the following with a total of 500 columns:
Probes <- data.frame(Days=seq(0.01, 4.91, 0.01), B1=5:495,B2=-100:390, B3=10:500,B4=-200:290)
I would like to calculate a rolling window linear regression where my window size is 12 data points and each sequential regression is separated by 6 data points. For each regression, "Days" will always be the x component of the model, and the y's would be each of the other columns (B1, followed by B2, B3, etc). I would then like to save the co-efficients as a dataframe with the existing column titles (B1, B2, etc).
I think my code is close, but is not quite working. I used rollapply from the zoo library.
slopedata<-rollapply(zoo(Probes), width=12, function(Probes) {
coef(lm(formula=y~Probes$Days, data = Probes))[2]
}, by = 6, by.column=TRUE, align="right")
If possible, I would also like to have the "xmins" saved to a vector to add to the dataframe. This would mean the smallest x value used in each regression (basically it would be every 6 numbers in the "Days" column.)
Thanks for your help.

1) Define a zoo object z whose data contains Probes and whose index is taken from the first column of Probes, i.e. Days. Noting that lm allows y to be a matrix define a coefs function which computes the regression coefficients. Finally rollapply over z. Note that the index of the returned object gives xmin.
library(zoo)
z <- zoo(Probes, Probes[[1]])
coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1])))))
rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left")
giving:
> head(rz)
B11 B12 B21 B22 B31 B32 B41 B42
0.01 4 100 -101 100 9 100 -201 100
0.07 4 100 -101 100 9 100 -201 100
0.13 4 100 -101 100 9 100 -201 100
0.19 4 100 -101 100 9 100 -201 100
0.25 4 100 -101 100 9 100 -201 100
0.31 4 100 -101 100 9 100 -201 100
Note that DF <- fortify.zoo(rz) could be used if you needed a data frame representation of rz.
2) An alternative somewhat similar approch would be to rollaplly over the row numbers:
library(zoo)
y <- as.matrix(Probes[-1])
Days <- Probes$Days
n <- nrow(Probes)
coefs <- function(ix) c(unlist(as.data.frame(coef(lm(y ~ Days, subset = ix)))),
xmins = Days[ix][1])
r <- rollapply(1:n, 12, by = 6, coefs)

try this:
# here are the xmin values you wanted
xmins <- Probes$Days[seq(1,nrow(Probes),6)]
# here we build a function that will run regressions across the columns
# y1 vs x, y2 vs x, y3 vs x...
# you enter the window and by (12/6) in order to limit the interval being
# regressed. this is later called in do.call
runreg <- function(Probes,m,window=12,by=6){
# beg,end are used to specify the interval
beg <- seq(1,nrow(Probes),by)[m]
end <- beg+window-1
# this is used to go through all the columns
N <- ncol(Probes)-1
tmp <- numeric(N)
# go through each column and store the coefficients in tmp
for(i in 1:N){
y <- Probes[[i+1]][beg:end]
x <- Probes$Days[beg:end]
tmp[i] <- coef(lm(y~x))[2][[1]]
}
# put all our column regressions into a dataframe
res <- rbind('coeff'=tmp)
colnames(res) <- colnames(Probes)[-1]
return(res)
}
# now that we've built the function to do the column regressions
# we just need to go through all the window-ed regressions (row regressions)
res <- do.call(rbind,lapply(1:length(xmins),function(m) runreg(Probes,m)))
# these rownames are the index of the xmin values
rownames(res) <- seq(1,nrow(Probes),6)
res <- data.frame(res,xmins)

You can also use the rollRegres package as follows
# setup data
Probes <- data.frame(
# I changed the days to be intergers
Days=seq(1L, 491L, 1L),
B1=5:495, B2=-100:390, B3=10:500 , B4=-200:290)
# setup grp argument
grp_arg <- as.integer((Probes$Days - 1L) %/% 6)
# estimate coefs. width argument is realtive in grp units
library(rollRegres)
X <- cbind(1, Probes$Days / 100)
Ys <- as.matrix(Probes[, 2:5])
out <- lapply(1:ncol(Ys), function(i)
roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
out <- do.call(cbind, out)
# only keep the complete.cases and the unique values
colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
out <- out[complete.cases(out), ]
head(out)
#R B10 B11 B20 B21 B30 B31 B40 B41
#R [1,] 4 100 -101 100 9 100 -201 100
#R [2,] 4 100 -101 100 9 100 -201 100
#R [3,] 4 100 -101 100 9 100 -201 100
#R [4,] 4 100 -101 100 9 100 -201 100
#R [5,] 4 100 -101 100 9 100 -201 100
#R [6,] 4 100 -101 100 9 100 -201 100
The solution is a lot faster than e.g., the zoo solution
library(zoo) coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1]))))) microbenchmark::microbenchmark( rollapply = {
z <- zoo(Probes, Probes[[1]])
rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left") }, roll_regres = {
grp_arg <- as.integer((Probes$Days - 1L) %/% 6)
X <- cbind(1, Probes$Days / 100)
Ys <- as.matrix(Probes[, 2:5])
out <- lapply(1:ncol(Ys), function(i)
roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
out <- do.call(cbind, out)
colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
out <- out[complete.cases(out), ]
head(out) } )
#R Unit: microseconds
#R expr min lq mean median uq max neval
#R rollapply 53392.614 56330.492 59793.106 58363.2825 60902.938 119206.76 100
#R roll_regres 865.186 920.297 1074.161 983.9015 1047.705 5071.41 100
At present you though need to install the package from Github due to an error in the validation in version 0.1.0. Thus, run
devtools::install_github("boennecd/rollRegres", upgrade_dependencies = FALSE,
build_vignettes = TRUE)

trying to perform a t.test for each row and count all rows where p-value is less than 0.05

I've been wrecking my head for the past four hours trying to find the solution to an R problem, which is driving me nuts. I've searching everywhere for a decent answer but so far I've been hitting wall after wall. I am now appealing to your good will of this fine community for help.
Consider the following dataset:
set.seed(2112)
DataSample <- matrix(rnorm(24000),nrow=1000)
colnames(DataSample) <- c(paste("Trial",1:12,sep=""),paste("Control",13:24,sep=""))
I need to perform a t-test for every row in DataSample in order to find out if groups TRIAL and CONTROL differ (equal variance applies).
Then I need to count the number of rows with a p-value equal to, or lower than 0.05.
So here is the code I tried, which I know is wrong:
set.seed(2112)
DataSample <- matrix(rnorm(24000),nrow=1000)
colnames(DataSample) <- c(paste("Trial",1:12,sep=""),paste("Control",13:24,sep=""))
pValResults <- apply(
DataSample[,1:12],1,function(x) t.test(x,DataSample[,13:24], var.equal=T)$p.value
)
sum(pValResults < 0.05) # Returns the wrong answer (so I was told)
I did try looking at many similar questions around stackoverflow, but I would often end-up with syntax errors or a dimensional mismatch. The code above is the best I could get without returning me an R error -- but I since the code is returning the wrong answer I have nothing to feel proud of.
Any advice will be greatly appreciated! Thanks in advance for your time.

One option is to loop over the data set calculating the t test for each row, but it is not as elegant.
set.seed(2112)
DataSample <- matrix(rnorm(24000),nrow=1000)
colnames(DataSample) <- c(paste("Trial",1:12,sep=""),paste("Control",13:24,sep=""))
# initialize vector of stored p-values
pvalue <- rep(0,nrow(DataSample))
for (i in 1:nrow(DataSample)){
pvalue[i] <- t.test(DataSample[i,1:12],DataSample[i,13:24])$p.value
}
# finding number that are significant
sum(pvalue < 0.05)

I converted to a data.table, and the answer I got was 45:
DataSample.dt <- as.data.table(DataSample)
sum(sapply(seq_len(nrow(DataSample.dt)), function(x)
t.test(DataSample.dt[x, paste0('Trial', 1:12), with=F],
DataSample.dt[x, paste0('Control', 13:24), with=F],
var.equal=T)$p.value) < 0.05)

To do a paired T test, you need to supply the paired = TRUE parameter. The t.test function isn't vectorised, but it's quite simple to do t tests a whole matrix at a time. Here's three methods (including using apply):
library("genefilter")
library("matrixStats")
library("microbenchmark")
dd <- DataSample[, 1:12] - DataSample[, 13:24]
microbenchmark::microbenchmark(
manual = {ps1 <- 2 * pt(-abs(rowMeans(dd) / sqrt(rowVars(dd) / ncol(dd))), ncol(dd) - 1)},
apply = {ps2 <- apply(DataSample, 1, function(x) t.test(x[1:12], x[13:24], paired=TRUE)$p.value)},
rowttests = {ps3 <- rowttests(dd)[, "p.value"]})
#Unit: milliseconds
# expr min lq mean median uq max
# manual 1.611808 1.641783 1.677010 1.663122 1.709401 1.852347
# apply 390.869635 398.720930 404.391487 401.508382 405.715668 634.932675
# rowttests 2.368823 2.417837 2.639671 2.574320 2.757870 7.207135
# neval
# 100
# 100
# 100
You can see the manual method is over 200x faster than apply.
If you actually meant an unpaired test, here's the equivalent comparison:
microbenchmark::microbenchmark(
manual = {x <- DataSample[, 1:12]; y <- DataSample[, 13:24]; ps1 <- 2 * pt(-abs((rowMeans(x) - rowMeans(y)) / sqrt((rowVars(x) + rowVars(y)) / ncol(x))), ncol(DataSample) - 2)},
apply = { ps2 <- apply(DataSample, 1, function(x) t.test(x[1:12], x[13:24], var.equal = TRUE)$p.value)},
rowttests = {ps3 <- rowttests(DataSample, factor(rep(1:2, each = 12)))[, "p.value"]})
Note the manual method assumes that the two groups are the same sizes.

Adding an alternative using an external library.
Performing the test:
library(matrixTests)
res <- row_t_equalvar(DataSample[,1:12], DataSample[,13:24])
Format of the result:
res
obs.x obs.y obs.tot mean.x mean.y mean.diff var.x var.y var.pooled stderr df statistic pvalue conf.low conf.high alternative mean.null conf.level
1 12 12 24 0.30569721 0.160622830 0.145074376 0.5034806 1.0769678 0.7902242 0.3629105 22 0.399752487 0.69319351 -0.6075559 0.89770469 two.sided 0 0.95
2 12 12 24 -0.27463354 -0.206396781 -0.068236762 0.8133311 0.2807800 0.5470556 0.3019535 22 -0.225984324 0.82329990 -0.6944500 0.55797651 two.sided 0 0.95
3 12 12 24 -0.19805092 -0.023207888 -0.174843032 0.4278359 0.5604078 0.4941219 0.2869733 22 -0.609265949 0.54858909 -0.7699891 0.42030307 two.sided 0 0.95
Number of rows with p <= 0.05:
> sum(res$pvalue <= 0.05)
[1] 4

R - streamline subsampling procedure

I have a dataset composed of values obtained from studies and experiments. Experiments are nested within studies. I want to subsample the dataset so that only 1 experiment is represented for each study. I want to repeat this procedure 10,000 times, randomly drawing the 1 experiment each time, and then calculate some summary statistics for the values. Here is an example dataset:
df=data.frame(study=c(1,1,2,2,2,3,4,4),expt=c(1,2,1,2,3,1,1,2),value=runif(8))
I wrote the following function to do the above, but it is taking forever. Does anyone have any suggestions for streamlining this code? Thanks!
subsample=function(x,A) {
subsample.list=sapply(1:A,function(m) {
idx=ddply(x,c("study"),function(i) sample(1:nrow(i),1)) #Sample one experiment from each study
x[paste(x$study,x$expt,sep="-") %in% paste(idx$study,idx$V1,sep="-"),"value"] } ) #Match the study-experiment combinations and retrieve values
means.list=ldply(subsample.list,mean) #Calculate the mean of 'values' for each iteration
c(quantile(means.list$V1,0.025),mean(means.list$V1),upper=quantile(means.list$V1,0.975)) } #Calculate overall means and 95% CIs

You can vectorise this way more (even using plyr), and go much much faster:
function=yoursummary(x)c(quantile(x,0.025),mean(x),upper=quantile(x,0.975))
subsampleX=function(x,M)
yoursummary(
aaply(
daply(.drop_o=F,df,.(study),
function(x)sample(x$value,M,replace=T)
),1,mean
)
)
The trick here is to do all the sampling up front. If we want to sample M times, why not do all that while you have access to the study.
Original code:
> system.time(subsample(df,20000))
user system elapsed
123.23 0.06 124.74
New vectorised code:
> system.time(subsampleX(df,20000))
user system elapsed
0.24 0.00 0.25
That's about 500x faster.

Here's a base R solution which avoids ddply for speed reasons:
df=data.frame(study=c(1,1,2,2,2,3,4,4),expt=c(1,2,1,2,3,1,1,2),value=runif(8))
sample.experiments <- function(df) {
r <- rle(df$study)
samp <- sapply( r$lengths , function(x) sample(seq(x),1) )
start.idx <- c(0,cumsum(r$lengths)[1:(length(r$lengths)-1)] )
df[samp + start.idx,]
}
> sample.experiments(df)
study expt value
1 1 1 0.6113196
4 2 2 0.5026527
6 3 1 0.2803080
7 4 1 0.9824377
Benchmarks
> m <- microbenchmark(
+ ddply(df,.(study),function(i) i[sample(1:nrow(i),1),]) ,
+ sample.experiments(df)
+ )
> m
Unit: microseconds
expr min lq median uq max
1 ddply(df, .(study), function(i) i[sample(1:nrow(i), 1), ]) 3808.652 3883.632 3936.805 4022.725 6530.506
2 sample.experiments(df) 337.327 350.734 357.644 365.915 580.097

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Large raster frequency table / counts - r

Related

How to optimzie my function by dropping loops

Calculate a percentile of dataframe column efficiently

Using rollapply and lm over multiple columns of data

trying to perform a t.test for each row and count all rows where p-value is less than 0.05

R - streamline subsampling procedure

Categories

Resources