create data frame containing rows that add up to 100 - r

This is my first stab at this:
library(dplyr)
step_size <- 5
grid <- expand.grid(
x1 = seq(0, 100, step_size)
, x2 = seq(0, 100, step_size)
, x3 = seq(0, 100, step_size)
)
grid$sum = grid$x1 + grid$x2 + grid$x3
grid$x1 <- (grid$x1 / grid$sum) * 100
grid$x2 <- (grid$x2 / grid$sum) * 100
grid$x3 <- (grid$x3 / grid$sum) * 100
grid$sum <- grid$x1 + grid$x2 + grid$x3
nrow(grid)
result <- distinct(grid) %>% filter(!is.na(sum))
head(result, 20)
nrow(result)
Basically, I want to create a data frame that contains as many rows as possible that add up to 100 and are uniformly distributed.
is there an easier better approach in R? thanks!

Using data.table...
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
res <- grid[grid[, rowSums(.SD) == 100], ]
res[, summation := rowSums(.SD)]
Result:
> res[, unique(summation)]
[1] 100
This can also be done in base but data.table is faster:
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
grid2 <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
microbenchmark::microbenchmark(
data.table = {
res <- grid[grid[, rowSums(.SD) == 100], ]
},
base = {
res2 <- grid2[rowSums(grid2) == 100, ]
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
data.table 59.41157 89.6700 109.0462 107.7415 124.2675 183.9730 100 a
base 65.70521 109.6471 154.1312 125.4238 156.9168 611.0169 100 b

Here's a simple function. You can specify how many rows/columns you want, and what each row sums to.
func <- function(cols = 3, rows = 10, rowTotal = 100) {
dt1 <- replicate(n = cols, runif(n = rows))
dt1 <- data.frame(apply(X = dt1, MARGIN = 2, FUN = function(x) x / rowSums(dt1) * rowTotal))
return(dt1)
}
rowSums(func()) # default values (3 cols, 10 rows, each row sums to 100)
rowSums(func(cols = 5, rows = 10, rowTotal = 50)) # 5 cols, 10 rows, row sums to 50)

Related

Sample from a 0:Vector[i]

In R:
I have a vector **
y = sample(0:200, 1e4, replace = TRUE)
I want to create a variable ‘x’ such that:
x = sample(0:y[i], 1e4, replace = TRUE)
Where y[i] are the values of y1, y2, …, y1e4 created from the sample function before. For example if y1 = 50 then I would like the first entry of x = sample(0:50) etc. However I am not sure how to do this. I have tried for loops but have gotten no where.
Any help is much appreciated!
How about
x <- as.integer(runif(1e4)*sample(201, 1e4, TRUE))
Benchmarking:
f1 <- function() sapply(sample(0:200, 1e4, replace = TRUE), function(i) sample(0:i, size = 1))
f2 <- function() as.integer(runif(1e4)*sample(201, 1e4, TRUE))
microbenchmark::microbenchmark(f1 = f1(),
f2 = f2())
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 38877.3 47070.50 49294.770 48625.00 50175.35 97045.0 100
#> f2 508.2 522.05 555.602 531.45 549.45 2080.8 100
This should work:
y = sample(0:200, 1e4, replace = TRUE)
x = sapply(y, \(i) sample(0:i, size = 1))
Or the equivalent using a for loop:
x = numeric(length(y))
for(i in seq_along(y)) {
x[i] = sample(0:y[i], size = 1)
}
If efficiency matters, this might be a bit faster on very long input:
x = floor(runif(length(y), min = 0, max = y + 1))

Using app function from {terra} package on raster stacks? (in parallel)

I have four high resolution rasters for a country. I have split each raster into tiles and done some other processing to them. I now want to apply a function to each cell, of each 'stack' of the raster tiles, to produce one set of output tiles. The function is a little complex. I have tried to synthesise some data below to reproduce my current approach. It works (ish) but I'm convinced that there's a better way to do this. To use parallel processing on my unix box, I simply swap mapply for mcmapply, but I haven't done that in the example below as I presume many will be working on Windows machines. I'd welcome ideas on my approach and particularly optimisation.
library("terra")
library("glue")
## Make some toy data
dir.create("temp_folder")
dir.create("result_folder")
x <- rast(ncols = 10, nrows = 10)
a <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(a) <- some_values
a_tiles <- makeTiles(a, x, glue("temp_folder/tile_a_{1:100}.tif"), overwrite = TRUE)
b <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(b) <- some_values
b_tiles <- makeTiles(b, x, glue("temp_folder/tile_b_{1:100}.tif"), overwrite = TRUE)
c <-rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(c) <- some_values
c_tiles <- makeTiles(c, x, glue("temp_folder/tile_c_{1:100}.tif"), overwrite = TRUE)
d <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(d) <- some_values
d_tiles <- makeTiles(d, x, glue("temp_folder/tile_d_{1:100}.tif"), overwrite = TRUE)
## Outer function so that this can be used in parallel ? But maybe this is a silly way to do it?
outer_function <- function(a_tiles, b_tiles, c_tiles, d_tiles, output_files) {
one_a_tile <- rast(unlist(a_tiles))
one_b_tile <- rast(unlist(b_tiles))
one_c_tile <- rast(unlist(c_tiles))
one_d_tile <- rast(unlist(d_tiles))
output_file <- output_files
# I replace any NAs with 0 as an NA will break my 'if' statement of the inner_function.
# I get Error in if (z["a"] <= z["b"]) { : missing value where TRUE/FALSE needed
one_a_tile[is.na(one_a_tile)] <- 0
one_b_tile[is.na(one_b_tile)] <- 0
one_c_tile[is.na(one_c_tile)] <- 0
one_d_tile[is.na(one_d_tile)] <- 0
z <- sds(one_a_tile, one_b_tile, one_c_tile, one_d_tile)
## Inner function that actually does the work I want doing
inner_function <- function(z) {
names(z) <- c('a', 'b', 'c', 'd')
if (z['a'] <= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 10),
(z['c'] + z['a'] * 20)))
}
if (z['a'] >= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 40),
(z['c'] + z['a'] * 10)))
}
if (z['a'] == z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 60),
(z['c'] + z['a'] * 10)))
}
y <- ifelse(y == 0, NA, y)
return(y)
}
app(z,
inner_function,
filename = output_file,
overwrite = TRUE,
wopt = list(datatype = "INT4U"))
return(output_file)
}
results <- mapply(outer_function,
a_tiles = a_tiles,
b_tiles = b_tiles,
c_tiles = c_tiles,
d_tiles = d_tiles,
output_files = output_files <- glue("result_folder/result_tile_{1:length(d_tiles)}.tif"))
names(results) <- NULL
unlink("temp_folder", recursive = TRUE)
unlink("result_folder", recursive = TRUE)

Why does function return NULL?

A beginner in R over here, so apologies for the basic question.
Why does ATE return a null vector instead of saving the values of the difference of the means?
fun.cluster <- function(M, N){
set.seed(02139)
J <- 1:M # vector J_i
df <- as.data.frame(matrix(data=1:N, nrow = N, ncol = 1)) #data frame of all original values
df$cluster <- cut(df$V1, M, labels = 1:M) #breaking the dataframe into clusters
df$cluster <- as.numeric(df$cluster)
Y1 <- as.vector(sample(J, 5)) # assigning treatment
df$treatment <- ifelse(df$cluster %in% Y1, df$treatment <- 1, df$treatment <- 0)
#Inducing intracluster correlation:
mu_0j <- runif(n = 50, min = -1, max = 1)
df$V1[df$treatment==0] <- mu_0j
mu_1j <- runif(n=50, min = -0.5, max = 1.5)
df$V1[df$treatment==0] <- mu_1j
# drawing values
y_0i <- rnorm(n = 50, mean = mu_0j, sd = 1)
y_1i <- rnorm(n = 50, mean = mu_1j, sd = 1)
D_i <- as.vector(c(y_0i, y_1i))
# calculating ATE:
ATE[i] <- mean(y_1i - y_0i)
}
ATE <- c()
for(i in 1:10){
fun.cluster(M = 10, N = 100)
}

Fastest way to map multiple character columns to numerical values

I have an algorithm that at each iteration calculates means for certain groups (the groups do not change only their values).
The table of the values -
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
head(d1)
# x y1
# 1 H -0.7852538
# 2 G -0.6739159
# 3 V -1.7783771
# 4 L -0.2849846
# 5 I -0.1760284
# 6 V -0.2785826
I can calculate the means (in several ways: dplyr, data.table and tapply). I have another data.frame consisting of two columns with the group names.
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
head(d2)
# group.high group.low
# 1 U L
# 2 K J
# 3 C Q
# 4 Q A
# 5 Q U
# 6 K W
I want to add to columns, mean.high and mean.better, of the mean values of each group based on d1.
So far I have tried two options from dplyr and data.table. I had to use left_join twice in either of them. They are both similar in speed.
microbenchmark(
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
dplyr 34.0837 36.88650 53.22239 42.9227 47.50660 231.5066 100 a
data.table 40.2071 47.70735 87.46804 51.2517 59.05385 258.4999 100 b
Is there a better way? How can I speed the calculation?
As mentioned in the comments, there is an iterative process of updating the values. Here is an example.
N <- 10000
iterFuncDplyr <- function(d1, d2) {
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))
return(var(d1$y1))
}
iterFuncData <- function(d1, d2) {
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table:::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table:::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
return(var(d1$y1))
}
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
library(data.table)
library(dplyr)
microbenchmark::microbenchmark(dplyr = {
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncDplyr(d1, d2)
}},
data.table = {
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncData(d1, d2)
}
}
)
Unit: milliseconds
expr min lq mean median uq max neval
dplyr 46.22904 50.67959 52.78275 51.96358 53.34825 108.2874 100
data.table 63.81111 67.13257 70.85537 69.85712 72.72446 127.4228 100
You could subset the named vector means to create new columns and match your output:
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d2$mean.high <- means[d2$group.high]
d2$mean.low <- means[d2$group.low]
identical(as.matrix(d2), as.matrix(d3)) #factor vs character, used d3 w/ benchmark
[1] TRUE
Unit: microseconds
expr min lq mean median uq max neval
dplyr 4868.2 5316.25 5787.123 5524.15 5892.70 12187.3 100
data.table 8254.4 9606.60 10438.424 10118.35 10771.75 20966.5 100
subset 481.2 529.40 651.194 550.35 582.55 7849.9 100
Benchmark code:
d3 <- d2
microbenchmark::microbenchmark( # N = 10000
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
},
subset = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d3$mean.high <- means[d2$group.high]
d3$mean.low <- means[d2$group.low]
}
)
Here is an answer very similar to Andrew's but relying on data.table instead of tapply() (which seems faster for very big N).
library(data.table)
# Create a named vector "means"
means <- setDT(d1)[, mean(y1), by = x][, setNames(V1, x)]
setDT(d2)[, c("mean.high.means", "mean.low.means") :=
.(means[as.character(group.high)], means[as.character(group.low)])]
Output:
group.high group.low mean.high.means mean.low.means
1: Z W 0.017032792 0.0091625547
2: A A 0.013796137 0.0137961371
3: V S -0.011570159 0.0004560325
4: D X 0.005475629 0.0200984250
5: U H -0.008249901 0.0054537833
---
199996: H K 0.005453783 0.0079905631
199997: A T 0.013796137 -0.0068537963
199998: W U 0.009162555 -0.0082499015
199999: T V -0.006853796 -0.0115701585
200000: G J 0.014829259 0.0206598470
Reproducible data:
N = 1e5
set.seed(1)
d1 <- data.frame(
x = sample(LETTERS, N, replace = TRUE),
y1 = rnorm(N)
)
d2 <- data.frame(
group.high = sample(LETTERS, N * 2, replace = TRUE),
group.low = sample(LETTERS, N * 2, replace = TRUE)
)

When is it advisable to use sparse matrices in R?

I have been tinkering with power simulations recently and I have the following code:
library(MASS)
library(Matrix)
simdat <- data.frame(mmm = rep(rep(factor(1:2,
labels=c("m1", "m2")),
each = 2),
times = 2800),
ttt = rep(factor(1:2,
labels = c("t1", "t2")),
times = 5600),
sss = rep(factor(1:70),
each = 160),
iii = rep(rep(factor(1:40),
each = 4),
times = 70))
beta <- c(1, 2)
X1 <- model.matrix(~ mmm,
data = simdat)
Z1 <- model.matrix(~ ttt,
data = simdat)
X1 and Z1 are 11200x2 matrices. With the help of Stackoverflow I managed to make my calculations a lot more efficient than they were before:
funab <- function(){
ran_sub <- mvrnorm(70, mu = c(0,0), Sigma = matrix(c(10, 3, 3, 2), ncol = 2))
ran_ite <- mvrnorm(40, mu = c(0,0), Sigma = matrix(c(10, 3, 3, 2), ncol = 2))
Mb <- as.vector(X1 %*% beta)
M1 <- rowSums(Z1 * ran_sub[rep(1:70,
each = 160),])
M2 <- rowSums(Z1 * ran_ite[rep(rep(1:40, each = 4),
times = 70),])
Mout <- Mb + M1 + M2
Y <- as.vector(Mout) + rnorm(length(Mout), mean = 0 , sd = 0.27)
}
Y will then be a vector of length 11200. I then replicate this function a lot (say 1000 times):
sim <- replicate(n = 1000,
expr = funab()},
simplify = FALSE)
sim will be a 11200x1000 list. Given that I want to do this a lot more and possibly include more code into funab() I wonder if it is advisable to use sparse matrices for X1 and Z1 in the calculations in funab() as it is now?
Ok, I've tried to follow an advice given in the comments to my question and ran a test with the microbenchmark package. To make copy and pasting easier I will repeat the code from above:
library(MASS)
library(Matrix)
simdat <- data.frame(mmm = rep(rep(factor(1:2,
labels=c("m1", "m2")),
each = 2),
times = 2800),
ttt = rep(factor(1:2,
labels = c("t1", "t2")),
times = 5600),
sss = rep(factor(1:70),
each = 160),
iii = rep(rep(factor(1:40),
each = 4),
times = 70))
beta <- c(1, 2)
X1 <- model.matrix(~ mmm,
data = simdat)
Z1 <- model.matrix(~ ttt,
data = simdat)
I now create the same matrices as sparse matrices:
sparseX1 <- sparse.model.matrix(~ mmm,
data = simdat)
sparseZ1 <- sparse.model.matrix(~ ttt,
data = simdat)
I then set up the two functions:
funab_sparse <- function(){
ran_sub <- mvrnorm(70, mu = c(0,0), Sigma = matrix(c(10, 3, 3, 2), ncol = 2))
ran_ite <- mvrnorm(40, mu = c(0,0), Sigma = matrix(c(10, 3, 3, 2), ncol = 2))
Mb <- as.vector(sparseX1 %*% beta)
M1 <- Matrix::rowSums(sparseZ1 * ran_sub[rep(1:70,
each = 160),])
M2 <- Matrix::rowSums(sparseZ1 * ran_ite[rep(rep(1:40, each = 4),
times = 70),])
Mout <- Mb + M1 + M2
Y <- as.vector(Mout) + rnorm(length(Mout), mean = 0 , sd = 0.27)
}
funab <- function(){
ran_sub <- mvrnorm(70, mu = c(0,0), Sigma = matrix(c(10, 3, 3, 2), ncol = 2))
ran_ite <- mvrnorm(40, mu = c(0,0), Sigma = matrix(c(10, 3, 3, 2), ncol = 2))
Mb <- as.vector(X1 %*% beta)
M1 <- rowSums(Z1 * ran_sub[rep(1:70,
each = 160),])
M2 <- rowSums(Z1 * ran_ite[rep(rep(1:40, each = 4),
times = 70),])
Mout <- Mb + M1 + M2
Y <- as.vector(Mout) + rnorm(length(Mout), mean = 0 , sd = 0.27)
}
library(microbenchmark)
res <- microbenchmark(funab(), funab_sparse(), times = 1000)
and get the results:
> res <- microbenchmark(funab(), funab_sparse(), times = 1000)
> res
Unit: milliseconds
expr min lq median uq max neval
funab() 2.200342 2.277006 2.309587 2.481627 69.99895 1000
funab_sparse() 8.419564 8.568157 9.666248 9.874024 75.88907 1000
Assuming that I did not make any substantial mistakes I can conclude that with this particular way of doing the calculations using sparse matrices will not speed up my code.

Resources