R code iteration - r

My goal is to generate this vector in R using iter:
0 + 1 = 1,
1 + 2 = 3,
3 + 3 = 6,
6 + 4 = 10
I tried the code below, but it didn't give me the right numbers:
iter <- 4
w_vector <- rep(0, iter)
for(i in 1:iter) {w_vector[i] <-sum(i, i-1)
print(w_vector[i])
}
I'll truly appreciate it if you can help me fix my code.

You can do:
w_vector <- cumsum(seq(iter))
w_vector
# [1] 1 3 6 10
Otherwise, using a for loop, your code should look something like:
iter <- 4
w_vector <- rep(0, iter)
w_vector[1] <- 1
for(i in 2:iter) {
w_vector[i] <- w_vector[i-1] + i
}
w_vector
# [1] 1 3 6 10

Related

Group numeric vector by predefined maximal group sum

I have a numeric vector like this x <- c(1, 23, 7, 10, 9, 2, 4) and I want to group the elements from left to right with the constrain that each group sum must not exceed 25. Thus, here the first group is c(1, 23), the second is c(7, 10) and the last c(9, 2, 4). the expected output is a dataframe with a second column containing the groups:
data.frame(x= c(1, 23, 7, 10, 9, 2, 4), group= c(1, 1, 2, 2, 3, 3, 3))
I have tried different things with cumsum but am not able to kind of dynamically restart cumsum for the new group once the limit sum of 25 for the last group is reached.
I think cpp function is the fastest way:
library(Rcpp)
cppFunction(
"IntegerVector GroupBySum(const NumericVector& x, const double& max_sum = 25)
{
double sum = 0;
int cnt = 0;
int period = 1;
IntegerVector res(x.size());
for (int i = 0; i < x.size(); ++i)
{
++cnt;
sum += x[i];
if (sum > max_sum)
{
sum = x[i];
if (cnt > 1)
++period;
cnt = 1;
}
res[i] = period;
}
return res;
}"
)
GroupBySum(c(1, 23, 7, 10, 9, 2, 4), 25)
We can try this as a programming practice if you like :)
f1 <- function(x) {
group <- c()
while (length(x)) {
idx <- cumsum(x) <= 25
x <- x[!idx]
group <- c(group, rep(max(group, 0) + 1, sum(idx)))
}
group
}
or
f2 <- function(x) {
group <- c()
g <- 0
while (length(x)) {
cnt <- s <- 0
for (i in seq_along(x)) {
s <- s + x[i]
if (s <= 25) {
cnt <- cnt + 1
} else {
break
}
}
g <- g + 1
group <- c(group, rep(g, cnt))
x <- x[-(1:cnt)]
}
group
}
or
f3 <- function(x) {
s <- cumsum(x)
r <- c()
grp <- 1
while (length(s)) {
idx <- (s <= 25)
r <- c(r, rep(grp, sum(idx)))
grp <- grp + 1
s <- s[!idx] - tail(s[idx], 1)
}
r
}
which gives
[1] 1 1 2 2 3 3 3
and benchmarking among them looks like
set.seed(1)
set.seed(1)
x <- runif(1e3, 0, 25)
bm <- microbenchmark(
f1(x),
f2(x),
f3(x),
check = "equivalent"
)
autoplot(bm)
Recursion version
Another option is using recursion (based on f1())
f <- function(x, res = c()) {
if (!length(x)) {
return(res)
}
idx <- cumsum(x) <= 25
Recall(x[!idx], res = c(res, list(x[idx])))
}
and you will see
> f(x)
[[1]]
[1] 1 23
[[2]]
[1] 7 10
[[3]]
[1] 9 2 4
You can use the cumsumbinning built-in function from the MESS package:
# install.packages("MESS")
MESS::cumsumbinning(x, 25, cutwhenpassed = F)
# [1] 1 1 2 2 3 3 3
Or it can be done with purrr::accumulate:
cumsum(x == accumulate(x, ~ifelse(.x + .y <= 25, .x + .y, .y)))
# [1] 1 1 2 2 3 3 3
output
group <- MESS::cumsumbinning(x, 25, cutwhenpassed = F)
data.frame(x= c(1, 23, 7, 10, 9, 2, 4),
group = group)
x group
1 1 1
2 23 1
3 7 2
4 10 2
5 9 3
6 2 3
7 4 3
Quick benchmark:
x<- c(1, 23, 7, 10, 9, 2, 4)
bm <- microbenchmark(
fThomas(x),
fThomasRec(x),
fJKupzig(x),
fCumsumbinning(x),
fAccumulate(x),
fReduce(x),
fRcpp(x),
times = 100L,
setup = gc(FALSE)
)
autoplot(bm)
Егор Шишунов's Rcpp is the fastest, closely followed by MESS::cumsumbinning and ThomasIsCoding's both functions.
With n = 100, the gap gets bigger but Rcpp and cumsumbinning are still the top choices and the while loop option is no longer efficient (I had to remove ThomasIsCoding's functions because the execution time was too long):
x = runif(100, 1, 50)
In base R you could also use Reduce:
do.call(rbind, Reduce(\(x,y) if((z<-x[1] + y) > 25) c(y, x[2]+1)
else c(z, x[2]), x[-1], init = c(x[1], 1), accumulate = TRUE))
[,1] [,2]
[1,] 1 1
[2,] 24 1
[3,] 7 2
[4,] 17 2
[5,] 9 3
[6,] 11 3
[7,] 15 3
Breaking it down:
f <- function(x, y){
z <- x[1] + y
if(z > 25) c(y, x[2] + 1)
else c(z, x[2])
}
do.call(rbind, Reduce(f, x[-1], init = c(x[1], 1), accumulate = TRUE))
if using accumulate
library(tidyverse)
accumulate(x[-1], f, .init = c(x[1], 1)) %>%
invoke(rbind, .)
[,1] [,2]
[1,] 1 1
[2,] 24 1
[3,] 7 2
[4,] 17 2
[5,] 9 3
[6,] 11 3
[7,] 15 3
Here is a solution using base R and cumsum (and lapply for iteration):
id <- c(seq(1, length(x),1)[!duplicated(cumsum(x) %/% 25)], length(x)+1)
id2 <- 1:length(id)
group <- unlist(lapply(1:(length(id)-1), function(x) rep(id2[x], diff(id)[x])))
data.frame(x=x, group=group)
x group
1 1 1
2 23 1
3 7 2
4 10 2
5 9 3
6 2 3
7 4 3
Edit: New Approach using recursive function
Here is a new more efficient approach that should also cover the special case which #ЕгорШишунов considered and should work efficiently because it's written as a recursive function.
recursiveFunction<- function(x, maxN=25, sumX=0, period=1, period2return=c()){
sumX <- sumX + x[1]
if (sumX >= maxN) { sumX=x[1]; period = period + 1}
period2return <- c(period2return, period)
if (length(x) == 1) { return(period2return)}
return(recursiveFunction(x[-1], 25, sumX, period, period2return))
}
recursiveFunction(x, maxN=25)
Note that you should not change the entries for the last three function parameters (sumX=0, period=1, period2return=c()) because they are only important during the recursive call of the function.

How to store for loop outputs of varying sizes in an initialized vector by indices

Problem Statement
Let's say you have the following data:
df <- data.frame(x = rep(0, 10),
batch = rep(1:3,c(4,2,4)))
x batch
1 0 1
2 0 1
3 0 1
4 0 1
5 0 2
6 0 2
7 0 3
8 0 3
9 0 3
10 0 3
You want to loop over the number of unique batches in your dataset and within each batch, apply an algorithm to generate a vector of 1's and 0's. The algorithm is quite long, so for example's sake, let's say it's a random sample:
set.seed(2021)
for(i in seq_len(length(unique(df$batch)))){
batch_val <- d[which(df$batch == i),]$batch
#some algorithm to generate 1's and 0's, but using sample() here
out_x <- sample(c(0,1), length(batch_val), replace = T)
}
You then want to save out_x into the correct indices in df$x. My current rudimentary approach is to explicitly specify indices:
idxb <- 1
idxe <- length(df[which(df$batch == 1),]$batch)
set.seed(2021)
for(i in seq_len(length(unique(df$batch)))){
batch_val <- d[which(df$batch == i),]$batch
#some algorithm to generate 1's and 0's, but using sample() here
out_x <- sample(c(0,1), length(batch_val), replace = T)
print(out_x)
#save output
df$x[idxb:idxe] <- out_x
#update indices
idxb <- idxb + length(out_X)
if(i < length(unique(df$batch))) {
idxe <- idxe + length(df[which(df$batch == i+1),]$batch)
}
}
Output
The result should look like this:
x batch
1 0 1
2 1 1
3 1 1
4 0 1
5 1 2
6 1 2
7 1 3
8 0 3
9 1 3
10 1 3
where each iteration of out_x looks like this:
[1] 0 1 1 0
[1] 1 1
[1] 1 0 1 1
Question
What is a faster way to implement this while still using base R?
What about using tapply?
out_x <- tapply(df$batch, df$batch, function(x) sample(c(0,1), length(x), replace = T))
#------
$`1`
[1] 0 1 1 1
$`2`
[1] 0 1
$`3`
[1] 1 1 1 1
And then to reassign to df
df$x <- unlist(out_x)
A timing test:
microbenchmark::microbenchmark(f_loop(), f_apply())
#---------
Unit: microseconds
expr min lq mean median uq max neval
f_loop() 399.895 425.1975 442.7077 437.754 450.690 612.969 100
f_apply() 100.449 106.9185 160.5557 110.913 114.909 4867.603 100
Where the functions are defined as
f_loop <- function(){
idxb <- 1
idxe <- length(df[which(df$batch == 1),]$batch)
for(i in seq_len(length(unique(df$batch)))){
batch_val <- df[which(df$batch == i),]$batch
#some algorithm to generate 1's and 0's, but using sample() here
out_x <- sample(c(0,1), length(batch_val), replace = T)
#print(out_x)
#save output
df$x[idxb:idxe] <- out_x
#update indices
idxb <- idxb + length(out_x)
if(i < length(unique(df$batch))) {
idxe <- idxe + length(df[which(df$batch == i+1),]$batch)
}
}
return(df$x)
}
f_apply <- function() {
unlist(tapply(df$batch, df$batch, function(x) sample(c(0,1), length(x), replace = T)))
}
One solution is to remind myself that I can index a vector with a vector!
set.seed(2021)
for(i in seq_len(length(unique(df$batch)))){
batch_val <- d[which(df$batch == i),]$batch
#some algorithm to generate 1's and 0's, but using sample() here
out_x <- sample(c(0,1), length(batch_val), replace = T)
print(out_x)
#save output
idx <- which(df$batch == i)
df$x[idx] <- out_x
}

Why R resets dataframe column after function call?

I made simple R script and run:
f <- data.frame(x = c(1,1))
f$y <- NA
test <- function(n) {
f$y[1] <- 1
print(f)
}
test(0)
print(f)
the result is
> test(0)
x y
1 1 1
2 1 NA
> print(f)
x y
1 1 NA
2 1 NA
>
I would expect in the final output (1, 1) (1, NA) but for some reason f$y[1] had been reset to NA.
When I inline the function, the result are ok.
Why function call has this side effect on dataframe column?
Is it avoidable?
How about this:
f <- data.frame(x = c(1,1))
f$y <- NA
test <- function(f) {
f$y[1] <- 1
return(f)
}
f <- test(f)
print(f)
x y
1 1 1
2 1 NA

storing results of a for function in list or

add <- c( 2,3,4)
for (i in add){
a <- i +3
b <- a + 3
z <- a + b
print(z)
}
# Result
[1] 13
[1] 15
[1] 17
In R, it can print the result, but I want to save the results for further computation in a vector, data frame or list
Thanks in advance
Try something like:
add <- c(2, 3, 4)
z <- rep(0, length(add))
idx = 1
for(i in add) {
a <- i + 3
b <- a + 3
z[idx] <- a + b
idx <- idx + 1
}
print(z)
This is simple algebra, no need in a for loop at all
res <- (add + 3)*2 + 3
res
## [1] 13 15 17
Or if you want a data.frame
data.frame(a = add + 3, b = add + 6, c = (add + 3)*2 + 3)
# a b c
# 1 5 8 13
# 2 6 9 15
# 3 7 10 17
Though in general, when you are trying to something like that, it is better to create a function, for example
myfunc <- function(x) {
a <- x + 3
b <- a + 3
z <- a + b
z
}
myfunc(add)
## [1] 13 15 17
In cases when a loop is actually needed (unlike in your example) and you want to store its results, it is better to use *apply family for such tasks. For example, use lapply if you want a list back
res <- lapply(add, myfunc)
res
# [[1]]
# [1] 13
#
# [[2]]
# [1] 15
#
# [[3]]
# [1] 17
Or use sapply if you want a vector back
res <- sapply(add, myfunc)
res
## [1] 13 15 17
For a data.frame to keep all the info
add <- c( 2,3,4)
results <- data.frame()
for (i in add){
a <- i +3
b <- a + 3
z <- a + b
#print(z)
results <- rbind(results, cbind(a,b,z))
}
results
a b z
1 5 8 13
2 6 9 15
3 7 10 17
If you just want z then use a vector, no need for lists
add <- c( 2,3,4)
results <- vector()
for (i in add){
a <- i +3
b <- a + 3
z <- a + b
#print(z)
results <- c(results, z)
}
results
[1] 13 15 17
It might be instructive to compare these two results with those of #dugar:
> sapply(add, function(x) c(a=x+3, b=a+3, z=a+b) )
[,1] [,2] [,3]
a 5 6 7
b 10 10 10
z 17 17 17
That is the result of lazy evaluation and sometimes trips us up when computing with intermediate values. This next one should give a slightly more expected result:
> sapply(add, function(x) c(a=x+3, b=(x+3)+3, z=(x+3)+((x+3)+3)) )
[,1] [,2] [,3]
a 5 6 7
b 8 9 10
z 13 15 17
Those results are the transpose of #dugar. Using sapply or lapply often saves you the effort off setting up a zeroth case object and then incrementing counters.
> lapply(add, function(x) c(a=x+3, b=(x+3)+3, z=(x+3)+((x+3)+3)) )
[[1]]
a b z
5 8 13
[[2]]
a b z
6 9 15
[[3]]
a b z
7 10 17

How do you make relative references in R?

I have this data frame:
A <- c(10, 20, 30, 40, 5)
B <- c(5, 0, 0, 0, 0)
df = data.frame(A, B)
And I want to replace the 0's in B with the sum of A and B[i-1]. I have searched everywhere, but I feel like I am missing something really basic. This is my desired result:
A B
1 10 5
2 20 25
3 30 55
4 40 95
5 5 100
I have tried this, but it didn't work:
for(i in 2:length(df)){
df$B <- A[i] + B[i-1]
}
In Excel, this would be something like B$2 = A$2 + B$1. I cannot figure out how to do this in R. Any help would be greatly appreciated since I feel like I am missing something basic. Thanks!
You were very close. Try this:
for(i in 2:nrow(df)){
df$B[i] <- df$A[i] + df$B[i-1]
}
And to expand to those comments, could something like this work?
for(i in 2:nrow(df)){
if((df$A[i] + df$B[i-1]) > 60) df$B[i] <- df$B[i-1] else{
df$B[i] <- df$A[i] + df$B[i-1]}
}
# Data
# I changed one of the later values of B to non-zero to confirm that only
# the zero values of B were getting changed
A <- c(10, 20, 30, 40, 5)
B <- c(5, 0, 0, 10, 0)
(df = data.frame(A, B))
# A B
# 1 10 5
# 2 20 0
# 3 30 0
# 4 40 0
# 5 5 10
for(i in 2:nrow(df)) {
if(df$B[i]==0) df$B[i] <- df$A[i] + df$B[i-1]
if(df$B[i] >= 60) df$B[i] <- df$B[i-1]
}
df
# A B
# 1 10 5
# 2 20 25
# 3 30 55
# 4 40 55
# 5 5 10

Resources