Can an if statement have more than one then statements?
# this works
a <- c(1,2,3,4,5,6,7,8,9,10)
b <- c(4,3,5,2,8,9,1,2,2,4)
c <- c(9,9,9,5,5,5,2,2,2,1)
for(i in 1:10) { if(c[i]==2) a[i]= 100; if(c[i]==2) b[i]= -99 }
# this does not work
d <- c(1,2,3,4,5,6,7,8,9,10)
e <- c(4,3,5,2,8,9,1,2,2,4)
f <- c(9,9,9,5,5,5,2,2,2,1)
for(i in 1:10) { if(f[i]==2) (d[i]= 100 & e[i]= -99) }
You need to have each statement in a separate line (delimited by ;) and the whole execution block enclosed within curly braces
for(i in 1:10) { if(f[i]==2) {d[i]= 100; e[i]= -99} }
You're probably confusing if-expressions (a.k.a. ternary operators) with if-statements. In the latter, you usually have a brace-enclosed block of statements, which are delimited by semicolons or newlines:
R> for(i in 1:10) if(f[i]==2) { d[i]= 100; e[i]= -99 }
R> d
[1] 1 2 3 4 5 6 100 100 100 10
R> e
[1] 4 3 5 2 8 9 -99 -99 -99 4
Also, here is a somewhat faster equivalent:
a[which(c==2)] = 100
b[which(c==2)] = -99
Yes you can as others have mentioned. It's also more clear if you place things on new lines and use some indentation convention. For instance I might write your code like
a <- 1:10
b <- c(4, 3, 5, 2, 8, 9, 1, 2, 2, 4)
c <- c(9, 9, 9, 5, 5, 5, 2, 2, 2, 1)
for(i in 1:10){
if(c[i] == 2){
a[i] <- 100
b[i] <- -99
}
}
Related
I really need some help to write a recursion in R.
The function that I want changes a certain observation according to a set of comparisons between different rows in a data frame, which I shall call g. One of these comparisons depends on the previous value of this same observation.
Suppose first that I want to update the value of column index, row i in my data df in the following way:
j <- 1:4
g <- (df$dom[i] > 0 &
abs(df$V2009[i] - df$V2009[j]) <= w) |
df$index[i] == df$index[j]
df$index[i] <- ifelse(any(g), which(g)[[1]], df$index[[i]])
The thing is, the object w is actually a list:
w = list(0, 1, 2, df$age[i])
So, as you can see, I want to create a function foo() that updates df$index iteratively. It changes it by looping through w and comparisons depend on updated values.
Here is some data:
df <- data.frame(dom = c(0, 0, 6, 6),
V2009 = c(9, 11, 9, 11),
index = c(1, 2, 1, 2),
age = c(2, 2, 2, 2))
I am not sure if a recursive function is actually needed or if something like reduce or map would do it.
Thank you!
The following function uses a double for loop to change the values of column index according to the condition defining g. It accepts a data.frame as input and returns the updated data.frame.
foo <- function(x){
change_index <- function(x, i, w){
j <- seq_len(nrow(x))
(x$dom[i] > 0 & abs(x$V2009[i] - x$V2009[j]) <= w) |
x$index[i] == x$index[j]
}
for(i in seq_len(nrow(x))){
W <- list(0, 1, 2, x$age[i])
for(w in W){
g <- change_index(x, i, w)
if(any(g)) x$index[i] <- which(g)[1]
}
}
x
}
foo(df)
# dom V2009 index age
#1 0 9 1 2
#2 0 11 2 2
#3 6 9 1 2
#4 6 11 1 2
One can define w inside a function and use lexical scoping (closure).
Using your instructions, the function index_value calculates for any given i the index value.
correct_index_col returns the corrected df.
df <- data.frame(dom = c(0, 0, 6, 6),
V2009 = c(9, 11, 9, 11),
index = c(1, 2, 1, 2),
age = c(2, 2, 2, 2))
index_value <- function(df, i) {
j <- nrow(df)
w <- c(0, 1, 2, df$age[i])
g <- (df$dom[i] > 0 & abs(df$V2009[i] - df$V2009[j]) <= w) |
df$index[i] == df$index[j]
ifelse(any(g), which(g)[[1]], df$index[[i]])
}
correct_index_col <- function(df) {
indexes <- Vectorize(function(i) {
index_value(df, i)
})
df$index <- indexes(1:nrow(df))
df
}
# > correct_index_col(df)
# dom V2009 index age
# 1 0 9 1 2
# 2 0 11 1 2
# 3 6 9 3 2
# 4 6 11 1 2
#
If you want to really update (mutate) your df, then you have to do
df <- correct_index_col(df).
Here is an attempt of my own. I guess I figured out a way to use recursion over mutate:
test <- function(i, df, k){
j <- 1:nrow(df)
w <- list(0, 1, 2, df$age[i])
g <- (df$dom[i] > 0 & abs(df$V2009[i] - df$V2009[j]) <= w[k]) |
df$index[i] == df$index[j]
l <- ifelse(any(g), which(g)[1], df$index[i])
return(l)
}
loop <- function(data,
k = 1) {
data <- data %>%
mutate(index = map_dbl(seq(n()),
~ test(.x, df = cur_data(), k)))
if (k == 4) {
return(data)
} else {
return(loop(data, k + 1))
}
}
df %>% loop()
I welcome any comments in case this is inefficient considering large datasets
Assume one wished to calculate a cumulative sum based on a multivariate condition, all(Z[i] <= x), for all i over a multivariate grid x. One may obviously implement this naively
cSums <- numeric(nrow(x))
for(i in seq(nrow(x))){
for(j in seq(nrow(Z))){
if(all(Z[j, ] <= x[i, ]))
cSums[i] <- cSums[i] + R[j] # <== R is a single vector to be summed
}
}
which would be somewhere around O((n*p)^2), or slightly faster by iteratively subsetting the columns
cSums <- numeric(nrow(x))
for(i in seq(nrow(x))){
indx <- seq(nrow(Z))
for(j in seq(ncol(Z))){
indx <- indx[which(Z[indx, j] <= x[i, j])]
}
cSums[i] <- sum(R[indx])
}
but this still worst-case as slow as the naive-implementation. How could one improve this to achieve faster performance, while still allowing an undefined number of columns to be compared?
Dummy data and Reproducible example
var1 <- c(3,3,3,5,5,5,4,4,4,6)
var2 <- rep(seq(1,5), each = 2)
Z <- cbind(var1, var2)
x <- Z
R <- rep(1, nrow(x))
# Result using either method.
#[1] 2 2 3 4 6 6 5 5 6 10
outer is your friend, just Vectorize your comparison. colSums yields the desired result then. Should be fast.
f <- Vectorize(function(k, l) all(Z[k, ] <= x[l, ]))
res <- colSums(outer(1:nrow(Z), 1:nrow(x), f))
res
# [1] 2 2 3 4 6 6 5 5 6 10
Data
x <- Z <- structure(c(3, 3, 3, 5, 5, 5, 4, 4, 4, 6, 1, 1, 2, 2, 3, 3, 4,
4, 5, 5), .Dim = c(10L, 2L), .Dimnames = list(NULL, c("var1",
"var2")))
We can use apply row-wise and compare every row with every other row and count how many of them satidy the criteria.
apply(Z, 1, function(x) sum(rowSums(Z <= as.list(x)) == length(x)))
#[1] 2 2 3 4 6 6 5 5 6 10
Similar approach can also be performed using sapply + split
sapply(split(Z, seq_len(nrow(Z))), function(x)
sum(rowSums(Z <= as.list(x)) == length(x)))
data
var1 <- c(3,3,3,5,5,5,4,4,4,6)
var2 <- rep(seq(1,5), each = 2)
Z <- data.frame(var1, var2)
I have a vector:
as <- c(1,2,3,4,5,9)
I need to extract the first continunous sequence in the vector, starting at index 1, such that the output is the following:
1 2 3 4 5
Is there a smart function for doing this, or do I have to do something not so elegant like this:
a <- c(1,2,3,4,5,9)
is_continunous <- c()
for (i in 1:length(a)) {
if(a[i+1] - a[i] == 1) {
is_continunous <- c(is_continunous, i)
} else {
break
}
}
continunous_numbers <- c()
if(is_continunous[1] == 1) {
is_continunous <- c(is_continunous, length(is_continunous)+1)
continunous_numbers <- a[is_continunous]
}
It does the trick, but I would expect that there is a function that can already do this.
It isn't clear what you need if the index of the continuous sequence only if it starts at index one or the first sequence, whatever the beginning index is.
In both case, you need to start by checking the difference between adjacent elements:
d_as <- diff(as)
If you need the first sequence only if it starts at index 1:
if(d_as[1]==1) 1:(rle(d_as)$lengths[1]+1) else NULL
# [1] 1 2 3 4 5
rle permits to know lengths and values for each consecutive sequence of same value.
If you need the first continuous sequence, whatever the starting index is:
rle_d_as <- rle(d_as)
which(d_as==1)[1]+(0:(rle_d_as$lengths[rle_d_as$values==1][1]))
Examples (for the second option):
as <- c(1,2,3,4,5,9)
d_as <- diff(as)
rle_d_as <- rle(d_as)
which(d_as==1)[1]+(0:(rle_d_as$lengths[rle_d_as$values==1][1]))
#[1] 1 2 3 4 5
as <- c(4,3,1,2,3,4,5,9)
d_as <- diff(as)
rle_d_as <- rle(d_as)
which(d_as==1)[1]+(0:(rle_d_as$lengths[rle_d_as$values==1][1]))
# [1] 3 4 5 6 7
as <- c(1, 2, 3, 6, 7, 8)
d_as <- diff(as)
rle_d_as <- rle(d_as)
which(d_as==1)[1]+(0:(rle_d_as$lengths[rle_d_as$values==1][1]))
# [1] 1 2 3
A simple way to catch the sequence would be to find the diff of your vector and grab all elements with diff == 1 plus the very next element, i.e.
d1<- which(diff(as) == 1)
as[c(d1, d1[length(d1)]+1)]
NOTE
This will only work If you only have one sequence in your vector. However If we want to make it more general, then I 'd suggest creating a function as so,
get_seq <- function(vec){
d1 <- which(diff(as) == 1)
if(all(diff(d1) == 1)){
return(c(d1, d1[length(d1)]+1))
}else{
d2 <- split(d1, cumsum(c(1, diff(d1) != 1)))[[1]]
return(c(d2, d2[length(d2)]+1))
}
}
#testing it
as <- c(3, 5, 1, 2, 3, 4, 9, 7, 5, 4, 5, 6, 7, 8)
get_seq(as)
#[1] 3 4 5 6
as <- c(8, 9, 10, 11, 1, 2, 3, 4, 7, 8, 9, 10)
get_seq(as)
#[1] 1 2 3 4
as <- c(1, 2, 3, 4, 5, 6, 11)
get_seq(as)
#[1] 1 2 3 4 5 6
I have two vectors, A and B. For every element in A I want to find the index of the first element in B that is greater and has higher index. The length of A and B are the same.
So for vectors:
A <- c(10, 5, 3, 4, 7)
B <- c(4, 8, 11, 1, 5)
I want a result vector:
R <- c(3, 3, 5, 5, NA)
Of course I can do it with two loops, but it's very slow, and I don't know how to use apply() in this situation, when the indices matter. My data set has vectors of length 20000, so the speed is really important in this case.
A few bonus questions:
What if I have a sequence of numbers (like seq = 2:10), and I want to find the first number in B that is higher than a+s for every a of A and every s of seq.
Like with question 1), but I want to know the first greater, and the first lower value, and create a matrix, which stores which one was first. So for example I have a of A, and 10 from seq. I want to find the first value of B, which is higher than a+10, or lower than a-10, and then store it's index and value.
sapply(sapply(seq_along(a),function(x) which(b[-seq(x)]>a[x])+x),"[",1)
[1] 3 3 5 5 NA
This is a great example of when sapply is less efficient than loops.
Although the sapply does make the code look neater, you are paying for that neatness with time.
Instead you can wrap a while loop inside a for loop inside a nice, neat function.
Here are benchmarks comparing a nested-apply loop against nested for-while loop (and a mixed apply-while loop, for good measure). Update: added the vapply..match.. mentioned in comments. Faster than sapply, but still much slower than while loop.
BENCHMARK:
test elapsed relative
1 for.while 0.069 1.000
2 sapply.while 0.080 1.159
3 vapply.match 0.101 1.464
4 nested.sapply 0.104 1.507
Notice you save a third of your time; The savings will likely be larger when you start adding the sequences to A.
For the second part of your question:
If you have this all wrapped up in an nice function, it is easy to add a seq to A
# Sample data
A <- c(10, 5, 3, 4, 7, 100, 2)
B <- c(4, 8, 11, 1, 5, 18, 20)
# Sample sequence
S <- seq(1, 12, 3)
# marix with all index values (with names cleaned up)
indexesOfB <- t(sapply(S, function(s) findIndx(A+s, B)))
dimnames(indexesOfB) <- list(S, A)
Lastly, if you want to instead find values of B less than A, just swap the operation in the function.
(You could include an if-clause in the function and use only a single function. I find it more efficient
to have two separate functions)
findIndx.gt(A, B) # [1] 3 3 5 5 6 NA 8 NA NA
findIndx.lt(A, B) # [1] 2 4 4 NA 8 7 NA NA NA
Then you can wrap it up in one nice pacakge
rangeFindIndx(A, B, S)
# A S indxB.gt indxB.lt
# 10 1 3 2
# 5 1 3 4
# 3 1 5 4
# 4 1 5 NA
# 7 1 6 NA
# 100 1 NA NA
# 2 1 NA NA
# 10 4 6 4
# 5 4 3 4
# ...
FUNCTIONS
(Notice they depend on reshape2)
rangeFindIndx <- function(A, B, S) {
# For each s in S, and for each a in A,
# find the first value of B, which is higher than a+s, or lower than a-s
require(reshape2)
# Create gt & lt matricies; add dimnames for melting function
indexesOfB.gt <- sapply(S, function(s) findIndx.gt(A+s, B))
indexesOfB.lt <- sapply(S, function(s) findIndx.lt(A-s, B))
dimnames(indexesOfB.gt) <- dimnames(indexesOfB.gt) <- list(A, S)
# melt the matricies and combine into one
gtltMatrix <- cbind(melt(indexesOfB.gt), melt(indexesOfB.lt)$value)
# clean up their names
names(gtltMatrix) <- c("A", "S", "indxB.gt", "indxB.lt")
return(gtltMatrix)
}
findIndx.gt <- function(A, B) {
lng <- length(A)
ret <- integer(0)
b <- NULL
for (j in seq(lng-1)) {
i <- j + 1
while (i <= lng && ((b <- B[[i]]) < A[[j]]) ) {
i <- i + 1
}
ret <- c(ret, ifelse(i<lng, i, NA))
}
c(ret, NA)
}
findIndx.lt <- function(A, B) {
lng <- length(A)
ret <- integer(0)
b <- NULL
for (j in seq(lng-1)) {
i <- j + 1
while (i <= lng && ((b <- B[[i]]) > A[[j]]) ) { # this line contains the only difference from findIndx.gt
i <- i + 1
}
ret <- c(ret, ifelse(i<lng, i, NA))
}
c(ret, NA)
}
Let I have an array like
a <- seq(1, 100, 1)
and I want to select just the elements that occur each 3 steps with a for() loop starting from the second one, e.g. 2, 5, 8, 11 and so on.
How should I use for() in this case?
b <- NULL
# for(i in 1:length(a)) { # Is there any additional argument?
# b[i] <- a[...] # Or I can just multiply 'i' by some integer?
# }
Thanks,
Use 3 as the value for by in seq
for (i in seq(2, length(a), by=3)) {}
> seq(2, 11, 3)
[1] 2 5 8 11
Why use for ?
b <- a[seq(2,length(a),3)]