Related
let's image that we have data frame of description of several individuals:
des <- c('mad', 'crazy','stupid', 'crazy','wise','dumb','mad','furious')
id <- c(1,2,3,4,5,6,7,8)
d <-data.frame(id,des)
d$dangerous <- NA
dan <-c('mad','crazy','furious')
We want to match d$des with description in vector dan
I prepared the following function:
for (i in 1:nrow(d)){
for(j in 1:length(dan)){
if (d$des[i]==dan[j])
{d$dangerous[i] <- 1 }
} }
d
id des dangerous
1 1 mad 1
2 2 crazy 1
3 3 stupid NA
4 4 crazy 1
5 5 wise NA
6 6 dumb NA
7 7 mad 1
8 8 furious 1
The code works well however I wonder how to optimize the code if it could deal with longer vectors and data frame. Any ideas?
Here are timings of the several solutions and of a solution of mine.
I have timed the functions with the original data.frame d and with a bigger data.frame, since the OP says it's an optimization problem.
OP <- function(DF, dan){
DF$dangerous <- NA
for (i in 1:nrow(DF)){
for(j in 1:length(dan)){
if (DF$des[i]==dan[j]) DF$dangerous[i] <- 1
}
}
DF
}
Carles <- function(DF, dan){
DF$dangerous<-ifelse(DF$des %in% dan, 1, NA)
DF
}
arg0naut91_1 <- function(DF, dan){
DF$dangerous <- NA
transform(DF, dangerous = replace(dangerous, des %in% dan, 1))
}
arg0naut91_2 <- function(DF, dan){
DF$dangerous <- NA
DF$dangerous[DF$des %in% dan] <- 1
DF
}
Rui <- function(DF, dan){
DF$dangerous <- c(1, NA)[(DF$des %in% dan) + 1]
DF
}
library(microbenchmark)
mb <- microbenchmark(
OP = OP(d, dan),
Carles = Carles(d, dan),
Rui = Rui(d, dan),
arg0naut91_1 = arg0naut91_1(d, dan),
arg0naut91_2 = arg0naut91_2(d, dan)
)
print(mb, order = "median")
#Unit: microseconds
# expr min lq mean median uq max neval cld
# Rui 22.623 25.1865 82.73746 27.2510 31.6630 5441.491 100 a
# Carles 31.740 34.4120 76.82339 36.9385 42.1760 3753.407 100 a
# arg0naut91_2 34.131 36.7140 89.10827 39.5925 46.6930 4577.938 100 a
# arg0naut91_1 226.237 230.1020 296.23198 234.6225 243.3040 4847.553 100 a
# OP 757.831 770.1875 926.88995 781.5630 818.2745 10992.040 100 b
e <- d
for(i in 1:10) e <- rbind(e, e)
mb2 <- microbenchmark(
OP = OP(e, dan),
Carles = Carles(e, dan),
Rui = Rui(e, dan),
arg0naut91_1 = arg0naut91_1(e, dan),
arg0naut91_2 = arg0naut91_2(e, dan),
times = 10
)
print(mb2, order = "median")
#Unit: microseconds
# expr min lq mean median uq max neval cld
# Rui 291.090 294.690 346.3638 298.9580 301.238 776.769 10 a
# arg0naut91_2 288.123 292.236 312.6684 311.2435 314.495 388.212 10 a
# Carles 427.500 430.120 447.7170 450.2570 453.884 480.424 10 a
# arg0naut91_1 513.059 517.822 611.0255 666.7095 670.059 688.023 10 a
# OP 898781.320 909717.469 911988.3906 914269.7245 916975.858 919223.886 10 b
Using ifelse() with %in% will do the trick:
d$dangerous<-ifelse(des %in% dan, 1,NA)
> d
id des dangerous
1 1 mad 1
2 2 crazy 1
3 3 stupid NA
4 4 crazy 1
5 5 wise NA
6 6 dumb NA
7 7 mad 1
8 8 furious 1
Another option:
transform(d, dangerous = replace(dangerous, des %in% dan, 1))
id des dangerous
1 1 mad 1
2 2 crazy 1
3 3 stupid NA
4 4 crazy 1
5 5 wise NA
6 6 dumb NA
7 7 mad 1
8 8 furious 1
Or:
d$dangerous[d$des %in% dan] <- 1
I would like to replace up to n consecutive NA values in vector with latest non-NA value.
For example, if:
a <- c(1,NA,NA,NA,NA,NA,2,NA,1,NA,NA,NA)
n <- 2
I would like to obtain:
c(1,1,1,NA,NA,NA,2,2,1,1,1,NA)
n is maximum number of NA values that can be replaced by given element).
I know na.locf() function, but I don't know how to set the limit n. Is it possible to do it?
Here's an option using na.locf and rle
library(zoo)
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
So here I first computed the run lengths of elements in a (including the NA entries), then replaced all NA's using na.locf and finally turned those elements back to NA's where the run lengths were greater than n and the elements were NA.
As another idea, we can find the last indices of "a" without NAs:
is = seq_along(a)
i = cummax((!is.na(a)) * is)
i
# [1] 1 1 1 1 1 1 7 7 9 9 9 9
Replace the last non-NA index with the current index if last non-NA is more than "n" steps away:
wh = (is - i) > n
i[wh] = is[wh]
i
# [1] 1 1 1 4 5 6 7 7 9 9 9 12
And subset "a":
a[i]
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
You could do this using split and replace in base R
f <- function(a, n) {
# split the vector based on the position of non-NA values
l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f(a, n = 2)
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
f(a, n = 3)
#[1] 1 1 1 1 NA NA 2 2 1 1 1 1
Benchmarking (random generated vector of size 7467)
library(microbenchmark)
library(dplyr)
library(zoo)
set.seed(123)
a <- unlist(replicate(1000, c(sample(10, 2), rep(NA, sample.int(10, 1)))))
length(a)
# [1] 7467
n <- 3
f_989 <- function(a, n) {
# split the vector based on the position of non-NA values
l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f_zx8754 <- function(a, n)
data.frame(a) %>% mutate(gr = cumsum(!is.na(a))) %>%
group_by(gr) %>%
mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>%
.$res
f_docendo_discimus <- function(a, n){
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
}
f_akrun <- function(a,n)
ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))
f_alexis_laz=function(a,n){
is = seq_along(a)
i = cummax((!is.na(a)) * is)
wh = (is - i) > n
i[wh] = is[wh]
a[i]
}
r <- f_989(a,n)
identical(r, f_zx8754(a,n))
# [1] TRUE
identical(r, f_docendo_discimus(a,n))
# [1] TRUE
identical(r, f_akrun(a,n))
# [1] TRUE
identical(r, f_alexis_laz(a,n))
# [1] TRUE
res <- microbenchmark("f1"=f_989(a,n), "f2"=f_zx8754(a,n),
"f3"=f_docendo_discimus(a,n), "f4"=f_akrun(a,n), "f5"=f_alexis_laz(a,n))
print(res, order="mean")
# Unit: microseconds
# expr min lq mean median uq max neval
# f5 129.804 137.014 161.106 141.6715 151.7375 1031.511 100
# f3 1249.351 1354.215 1471.478 1392.9750 1482.2140 2553.086 100
# f1 4736.895 5093.852 5630.367 5345.3450 6069.9260 8848.513 100
# f4 22165.601 23936.866 24660.990 24485.6725 24883.6440 29453.177 100
# f2 205854.339 215582.174 221524.448 218643.9540 224211.0435 261512.922 100
We can use a base R approach by creating a grouping variable with cumsum and diff, then using the grouping variable in ave we replace the NA values based on the condition given by 'n'
ave(a, cumsum(c(TRUE, diff(is.na(a)) < 0)),
FUN = function(x) replace(x, is.na(x) & seq_along(x) <= n + 1, x[1]))
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
Or more compact option
ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
Using dplyr::group_by and zoo::na.locf:
library(dplyr)
library(zoo)
data.frame(a) %>%
mutate(gr = cumsum(!is.na(a))) %>%
group_by(gr) %>%
mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>%
.$res
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
I have a dataframe with three columns:
set.seed(123)
df <- data.frame(x = abs(rnorm(10)), y = abs(rnorm(10)), z = abs(rnorm(10)))
df
x y z
1 0.56047565 1.2240818 1.0678237
2 0.23017749 0.3598138 0.2179749
3 1.55870831 0.4007715 1.0260044
4 0.07050839 0.1106827 0.7288912
5 0.12928774 0.5558411 0.6250393
6 1.71506499 1.7869131 1.6866933
7 0.46091621 0.4978505 0.8377870
8 1.26506123 1.9666172 0.1533731
9 0.68685285 0.7013559 1.1381369
10 0.44566197 0.4727914 1.2538149
I want to construct a dataframe with the same number of rows, having in each row, the column names of df, ordered by the corresponding row value in df. I have a for-loop based approach that works, but is too slow for a large dataframe, but am looking for a faster, vectorized approach. Here is the for loop based approach:
df_names <- df
df_names[,] <- NA
df_names
x y z
1 NA NA NA
2 NA NA NA
3 NA NA NA
4 NA NA NA
5 NA NA NA
6 NA NA NA
7 NA NA NA
8 NA NA NA
9 NA NA NA
10 NA NA NA
for(r in 1:nrow(df)) {
sorted_row <- sort(df[r,], decreasing = TRUE)
df_names[r,] <- colnames(sorted_row)
}
df_names
x y z
1 y z x
2 y x z
3 x z y
4 z y x
5 z y x
6 y x z
7 z y x
8 y x z
9 z y x
10 z y x
How do I do this faster using the apply family or vectorization?
Revised: I merged all attempts, corrections by #rawr, and #rawr's approach is the best so far - with a 30x savings. #989 added a much faster approach. See accepted answer by #989.
library(microbenchmark)
set.seed(123)
df <- data.frame(x = abs(rnorm(1000)), y = abs(rnorm(1000)), z = abs(rnorm(1000)))
get_name_df_with_for = function(df) {
df_names <- df
df_names[,] <- NA
for(r in 1:nrow(df)) {
df_names[r,] <- colnames(sort(df[r,], decreasing = TRUE))
}
return(df_names)
}
get_name_df_with_apply = function(df) {
df_names <- data.frame(t(apply(df, 1, function(row) names(sort(row, decreasing = TRUE)))))
return(df_names)
}
get_name_df_with_apply_names = function(df) {
df_names <- data.frame(t(apply(df, 1, function(row) names(row)[(order(row, decreasing = TRUE))])))
return(df_names)
}
get_name_df_double_t = function(df) {
df_names <- data.frame(t(apply(t(df), 2, function(col) names(sort(col, decreasing = TRUE)))))
return(df_names)
}
microbenchmark(
"for" = get_name_df_with_for(df),
"double_transpose" = get_name_df_double_t(df),
"apply" = get_name_df_with_apply(df),
"apply_with_names" = get_name_df_with_apply_names(df),
times = 10
)
Unit: milliseconds
expr min lq mean median uq max neval
for 417.08341 424.37019 446.00655 451.67451 459.64900 480.33351 10
double_transpose 28.46577 29.96637 32.44685 33.02763 33.51309 36.77468 10
apply 27.54800 28.27331 38.02239 30.36667 37.29727 71.46596 10
apply_with_names 12.35264 12.59502 14.16868 13.92946 15.80656 17.22005 10
If the number of columns in your df is just three, here is a faster solution using max.col. It is provably about 8x faster than the fastest solution proposed in the other answer when nrow(df)=100.
The case in which nrow(df)=100
library(microbenchmark)
set.seed(123)
size <- 100
df <- data.frame(x = abs(rnorm(size)), y = abs(rnorm(size)), z = abs(rnorm(size)))
f1 <- function(df){
vec <- unlist(t(df))
sq <- seq(0,(nrow(df)-1)*3,3)
m1 <- max.col(df)
# -----------------------
vec[sq+m1] <- -Inf
m2 <- max.col(matrix(vec, ncol=3, byrow=T))
vec[sq+m2] <- -Inf
# -----------------------
m3 <- max.col(matrix(vec, ncol=3, byrow=T))
nm <- names(df)
cbind(nm[m1], nm[m2], nm[m3])
}
all(f1(df)==get_name_df_with_for(df))
# [1] TRUE
all(f1(df)==get_name_df_with_apply(df))
# [1] TRUE
all(f1(df)==get_name_df_with_apply_names(df))
# [1] TRUE
all(f1(df)==get_name_df_double_t(df))
# [1] TRUE
microbenchmark(f1(df), "f2"=get_name_df_with_for(df), "f3"=get_name_df_with_apply(df),
"f4"=get_name_df_with_apply_names(df), "f5"=get_name_df_double_t(df))
# Unit: microseconds
# expr min lq mean median uq max neval
# f1(df) 395.643 458.0905 470.8278 472.633 492.7355 701.464 100
# f2 59262.146 61773.0865 63098.5840 62963.223 64309.4780 74246.953 100
# f3 5491.521 5637.1605 6754.3912 5801.619 5956.4545 90457.611 100
# f4 3392.689 3463.9055 3603.1546 3569.125 3707.2795 4237.012 100
# f5 5513.335 5636.3045 5954.9277 5781.089 5971.2115 8622.017 100
Significantly faster when nrow(df)=1000
# Unit: microseconds
# expr min lq mean median uq max neval
# f1(df) 693.765 769.8995 878.3698 815.6655 846.4615 3559.929 100
# f2 627876.429 646057.8155 671925.4799 657768.6270 694047.9940 797900.142 100
# f3 49570.397 52038.3515 54334.0501 53838.8465 56181.0515 62517.965 100
# f4 28892.611 30046.8180 31961.4085 31262.4040 33057.5525 48694.850 100
# f5 49866.379 51491.7235 54413.8287 53705.3970 55962.0575 75287.600 100
I am a beginner in R and here's my code:
for (i in 1:7){
testing<-vector(length=(length(yy)-3))
if(all(yy[i:(i+2)]==0))
testing[i]<-1
else
testing[i]<-NA
}
yy refers to the following vector of length 10:
> yy
[1] 1 0 0 0 0 1 0 1 0 1
testing is like a predict function output, which will predict a 1 if the previous 3 elements in yy are all 0.If not, it will not predict anything, and so NA. Since yy has a total of 10 elements, testing will have a total of 7 elements (Therefore length 7) However, instead of giving me a output of 1s and NAs, it is giving this:
> testing
[1] FALSE FALSE FALSE FALSE FALSE FALSE NA
I cannot figure out why, some help will be great. Thank you.
You should define testing outside the loop:
testing<-vector(length=(length(yy)-3))
for (i in 1:7){
if(all(yy[i:(i+2)]==0))
testing[i]<-1
else
testing[i]<-NA
}
testing
[1] NA 1 1 NA NA NA NA
For this task you couls also use rollapply from zoo:
library(zoo)
rollapply(yy, 3, function(x) ifelse(all(x == 0), 1, NA))
[1] NA 1 1 NA NA NA NA NA
Here are some additional vectorized ways of solving this
Base r stats::filter
N <- 3
NA^(stats::filter(yy == 0, rep(1, N), sides = 1)[-(1:N-1)] != N)
# [1] NA 1 1 NA NA NA NA NA
data.table::shift
NA^(Reduce(`+`, data.table::shift(yy == 0, 0:(N-1)))[-(1:N-1)] != N)
# [1] NA 1 1 NA NA NA NA NA
RcppRoll::roll_sum
NA^(RcppRoll::roll_sum(yy == 0, N) != N)
# [1] NA 1 1 NA NA NA NA NA
Some becnmarks (I've also added a compiled version of the for loop using compiler::cmpfun and two more efficient zoo solutions)
ForLoop <- function(yy, N){
testing<-vector(length=(length(yy)-N))
for (i in 1:length(testing)){
if(all(yy[i:(i+(N-1))]==0))
testing[i]<-1
else
testing[i]<-NA
}
testing
}
ForLoopBin <- compiler::cmpfun(ForLoop)
ZOO <- function(yy, N) zoo::rollapply(yy, N, function(x) ifelse(all(x == 0), 1, NA))
ZOO2 <- function(yy, N) NA^!zoo::rollapply(yy == 0, N, all)
ZOO3 <- function(yy, N) NA^(zoo::rollsum(yy == 0, N) != N)
RCPPROLL <- function(yy, N) NA^(RcppRoll::roll_sum(yy == 0, N) != N)
BaseFilter <- function(yy, N) NA^(stats::filter(yy == 0, rep(1, N), sides = 1)[-(1:N-1)] != N)
DTShift <- function(yy, N) NA^(Reduce(`+`, data.table::shift(yy == 0, 0:(N-1)))[-(1:N-1)] != N)
set.seed(123)
yy <- sample(0:1, 1e4, replace = TRUE)
N <- 3
library(microbenchmark)
microbenchmark(
"for loop" = ForLoop(yy, N),
"Compiled for loop" = ForLoopBin(yy, N),
"zoo1" = ZOO(yy, N),
"zoo2" = ZOO2(yy, N),
"zoo3" = ZOO3(yy, N),
"Rcpproll" = RCPPROLL(yy, N),
"stats::filter" = BaseFilter(yy, N),
"data.table::shift" = DTShift(yy, N)
)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# for loop 25917.837 26858.936 30157.3927 28546.2595 29334.2430 110135.205 100 d
# Compiled for loop 7559.837 8208.142 9709.7256 8882.6875 9428.9155 22683.347 100 c
# zoo1 101699.548 107857.014 112210.5929 110402.3985 113335.7745 171537.068 100 f
# zoo2 72265.949 77788.709 81275.9028 80292.8135 81917.8985 153197.948 100 e
# zoo3 4584.861 4734.778 4939.3528 4785.9770 4853.6560 13228.514 100 b
# Rcpproll 216.636 246.076 290.7211 290.0745 311.3540 663.667 100 a
# stats::filter 425.912 475.350 536.0757 509.5900 544.6295 1497.568 100 a
# data.table::shift 334.394 365.593 443.2138 409.4325 424.6320 1944.279 100 a
I have a vector in R,
a = c(2,3,4,9,10,2,4,19)
let us say I want to efficiently insert the following vectors, b, and c,
b = c(2,1)
d = c(0,1)
right after the 3rd and 7th positions (the "4" entries), resulting in,
e = c(2,3,4,2,1,9,10,2,4,0,1,19)
How would I do this efficiently in R, without recursively using cbind or so.
I found a package R.basic but its not part of CRAN packages so I thought about using a supported version.
Try this:
result <- vector("list",5)
result[c(TRUE,FALSE)] <- split(a, cumsum(seq_along(a) %in% (c(3,7)+1)))
result[c(FALSE,TRUE)] <- list(b,d)
f <- unlist(result)
identical(f, e)
#[1] TRUE
EDIT: generalization to arbitrary number of insertions is straightforward:
insert.at <- function(a, pos, ...){
dots <- list(...)
stopifnot(length(dots)==length(pos))
result <- vector("list",2*length(pos)+1)
result[c(TRUE,FALSE)] <- split(a, cumsum(seq_along(a) %in% (pos+1)))
result[c(FALSE,TRUE)] <- dots
unlist(result)
}
> insert.at(a, c(3,7), b, d)
[1] 2 3 4 2 1 9 10 2 4 0 1 19
> insert.at(1:10, c(4,7,9), 11, 12, 13)
[1] 1 2 3 4 11 5 6 7 12 8 9 13 10
> insert.at(1:10, c(4,7,9), 11, 12)
Error: length(dots) == length(pos) is not TRUE
Note the bonus error checking if the number of positions and insertions do not match.
You can use the following function,
ins(a, list(b, d), pos=c(3, 7))
# [1] 2 3 4 2 1 9 10 2 4 0 1 4 19
where:
ins <- function(a, to.insert=list(), pos=c()) {
c(a[seq(pos[1])],
to.insert[[1]],
a[seq(pos[1]+1, pos[2])],
to.insert[[2]],
a[seq(pos[2], length(a))]
)
}
Here's another function, using Ricardo's syntax, Ferdinand's split and #Arun's interleaving trick from another question:
ins2 <- function(a,bs,pos){
as <- split(a,cumsum(seq(a)%in%(pos+1)))
idx <- order(c(seq_along(as),seq_along(bs)))
unlist(c(as,bs)[idx])
}
The advantage is that this should extend to more insertions. However, it may produce weird output when passed invalid arguments, e.g., with any(pos > length(a)) or length(bs)!=length(pos).
You can change the last line to unname(unlist(... if you don't want a's items named.
The straightforward approach:
b.pos <- 3
d.pos <- 7
c(a[1:b.pos],b,a[(b.pos+1):d.pos],d,a[(d.pos+1):length(a)])
[1] 2 3 4 2 1 9 10 2 4 0 1 19
Note the importance of parenthesis for the boundaries of the : operator.
After using Ferdinand's function, I tried to write my own and surprisingly it is far more efficient.
Here's mine :
insertElems = function(vect, pos, elems) {
l = length(vect)
j = 0
for (i in 1:length(pos)){
if (pos[i]==1)
vect = c(elems[j+1], vect)
else if (pos[i] == length(vect)+1)
vect = c(vect, elems[j+1])
else
vect = c(vect[1:(pos[i]-1+j)], elems[j+1], vect[(pos[i]+j):(l+j)])
j = j+1
}
return(vect)
}
tmp = c(seq(1:5))
insertElems(tmp, c(2,4,5), c(NA,NA,NA))
# [1] 1 NA 2 3 NA 4 NA 5
insert.at(tmp, c(2,4,5), c(NA,NA,NA))
# [1] 1 NA 2 3 NA 4 NA 5
And there's the benchmark result :
> microbenchmark(insertElems(tmp, c(2,4,5), c(NA,NA,NA)), insert.at(tmp, c(2,4,5), c(NA,NA,NA)), times = 10000)
Unit: microseconds
expr min lq mean median uq max neval
insertElems(tmp, c(2, 4, 5), c(NA, NA, NA)) 9.660 11.472 13.44247 12.68 13.585 1630.421 10000
insert.at(tmp, c(2, 4, 5), c(NA, NA, NA)) 58.866 62.791 70.36281 64.30 67.923 2475.366 10000
my code works even better for some cases :
> insert.at(tmp, c(1,4,5), c(NA,NA,NA))
# [1] 1 2 3 NA 4 NA 5 NA 1 2 3
# Warning message:
# In result[c(TRUE, FALSE)] <- split(a, cumsum(seq_along(a) %in% (pos))) :
# number of items to replace is not a multiple of replacement length
> insertElems(tmp, c(1,4,5), c(NA,NA,NA))
# [1] NA 1 2 3 NA 4 NA 5
Here's an alternative that uses append. It's fine for small vectors, but I can't imagine it being efficient for large vectors since a new vector is created upon each iteration of the loop (which is, obviously, bad). The trick is to reverse the vector of things that need to be inserted to get append to insert them in the correct place relative to the original vector.
a = c(2,3,4,9,10,2,4,19)
b = c(2,1)
d = c(0,1)
pos <- c(3, 7)
z <- setNames(list(b, d), pos)
z <- z[order(names(z), decreasing=TRUE)]
for (i in seq_along(z)) {
a <- append(a, z[[i]], after = as.numeric(names(z)[[i]]))
}
a
# [1] 2 3 4 2 1 9 10 2 4 0 1 19