tidy functional way of interweaving rows of a data frame - r

I have written the code below to take two dataframes and interweave them by row based on this example. I believe this is using Bresenham's line algorithm which evenly disperses the shorter dataframe within the longer one.
interleave_rows <- function(x, y) {
m <- nrow(x)
yi <- 1
len <- m + nrow(y)
err <- len %/% 2
res <- x
for (i in 1:len) {
err <- err - m
if (err < 0) { err <- err + len } else {
res <- add_row(res, !!! slice(y, yi), .before = i)
yi <- yi + 1
}
}
res
}
l <- list(
a = tibble(n = 1:3, l = letters[1:3]),
b = tibble(n = 4:9, l = letters[4:9]),
c = tibble(n = 10:11, l = letters[10:11])
)
reduce(l, interleave_rows)
I'm using this in a shiny app as part of a reduce and it's a little slow. I also don't find this to be a very tidy or functional approach to solving this problem. I haven't been able to wrap my head around how to do this without the loop and reassigning variables but I suspect doing so would be faster. Is there a better approach?

I think the problem with your function is that it inserts one row at a time to the data frame. It would be better to create interleaving indices, call rbind once, and subset by the indexes.
This function works by effectively calculating the quantile of the row number within each data frame, then finding the order of the quantiles:
interleave_rows <- function(df_a, df_b)
{
if(nrow(df_b) > nrow(df_a)) return(interleave_rows(df_b, df_a))
a <- seq(nrow(df_a))
b <- seq(nrow(df_b))
rbind(df_a, df_b)[order(c(a, length(a) * b/(length(b) + 1))), ]
}
You can see how this works clearly with these two data frames:
df_a <- data.frame(came_from = rep("A", 10), value = 1:10)
df_b <- data.frame(came_from = rep("B", 4), value = 101:104)
interleave_rows(df_a, df_b)
#> came_from value
#> 1 A 1
#> 2 A 2
#> 11 B 101
#> 3 A 3
#> 4 A 4
#> 12 B 102
#> 5 A 5
#> 6 A 6
#> 13 B 103
#> 7 A 7
#> 8 A 8
#> 14 B 104
#> 9 A 9
#> 10 A 10
And on your own data you would get:
l <- list(
a = tibble(n = 1:3, l = letters[1:3]),
b = tibble(n = 4:9, l = letters[4:9]),
c = tibble(n = 10:11, l = letters[10:11])
)
reduce(l, interleave_rows)
#> # A tibble: 11 x 2
#> n l
#> <int> <chr>
#> 1 4 d
#> 2 1 a
#> 3 5 e
#> 4 10 j
#> 5 6 f
#> 6 2 b
#> 7 7 g
#> 8 11 k
#> 9 3 c
#> 10 8 h
#> 11 9 i
In terms of timing, even on small data frames this is over 10 times faster than the original. I suspect the difference would be more marked on larger data frames:
microbenchmark::microbenchmark(reduce(l, interleave_rows),
reduce(l, interleave_rows_OP))
#> Unit: milliseconds
#> expr min lq mean median uq max
#> reduce(l, interleave_rows) 2.6741 2.94680 3.610404 3.05115 3.22800 21.5097
#> reduce(l, interleave_rows_OP) 36.2170 37.82645 40.005754 38.90145 40.03415 57.3965
#> neval
#> 100
#> 100

From Allan Cameron's excellent answer I was able to do exactly what I wanted. I'm adding this answer just for reference on how to do this using tidy.
interleave_rows_tidy <- function(df_a, df_b) {
if(nrow(df_b) > nrow(df_a)) return(interleave_rows_tidy(df_b, df_a))
a <- df_a %>% nrow %>% seq
b <- df_b %>% nrow %>% seq
bind_rows(df_a, df_b) %>% arrange(c(a, length(a) * b/(length(b) + 1)))
}
The key feature for me was how to calculate the sequence. In case anyone is wondering here is the microbenchmarks.
> microbenchmark::microbenchmark(reduce(l, interleave_rows_tidy), reduce(l, interleave_rows_SO))
Unit: microseconds
expr min lq mean median uq max neval
reduce(l, interleave_rows_tidy) 852.904 1088.5170 2586.924 1742.8185 4013.212 7401.947 100
reduce(l, interleave_rows_SO) 504.500 636.9975 1251.016 769.7465 1357.512 4738.728 100
It looks like the tidy version is a little slower on this test data. Both of these are much faster than my original loop that added the rows one-by-one.

Related

Append first row by group to original data

I have data with a grouping variable "ID" and some values:
ID, Value
1, 1
1, 2
1, 3
1, 4
2, 5
2, 6
2, 7
2, 8
Within each group, I want to append the first row after the last row.
Desired result:
ID, Value
1, 1
1, 2
1, 3
1, 4
1, 1 # First row of ID 1 inserted as last row in the group
2, 5
2, 6
2, 7
2, 8
2, 5 # First row of ID 2 inserted as last row in the group
I have 5000 row of this.
Within tidyverse, you could use add_row with do (now deprecated) or group_modify (experimental):
dat |>
group_by(ID) |>
do(add_row(., ID = unique(.$ID), Value = first(.$Value))) |>
ungroup()
dat |>
group_by(ID) |>
group_modify(~ add_row(., Value = first(.$Value))) |>
ungroup()
Or bind_rows with summarize (my variation of #Gregor Thomas, thanks):
dat |>
group_by(ID) |>
summarize(bind_rows(cur_data(), head(cur_data(), 1))) |>
ungroup()
Or by applying the same logic of #Henrik using bind_rows, filter, and arrange:
dat |>
bind_rows(dat |> filter(!duplicated(ID))) |>
arrange(ID)
Output:
# A tibble: 10 × 2
ID Value
<int> <dbl>
1 1 1
2 1 2
3 1 3
4 1 4
5 1 1
6 2 5
7 2 6
8 2 7
9 2 8
10 2 5
Thanks to #SamR for the data.
And with data.table:
library(data.table)
dt <- data.table(ID = rep(1:2, each = 4), Value = 1:8)
dt[,.(Value = c(Value, first(Value))), ID]
#> ID Value
#> 1: 1 1
#> 2: 1 2
#> 3: 1 3
#> 4: 1 4
#> 5: 1 1
#> 6: 2 5
#> 7: 2 6
#> 8: 2 7
#> 9: 2 8
#> 10: 2 5
Benchmarking with a 5000-row table:
library(dplyr)
dt <- data.table(ID = rep(1:1250, each = 4), Value = 1:5e3)
f1 <- function(dt) dt[,.(Value = c(Value, first(Value))), ID]
# base R
f2 <- function(dt) do.call(rbind, lapply(split(dt, by = "ID"), function(x) rbind(x, x[1,])))
# tidyverse
f3 <- function(dt) {
dt %>%
group_by(ID) %>%
do(add_row(., ID = unique(.$ID), Value = first(.$Value))) %>%
ungroup()
}
f4 <- function(dt) {
dt %>%
group_by(ID) %>%
group_modify(~ add_row(., Value = first(.$Value))) %>%
ungroup()
}
microbenchmark::microbenchmark(data.table = f1(dt),
"base R" = f2(dt),
tidyverse1 = f3(dt),
tidyverse2 = f4(dt),
times = 10)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> data.table 3.4989 3.6844 4.16619 4.3085 4.4623 5.3020 10
#> base R 245.1397 263.1636 283.61131 284.8053 307.7105 310.3901 10
#> tidyverse1 761.7097 773.3705 791.05115 787.9463 808.5416 821.5321 10
#> tidyverse2 711.9593 716.4959 752.20273 728.2170 782.6474 837.1926 10
If speed is really important, this simple Rcpp function provides a very fast solution:
Rcpp::cppFunction(
"IntegerVector firstLast(const IntegerVector& x) {
const int n = x.size();
IntegerVector idxOut(2*n);
int i0 = 1;
int idx = 0;
idxOut(0) = 1;
for (int i = 1; i < n; i++) {
if (x(i) != x(i - 1)) {
idxOut(++idx) = i0;
i0 = i + 1;
}
idxOut(++idx) = i + 1;
}
idxOut(++idx) = i0;
return idxOut[Rcpp::Range(0, idx)];
}"
)
Benchmarking against the fastest solution from this answer (on a much larger dataset):
dt = data.table(ID = rep(1:125e4, each = 4), Value = 1:5e6)
microbenchmark::microbenchmark(
f_uniq_dt = setorder(rbindlist(list(dt, unique(dt, by = "ID"))), ID),
f_Rcpp = dt[firstLast(dt$ID)],
check = "equal"
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f_uniq_dt 78.6056 83.71345 95.42876 85.80720 90.03685 175.8867 100
#> f_Rcpp 49.1485 53.38275 60.96322 55.44925 58.01485 121.3637 100
Use !duplicated to get first row by "ID" (more efficient than "by group" operations). rbind with original data and order result:
df = data.frame(ID = rep(1:2, each = 4), Value = 1:8)
d2 = rbind(df, df[!duplicated(df$ID), ])
d2[order(d2$ID), ]
# ID Value
# 1 1 1
# 2 1 2
# 3 1 3
# 4 1 4
# 11 1 1
# 5 2 5
# 6 2 6
# 7 2 7
# 8 2 8
# 51 2 5
Same idea with data.table::duplicated:
d = as.data.table(df)
d2 = rbindlist(list(d, d[!duplicated(d, by = "ID")]))
setorder(d2, ID)
More straightforward with data.table::unique:
d2 = rbindlist(list(d, unique(d, by = "ID")))
setorder(d2, ID)
Also data.table::rowid:
d2 = rbindlist(list(d, d[rowid(ID) == 1]))
setorder(d2, ID)
By avoiding "by group" operations, !duplicated, unique and rowid alternatives are all faster than the clear winner in the previous benchmark, data.table solution which uses by, on a 5000 row data set (see OP):
df = data.frame(ID = rep(1:1250, each = 4), Value = 1:5e3)
d = as.data.table(d)
microbenchmark(
f_by = {
d1 = d[ , .(Value = c(Value, first(Value))), by = ID]
},
f_dupl_df = {
d2 = rbind(df, df[!duplicated(df$ID), ])
d2 = d2[order(d2$ID), ]
},
f_dupl_dt = {
d3 = rbindlist(list(d, d[!duplicated(d, by = "ID")]))
setorder(d3, ID)
},
f_uniq_dt = {
d4 = rbindlist(list(d, unique(d, by = "ID")))
setorder(d4, ID)
},
f_rowid = {
d5 = rbindlist(list(d, d[rowid(ID) == 1]))
setorder(d5, ID)
},
times = 10L)
# Unit: milliseconds
# expr min lq mean median uq max
# f_by 8.5167 9.1397 11.01410 9.90925 12.3327 15.9134
# f_dupl_df 6.8337 7.0901 8.31057 7.56810 8.4899 13.9278
# f_dupl_dt 2.4742 2.6687 3.24932 3.18670 3.7993 4.3318
# f_uniq_dt 2.2059 2.4225 3.50756 3.36250 4.4590 5.6632
# f_rowid 2.2963 2.4295 3.43876 2.74345 4.8035 5.9278
all.equal(d1, as.data.table(d2))
all.equal(d1, d3)
all.equal(d1, d4)
all.equal(d1, d5)
# [1] TRUE
However, a sub-second benchmark isn't very informative, so try on larger data with many groups. The base solution loses ground. data.table::duplicated, unique and rowid scale better, and are now about 20 times faster, data.table::unique being fastest.
df = data.frame(ID = rep(1:1250000, each = 4), Value = 1:5e6)
d = as.data.table(df)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_by 6834.5959 7157.1686 12273.2399 7775.3919 8850.5324 35339.0262 10
# f_dupl_df 10732.1536 11035.4886 19440.4964 11691.5347 37956.6961 38387.4927 10
# f_dupl_dt 174.5640 183.8399 391.8605 381.8920 401.4929 962.4948 10
# f_uniq_dt 156.1267 161.9555 212.3472 180.7912 209.3905 406.7780 10
# f_rowid 192.1106 197.1564 380.0023 234.5851 474.5024 1172.6529 10
For completeness, a binary search, with mult = "first" to select the first match:
d[.(unique(ID)), on = .(ID), mult = "first"]
However, in both timings above, it ended on twice the time compared to the unique alternative.
Here is a base R method:
do.call(rbind,
lapply(split(dat, dat$ID), \(id_df) rbind(id_df, id_df[1,]))
)
# ID Value
# 1.1 1 1
# 1.2 1 2
# 1.3 1 3
# 1.4 1 4
# 1.5 1 1
# 2.5 2 5
# 2.6 2 6
# 2.7 2 7
# 2.8 2 8
# 2.51 2 5
It does give you slightly strange row names - if you care about that you can wrap it in (or pipe it to) tibble::as_tibble(), which removes row names altogether.
Alternatively you could do data.table::rbindlist(lapply(split(dat, dat$ID), \(id_df) rbind(id_df, id_df[1,]))), becausedata.table also does not use row names.
Data
dat <- read.csv(text = "ID, Value
1, 1
1, 2
1, 3
1, 4
2, 5
2, 6
2, 7
2, 8", h=T)
You can try the following data.table option
> setDT(df)[, .SD[(seq(1 + .N) - 1) %% .N + 1], ID]
ID Value
1: 1 1
2: 1 2
3: 1 3
4: 1 4
5: 1 1
6: 2 5
7: 2 6
8: 2 7
9: 2 8
10: 2 5

What's a tidyverse approach to iterating over rows in a data frame when vectorisation is not feasible?

I want to know the best way to iterate over rows of a data frame when the value of a variable at row n depends on the value of variable(s) at row n-1 and/or n-2. Ideally I would like to do this in a "tidyverse" way, perhaps with purrr::pmap().
For example, say I have this data frame:
library(dplyr)
x <- tibble(t = c(1:10),
a = c(seq(100, 140, 10), rep(NA_real_, 5)),
b = c(runif(5), rep(NA_real_, 5)),
c = c(runif(5), rep(NA_real_, 5)))
x
#> # A tibble: 10 x 4
#> t a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 100 0.750 0.900
#> 2 2 110 0.898 0.657
#> 3 3 120 0.731 0.000137
#> 4 4 130 0.208 0.696
#> 5 5 140 0.670 0.882
#> 6 6 NA NA NA
#> 7 7 NA NA NA
#> 8 8 NA NA NA
#> 9 9 NA NA NA
#> 10 10 NA NA NA
I have known values up to time (t) = 5. Beyond that, I wish to project values, using the following formulae:
a = lag(a) * 1.1
b = a * lag(b)
c = b * lag(a, 2)
This code achieves the desired output, but it's a clunky, horrible for loop that scales poorly to larger datasets:
for(i in 1:nrow(x)) {
x <- x %>%
mutate(a = if_else(!is.na(a), a, lag(a, 1) * 1.1),
b = if_else(!is.na(b), b, a * lag(b, 1)),
c = if_else(!is.na(c), c, b * lag(a, 2)))
}
x
#> # A tibble: 10 x 4
#> t a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 100 7.50e- 1 9.00e- 1
#> 2 2 110 8.98e- 1 6.57e- 1
#> 3 3 120 7.31e- 1 1.37e- 4
#> 4 4 130 2.08e- 1 6.96e- 1
#> 5 5 140 6.70e- 1 8.82e- 1
#> 6 6 154 1.03e+ 2 1.34e+ 4
#> 7 7 169. 1.75e+ 4 2.45e+ 6
#> 8 8 186. 3.26e+ 6 5.02e+ 8
#> 9 9 205. 6.68e+ 8 1.13e+11
#> 10 10 225. 1.51e+11 2.80e+13
I think that for this sort of intrinsically iterative process it is genuinely hard to beat a for loop. The method proposed by #Shree depends on the NAs being continuous and starting in a known spot.
Here is my mild improvement on your loop, which I think is more readable and about 2.5 times the speed and will probably scale up better than your approach which combines vectorized operations with the loop. By moving out of the tidyverse altogether and embracing a rowwise loop that really works on each row one at a time, we get some efficiencies on both counts:
method_peter <- function(x){
for(i in 2:nrow(x)){
x[i, "a"] <- ifelse(is.na(x[i, "a"]), x[i - 1, "a"] * 1.1, x[i, "a"])
x[i, "b"] <- ifelse(is.na(x[i, "b"]), x[i, "a"] * x[i - 1, "b"], x[i, "b"])
x[i, "c"] <- ifelse(is.na(x[i, "c"]), x[i, "b"] * x[i - 2, "a"], x[i, "c"])
}
return(x)
}
There's doubtless more efficiencies possible, and of course this is an ideal candidate to rewrite it in C++ :).
This is about twice as fast as your method as seen by this:
method_matt <- function(x){
for(i in 1:nrow(x)) {
x <- x %>%
mutate(a = if_else(!is.na(a), a, lag(a, 1) * 1.1),
b = if_else(!is.na(b), b, a * lag(b, 1)),
c = if_else(!is.na(c), c, b * lag(a, 2)))
}
return(x)
}
set.seed(123)
x <- tibble(t = c(1:10),
a = c(seq(100, 140, 10), rep(NA_real_, 5)),
b = c(runif(5), rep(NA_real_, 5)),
c = c(runif(5), rep(NA_real_, 5)))
stopifnot(identical(method_matt(x), method_peter(x)))
library(microbenchmark)
microbenchmark(
method_matt(x),
method_peter(x)
)
which returns:
Unit: milliseconds
expr min lq mean median uq max neval
method_matt(x) 24.1975 25.50925 30.64438 26.33310 31.8681 74.5093 100
method_peter(x) 10.0005 10.56050 13.33751 11.06495 13.5913 42.0568 100
#Shree's method is much faster again and is ideal for the example data, but I'm not sure it is flexible enough to work in all your use cases.
I would like to see a tidyverse solution if there is one.
Edit: Added tidyverse approach
Here's a readable and flexible tidyverse approach. The downside is that it is very slow.
accumutate <- function(df, ...){
df %>% group_by(row_number()) %>%
nest() %>%
pull(data) %>%
accumulate(function(x,y){bind_rows(x,y) %>% mutate(!!!enquos(...)) }) %>%
.[[length(.)]]
}
x %>%
accumutate(a = ifelse(is.na(a), 1.1 * lag(a,1), a)) %>%
accumutate(b = ifelse(is.na(b), a * lag(b), b)) %>%
accumutate(c = ifelse(is.na(c),b * lag(a, 2), c))
#> # A tibble: 10 x 4
#> t a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 100 2.88e- 1 4.56e- 2
#> 2 2 110 7.88e- 1 5.28e- 1
#> 3 3 120 4.09e- 1 8.92e- 1
#> 4 4 130 8.83e- 1 5.51e- 1
#> 5 5 140 9.40e- 1 4.57e- 1
#> 6 6 154 1.45e+ 2 1.88e+ 4
#> 7 7 169. 2.45e+ 4 3.43e+ 6
#> 8 8 186. 4.57e+ 6 7.04e+ 8
#> 9 9 205. 9.37e+ 8 1.59e+11
#> 10 10 225. 2.11e+11 3.94e+13
Created on 2020-10-07 by the reprex package (v0.3.0)
Here's another approach that you might find interesting. It's not concise or especially readable, but it's tidyverse (or at least functionally) inspired. And it performs fairly well.
It uses a semigroup pattern, converting the mutate expressions into binary functions, creating corresponding lists and then using accumulate.
library(tidyverse)
library(dplyr)
library(microbenchmark)
options(width =100)
set.seed(123)
# Create the data frame
x <- tibble(t = c(1:100),
a = c(seq(100, 140, 10), rep(NA_real_,100- 5)),
b = c(runif(5), rep(NA_real_, 100-5)),
c = c(runif(5), rep(NA_real_, 100-5)))
a_mappend <- function(a1, a2) {
ifelse(is.na(a2), a1 * 1.1, a2)
}
b_mappend <- function(ab1, ab2) {
list(a = ab2$a, b = ifelse(is.na(ab2$b), ab2$a * ab1$b,ab2$b))
}
c_mappend <- function(abc12, abc23) {
list(abc1 = list(a = abc12$abc2$a, b = abc12$abc2$b, c = abc12$abc2$c),
abc2 = list(a = abc23$abc2$a, b = abc23$abc2$b, c = ifelse(is.na(abc23$abc2$c),abc12$abc1$a * abc23$abc2$b,abc23$abc2$c)))
}
method_ian <- function(x) {
x %>%
mutate(a = accumulate(a, a_mappend)) %>%
mutate(b = list(a, b) %>%
pmap(~ list(a = .x, b = .y)) %>%
accumulate(b_mappend) %>% map_dbl(~ .x$b)) %>%
mutate(c = list(a, b, c, c(a[-1], NA), c(b[-1], NA), c(c[-1], NA)) %>%
pmap(~ list(abc1 = list(a = ..1, b = ..2, c = ..3),
abc2 = list(a = ..4, b = ..5, c = ..6))) %>%
accumulate(c_mappend) %>% map_dbl(~ .x$abc1$c))
}
method_matt <- function(x){
for(i in 1:nrow(x)) {
x <- x %>%
mutate(a = if_else(!is.na(a), a, lag(a, 1) * 1.1),
b = if_else(!is.na(b), b, a * lag(b, 1)),
c = if_else(!is.na(c), c, b * lag(a, 2)))
}
return(x)
}
method_peter <- function(x){
for(i in 2:nrow(x)){
x[i, "a"] <- ifelse(is.na(x[i, "a"]), x[i - 1, "a"] * 1.1, x[i, "a"])
x[i, "b"] <- ifelse(is.na(x[i, "b"]), x[i, "a"] * x[i - 1, "b"], x[i, "b"])
x[i, "c"] <- ifelse(is.na(x[i, "c"]), x[i, "b"] * x[i - 2, "a"], x[i, "c"])
}
return(x)
}
stopifnot(identical(method_matt(x), method_ian(x)))
microbenchmark( method_matt(x), method_peter(x), method_ian(x))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> method_matt(x) 324.90086 330.93192 337.46518 334.55447 338.38461 426.30457 100
#> method_peter(x) 208.27498 211.60526 213.59438 212.66088 214.36421 242.59854 100
#> method_ian(x) 13.06774 13.43105 14.30003 13.86428 14.32263 19.54843 100
Created on 2020-10-06 by the reprex package (v0.3.0)
I don't think there's any simple way in tidyverse to do calculations with row-dependencies. Something with Reduce or gather + spread could be possible but I don't expect them to score poits on readability.
Anyways, on the bright side, your calculations are vectorizable using dplyr and zoo packages -
x %>%
mutate(
a = ifelse(is.na(a), na.locf(a) * 1.1^(t-5), a),
b = ifelse(is.na(b), na.locf(b) * c(rep(1, 5), cumprod(a[6:n()])), b),
c = ifelse(is.na(c), b * lag(a, 2), c)
)
# A tibble: 10 x 4
t a b c
<int> <dbl> <dbl> <dbl>
1 1 100 1.85e- 1 9.43e- 1
2 2 110 7.02e- 1 1.29e- 1
3 3 120 5.73e- 1 8.33e- 1
4 4 130 1.68e- 1 4.68e- 1
5 5 140 9.44e- 1 5.50e- 1
6 6 154 1.45e+ 2 1.89e+ 4
7 7 169. 2.46e+ 4 3.45e+ 6
8 8 186. 4.59e+ 6 7.07e+ 8
9 9 205. 9.40e+ 8 1.59e+11
10 10 225. 2.12e+11 3.95e+13
Data -
set.seed(2)
x <- tibble(t = c(1:10),
a = c(seq(100, 140, 10), rep(NA_real_, 5)),
b = c(runif(5), rep(NA_real_, 5)),
c = c(runif(5), rep(NA_real_, 5)))

Finding the first time a value shows up in a list efficiently

I was trying to solve a problem where there was a long list that had a variable amount of numbers at each index. The goals was to say what was the earliest index at which every number appeared. So if a 15 shows up at index 45 and 78 in the list, then I should return that 15 is located first at 48. In the original problem this went on with a list of length 10,000, so doing this fast was helpful.
Originally I tried to work with the existing list structure and did something like this, which at 10,000 lines very slow.
set.seed(1)
x <- replicate(100, sample(100, sample(10, 1)))
cbind(value = 1:100,
index = sapply(1:100, function(i) which.max(sapply(x, function(x) i %in% x))))
Eventually I tried converting the data in to a data.table, which worked much better but I always wondered if there was a better way to go about solving the problem. Like was the default list structure inherently inefficient or was there a better way I could have worked with it?
set.seed(1)
x <- replicate(100, sample(100, sample(10, 1)))
dt <- data.table(index = rep(1:100, sapply(x, length)), value = unlist(x))
dt[,.(index = first(index)),value][order(value)]
Here's the full dataset from the original problem if that's helpful.
library(RcppAlgos)
library(memoise)
library(data.table)
jgo <- function(n) {
if (isPrimeRcpp(n) | n == 1) return (n)
div <- divisorsRcpp(n)
div <- div[-c(1, length(div))]
div <- Map(function(a, b) c(a, b), div, rev(div))
div2 <- lapply(div, function(x) lapply(jgo(x[1]), c, x[2]))
unique(lapply(c(div, unlist(div2, recursive = FALSE)), sort))
}
jgo <- memoise(jgo)
x <- lapply(1:12500, function(x) x - sapply(jgo(x), sum) + sapply(jgo(x), length))
Here is another approach that uses match to find the first indices. This slightly outperforms the other suggested approaches and produces similar output as in OP's question:
## dummy data
set.seed(1)
x <- replicate(100, sample(100, sample(10, 1)))
## use match to find first indices
first_indices_match <- function(x) {
seq_x <- 1:length(x)
matrix(c(seq_x, rep(seq_x, lengths(x))[match(seq_x, unlist(x))]),
ncol = 2, dimnames = list(NULL, c("value", "index")))
}
head(first_indices_match(x))
#> value index
#> [1,] 1 1
#> [2,] 2 7
#> [3,] 3 45
#> [4,] 4 38
#> [5,] 5 31
#> [6,] 6 7
## data.table approach
library(data.table)
first_indices_dt <- function(x) {
dt <- data.table(index = rep(seq_along(x), sapply(x, length)), value = unlist(x))
dt[,.(index = first(index)),value][order(value)]
}
head(first_indices_dt(x))
#> value index
#> 1: 1 1
#> 2: 2 7
#> 3: 3 45
#> 4: 4 38
#> 5: 5 31
#> 6: 6 7
Benchmarks
## stack + remove duplicate approach
first_indices_shree <- function(x) {
names(x) <- seq_len(length(x))
(d <- stack(x))[!duplicated(d$values), ]
}
## benchmarks several list sizes
bnch <- bench::press(
n_size = c(100, 1E3, 1E4),
{
x <- replicate(n_size, sample(n_size, sample(10, 1)))
bench::mark(
match = first_indices_match(x),
shree = first_indices_shree(x),
dt = first_indices_dt(x),
check = FALSE
)
}
)
#> # A tibble: 9 x 7
#> expression n_size min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 match 100 18.17µs 21.2µs 45639. 637.3KB 27.4
#> 2 shree 100 361.88µs 411.06µs 2307. 106.68KB 11.1
#> 3 dt 100 759.17µs 898.26µs 936. 264.58KB 8.51
#> 4 match 1000 158.34µs 169.9µs 5293. 164.15KB 30.8
#> 5 shree 1000 1.54ms 1.71ms 567. 412.52KB 13.2
#> 6 dt 1000 1.19ms 1.4ms 695. 372.13KB 10.7
#> 7 match 10000 3.09ms 3.69ms 255. 1.47MB 15.9
#> 8 shree 10000 18.06ms 18.95ms 51.5 4.07MB 12.9
#> 9 dt 10000 5.65ms 6.33ms 149. 2.79MB 20.5
You can simply stack the list into a dataframe and remove duplicated values. This will give you the first index for all values in the list.
set.seed(1)
x <- replicate(100, sample(100, sample(10, 1)))
names(x) <- seq_along(x)
first_indices <- (d <- stack(x))[!duplicated(d$values), ]
head(first_indices)
values ind
1 38 1
2 57 1
3 90 1
5 94 2
6 65 2
7 7 3
You can now lookup index for any value you want using %in% -
subset(first_indices, values %in% c(37,48))
values ind
11 37 3
40 48 8
Benchmarks -
set.seed(1)
x <- replicate(1000, sample(1000, sample(10, 1)))
microbenchmark::microbenchmark(
Shree = first_indices(x),
JamesB = cbind(value = 1:1000,
index = sapply(1:1000, function(i) which.max(sapply(x, function(x) i %in% x))))
)
Unit: milliseconds
expr min lq mean median uq max neval
Shree 2.3488 2.74855 4.171323 3.0577 4.7752 17.0743 100
JamesB 1750.4806 1904.79150 2519.912936 1994.9814 3282.5957 5966.1011 100

Counting amount of zeros within a "melted" data frame

Hei, I learn R and I try to count how many zeros I have within the melted data. So, I want to know how many zeros corresponds to column a and b and print two results out.
I generated an example:
library(reshape)
library(plyr)
library(dplyr)
id = c(1,2,3,4,5,6,7,8,9,10)
b = c(0,0,5,6,3,7,2,8,1,8)
c = c(0,4,9,87,0,87,0,4,5,0)
test = data.frame(id,b,c)
test_melt = melt(test, id.vars = "id")
test_melt
I imagine for that I should create an if statement. Something with
if (test$value == 0){print()}, but how can I tell R to count zeros for a columns that have been melted?
With your data:
test_melt %>%
group_by(variable) %>%
summarize(zeroes = sum(value == 0))
# # A tibble: 2 x 2
# variable zeroes
# <fctr> <int>
# 1 b 2
# 2 c 4
Base R:
aggregate(test_melt$value, by = list(variable = test_melt$variable),
FUN = function(x) sum(x == 0))
# variable x
# 1 b 2
# 2 c 4
... and for curiosity:
library(microbenchmark)
microbenchmark(
dplyr = group_by(test_melt, variable) %>% summarize(zeroes = sum(value == 0)),
base1 = aggregate(test_melt$value, by = list(variable = test_melt$variable), FUN = function(x) sum(x == 0)),
# #PankajKaundal's suggested "formula" notation reads easier
base2 = aggregate(value ~ variable, test_melt, function(x) sum(x == 0))
)
# Unit: microseconds
# expr min lq mean median uq max neval
# dplyr 916.421 986.985 1069.7000 1022.1760 1094.7460 2272.636 100
# base1 647.658 682.302 783.2065 715.3045 765.9940 1905.411 100
# base2 813.219 867.737 950.3247 897.0930 959.8175 2017.001 100
sum(test_melt$value==0)
This should do it.
This might help . Is this what you're looking for ?
> test_melt[4] <- 1
> test_melt2 <- aggregate(V4 ~ value + variable, test_melt, sum)
> test_melt2
value variable V4
1 0 b 2
2 1 b 1
3 2 b 1
4 3 b 1
5 5 b 1
6 6 b 1
7 7 b 1
8 8 b 2
9 0 c 4
10 4 c 2
11 5 c 1
12 9 c 1
13 87 c 2
V4 is the count

dplyr: max value in a group, excluding the value in each row?

I have a data frame that looks as follows:
> df <- data_frame(g = c('A', 'A', 'B', 'B', 'B', 'C'), x = c(7, 3, 5, 9, 2, 4))
> df
Source: local data frame [6 x 2]
g x
1 A 7
2 A 3
3 B 5
4 B 9
5 B 2
6 C 4
I know how to add a column with the maximum x value for each group g:
> df %>% group_by(g) %>% mutate(x_max = max(x))
Source: local data frame [6 x 3]
Groups: g
g x x_max
1 A 7 7
2 A 3 7
3 B 5 9
4 B 9 9
5 B 2 9
6 C 4 4
But what I would like is to get is the maximum x value for each group g, excluding the x value in each row.
For the given example, the desired output would look like this:
Source: local data frame [6 x 3]
Groups: g
g x x_max x_max_exclude
1 A 7 7 3
2 A 3 7 7
3 B 5 9 9
4 B 9 9 5
5 B 2 9 9
6 C 4 4 NA
I thought I might be able to use row_number() to remove particular elements and take the max of what remained, but hit warning messages and got incorrect -Inf output:
> df %>% group_by(g) %>% mutate(x_max = max(x), r = row_number(), x_max_exclude = max(x[-r]))
Source: local data frame [6 x 5]
Groups: g
g x x_max r x_max_exclude
1 A 7 7 1 -Inf
2 A 3 7 2 -Inf
3 B 5 9 1 -Inf
4 B 9 9 2 -Inf
5 B 2 9 3 -Inf
6 C 4 4 1 -Inf
Warning messages:
1: In max(c(4, 9, 2)[-1:3]) :
no non-missing arguments to max; returning -Inf
2: In max(c(4, 9, 2)[-1:3]) :
no non-missing arguments to max; returning -Inf
3: In max(c(4, 9, 2)[-1:3]) :
no non-missing arguments to max; returning -Inf
What is the most {readable, concise, efficient} way to get this output in dplyr? Any insight into why my attempt using row_number() doesn't work would also be much appreciated. Thanks for the help.
You could try:
df %>%
group_by(g) %>%
arrange(desc(x)) %>%
mutate(max = ifelse(x == max(x), x[2], max(x)))
Which gives:
#Source: local data frame [6 x 3]
#Groups: g
#
# g x max
#1 A 7 3
#2 A 3 7
#3 B 9 5
#4 B 5 9
#5 B 2 9
#6 C 4 NA
Benchmark
I've tried the solutions so far on the benchmark:
df <- data.frame(g = sample(LETTERS, 10e5, replace = TRUE),
x = sample(1:10, 10e5, replace = TRUE))
library(microbenchmark)
mbm <- microbenchmark(
steven = df %>%
group_by(g) %>%
arrange(desc(x)) %>%
mutate(max = ifelse(x == max(x), x[2], max(x))),
eric = df %>%
group_by(g) %>%
mutate(x_max = max(x),
x_max2 = sort(x, decreasing = TRUE)[2],
x_max_exclude = ifelse(x == x_max, x_max2, x_max)) %>%
select(-x_max2),
arun = setDT(df)[order(x), x_max_exclude := c(rep(x[.N], .N-1L), x[.N-1L]), by=g],
times = 50
)
#Arun's data.table solution is the fastest:
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# steven 158.58083 163.82669 197.28946 210.54179 212.1517 260.1448 50 b
# eric 223.37877 228.98313 262.01623 274.74702 277.1431 284.5170 50 c
# arun 44.48639 46.17961 54.65824 47.74142 48.9884 102.3830 50 a
Interesting problem. Here's one way using data.table:
require(data.table)
setDT(df)[order(x), x_max_exclude := c(rep(x[.N], .N-1L), x[.N-1L]), by=g]
The idea is to order by column x and on those indices, we group by g. Since we've the ordered indices, for the first .N-1 rows, the max value is the value at .N. And for the .Nth row, it's the value at .N-1th row.
.N is a special variable that holds the number of observations in each group.
I'll leave it to you and/or the dplyr experts to translate this (or answer with another approach).
This is the best I've come up with so far. Not sure if there's a better way.
df %>%
group_by(g) %>%
mutate(x_max = max(x),
x_max2 = sort(x, decreasing = TRUE)[2],
x_max_exclude = ifelse(x == x_max, x_max2, x_max)) %>%
select(-x_max2)
Another way with a functional:
df %>% group_by(g) %>% mutate(x_max_exclude = max_exclude(x))
Source: local data frame [6 x 3]
Groups: g
g x x_max_exclude
1 A 7 3
2 A 3 7
3 B 5 9
4 B 9 5
5 B 2 9
6 C 4 NA
We write a function called max_exclude that does the operation that you describe.
max_exclude <- function(v) {
res <- c()
for(i in seq_along(v)) {
res[i] <- suppressWarnings(max(v[-i]))
}
res <- ifelse(!is.finite(res), NA, res)
as.numeric(res)
}
It works with base R too:
df$x_max_exclude <- with(df, ave(x, g, FUN=max_exclude))
Source: local data frame [6 x 3]
g x x_max_exclude
1 A 7 3
2 A 3 7
3 B 5 9
4 B 9 5
5 B 2 9
6 C 4 NA
Benchmark
Here's a lesson kids, beware of for loops!
big.df <- data.frame(g=rep(LETTERS[1:4], each=1e3), x=sample(10, 4e3, replace=T))
microbenchmark(
plafort_dplyr = big.df %>% group_by(g) %>% mutate(x_max_exclude = max_exclude(x)),
plafort_ave = big.df$x_max_exclude <- with(big.df, ave(x, g, FUN=max_exclude)),
StevenB = (big.df %>%
group_by(g) %>%
mutate(max = ifelse(row_number(desc(x)) == 1, x[row_number(desc(x)) == 2], max(x)))
),
Eric = df %>%
group_by(g) %>%
mutate(x_max = max(x),
x_max2 = sort(x, decreasing = TRUE)[2],
x_max_exclude = ifelse(x == x_max, x_max2, x_max)) %>%
select(-x_max2),
Arun = setDT(df)[order(x), x_max_exclude := c(rep(x[.N], .N-1L), x[.N-1L]), by=g]
)
Unit: milliseconds
expr min lq mean median uq max neval
plafort_dplyr 75.219042 85.207442 89.247409 88.203225 90.627663 179.553166 100
plafort_ave 75.907798 84.604180 87.136122 86.961251 89.431884 104.884294 100
StevenB 4.436973 4.699226 5.207548 4.931484 5.364242 11.893306 100
Eric 7.233057 8.034092 8.921904 8.414720 9.060488 15.946281 100
Arun 1.789097 2.037235 2.410915 2.226988 2.423638 9.326272 100

Resources